Python数据分析第二周总结
Python數(shù)據(jù)分析第二周總結(jié)
一、數(shù)據(jù)分析五大步驟
0.加載數(shù)據(jù)
? read_csv
? read_excel
? read_sql
1.數(shù)據(jù)抽取
? 布爾索引
? query
? drop
2.數(shù)據(jù)清洗
? 缺失值:isnull,isna, notnull, notna, dropna, fillna
? 重復(fù)值:duplicated, drop_duplicates, numique
? 異常值:replace, drop
? 檢測異常值:Z-score
? IQR
? DBScan
? Isolation Forest
? 預(yù)處理: apply, transform, applymap
? str: extract(regxp), contains
? dt: year, quarter, month, weekday…
? to_datetime
? cut(data, bins), qcut(data, [0, 0.1, 0.25, 0.5, 0.75, 0.9, 1])
3.數(shù)據(jù)透視
? 分組:groupby —> agg —>聚合函數(shù)
? 透視表:pivot_table —> index, columns, values, aggfunc
? 交叉表(了解):crosstab
? 排序: sort_values(by, ascending), sort_index(level)
? 取頭部: nlargest(n, columns), nsmallest
4.可視化
? 繪圖:plot
? kind —> line, scatter, pie, bar, barh, hist, box,…
? figsize
5.業(yè)務(wù)洞察
二、數(shù)據(jù)抽取和數(shù)據(jù)清洗
例一、讀取kobe_data.csv文件
import numpy as np import pandas as pd import matplotlib.pyplot as plt df1 = pd.read_csv(r'C:\Users\wby\Desktop\data\csv\kobe_data.csv', index_col='shot_id') df1 # 獲取DataFrame的相關(guān)信息 df1.info() # 設(shè)置不限制最大顯示的列數(shù) pd.set_option('max_columns', None) # 顯示前5行,df1.head(n)默認(rèn)n=5,可省略 df1.head() # 顯示后5行,df1.tail(n)默認(rèn)n=5,可省略 df1.tail() # 科比使用最多的投籃動(dòng)作 temp = df1.action_type + '-' + df1.combined_shot_type temp.value_counts().index[0] # 科比交手次數(shù)最多的球隊(duì) match_index = df1.game_id.drop_duplicates().index temp = df1.opponent[match_index] temp.value_counts().index[0] 或者 df1.drop_duplicates('game_id').opponent.value_counts().index[0] # 科比職業(yè)生涯得分(不包含罰籃) df1.shot_type.apply(lambda x: int(x[0]))[df1.shot_made_flag == 1].sum()例二、連接數(shù)據(jù)庫
import pymysqlconn = pymysql.connect(host='47.104.31.138',port=3306,user='guest',password='Guest.618',database='hrs',charset='utf8mb4') conn emp_df = pd.read_sql('select * from tb_emp', conn, index_col='eno') emp_df.info() # 判斷空值 emp_df.isnull() emp_df.isna() # 判斷非空值 emp_df.notnull() emp_df.notna() # 計(jì)算每一個(gè)列空值的數(shù)量 emp_df.isnull().sum() # 刪除空值(默認(rèn)沿著0軸刪除)(會把有空值的行全部刪除掉) emp_df.dropna() # 用插值法填充空值 emp_df.comm.interpolate() emp_df['mgr'] = emp_df.mgr.fillna(-1).astype(np.int64) emp_df['comm'] = emp_df.comm.interpolate() emp_df emp_df.info() dept_df = pd.read_sql('select * from tb_dept', conn, index='dno') dept_df dept_df.loc[50]={'dname':'銷售部','dloc':'長沙'} dept_df.loc[60]={'dname': '運(yùn)維部', 'dloc':'綿陽'} dept_df # 是否重復(fù),返回布爾值,默認(rèn)keep='first' dept_df.duplicated('dname') # 最后出現(xiàn)的值返回False dept_df.duplicated('dname', keep='last') # 最先出現(xiàn)的值返回False dept_df.duplicated('dname', keep='first') # 重復(fù)值全部返回True dept_df.duplicated('dname', keep=False)例三、讀取某視頻網(wǎng)站運(yùn)營數(shù)據(jù)(excel)
df2 = pd.read_excel(r'C:\Users\wby\Desktop\data\excel\某視頻網(wǎng)站運(yùn)營數(shù)據(jù).xlsx') df2.info() df2.drop_duplicates('video_id').shape # 去重 df2.drop_duplicates('video_id').shape df2.drop_duplicates('title').shape # 統(tǒng)計(jì)不重復(fù)的元素的個(gè)數(shù) df2.nunique() heights = np.ceil(np.random.normal(110, 5, 50)) heights heights[-1] = 193 heights[0] = 80 heights # 畫箱線圖,whis默認(rèn)是1.5 plt.boxplot(heights, whis=3) plt.show()尋找異常值
四種方法:1.1.5QR;2.zscore判定法;3.DBScan聚類;4.孤立森林
正態(tài)分布
# 尋找異常值 # 1.5倍IQR def detect_outliers_iqr(data, whis=1.5):q1, q3 = np.quantile(data, [0.25, 0.75])iqr = q3 - q1lower ,upper = q1 - whis * iqr,q3 + whis * iqrreturn data[(data < lower) | (data >upper)]print(detect_outliers_iqr(heights)) # 3塞格瑪法則(zscore判定法) def detect_outliers_zscore(data, threshold=3):avg_value = np.mean(data)std_value = np.std(data)z_score = np.abs((data-avg_value)/std_value)return data[z_score > threshold]print(detect_outliers_zscore(heights)) new_heights = pd.Series(heights).replace([80, 193], 110) new_heights.plot(kind='box') new_heights = pd.Series(heights).replace([80, 193], 110) new_heights.plot(kind='box') plt.boxplot(heights[1:-1]) plt.show()例三、讀取bilibili.csv數(shù)據(jù)
bilibili_df = pd.read_csv(r'C:\Users\wby\Desktop\data\csv\bilibili.csv', encoding='gbk') bilibili_df # 處理網(wǎng)址 from urllib.parse import urljoindef fix_url(url):pos = url.rfind('?')if pos >= 0:url = url[:pos]return urljoin('https:', url)bilibili_df['url'] = bilibili_df.url.apply(fix_url) bilibili_df # 處理watchnum def handle_watch_num(watchnum):unit_dict = {'萬':10000, '億':100000000}unit = unit_dict.get(watchnum[-1], 1)if watchnum[-1] in unit_dict:watchnum = watch[:-1]return int(float(watchnum) * unit) bilibili_df['watchnum'] = bilibili_df.watchnum.apply(handle_watch_num) bilibili_df bilibili_df.info # 處理日期,方便單獨(dú)拿出年、月、日 bilibili_df['uptime'] = pd.to_datetime(bilibili_df.uptime) bilibili_df.info() bilibili.df.uptime.dt.year例四 、讀取lagou.csv文件
import numpy as np import pandas as pd import matplotlib.pyplot as pltplt.rcParams['font.sans-serif'] = 'STZhongsong' plt.rcParams['axes.unicode_minus'] = False %config InlineBackend.figure_format = 'svg' # 讀取所需要的列的數(shù)據(jù) lagou_df = pd.read_csv(r'C:\Users\wby\Desktop\data\csv\lagou.csv', usecols=['city', 'companyFullName', 'companyLabelList', 'companySize', 'district', 'education', 'financeStage', 'industryField', 'positionName', 'salary','workYear']) lagou_df lagou_df.info() lagou_df[lagou_df.district.isnull()] # 0.篩選出數(shù)據(jù)分析的崗位 temp_df = lagou_df[lagou_df.positionName.str.contains('數(shù)據(jù)分析')] temp_df index_nums = lagou_df[~lagou_df.positionName.str.contains('數(shù)據(jù)分析')].index lagou_df.drop(index=index_nums, inplace=True) lagou_df.shape # 1.將薪資處理成上下限的平均值 sals = lagou_df.salary.str.extract(r'(\d+)[kK]?-(\d+)[kK]?').applymap(int).mean(axis=1) lagou_df['salary'] = sals lagou_df # 2.將工作年限要求處理成下限值或0 import redef min_work_year(content):matcher = re.search(r'\d+', content)result = matcher.group if matcher else '0'return int(result)lagou_df['workYear'] = lagou_df.workYear.apply(min_work_year) lagou_df lagou_df.salary.mean() lagou_df.workYear.mean()DataFrame函數(shù)的用法
temp_df = pd.DataFrame(data=np.random.randint(30, 101, (5,3))) temp_df.mean() temp_df.apply(np.mean) temp_df.apply([np.mean]) temp_df.apply([np.mean, np.std, np.max, np.min]) # 此處lambda函數(shù)的參數(shù)x是一個(gè)數(shù)據(jù)序列 temp_df.apply(lambda x:(x ** 0.5 * 10).astype(int)) # 此處lambda函數(shù)的參數(shù)是一個(gè)數(shù)據(jù)序列 # transform的參數(shù)是不帶規(guī)約性質(zhì)的參數(shù) temp_df.transfrom(lambda x:(x ** 0.5 *10).astype(int)) # 此處lambda函數(shù)的參數(shù)是一個(gè)值 temp_df.applymap(lambda x: int(x ** 0.5 * 10) # ValueError # temp_df.transform(np.mean)總結(jié):
數(shù)據(jù)序列三個(gè)添加函數(shù)方法: apply, map, transform
DataFrame三個(gè)添加函數(shù)方法:apply,transform,applymap
transform的參數(shù)不能帶規(guī)約性質(zhì),否則報(bào)錯(cuò),傳入幾個(gè)參數(shù),就返回幾個(gè)結(jié)果
例五、讀取2020年銷售數(shù)據(jù)
sales_df = pd.read_excel(r'C:\Users\wby\Desktop\2020年銷售數(shù)據(jù)-2.xlsx') sales_df sales_df.info() sales_df['年'] = sales_df.銷售日期.dt.year sales_df['季度'] = sales_df.銷售日期.dt.quarter sales_df['月'] = sales_df.銷售日期.dt.month sales_df['銷售額'] = sales_df.售價(jià) * sales_df.銷售數(shù)量 sales_df # 寫入excel表格 sales_df.to_excel(r'C:\Users\wby\Desktop\2020年銷售數(shù)據(jù)-3.xlsx') # 1.統(tǒng)計(jì)月度銷售額 sales_df.groupby('月').銷售額.sum() sales_df.groupby('月').銷售額.agg([np.sum, np.max, np.min]) # 2.統(tǒng)計(jì)品牌銷售占比 ser = sales_df.groupby('品牌').銷售額.sum() ser.plot(kind='pie', autopct='%.1f%%', counterclock=False) plt.show() # 3.統(tǒng)計(jì)各地區(qū)的月度銷售額 sales_df.groupby(['銷售區(qū)域', '月']).銷售額.sum() # 畫透視圖 pd.pivot_table(sales_df, index=['銷售區(qū)域'], columns=['月'],values=['銷售額'], aggfunc=np.sum, fill_value=0, margins=True, margins_name='總計(jì)') # 4.統(tǒng)計(jì)各渠道的品牌銷量 sales_df.grouby(['銷售渠道', '品牌']).銷售數(shù)量.sum() pd.pivot_table(sales_df, index='銷售渠道', columns='品牌', values='銷售數(shù)量', aggfunc=np.sum) # 5.統(tǒng)計(jì)不同銷售區(qū)間的月度銷量占比 # 數(shù)據(jù)分箱 bins = [0, 100, 300, 500, 800, 1200, 10000] cate = pd.cut(sales_df.售價(jià), bins, right=False) cate sales_df['價(jià)格區(qū)間'] = cate sales_df temp_df = pd.pivot_table(sales_df, index='價(jià)格區(qū)間', columns='月', values='銷售數(shù)量', aggfunc=np.sum, margins=True, margins_name='總計(jì)') temp_df.iloc temp_df ser = temp_df.iloc[:-1, :-1].sum() temp_df = np.round(temp_df.divide(ser) * 100, 2) temp_df temp_df.applymap(lambda x:f'{x}%'')三、DataFrame的其他方法
df1 = pd.DataFrame({'col1':[[1, 2, 3], [4, 5, 6]],'col2':[[0] * 3,[1] * 5] }) df1 df1.col1.explode() df1.explode('col1', ignore_index=True) df1.col2.explode() df1.explode('col1').explode('col2') ser1 = pd.Series(np.random.randint(10, 100, 10)) ser1 ser1.rolling(3).mean()Index對象
1.范圍索引
# 范圍索引 sales_data = np.random.randint(400, 1000, 12) month_index = pd.RangeIndex(1, 13, name='月份') ser = pd.Series(data=sales_data, index=month_index) ser ser.index2.多級索引
# 多級索引 ids = np.arange(1001, 1006) sms = ['期中', '期末'] index = pd.MultiIndex.from_product((ids, sms), names=['學(xué)號', '學(xué)期']) courses = ['語文', '數(shù)學(xué)', '英語'] scores = np.random.randint(60, 101, (10, 3)) df = pd.DataFrame(data=scores, columns=courses, index=index) df df.reset_index(level=1)取數(shù)據(jù)的方法
# 將數(shù)據(jù)分為10個(gè)周期,取出10個(gè)周期的數(shù)據(jù) pd.date_range('2021-1-1', '2021-6-1', periods=10) # 一周為一個(gè)周期取出數(shù)據(jù) temp = pd.date_range('2021-1-1', '2021-6-1', freq='W') temp temp - pd.DateOffset(days=2) temp + pd.DateOffset(days=2) import pandas_datareader as pdrbaidu_df = pdr.get_data_stooq('BIDU', start='2021-11-1', end='2021-12-8') baidu_df.sort_index(inplace=True) baidu_df baidu_df.Close.rolling(5).mean() baidu_df.Close.plot(kind='line') plt.yticks(np.arange(100, 181, 10)) plt.show() # 往后移三天,第一天的數(shù)據(jù)到第四天,以此類推 baidu_df.shift(3, fill_value=0) # 每隔10天取數(shù)據(jù),如果沒有數(shù)據(jù),用前面的數(shù)據(jù)填充后面的數(shù)據(jù) baidu_df.asfreq('10D', method='ffill') # 取每個(gè)月數(shù)據(jù)平均值 baidu_df.resample('1M').mean() # 取每10天數(shù)據(jù)的總和 baidu_df.resample('10D').Volume.sum()四、用pyecharts繪圖
例一、
from pyecharts.charts import Bar from pyecharts import options as opts # 內(nèi)置主題類型可查看 pyecharts.globals.ThemeType from pyecharts.globals import ThemeTypebar = Bar() bar.add_xaxis(["襯衫", "羊毛衫", "雪紡衫", "褲子", "高跟鞋", "襪子"]) bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90], color='blue') bar.add_yaxis("商家B", [15, 6, 45, 20, 35, 66], color='red') bar.set_global_opts(title_opts=opts.TitleOpts(title="銷售統(tǒng)計(jì)圖", subtitle="服裝銷售統(tǒng)計(jì)圖", pos_left='12%')) bar.render_notebook()例二、讀取股票數(shù)據(jù)繪制K線圖
df1 = pd.read_excel(r'C:\Users\wby\Desktop\data\excel\阿里巴巴2020年股票數(shù)據(jù).xlsx', usecols=['Date','Open', 'Close', 'Low', 'High'], index_col='Date') def format_date(curr_date):return curr_date.strftime('%y/%m/%d')cols = df1.columns[[2, 0, 3, 1]] df1 = df1[cols] x, y = df1.index.tolist(), np.round(df1.values, 2).tolist() x = list(map(format_date, x)) from pyecharts import options as opts from pyecharts.charts import Klinec = Kline() c.add_xaxis(x) c.add_yaxis("kline",y,markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_="max", value_dim="close")]), ) c.set_global_opts(xaxis_opts=opts.AxisOpts(is_scale=True),yaxis_opts=opts.AxisOpts(is_scale=True,splitarea_opts=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)),),title_opts=opts.TitleOpts(title="阿里巴巴2020年股票數(shù)據(jù)"),datazoom_opts = opts.DataZoomOpts(), ) c.render_notebook()五、用seaborn畫散點(diǎn)圖
import sslssl._create_default_https_context = ssl._create_unverified_context import seaborn as snsdf = pd.read_csv(r'C:\Users\wby\Desktop\tips.csv') df['sex'] = df.sex.apply(lambda x: 0 if x == 'Female' else 1) df['smoker'] = df.smoker.apply(lambda x: 0 if x == 'No' else 1) sns.set_theme(style="ticks") sns.pairplot(df, y_vars="tip", x_vars=["total_bill", "size"]).
總結(jié)
以上是生活随笔為你收集整理的Python数据分析第二周总结的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Android 截图,截取指定view截
- 下一篇: python控制苹果手机触摸屏失灵怎么办