section8
本章節(jié)的目的是 【明確目標(biāo)用戶群】 ,以更好的服務(wù)現(xiàn)有用戶。
【知識(shí)點(diǎn)】
1.作圖
- 顯示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步驟一(替換sans-serif字體) plt.rcParams['axes.unicode_minus'] = False # 步驟二(解決坐標(biāo)軸負(fù)數(shù)的負(fù)號(hào)顯示問(wèn)題)
2.數(shù)據(jù)庫(kù)操作
- sqlalchemy 引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
3.批量讀取文件
- os.wolk()、os.path.join()用法
4.groupby()以及agg() 的聯(lián)合使用,應(yīng)對(duì)不同列使用不同的函數(shù)
- 按月統(tǒng)計(jì)
affc = {'payment':'sum', 'log_date':'count'} dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
- 修改列明
renam = {'log_date':'access_days'} dfm.rename(columns=renam, inplace=True)
5.KMeans 聚類的使用
- 單列的聚類(需要將單列應(yīng)用 reshape(-1,1)格式化為1列)
from sklearn.cluster import KMeans a47 = action['A47'].reshape(-1, 1) kms = KMeans(n_clusters=3).fit(a47)
- 聚類的標(biāo)簽 labels_ 屬性
cluster = kms.labels_
- 將標(biāo)簽添加至源數(shù)據(jù)中,運(yùn)用groupby()查看分組情況
action['cluster'] = cluster action.groupby(['cluster'])['user_id'].count()
- 可視化分組
6.主成分分析
數(shù)據(jù)預(yù)處理
- 提取要進(jìn)行主成分分析的列
paction = acc.iloc[:,3:(len(acc.columns)-1)]
- 提取要進(jìn)行主成分分析的列
- 刪掉0值較多的列
cc = paction[paction==0].count(axis=0)/len(paction) cc.plot() dd = cc[cc<.9] #刪掉該列中90%以上都是0值的列 paction = paction[dd.index] paction.head()
- 刪掉0值較多的列
刪掉相關(guān)性較強(qiáng)的列
# 數(shù)據(jù)概覽 corp = paction.corr() sns.heatmap(corp) mask = np.array(corp) mask[np.tril_indices_from(mask)] = False # 畫(huà)下三角heatmap的方法 sns.heatmap(corp,mask=mask)# 通過(guò)下三角矩陣的方式,刪掉相關(guān)性較強(qiáng)的數(shù)據(jù)列 coll = corp.columns corp = pd.DataFrame(np.tril(corp, -1)) # 應(yīng)用 np.tril(m, -1) 函數(shù)獲取下三角,上三角數(shù)據(jù)全部置為0 corp.columns = coll pac2 = paction.loc[:,(corp.abs()<.8).all()] # 任何一個(gè)數(shù)都小于 0.8 的數(shù)據(jù) all() 函數(shù) pac2.head()
進(jìn)行主成分分析
from sklearn.decomposition import PCA pca = PCA() pca.fit(pac2)redio = pca.explained_variance_ratio_ # pca.explained_variance_ratio_ 是PCA降維后的矩陣課解釋性比率 print(redio) print(pca.singular_values_) # singular_values_ 是奇異值矩陣
主成分的課解釋性曲線
recu = redio.cumsum() # 應(yīng)用 cumsum() 函數(shù)進(jìn)行逐數(shù)據(jù)累加 plt.plot(recu)
獲取降維后的數(shù)據(jù)以進(jìn)行下一步
pca.set_params(n_components=10) # 設(shè)置 維度 為 10 pac3 = pd.DataFrame(pca.fit_transform(pac2)) # 使用fit_transform()函數(shù)訓(xùn)練并獲得降維后的數(shù)據(jù) pac3.head()
- 繼續(xù)應(yīng)用 KMENAS 進(jìn)行聚類, 得到所有用戶的 分類 ,然后再 平均 每個(gè)分類的每個(gè)行為的所有用戶的值
- 繼續(xù)應(yīng)用相關(guān)性 刪除 相關(guān)性強(qiáng)的列, 獲得最后 主要觀察指標(biāo)
對(duì)主要觀察指標(biāo)進(jìn)行 雷達(dá)圖 展示
# 首先,對(duì)數(shù)據(jù)進(jìn)行標(biāo)準(zhǔn)化處理 from sklearn.preprocessing import scale ccccc = pd.DataFrame(scale(cccc)) ccccc.columns = cccc.columns# 畫(huà)圖 plt.figure(figsize=(8,8)) N = ccccc.shape[1] # 極坐標(biāo)的分割分?jǐn)?shù) angles = np.linspace(0, 2*np.pi, N, endpoint=False) # 設(shè)置雷達(dá)圖的角度,用于平分切開(kāi)一個(gè)圓面 angles = np.concatenate((angles,[angles[0]])) # 使雷達(dá)圖一圈封閉起來(lái) for i in range(len(ccccc)):values = ccccc.loc[i,:] # 構(gòu)造數(shù)據(jù)values = np.concatenate((values,[values[0]])) # 為了使雷達(dá)圖一圈封閉起來(lái)plt.polar(angles, values, 'o-', linewidth=2) # 繪制 plt.legend(ccccc.index, loc='lower right') plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns)) # 添加極坐標(biāo)的標(biāo)簽 plt.title('重要指標(biāo)雷達(dá)圖呈現(xiàn)')
一、庫(kù)導(dǎo)入以及matplotlib顯示中文
import pandas as pd import numpy as np import pymysql from sqlalchemy import create_engine import matplotlib.pyplot as plt import seaborn as sns import missingno as msno import osplt.rcParams['font.sans-serif'] = ['SimHei'] # 步驟一(替換sans-serif字體) plt.rcParams['axes.unicode_minus'] = False # 步驟二(解決坐標(biāo)軸負(fù)數(shù)的負(fù)號(hào)顯示問(wèn)題) %matplotlib inline數(shù)據(jù)庫(kù)引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')二、批量讀取文件
def read_files(path):df = pd.DataFrame()for root, dirs, files in os.walk(path):for file in files:rfile = os.path.join(root,file)if rfile.split('.')[-1] == 'tsv':rdf = pd.read_csv(rfile, sep='\t')df = df.append(rdf)return df action_path = 'data/sample-data/section8/daily/action/' dau_path = 'data/sample-data/section8/daily/dau/' dpu_path = 'data/sample-data/section8/daily/dpu/'action = read_files(action_path) dau = read_files(dau_path) dpu = read_files(dpu_path)查看數(shù)據(jù)完整性以及頭部信息
print(action.isnull().sum().sum()) print(action.shape) # print(action.info()) action.head() 0 (2653, 57)| 2013-10-31 | game-01 | 654133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 380 | 25655 | 0 | 0 | 0 | 0 | 0.0 | 46 |
| 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
| 2013-10-31 | game-01 | 709596 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 416 | 24817 | 0 | 0 | 0 | 0 | 0.0 | 2 |
| 2013-10-31 | game-01 | 525047 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | ... | 22 | 22 | 35200 | 6412 | 21 | 0 | 0 | 0 | 0.0 | 109 |
| 2013-10-31 | game-01 | 796908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 29 | 29 | 388 | 25444 | 1 | 0 | 0 | 0 | 0.0 | 64 |
5 rows × 57 columns
print(dau.isnull().sum().sum()) print(dau.shape) print(dau.info()) dau.head() 0 (509754, 3) <class 'pandas.core.frame.DataFrame'> Int64Index: 509754 entries, 0 to 2410 Data columns (total 3 columns): log_date 509754 non-null object app_name 509754 non-null object user_id 509754 non-null int64 dtypes: int64(1), object(2) memory usage: 15.6+ MB None| 2013-05-01 | game-01 | 608801 |
| 2013-05-01 | game-01 | 712453 |
| 2013-05-01 | game-01 | 776853 |
| 2013-05-01 | game-01 | 823486 |
| 2013-05-01 | game-01 | 113600 |
| 2013-05-01 | game-01 | 804005 | 571 |
| 2013-05-01 | game-01 | 793537 | 81 |
| 2013-05-01 | game-01 | 317717 | 81 |
| 2013-05-01 | game-01 | 317717 | 81 |
| 2013-05-01 | game-01 | 426525 | 324 |
三、數(shù)據(jù)預(yù)處理
1.合并 DAU DPU
df = pd.merge(dau, dpu[['log_date','user_id','payment']], how='left', on=['user_id','log_date']) df.head()| 2013-05-01 | game-01 | 608801 | NaN |
| 2013-05-01 | game-01 | 712453 | NaN |
| 2013-05-01 | game-01 | 776853 | NaN |
| 2013-05-01 | game-01 | 823486 | NaN |
| 2013-05-01 | game-01 | 113600 | NaN |
| 2013-05-01 | game-01 | 608801 | 0.0 | 0 |
| 2013-05-01 | game-01 | 712453 | 0.0 | 0 |
| 2013-05-01 | game-01 | 776853 | 0.0 | 0 |
| 2013-05-01 | game-01 | 823486 | 0.0 | 0 |
| 2013-05-01 | game-01 | 113600 | 0.0 | 0 |
2.按月統(tǒng)計(jì)
# 增加月份列 df['log_month'] = df['log_date'].apply(lambda x: x[0:7]) df.head()| 2013-05-01 | game-01 | 608801 | 0.0 | 0 | 2013-05 |
| 2013-05-01 | game-01 | 712453 | 0.0 | 0 | 2013-05 |
| 2013-05-01 | game-01 | 776853 | 0.0 | 0 | 2013-05 |
| 2013-05-01 | game-01 | 823486 | 0.0 | 0 | 2013-05 |
| 2013-05-01 | game-01 | 113600 | 0.0 | 0 | 2013-05 |
巧妙運(yùn)用 groupby 以及 agg 函數(shù),統(tǒng)計(jì)出用戶按月份的 消費(fèi)情況 和 登陸次數(shù)
# 按月統(tǒng)計(jì) affc = {'payment':'sum', 'log_date':'count'} dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index() # 修改列明 renam = {'log_date':'access_days'} dfm.rename(columns=renam, inplace=True) dfm.head()| 2013-05 | 65 | 0.0 | 1 |
| 2013-05 | 115 | 0.0 | 1 |
| 2013-05 | 194 | 0.0 | 1 |
| 2013-05 | 426 | 0.0 | 4 |
| 2013-05 | 539 | 0.0 | 1 |
4.使用 Kmeans 進(jìn)行分類, 得到排名靠前的用戶,即 重度用戶/中度用戶/輕度用戶
A47 列即是排行榜得分, 從分布圖上看出,大部分用戶得分很低,符合冪律曲線
# action['A47'].hist(bins=50, figsize=(6,4)) <matplotlib.axes._subplots.AxesSubplot at 0x1c21d894240> sns.distplot(action['A47'],bins=50,kde=True) <matplotlib.axes._subplots.AxesSubplot at 0x1c21af07a58>對(duì) A47 列進(jìn)行聚類,分為3類
from sklearn.cluster import KMeansa47 = action['A47'].reshape(-1, 1)kms = KMeans(n_clusters=3).fit(a47) D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) insteadThis is separate from the ipykernel package so we can avoid doing imports until cluster = kms.labels_ kms.cluster_centers_ array([[ 9359.84787792],[ 69386.11297071],[185857.17948718]]) action['cluster'] = cluster action.head()| 2013-10-31 | game-01 | 654133 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 380 | 25655 | 0 | 0 | 0 | 0 | 0.0 | 46 | 0 |
| 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 | 2 |
| 2013-10-31 | game-01 | 709596 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 416 | 24817 | 0 | 0 | 0 | 0 | 0.0 | 2 | 0 |
| 2013-10-31 | game-01 | 525047 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | ... | 22 | 35200 | 6412 | 21 | 0 | 0 | 0 | 0.0 | 109 | 0 |
| 2013-10-31 | game-01 | 796908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 29 | 388 | 25444 | 1 | 0 | 0 | 0 | 0.0 | 64 | 0 |
5 rows × 58 columns
action.groupby(['cluster'])['user_id'].count() cluster 0 2096 1 479 2 78 Name: user_id, dtype: int64圖上顯示,通過(guò)聚類分解后用戶分為3個(gè)類, 0 表示輕度用戶,排行榜得分最少; 1 表示中度用戶,排行版得分居中; 2 表示重度用戶,排行版得分較高,而且用戶數(shù)量較少,符合實(shí)際情況。
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False) snsdf['user'] = range(len(snsdf)) sns.scatterplot(x='user',y='A47',hue='cluster',data=snsdf, palette='rainbow', alpha=.2) <matplotlib.axes._subplots.AxesSubplot at 0x1c21b9bf898> snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False) snsdf['user'] = range(len(snsdf))plt.figure(figsize=(8,5)) snsdf1 = snsdf.reset_index() snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用戶') snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用戶') snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:輕度用戶') plt.legend() plt.xlabel('用戶分布') plt.ylabel('排行榜得分') Text(0,0.5,'排行榜得分')限定排名靠前的用戶,即得分較高的重度和中度用戶,以便接下來(lái)進(jìn)行分析
acc = action[action['cluster']>=1] acc.head()| 2013-10-31 | game-01 | 425530 | 0 | 0 | 0 | 0 | 10 | 1 | 233 | ... | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 | 2 |
| 2013-10-31 | game-01 | 776120 | 0 | 0 | 0 | 0 | 9 | 0 | 0 | ... | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 | 2 |
| 2013-10-31 | game-01 | 276197 | 0 | 0 | 0 | 0 | 7 | 0 | 58 | ... | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 | 1 |
| 2013-10-31 | game-01 | 221572 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 | 1 |
| 2013-10-31 | game-01 | 692433 | 0 | 0 | 0 | 0 | 6 | 0 | 0 | ... | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 | 1 |
5 rows × 58 columns
5.主成分分析
獲取關(guān)鍵的參數(shù)
paction = acc.iloc[:,3:(len(acc.columns)-1)] paction.index=acc.user_id paction.head()| 0 | 0 | 0 | 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
| 0 | 0 | 0 | 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
| 0 | 0 | 0 | 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
| 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 54 columns
1.刪掉 0 值比較多的列
cc = paction[paction==0].count(axis=0)/len(paction) print(cc.head()) cc.plot() A1 1.000000 A2 0.926391 A3 1.000000 A4 0.994614 A5 0.055655 dtype: float64<matplotlib.axes._subplots.AxesSubplot at 0x1c21bbb1470> # cc[cc>.8] dd = cc[cc<.95] paction = paction[dd.index] paction.head()| 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | 19 | 2 | 19 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
| 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | 38 | 8 | 19 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
| 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | 15 | 3 | 11 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
| 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | 0 | 0 | 3 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
| 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | 0 | 0 | 2 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 32 columns
2.刪掉相關(guān)性較強(qiáng)的列
corp = paction.corr() plt.figure(figsize=(15,8)) sns.heatmap(corp) <matplotlib.axes._subplots.AxesSubplot at 0x1c21bc094a8>畫(huà)下三角heatmap,使用到的函數(shù)
mask = np.array(corp) mask[np.tril_indices_from(mask)] = False fig,ax = plt.subplots() fig.set_size_inches(15,8) sns.heatmap(corp,mask=mask) <matplotlib.axes._subplots.AxesSubplot at 0x1c21bc09400>獲取矩陣的下三角,如果要獲取上三角的話, np.tril(m, 1)
coll = corp.columns corp = pd.DataFrame(np.tril(corp, -1)) corp.columns = coll corp.head()| 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 0.069744 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 0.076185 | 0.178833 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 0.158735 | 0.219395 | 0.371360 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 0.167200 | 0.186124 | 0.242025 | 0.803161 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 32 columns
pac2 = paction.loc[:,(corp.abs()<.7).all()] # 任何一個(gè)數(shù)都小于0.7 的數(shù)據(jù) pac2.head()| 0 | 19 | 2 | 19 | 0 | 0 | 0.5 | 23 | 0.92174 | 20 | 347 | 36 | 22 | 4 | 0.0 | 71 |
| 0 | 38 | 8 | 19 | 0 | 0 | 0.0 | 20 | 0.90256 | 38 | 684 | 37 | 15 | 0 | 0.0 | 312 |
| 0 | 15 | 3 | 11 | 0 | 0 | 0.0 | 10 | 0.92000 | 15 | 4226 | 15 | 0 | 8 | 0.0 | 95 |
| 0 | 0 | 0 | 3 | 0 | 0 | 0.0 | 2 | 0.85714 | 24 | 5792 | 4 | 0 | 0 | 0.0 | 21 |
| 0 | 0 | 0 | 2 | 0 | 0 | 0.0 | 11 | 0.73684 | 28 | 4549 | 16 | 8 | 0 | 0.0 | 154 |
進(jìn)行主成分分析
from sklearn.decomposition import PCA pca = PCA() pca.fit(pac2) PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,svd_solver='auto', tol=0.0, whiten=False) redio = pca.explained_variance_ratio_ print(redio) print(pca.singular_values_) [9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-052.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-063.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-083.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09] [3.96183910e+04 1.73797668e+03 4.34684952e+02 2.96004755e+022.05284590e+02 1.55911168e+02 1.21032418e+02 8.30848288e+016.89599635e+01 3.62791414e+01 1.44027941e+01 1.24044853e+017.79687146e+00 6.80796010e+00 5.35458829e+00 3.44523057e+00] recu = redio.cumsum() print(recu) x = np.arange(len(recu)) plt.plot(recu, color='r') [0.9978438 0.99976405 0.99988417 0.99993987 0.99996666 0.999982120.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.999999910.99999994 0.99999997 0.99999999 1. ][<matplotlib.lines.Line2D at 0x1c21dadada0>]得到降維后的數(shù)據(jù)
pca.set_params(n_components=10) pac3 = pd.DataFrame(pca.fit_transform(pac2)) pacsse = pac3.copy() pac3.head()| 2706.266005 | -100.824346 | -1.874787 | -1.577536 | 12.481591 | -2.394320 | 9.770878 | 7.807535 | 0.021273 | -2.169596 |
| 2373.811140 | 147.314930 | -16.386795 | -8.428655 | 10.019577 | -3.004725 | 6.009771 | 0.961469 | -1.598531 | 2.144615 |
| -1171.733361 | -5.493081 | 0.744995 | 0.542033 | -0.785251 | -5.756412 | -1.012336 | -1.778067 | 7.256884 | 0.343277 |
| -2738.903900 | -50.468487 | 2.328491 | 2.965415 | -5.794347 | 11.891289 | 2.965366 | -1.182413 | 0.065619 | 1.245358 |
| -1493.642618 | 58.686385 | -10.807612 | 11.777973 | 7.664692 | 9.312968 | 4.376429 | 1.994214 | -1.568050 | 0.426246 |
6.KMeans 進(jìn)行聚類
from sklearn.cluster import KMeanskm = KMeans(n_clusters=5) km.fit(pac3) KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',random_state=None, tol=0.0001, verbose=0) clu = km.labels_ pac3['clu'] = clu pac3.head()| 2706.266005 | -100.824346 | -1.874787 | -1.577536 | 12.481591 | -2.394320 | 9.770878 | 7.807535 | 0.021273 | -2.169596 | 0 |
| 2373.811140 | 147.314930 | -16.386795 | -8.428655 | 10.019577 | -3.004725 | 6.009771 | 0.961469 | -1.598531 | 2.144615 | 0 |
| -1171.733361 | -5.493081 | 0.744995 | 0.542033 | -0.785251 | -5.756412 | -1.012336 | -1.778067 | 7.256884 | 0.343277 | 1 |
| -2738.903900 | -50.468487 | 2.328491 | 2.965415 | -5.794347 | 11.891289 | 2.965366 | -1.182413 | 0.065619 | 1.245358 | 4 |
| -1493.642618 | 58.686385 | -10.807612 | 11.777973 | 7.664692 | 9.312968 | 4.376429 | 1.994214 | -1.568050 | 0.426246 | 1 |
#### palette 的顏色風(fēng)格:
Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r
將分類后的類別添加至原數(shù)據(jù)中
pac4 = pac2.copy() pac4['cluster'] = list(pac3.clu) pac4.head()| 0 | 19 | 2 | 19 | 0 | 0 | 0.5 | 23 | 0.92174 | 20 | 347 | 36 | 22 | 4 | 0.0 | 71 | 0 |
| 0 | 38 | 8 | 19 | 0 | 0 | 0.0 | 20 | 0.90256 | 38 | 684 | 37 | 15 | 0 | 0.0 | 312 | 0 |
| 0 | 15 | 3 | 11 | 0 | 0 | 0.0 | 10 | 0.92000 | 15 | 4226 | 15 | 0 | 8 | 0.0 | 95 | 1 |
| 0 | 0 | 0 | 3 | 0 | 0 | 0.0 | 2 | 0.85714 | 24 | 5792 | 4 | 0 | 0 | 0.0 | 21 | 4 |
| 0 | 0 | 0 | 2 | 0 | 0 | 0.0 | 11 | 0.73684 | 28 | 4549 | 16 | 8 | 0 | 0.0 | 154 | 1 |
| 0.022222 | 0.322222 | 0.655556 | 0.167691 | 0.858193 | 27.600000 | 10.666667 | 2.011111 | 166.711111 |
| 0.079646 | 0.274336 | 0.362832 | 0.095231 | 0.844027 | 20.159292 | 3.008850 | 1.469027 | 102.106195 |
| 0.073770 | 0.377049 | 0.336066 | 0.070628 | 0.849343 | 24.737705 | 4.286885 | 1.844262 | 121.909836 |
| 0.018349 | 0.229358 | 0.284404 | 0.098252 | 0.845981 | 24.119266 | 5.266055 | 1.733945 | 146.871560 |
| 0.203252 | 0.292683 | 0.243902 | 0.063686 | 0.775076 | 18.983740 | 2.130081 | 0.975610 | 84.032520 |
| -0.855590 | 0.468859 | 1.918400 | 1.862020 | 0.785882 | 1.422970 | 1.867773 | 1.118457 | 1.424282 |
| 0.002962 | -0.503392 | -0.094337 | -0.104961 | 0.315530 | -0.940402 | -0.688647 | -0.381093 | -0.746672 |
| -0.084884 | 1.582038 | -0.278379 | -0.772826 | 0.492038 | 0.513827 | -0.261998 | 0.656909 | -0.081200 |
| -0.913505 | -1.416613 | -0.633601 | -0.022944 | 0.380387 | 0.317394 | 0.064879 | 0.351742 | 0.757602 |
| 1.851016 | -0.130892 | -0.912083 | -0.961289 | -1.973837 | -1.313789 | -0.982007 | -1.746015 | -1.354012 |
不進(jìn)行預(yù)處理的降維
dfp = acc.iloc[:,3:(len(acc.columns)-1)] dfp.index=acc.user_id dfp.head()| 0 | 0 | 0 | 0 | 10 | 1 | 233 | 58.25 | 288 | 230 | ... | 19 | 20 | 180543 | 347 | 36 | 22 | 4 | 0 | 0.0 | 71 |
| 0 | 0 | 0 | 0 | 9 | 0 | 0 | 0.00 | 325 | 195 | ... | 19 | 38 | 142214 | 684 | 37 | 15 | 0 | 0 | 0.0 | 312 |
| 0 | 0 | 0 | 0 | 7 | 0 | 58 | 7.25 | 150 | 100 | ... | 15 | 15 | 54602 | 4226 | 15 | 0 | 8 | 0 | 0.0 | 95 |
| 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.00 | 40 | 14 | ... | 24 | 24 | 39891 | 5792 | 4 | 0 | 0 | 0 | 0.0 | 21 |
| 0 | 0 | 0 | 0 | 6 | 0 | 0 | 0.00 | 102 | 95 | ... | 15 | 28 | 50706 | 4549 | 16 | 8 | 0 | 0 | 0.0 | 154 |
5 rows × 54 columns
from sklearn.decomposition import PCApca = PCA(whiten=False) pca.fit(dfp) PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,svd_solver='auto', tol=0.0, whiten=False) retio = pca.explained_variance_ratio_ # print(retio) # print(pca.singular_values_) rec = retio.cumsum() print(rec) x = np.arange(len(rec)) plt.plot(rec, color='r') [0.9996008 0.99995245 0.99997489 0.99999016 0.9999933 0.999995640.99999759 0.99999838 0.99999897 0.9999995 0.99999962 0.999999720.99999979 0.99999986 0.9999999 0.99999993 0.99999996 0.999999970.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.999999990.99999999 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1.1. 1. 1. 1. 1. 1. ][<matplotlib.lines.Line2D at 0x1c21f406780>] pca.set_params(n_components=10) pacsse = pd.DataFrame(pca.fit_transform(dfp)) pacsse.head()| 94938.293061 | -342.891655 | -161.442878 | -199.616210 | 1.830692 | 73.107938 | 153.124982 | 124.440657 | -34.371612 | 46.548951 |
| 56613.313155 | -960.580156 | -38.560364 | -45.836571 | 13.670166 | 90.767620 | -145.846645 | -40.255134 | 10.508203 | 16.287863 |
| -31060.195159 | 388.005529 | -6.932692 | -0.948812 | -5.332728 | 18.237293 | 11.393467 | 14.689011 | -7.994909 | 32.398532 |
| -45806.252443 | 1579.357883 | -81.812845 | -96.488345 | -18.477649 | -90.059217 | 31.377291 | -22.865193 | -19.724837 | 16.293640 |
| -34963.135693 | 611.858506 | -18.187490 | -16.454233 | -5.597209 | -9.722257 | -63.112236 | -3.943266 | 7.222725 | -10.889839 |
手肘法獲取最優(yōu) K 值
from sklearn.cluster import KMeansdf_features = pacsse # 讀入數(shù)據(jù) # '利用SSE選擇k' SSE = [] # 存放每次結(jié)果的誤差平方和 for k in range(1,9):estimator = KMeans(n_clusters=k) # 構(gòu)造聚類器estimator.fit(df_features)SSE.append(estimator.inertia_) X = range(1,9) plt.xlabel('k') plt.ylabel('SSE') plt.plot(X,SSE,'o-') [<matplotlib.lines.Line2D at 0x1c2211cac50>]顯然,先標(biāo)準(zhǔn)化數(shù)據(jù)是不合適的
# 顯然,先標(biāo)準(zhǔn)化數(shù)據(jù)是不合適的df_features = pd.DataFrame(scale(pacsse)) SSE = [] for k in range(1,9):estimator = KMeans(n_clusters=k) estimator.fit(df_features)SSE.append(estimator.inertia_) X = range(1,9) plt.xlabel('k') plt.ylabel('SSE') plt.plot(X,SSE,'o-') [<matplotlib.lines.Line2D at 0x1c2213bc438>] km = KMeans(n_clusters=4) km.fit(pacsse) clu = km.labels_ pacsse['clu'] = clu pacsse.head()| 94938.293061 | -342.891655 | -161.442878 | -199.616210 | 1.830692 | 73.107938 | 153.124982 | 124.440657 | -34.371612 | 46.548951 | 2 |
| 56613.313155 | -960.580156 | -38.560364 | -45.836571 | 13.670166 | 90.767620 | -145.846645 | -40.255134 | 10.508203 | 16.287863 | 0 |
| -31060.195159 | 388.005529 | -6.932692 | -0.948812 | -5.332728 | 18.237293 | 11.393467 | 14.689011 | -7.994909 | 32.398532 | 1 |
| -45806.252443 | 1579.357883 | -81.812845 | -96.488345 | -18.477649 | -90.059217 | 31.377291 | -22.865193 | -19.724837 | 16.293640 | 1 |
| -34963.135693 | 611.858506 | -18.187490 | -16.454233 | -5.597209 | -9.722257 | -63.112236 | -3.943266 | 7.222725 | -10.889839 | 1 |
顯然,不進(jìn)行預(yù)處理的數(shù)據(jù)聚類是有問(wèn)題的, 第一主成分和第二主成分 顯然是相關(guān)的
pac4 = pac2.copy() pac4['cluster'] = list(pacsse.clu) pac4.head()clu5 = pac4.groupby('cluster').mean() clu5.drop(columns='A53',inplace=True) c5cor = clu5.corr() plt.figure(figsize=(15,8)) sns.heatmap(c5cor,annot=True) <matplotlib.axes._subplots.AxesSubplot at 0x1c22145a4e0> ccrp = pd.DataFrame(np.tril(c5cor,-1)) ccrp.columns = clu5.columns cccc = clu5.loc[:,(ccrp.abs()<.95).all()] cccc| 3.398693 | 0.228758 | 1.810458 | 146.287582 |
| 1.938953 | 0.316860 | 1.433140 | 101.531977 |
| 4.592593 | 0.407407 | 1.870370 | 169.777778 |
| 2.166667 | 0.166667 | 1.666667 | 213.833333 |
| 0.352533 | -0.562784 | 0.684599 | -0.285229 |
| -1.021705 | 0.406288 | -1.555764 | -1.388557 |
| 1.476502 | 1.402249 | 1.040338 | 0.293858 |
| -0.807330 | -1.245753 | -0.169173 | 1.379928 |
轉(zhuǎn)載于:https://www.cnblogs.com/cvlas/p/9537532.html
總結(jié)
- 上一篇: ECstore报表不显示解决
- 下一篇: 自我介绍的四个套路