风控-数据分析
數據總體了解:
讀取數據集并了解數據集大小,原始特征維度;
通過info熟悉數據類型;
粗略查看數據集中各特征基本統計量;
缺失值和唯一值:
查看數據缺失值情況
查看唯一值特征情況
深入數據-查看數據類型
類別型數據
數值型數據
離散數值型數據
連續數值型數據
數據間相關關系
特征和特征之間關系
特征和目標變量之間關系
用pandas_profiling生成數據報告
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime import warnings warnings.filterwarnings('ignore')1.讀取文件
#讀取文件 data_train = pd.read_csv("./train.csv") data_test_a = pd.read_csv('./testA.csv')2.整體信息
#查看數據集樣本個數和原始特征維度 data_test_a.shape (200000, 48) data_train.shape (800000, 47) data_train.columns Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade','subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership','annualIncome', 'verificationStatus', 'issueDate', 'isDefault','purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years','ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec','pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc','initialListStatus', 'applicationType', 'earliesCreditLine', 'title','policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8','n9', 'n10', 'n11', 'n12', 'n13', 'n14'],dtype='object') data_train.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 800000 entries, 0 to 799999 Data columns (total 47 columns): id 800000 non-null int64 loanAmnt 800000 non-null float64 term 800000 non-null int64 interestRate 800000 non-null float64 installment 800000 non-null float64 grade 800000 non-null object subGrade 800000 non-null object employmentTitle 799999 non-null float64 employmentLength 753201 non-null object homeOwnership 800000 non-null int64 annualIncome 800000 non-null float64 verificationStatus 800000 non-null int64 issueDate 800000 non-null object isDefault 800000 non-null int64 purpose 800000 non-null int64 postCode 799999 non-null float64 regionCode 800000 non-null int64 dti 799761 non-null float64 delinquency_2years 800000 non-null float64 ficoRangeLow 800000 non-null float64 ficoRangeHigh 800000 non-null float64 openAcc 800000 non-null float64 pubRec 800000 non-null float64 pubRecBankruptcies 799595 non-null float64 revolBal 800000 non-null float64 revolUtil 799469 non-null float64 totalAcc 800000 non-null float64 initialListStatus 800000 non-null int64 applicationType 800000 non-null int64 earliesCreditLine 800000 non-null object title 799999 non-null float64 policyCode 800000 non-null float64 n0 759730 non-null float64 n1 759730 non-null float64 n2 759730 non-null float64 n2.1 759730 non-null float64 n4 766761 non-null float64 n5 759730 non-null float64 n6 759730 non-null float64 n7 759730 non-null float64 n8 759729 non-null float64 n9 759730 non-null float64 n10 766761 non-null float64 n11 730248 non-null float64 n12 759730 non-null float64 n13 759730 non-null float64 n14 759730 non-null float64 dtypes: float64(33), int64(9), object(5) memory usage: 286.9+ MB data_train.describe()| 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 800000.000000 | 799999.000000 | 800000.000000 | 8.000000e+05 | 800000.000000 | 800000.000000 | ... | 759730.000000 | 759730.000000 | 759730.000000 | 759729.000000 | 759730.000000 | 766761.000000 | 730248.000000 | 759730.000000 | 759730.000000 | 759730.000000 |
| 399999.500000 | 14416.818875 | 3.482745 | 13.238391 | 437.947723 | 72005.351714 | 0.614213 | 7.613391e+04 | 1.009683 | 0.199513 | ... | 8.107937 | 8.575994 | 8.282953 | 14.622488 | 5.592345 | 11.643896 | 0.000815 | 0.003384 | 0.089366 | 2.178606 |
| 230940.252013 | 8716.086178 | 0.855832 | 4.765757 | 261.460393 | 106585.640204 | 0.675749 | 6.894751e+04 | 0.782716 | 0.399634 | ... | 4.799210 | 7.400536 | 4.561689 | 8.124610 | 3.216184 | 5.484104 | 0.030075 | 0.062041 | 0.509069 | 1.844377 |
| 0.000000 | 500.000000 | 3.000000 | 5.310000 | 15.690000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 199999.750000 | 8000.000000 | 3.000000 | 9.750000 | 248.450000 | 427.000000 | 0.000000 | 4.560000e+04 | 0.000000 | 0.000000 | ... | 5.000000 | 4.000000 | 5.000000 | 9.000000 | 3.000000 | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 399999.500000 | 12000.000000 | 3.000000 | 12.740000 | 375.135000 | 7755.000000 | 1.000000 | 6.500000e+04 | 1.000000 | 0.000000 | ... | 7.000000 | 7.000000 | 7.000000 | 13.000000 | 5.000000 | 11.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
| 599999.250000 | 20000.000000 | 3.000000 | 15.990000 | 580.710000 | 117663.500000 | 1.000000 | 9.000000e+04 | 2.000000 | 0.000000 | ... | 11.000000 | 11.000000 | 10.000000 | 19.000000 | 7.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
| 799999.000000 | 40000.000000 | 5.000000 | 30.990000 | 1715.420000 | 378351.000000 | 5.000000 | 1.099920e+07 | 2.000000 | 1.000000 | ... | 70.000000 | 132.000000 | 79.000000 | 128.000000 | 45.000000 | 82.000000 | 4.000000 | 4.000000 | 39.000000 | 30.000000 |
8 rows × 42 columns
data_train.head(3).append(data_train.tail(3))| 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
| 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
| 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
| 799997 | 6000.0 | 3 | 13.33 | 203.12 | C | C3 | 2582.0 | 10+ years | 1 | ... | 4.0 | 26.0 | 4.0 | 10.0 | 4.0 | 5.0 | 0.0 | 0.0 | 1.0 | 4.0 |
| 799998 | 19200.0 | 3 | 6.92 | 592.14 | A | A4 | 151.0 | 10+ years | 0 | ... | 10.0 | 6.0 | 12.0 | 22.0 | 8.0 | 16.0 | 0.0 | 0.0 | 0.0 | 5.0 |
| 799999 | 9000.0 | 3 | 11.06 | 294.91 | B | B3 | 13.0 | 5 years | 0 | ... | 3.0 | 4.0 | 4.0 | 8.0 | 3.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
6 rows × 47 columns
#查看數據集中特征缺失值,唯一值等 print(f'There are {data_train.isnull().any().sum()} columns in train dataset with missing values.') #上面得到訓練集有22列特征有缺失值,進一步查看缺失特征中缺失率大于50%的特征 There are 22 columns in train dataset with missing values. have_null_feature_dict = (data_train.isnull().sum()/len(data_train)).to_dict() fea_null_moreThanHalf = {} for key,value in have_null_feature_dict.items():if value>0.5:fea_null_moreThanHalf[key] = value fea_null_moreThanHalf {} have_null_feature_dict {'id': 0.0,'loanAmnt': 0.0,'term': 0.0,'interestRate': 0.0,'installment': 0.0,'grade': 0.0,'subGrade': 0.0,'employmentTitle': 1.25e-06,'employmentLength': 0.05849875,'homeOwnership': 0.0,'annualIncome': 0.0,'verificationStatus': 0.0,'issueDate': 0.0,'isDefault': 0.0,'purpose': 0.0,'postCode': 1.25e-06,'regionCode': 0.0,'dti': 0.00029875,'delinquency_2years': 0.0,'ficoRangeLow': 0.0,'ficoRangeHigh': 0.0,'openAcc': 0.0,'pubRec': 0.0,'pubRecBankruptcies': 0.00050625,'revolBal': 0.0,'revolUtil': 0.00066375,'totalAcc': 0.0,'initialListStatus': 0.0,'applicationType': 0.0,'earliesCreditLine': 0.0,'title': 1.25e-06,'policyCode': 0.0,'n0': 0.0503375,'n1': 0.0503375,'n2': 0.0503375,'n2.1': 0.0503375,'n4': 0.04154875,'n5': 0.0503375,'n6': 0.0503375,'n7': 0.0503375,'n8': 0.05033875,'n9': 0.0503375,'n10': 0.04154875,'n11': 0.08719,'n12': 0.0503375,'n13': 0.0503375,'n14': 0.0503375} #具體的查看缺失特征及缺失率 # nan可視化 missing = data_train.isnull().sum()/len(data_train) missing = missing[missing>0] missing.sort_values(inplace=True) missing.plot.bar() <AxesSubplot:>- 縱向了解哪些列存在 “nan”, 并可以把nan的個數打印,主要的目的在于查看某一列nan存在的個數是否真的很大,如果nan存在的過多,說明這一列對label的影響幾乎不起作用了,可以考慮刪掉。如果缺失值很小一般可以選擇填充。
- 另外可以橫向比較,如果在數據集中,某些樣本數據的大部分列都是缺失的且樣本足夠的情況下可以考慮刪除。
Tips: 比賽大殺器lgb模型可以自動處理缺失值,Task4模型會具體學習模型了解模型哦!
#查看訓練集測試集中特征屬性只有一值的特征 one_value_fea = [col for col in data_train.columns if data_train[col].nunique()<=1] one_value_fea_test = [col for col in data_test_a.columns if data_test_a[col].nunique() <= 1] one_value_fea ['policyCode'] one_value_fea_test ['policyCode']總結:47列數據中有22列都缺少數據,這在現實世界中很正常。‘policyCode’具有一個唯一值(或全部缺失)。有很多連續變量和一些分類變量。
3.數值類型特征,對象類型特征
- 特征一般都是由類別型特征和數值型特征組成,而數值型特征又分為連續型和離散型。
- 類別型特征有時具有非數值關系,有時也具有數值關系。比如‘grade’中的等級A,B,C等,是否只是單純的分類,還是A優于其他要結合業務判斷
- 數值型特征本是可以直接入模的,但往往風控人員要對其做分箱,轉化為WOE編碼進而做標準評分卡等操作。從模型效果上來看,特征分箱主要是為了降低變量的復雜性,減少變量噪音對模型的影響,提高自變量和因變量的相關度。從而使模型更加穩定。
3.1數值型變量分析,數值型肯定是包括連續型變量和離散型變量的,找出來
- 劃分數值型變量中的連續變量和離散型變量
數值離散型變量分析
data_train['term'].value_counts()#離散型變量 3 606902 5 193098 Name: term, dtype: int64 data_train['homeOwnership'].value_counts()#離散型變量 0 395732 1 317660 2 86309 3 185 5 81 4 33 Name: homeOwnership, dtype: int64 data_train['verificationStatus'].value_counts()#離散型變量 1 309810 2 248968 0 241222 Name: verificationStatus, dtype: int64 data_train['initialListStatus'].value_counts()#離散型變量 0 466438 1 333562 Name: initialListStatus, dtype: int64 data_train['applicationType'].value_counts()#離散型變量 0 784586 1 15414 Name: applicationType, dtype: int64 data_train['policyCode'].value_counts()#離散型變量,無用,全部一個值 1.0 800000 Name: policyCode, dtype: int64 data_train['n11'].value_counts()#離散型變量,相差懸殊,用不用再分析 0.0 729682 1.0 540 2.0 24 4.0 1 3.0 1 Name: n11, dtype: int64 data_train['n12'].value_counts()#離散型變量,相差懸殊,用不用再分析 0.0 757315 1.0 2281 2.0 115 3.0 16 4.0 3 Name: n12, dtype: int64數值連續型變量分析
#每個數字特征得分布可視化 f = pd.melt(data_train,value_vars=num_seralFea) g = sns.FacetGrid(f,col='variable',col_wrap=4,sharex=False,sharey=False) g = g.map(sns.distplot,"value")- 查看某一個數值型變量的分布,查看變量是否符合正態分布,如果不符合正太分布的變量可以log化后再觀察下是否符合正態分布。
- 如果想統一處理一批數據變標準化 必須把這些之前已經正態化的數據提出
- 正態化的原因:一些情況下正態非正態可以讓模型更快的收斂,一些模型要求數據正態(eg. GMM、KNN),保證數據不要過偏態即可,過于偏態可能會影響模型預測結果。
- 非數值類別型變量分析
總結:
- 上面我們用value_counts()等函數看了特征屬性的分布,但是圖表是概括原始信息最便捷的方式。
- 數無形時少直覺
- 同一份數據集,在不同的尺度刻畫上顯示出來的圖形反映的規律是不一樣的。python將數據轉化成圖表,但結論是否正確需要由你保證。
3.2變量分布可視化
單一變量分布可視化
plt.figure(figsize = (8,8)) sns.barplot(data_train['employmentLength'].value_counts(dropna = False)[:20],data_train['employmentLength'].value_counts(dropna = False).keys()[:20]) plt.show()根絕y值不同可視化x某個特征的分布
- 首先查看類別型變量在不同y值上的分布
- 其次查看連續型變量在不同y值上的分布
3.3時間格式數據處理及查看
#轉化成時間格式 issueDateDT特征表示數據日期離數據集中日期最早的日期(2007-06-01)的天數 data_train['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') data_train['issueDateDT'] = data_train['issueDate'].apply(lambda x: x-startdate).dt.days #轉化成時間格式 data_test_a['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') data_test_a['issueDateDT'] = data_test_a['issueDate'].apply(lambda x: x-startdate).dt.days plt.hist(data_train['issueDateDT'], label='train'); plt.hist(data_test_a['issueDateDT'], label='test'); plt.legend(); plt.title('Distribution of issueDateDT dates'); #train 和 test issueDateDT 日期有重疊 所以使用基于時間的分割進行驗證是不明智的3.4掌握透視圖可以讓我們更好的了解數據
#透視圖 索引可以有多個,“columns(列)”是可選的,聚合函數aggfunc最后是被應用到了變量“values”中你所列舉的項目上 pivot = pd.pivot_table(data_train, index=['grade'], columns=['issueDateDT'], values=['loanAmnt'], aggfunc=np.sum) pivot| NaN | 53650.0 | 42000.0 | 19500.0 | 34425.0 | 63950.0 | 43500.0 | 168825.0 | 85600.0 | 101825.0 | ... | 13093850.0 | 11757325.0 | 11945975.0 | 9144000.0 | 7977650.0 | 6888900.0 | 5109800.0 | 3919275.0 | 2694025.0 | 2245625.0 |
| NaN | 13000.0 | 24000.0 | 32125.0 | 7025.0 | 95750.0 | 164300.0 | 303175.0 | 434425.0 | 538450.0 | ... | 16863100.0 | 17275175.0 | 16217500.0 | 11431350.0 | 8967750.0 | 7572725.0 | 4884600.0 | 4329400.0 | 3922575.0 | 3257100.0 |
| NaN | 68750.0 | 8175.0 | 10000.0 | 61800.0 | 52550.0 | 175375.0 | 151100.0 | 243725.0 | 393150.0 | ... | 17502375.0 | 17471500.0 | 16111225.0 | 11973675.0 | 10184450.0 | 7765000.0 | 5354450.0 | 4552600.0 | 2870050.0 | 2246250.0 |
| NaN | NaN | 5500.0 | 2850.0 | 28625.0 | NaN | 167975.0 | 171325.0 | 192900.0 | 269325.0 | ... | 11403075.0 | 10964150.0 | 10747675.0 | 7082050.0 | 7189625.0 | 5195700.0 | 3455175.0 | 3038500.0 | 2452375.0 | 1771750.0 |
| 7500.0 | NaN | 10000.0 | NaN | 17975.0 | 1500.0 | 94375.0 | 116450.0 | 42000.0 | 139775.0 | ... | 3983050.0 | 3410125.0 | 3107150.0 | 2341825.0 | 2225675.0 | 1643675.0 | 1091025.0 | 1131625.0 | 883950.0 | 802425.0 |
| NaN | NaN | 31250.0 | 2125.0 | NaN | NaN | NaN | 49000.0 | 27000.0 | 43000.0 | ... | 1074175.0 | 868925.0 | 761675.0 | 685325.0 | 665750.0 | 685200.0 | 316700.0 | 315075.0 | 72300.0 | NaN |
| NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24625.0 | NaN | NaN | ... | 56100.0 | 243275.0 | 224825.0 | 64050.0 | 198575.0 | 245825.0 | 53125.0 | 23750.0 | 25100.0 | 1000.0 |
7 rows × 139 columns
3.5用pandas_profiling生成數據報告
import pandas_profiling pfr = pandas_profiling.ProfileReport(data_train) pfr.to_file("./example.html")總結
- 上一篇: 打造你的专属埃菲尔塔
- 下一篇: 金融风控-- >客户流失预警模型-- >