當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

逻辑回归预测事件发生的概率_通过逻辑回归，对信用卡申请数据使用卡方分箱法预测违约率建模...

發布時間：2024/9/27 编程问答 28 豆豆

生活随笔收集整理的這篇文章主要介紹了逻辑回归预测事件发生的概率_通过逻辑回归，对信用卡申请数据使用卡方分箱法预测违约率建模... 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一、建模步驟

(1)從數據中衍生特征

(2)對類別型變量和數值型變量進行補缺

(3)基于卡方分箱法對變量進行分箱

(4)WOE編碼后的單變量分析與多變量分析

(5)應用邏輯回歸模型

(6)尺度化

(7)模型預測能力

二、代碼

import pandas as pdimport datetimeimport collectionsimport numpy as npimport numbersimport randomimport sysimport picklefrom itertools import combinationsfrom sklearn.linear_model import LinearRegressionfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import roc_curvefrom sklearn.metrics import roc_auc_scoreimport statsmodels.api as sm# ##針對圖形顯示問題# import matplotlib# matplotlib.use('TkAgg')from importlib import reloadfrom matplotlib import pyplot as plt# 如果Jupyter，需要打開下面行代碼。#%matplotlib inlinereload(sys)# sys.setdefaultencoding( "utf-8")from scorecard_functions import *from sklearn.linear_model import LogisticRegressionCV# -*- coding: utf-8 -*-######################################## UDF: 自定義函數 ########################################### 對時間窗口，計算累計產比 ###def TimeWindowSelection(df, daysCol, time_windows): ''' :param df: the dataset containg variabel of days :param daysCol: the column of days :param time_windows: the list of time window :return: ''' freq_tw = {} for tw in time_windows: freq = sum(df[daysCol].apply(lambda x: int(x<=tw))) freq_tw[tw] = freq # freq_tw = {dict}: {30: 499175, 60: 524173, 90: 535223, 120: 542683, 150: 548083, 180: 552009, 210: 555009, 240: 557393, 270: 559259, 300: 560823, 330: 562458, 360: 563952} return freq_twdef DeivdedByZero(nominator, denominator): ''' 當分母為0時，返回0；否則返回正常值 ''' if denominator == 0: return 0 else: return nominator*1.0/denominator#對某些統一的字段進行統一def ChangeContent(x): y = x.upper() if y == '_MOBILEPHONE': y = '_PHONE' return ydef MissingCategorial(df,x): ''' :param df: the dataresources. :param x: the column of the dataresources. :return: ''' missing_vals = df[x].map(lambda x: int(x!=x)) return sum(missing_vals)*1.0/df.shape[0]def MissingContinuous(df,x): missing_vals = df[x].map(lambda x: int(np.isnan(x))) return sum(missing_vals) * 1.0 / df.shape[0]def MakeupRandom(x, sampledList): if x==x: return x else: randIndex = random.randint(0, len(sampledList)-1) return sampledList[randIndex]#############################################################Step 0: 數據分析的初始工作, 包括讀取數據文件、檢查用戶Id的一致性等############################################################## F:\chen\download\creditcard\Chimerge\cyc# folderOfData = '/Users/Code/Data Collections/bank default/'folderOfData = 'F:\/chen\/download\/creditcard\/Chimerge\/cyc\/'data1 = pd.read_csv(folderOfData+'PPD_LogInfo_3_1_Training_Set.csv', header = 0)data2 = pd.read_csv(folderOfData+'PPD_Training_Master_GBK_3_1_Training_Set.csv', dtype = {'target': np.int64}, header = 0,encoding = 'gbk')data3 = pd.read_csv(folderOfData+'PPD_Userupdate_Info_3_1_Training_Set.csv', header = 0)# score card:# data=pd.read_csv('F:\/chen\/download\/creditcard\/score card\/data_all_values.csv')############################################################################################## Step 1: 從PPD_LogInfo_3_1_Training_Set & PPD_Userupdate_Info_3_1_Training_Set數據中衍生特征############################################################################################### compare whether the four city variables matchdata2['city_match'] = data2.apply(lambda x: int(x.UserInfo_2 == x.UserInfo_4 == x.UserInfo_8 == x.UserInfo_20),axis = 1)# score card:# data=pd.read_csv('F:\/chen\/download\/creditcard\/score card\/data_all_values.csv')del data2['UserInfo_2']del data2['UserInfo_4']del data2['UserInfo_8']del data2['UserInfo_20']### 提取申請日期，計算日期差，查看日期差的分布data1['logInfo'] = data1['LogInfo3'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))data1['Listinginfo'] = data1['Listinginfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))data1['ListingGap'] = data1[['logInfo','Listinginfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)plt.hist(data1['ListingGap'],bins=200)plt.title('Days between login date and listing date')ListingGap2 = data1['ListingGap'].map(lambda x: min(x,365))plt.hist(ListingGap2,bins=200)# plt.figure()plt.show()timeWindows = TimeWindowSelection(data1, 'ListingGap', range(30,361,30))'''使用180天作為最大的時間窗口計算新特征所有可以使用的時間窗口可以有7 days, 30 days, 60 days, 90 days, 120 days, 150 days and 180 days.在每個時間窗口內，計算總的登錄次數，不同的登錄方式，以及每種登錄方式的平均次數'''time_window = [7, 30, 60, 90, 120, 150, 180]var_list = ['LogInfo1','LogInfo2']data1GroupbyIdx = pd.DataFrame({'Idx':data1['Idx'].drop_duplicates()})for tw in time_window: data1['TruncatedLogInfo'] = data1['Listinginfo'].map(lambda x: x + datetime.timedelta(-tw)) temp = data1.loc[data1['logInfo'] >= data1['TruncatedLogInfo']] for var in var_list: #count the frequences of LogInfo1 and LogInfo2 count_stats = temp.groupby(['Idx'])[var].count().to_dict() data1GroupbyIdx[str(var)+'_'+str(tw)+'_count'] = data1GroupbyIdx['Idx'].map(lambda x: count_stats.get(x,0)) # count the distinct value of LogInfo1 and LogInfo2 Idx_UserupdateInfo1 = temp[['Idx', var]].drop_duplicates() uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])[var].count().to_dict() data1GroupbyIdx[str(var) + '_' + str(tw) + '_unique'] = data1GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x,0)) # calculate the average count of each value in LogInfo1 and LogInfo2 # groupbyid,因為之前Idx已經生成，所以groupby,有可能找不到，這樣就是0。從而在計算平均值時候會出現分母是01的情況，使用DeicdedbyZero會防止這種情況。 data1GroupbyIdx[str(var) + '_' + str(tw) + '_avg_count'] = data1GroupbyIdx[[str(var)+'_'+str(tw)+'_count',str(var) + '_' + str(tw) + '_unique']].\ apply(lambda x: DeivdedByZero(x[0],x[1]), axis=1)data3['ListingInfo'] = data3['ListingInfo1'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))data3['UserupdateInfo'] = data3['UserupdateInfo2'].map(lambda x: datetime.datetime.strptime(x,'%Y/%m/%d'))data3['ListingGap'] = data3[['UserupdateInfo','ListingInfo']].apply(lambda x: (x[1]-x[0]).days,axis = 1)collections.Counter(data3['ListingGap'])hist_ListingGap = np.histogram(data3['ListingGap'])hist_ListingGap = pd.DataFrame({'Freq':hist_ListingGap[0],'gap':hist_ListingGap[1][1:]})hist_ListingGap['CumFreq'] = hist_ListingGap['Freq'].cumsum()hist_ListingGap['CumPercent'] = hist_ListingGap['CumFreq'].map(lambda x: x*1.0/hist_ListingGap.iloc[-1]['CumFreq'])'''對不同表達方法，如： QQ和qQ, Idnumber和idNumber,MOBILEPHONE和PHONE等進行統一在時間切片內，計算 (1) 更新的頻率 (2) 每種更新對象的種類個數 (3) 對重要信息如IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE的更新。這一點，比如，申請人對自己的重要信息如身份證、是否有車、婚姻狀況等等進行更改。表示這個人很有可能存在問題，會衍生出一個變量。不過這里的程序，有bug。就是它會統計data3里面有多少個 IDNUMBER等等，然后map去對應的ID，但是，如果get不到的話，它會有默認值就是ID自己。所以有時候看到這個值很大，因為就是ID它自己。代碼：data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))'''data3['UserupdateInfo1'] = data3['UserupdateInfo1'].map(ChangeContent)data3GroupbyIdx = pd.DataFrame({'Idx':data3['Idx'].drop_duplicates()})time_window = [7, 30, 60, 90, 120, 150, 180]for tw in time_window: data3['TruncatedLogInfo'] = data3['ListingInfo'].map(lambda x: x + datetime.timedelta(-tw)) temp = data3.loc[data3['UserupdateInfo'] >= data3['TruncatedLogInfo']] #frequency of updating freq_stats = temp.groupby(['Idx'])['UserupdateInfo1'].count().to_dict() data3GroupbyIdx['UserupdateInfo_'+str(tw)+'_freq'] = data3GroupbyIdx['Idx'].map(lambda x: freq_stats.get(x,0)) # number of updated types Idx_UserupdateInfo1 = temp[['Idx','UserupdateInfo1']].drop_duplicates() uniq_stats = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].count().to_dict() data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_unique'] = data3GroupbyIdx['Idx'].map(lambda x: uniq_stats.get(x, x)) #average count of each type data3GroupbyIdx['UserupdateInfo_' + str(tw) + '_avg_count'] = data3GroupbyIdx[['UserupdateInfo_'+str(tw)+'_freq', 'UserupdateInfo_' + str(tw) + '_unique']]. \ apply(lambda x: x[0] * 1.0 / x[1], axis=1) #whether the applicant changed items like IDNUMBER,HASBUYCAR, MARRIAGESTATUSID, PHONE # Idx_UserupdateInfo1 = {DataFrame} # Idx UserupdateInfo1 # 23 10002 [_PHONE] # 42 10006 [_EDUCATIONID] # 43 10006 [_HASBUYCAR] # 44 10006 [_MARRIAGESTATUSID] # 45 10006 [_PHONE] # ... ... ... # 372437 9995 [_QQ] # 372438 9995 [_RESIDENCEADDRESS] # 372439 9995 [_RESIDENCEPHONE] # 372440 9995 [_RESIDENCETYPEID] # 372441 9995 [_RESIDENCEYEARS] Idx_UserupdateInfo1['UserupdateInfo1'] = Idx_UserupdateInfo1['UserupdateInfo1'].map(lambda x: [x]) # Idx_UserupdateInfo1_V2 = {Series}Idx # 8 [_PHONE] # 16 [_PHONE] # 17 [_PHONE] # 18 [_EDUCATIONID, _HASBUYCAR, _IDNUMBER, _LASTUPD... # 20 [_EDUCATIONID, _HASBUYCAR, _IDNUMBER, _LASTUPD... # ... # 91688 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91689 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91695 [_DISTRICTID, _LASTUPDATEDATE, _PHONE, _RESIDE... # 91702 [_CITYID, _DISTRICTID, _EDUCATIONID, _HASBUYCA... # 91703 [_LASTUPDATEDATE] Idx_UserupdateInfo1_V2 = Idx_UserupdateInfo1.groupby(['Idx'])['UserupdateInfo1'].sum() for item in ['_IDNUMBER','_HASBUYCAR','_MARRIAGESTATUSID','_PHONE']: item_dict = Idx_UserupdateInfo1_V2.map(lambda x: int(item in x)).to_dict() data3GroupbyIdx['UserupdateInfo_' + str(tw) + str(item)] = data3GroupbyIdx['Idx'].map(lambda x: item_dict.get(x, x))# Combine the above features with raw features in PPD_Training_Master_GBK_3_1_Training_SetallData = pd.concat([data2.set_index('Idx'), data3GroupbyIdx.set_index('Idx'), data1GroupbyIdx.set_index('Idx')],axis= 1)allData.to_csv(folderOfData+'allData_0.csv',encoding = 'gbk')######################################## Step 2: 對類別型變量和數值型變量進行補缺#######################################allData = pd.read_csv(folderOfData+'allData_0.csv',header = 0,encoding = 'gbk')allFeatures = list(allData.columns)allFeatures.remove('target')if 'Idx' in allFeatures: allFeatures.remove('Idx')allFeatures.remove('ListingInfo')#檢查是否有常數型變量(如果有的話就拿掉)，并且檢查是類別型還是數值型變量"""有的時候，一些雖然是數值型變量，但是取值很少。這個時候可能這些是表示特殊意義的類別型"""numerical_var = []for col in allFeatures: if len(set(allData[col])) == 1: print('delete {} from the dataset because it is a constant'.format(col)) del allData[col] allFeatures.remove(col) else: # 排除了缺失值 uniq_valid_vals = [i for i in allData[col] if i == i] # 通過set()方法，很好地拿到了各個值 uniq_valid_vals = list(set(uniq_valid_vals)) if len(uniq_valid_vals) >= 10 and isinstance(uniq_valid_vals[0], numbers.Real): numerical_var.append(col)categorical_var = [i for i in allFeatures if i not in numerical_var]#檢查變量的最多值的占比情況,以及每個變量中占比最大的值records_count = allData.shape[0]# col_most_values{'學歷': 0.86,'收入':0.45, '單位性質'：0.58,'UserInfo_19': 0.909}# col_large_value存放每一個變量col最大值對應的值 = {'UserInfo_19': '廣東省', '收入'：23021}col_most_values,col_large_value = {},{}for col in allFeatures: # UserInfo_19 target # 廣東省 30001 # 廣西 2000 # ''' value_count = {Series} 山西省 945 廣東省 2405 廣西壯族自治區 1198 新疆維吾爾自治區 204 Name: UserInfo_19, dtype: int64 ''' value_count = allData[col].groupby(allData[col]).count() col_most_values[col] = max(value_count)/records_count # 這里要注意, index索引出來的不一定是數字，而是groupby出來的各種值。如果是'學歷'這個變量的話，就是：本科、研究生、博士等等 # groupby什么，什么就是index # 所以這個large_value起名字也很好，就是large_value的值。而不是index. # print("value_count[value_count== max(value_count)].index = ") # print(value_count[value_count== max(value_count)].index) large_value = value_count[value_count== max(value_count)].index[0] # value_count[value_count == max(value_count)] 這個公式，只會起過濾作用，選擇條件是True的數據出來。也即：只有1行數據出來 # 測試了一下：原來，index的值是：Float64Index([5.0], dtype='float64', name='UserInfo_3')，所以，必須寫成index[0],這樣index[0] = 5.0 # large_value = value_count[value_count == max(value_count)].index col_large_value[col] = large_valuecol_most_values_df = pd.DataFrame.from_dict(col_most_values, orient = 'index')col_most_values_df.columns = ['max percent']col_most_values_df = col_most_values_df.sort_values(by = 'max percent', ascending = False)pcnt = list(col_most_values_df[:500]['max percent'])vars = list(col_most_values_df[:500].index)plt.bar(range(len(pcnt)), height = pcnt)plt.title('Largest Percentage of Single Value in Each Variable')plt.show()# max percent# WeblogInfo_13 0.999700# SocialNetwork_11 0.999600說# WeblogInfo_55 0.999433#計算多數值占比超過90%的字段中，少數值的壞樣本率是否會顯著高于多數值# print("col_most_values_df[col_most_values_df['max percent']>=0.9].index = ")# print(col_most_values_df[col_most_values_df['max percent']>=0.9].index)large_percent_cols = list(col_most_values_df[col_most_values_df['max percent']>=0.9].index)bad_rate_diff = {}# print("col_large_value = ")# print(col_large_value)# print("large_percent_cols = ")# print(large_percent_cols)for col in large_percent_cols: large_value = col_large_value[col] temp = allData[[col,'target']] temp[col] = temp.apply(lambda x: int(x[col]==large_value),axis=1) bad_rate = temp.groupby(col).mean() # print("bad_rate = ") # print(bad_rate) if bad_rate.iloc[0]['target'] == 0: bad_rate_diff[col] = 0 continue bad_rate_diff[col] = np.log(bad_rate.iloc[0]['target']/bad_rate.iloc[1]['target'])bad_rate_diff_sorted = sorted(bad_rate_diff.items(),key=lambda x: x[1], reverse=True)bad_rate_diff_sorted_values = [x[1] for x in bad_rate_diff_sorted]plt.bar(x = range(len(bad_rate_diff_sorted_values)), height = bad_rate_diff_sorted_values)plt.title("log of bad rate ratio in large varaibles in which 90% is single value")plt.show()#由于所有的少數值的壞樣本率并沒有顯著高于多數值，意味著這些變量可以直接剔除for col in large_percent_cols: if col in numerical_var: numerical_var.remove(col) else: categorical_var.remove(col) del allData[col]'''對類別型變量，如果缺失超過80%, 就刪除，否則當成特殊的狀態'''missing_pcnt_threshould_1 = 0.8for col in categorical_var: missingRate = MissingCategorial(allData,col) print('{0} has missing rate as {1}'.format(col,missingRate)) if missingRate > missing_pcnt_threshould_1: categorical_var.remove(col) del allData[col] if 0 < missingRate < missing_pcnt_threshould_1: uniq_valid_vals = [i for i in allData[col] if i == i] uniq_valid_vals = list(set(uniq_valid_vals)) if isinstance(uniq_valid_vals[0], numbers.Real): missing_position = allData.loc[allData[col] != allData[col]][col].index not_missing_sample = [-1]*len(missing_position) allData.loc[missing_position, col] = not_missing_sample else: # In this way we convert NaN to NAN, which is a string instead of np.nan allData[col] = allData[col].map(lambda x: str(x).upper())allData_bk = allData.copy()'''檢查數值型變量'''missing_pcnt_threshould_2 = 0.8deleted_var = []for col in numerical_var: missingRate = MissingContinuous(allData, col) print('{0} has missing rate as {1}'.format(col, missingRate)) if missingRate > missing_pcnt_threshould_2: deleted_var.append(col) print('we delete variable {} because of its high missing rate'.format(col)) else: if missingRate > 0: not_missing = allData.loc[allData[col] == allData[col]][col] #makeuped = allData[col].map(lambda x: MakeupRandom(x, list(not_missing))) # missing_positon 是一個index的list missing_position = allData.loc[allData[col] != allData[col]][col].index # 函數random.sample(list, n)作用是隨機從list中取出n個元素 not_missing_sample = random.sample(list(not_missing), len(missing_position)) allData.loc[missing_position,col] = not_missing_sample #del allData[col] #allData[col] = makeuped missingRate2 = MissingContinuous(allData, col) print('missing rate after making up is:{}'.format(str(missingRate2)))if deleted_var != []: for col in deleted_var: numerical_var.remove(col) del allData[col]allData.to_csv(folderOfData+'allData_1.csv', header=True,encoding='gbk', columns = allData.columns, index=False)allData = pd.read_csv(folderOfData+'allData_1.csv', header=0,encoding='gbk')#################################### Step 3: 基于卡方分箱法對變量進行分箱#####################################for each categorical variable, if it has distinct values more than 5, we use the ChiMerge to merge ittrainData = pd.read_csv(folderOfData+'allData_1.csv',header = 0, encoding='gbk',dtype = {'target': np.int64})allFeatures = list(trainData.columns)allFeatures.remove('ListingInfo')allFeatures.remove('target')#allFeatures.remove('Idx')print("開始基于卡方分箱法對變量進行分箱")# 將特征區分為數值型和類別型numerical_var = []for var in allFeatures: uniq_vals = list(set(trainData[var])) if np.nan in uniq_vals: uniq_vals.remove( np.nan) if len(uniq_vals) >= 10 and isinstance(uniq_vals[0], numbers.Real): numerical_var.append(var)categorical_var = [i for i in allFeatures if i not in numerical_var]for col in categorical_var: #for Chinese character, upper() is not valid if col not in ['UserInfo_7','UserInfo_9','UserInfo_19']: trainData[col] = trainData[col].map(lambda x: str(x).upper())'''對于類別型變量，按照以下方式處理1，如果變量的取值個數超過5，計算bad rate進行編碼2，除此之外，其他任何類別型變量如果有某個取值中，對應的樣本全部是壞樣本或者是好樣本，進行合并。'''deleted_features = [] #將處理過的變量刪除，防止對后面建模的干擾encoded_features = {} #將bad rate編碼方式保存下來，在以后的測試和生產環境中需要使用merged_features = {} #將類別型變量合并方案保留下來var_IV = {} #save the IV values for binned features #將IV值和WOE值保留var_WOE = {}for col in categorical_var: print("現在正在處理類別型變量。。處理的變量是：") print('we are processing {}'.format(col)) if len(set(trainData[col]))>5: print(" now the len this category is > 5 , 需要用bad rate做編碼轉換成數值型變量，再分箱") print('{} is encoded with bad rate'.format(col)) col0 = str(col)+'_encoding' #(1), 計算壞樣本率并進行編碼 encoding_result = BadRateEncoding(trainData, col, 'target') # 0 0.082894 # 1 0.058025 # 2 0.082624 # 3 0.058025 # 4 0.058025 # ... # 29995 0.077460 # 29996 0.082041 # 29997 0.082624 # 29998 0.058025 # 29999 0.082041 # Name: UserInfo_1, Length: 30000, dtype: float64, # 'bad_rate': {'-1.0': 0.0, '0.0': 0.0, '1.0': 0.058024568061520024, '2.0': 0.018867924528301886, '3.0': 0.08262393590385578, '4.0': 0.0774604479145264, '5.0': 0.0828936170212766, '6.0': 0.08204081632653061, '7.0': 0.07959479015918958}} trainData[col0], br_encoding = encoding_result['encoding'],encoding_result['bad_rate'] #(2), 將(1)中的編碼后的變量也加入數值型變量列表中，為后面的卡方分箱做準備 numerical_var.append(col0) #(3), 保存編碼結果，保持編碼方案用于測試處理 encoded_features[col] = [col0, br_encoding] # print("encoded_features = ") # print(encoded_features) #(4), 刪除原始值 deleted_features.append(col) else: print("類別型變量處理中。本變量類別數少于或等于5") bad_bin = trainData.groupby([col])['target'].sum() #對于類別數少于5個，但是出現0壞樣本的特征需要做處理 if min(bad_bin) == 0: print("說明出現了 0 壞樣本的特征，需要處理") print('{} has 0 bad sample!'.format(col)) col1 = str(col) + '_mergeByBadRate' #(1), 找出最優合并方式，使得每一箱同時包含好壞樣本 mergeBin = MergeBad0(trainData, col, 'target') # (2), 依照(1)的結果對值進行合并 trainData[col1] = trainData[col].map(mergeBin) maxPcnt = MaximumBinPcnt(trainData, col1) #如果合并后導致有箱占比超過90%，就刪除。 if maxPcnt > 0.9: print("說明合并后導致有箱占比超過90%，需要刪除。為什么能夠直接刪除？不允許計算10%的顯著性？") print('{} is deleted because of large percentage of single bin'.format(col)) deleted_features.append(col) categorical_var.remove(col) del trainData[col] continue #(3) 如果合并后的新的變量滿足要求，就保留下來 print("說明合并后導致有箱占比沒有超過90%，不需要刪除。正常操作即可") merged_features[col] = [col1, mergeBin] WOE_IV = CalcWOE(trainData, col1, 'target') var_WOE[col1] = WOE_IV['WOE'] var_IV[col1] = WOE_IV['IV'] #del trainData[col] deleted_features.append(col) else: print("說明沒有 0 壞樣本的特征，按照最正常處理方式即可") WOE_IV = CalcWOE(trainData, col, 'target') var_WOE[col] = WOE_IV['WOE'] var_IV[col] = WOE_IV['IV']print("處理類別型變量工作結束！開始處理數值型變量")var_cutoff = {}for col in numerical_var: """ 這是最重要的環節 """ print("正在處理數值型變量，變量是：") print("{} is in processing".format(col)) col1 = str(col) + '_Bin' #(1),用卡方分箱法進行分箱，并且保存每一個分割的端點。例如端點=[10,20,30]表示將變量分為x<10,1030. #特別地，缺失值-1不參與分箱 ''' 缺失值采用-1表示。如果有缺失值，則不參與分箱 cutOffPoints是分割點 ''' if -1 in set(trainData[col]): special_attribute = [-1] else: special_attribute = [] cutOffPoints = ChiMerge(trainData, col, 'target',special_attribute=special_attribute) var_cutoff[col] = cutOffPoints trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) #(2), check whether the bad rate is monotone BRM = BadRateMonotone(trainData, col1, 'target',special_attribute=special_attribute) if not BRM: if special_attribute == []: bin_merged = Monotone_Merge(trainData, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin)>1: indices = [int(b.replace('Bin ','')) for b in bin] removed_index = removed_index+indices[0:-1] # removed_index = {list}[0] # cutOffPoints = {list}: [10750.0, 26020.0, 49531.0, 79315.0] # removed_point = {list}: [10750.0, 79315.0] removed_point = [cutOffPoints[k] for k in removed_index] for p in removed_point: cutOffPoints.remove(p) var_cutoff[col] = cutOffPoints trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints, special_attribute=special_attribute)) else: cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute] temp = trainData.loc[~trainData[col].isin(special_attribute)] bin_merged = Monotone_Merge(temp, 'target', col1) removed_index = [] for bin in bin_merged: if len(bin) > 1: indices = [int(b.replace('Bin ', '')) for b in bin] # first # removed_index = removed_index + indices[0:-1] removed_point = [cutOffPoints2[k] for k in removed_index] for p in removed_point: cutOffPoints2.remove(p) cutOffPoints2 = cutOffPoints2 + special_attribute var_cutoff[col] = cutOffPoints2 trainData[col1] = trainData[col].map(lambda x: AssignBin(x, cutOffPoints2, special_attribute=special_attribute)) maxPcnt = MaximumBinPcnt(trainData, col1) if maxPcnt > 0.9: # del trainData[col1] deleted_features.append(col) numerical_var.remove(col) print('we delete {} because the maximum bin occupies more than 90%'.format(col)) continue WOE_IV = CalcWOE(trainData, col1, 'target') var_IV[col] = WOE_IV['IV'] var_WOE[col] = WOE_IV['WOE'] #del trainData[col]print("數值型變量工作結束")print("Step3 基于卡方分箱法對變量進行分箱, 工作完成后的結果")print("var_WOE = ")print(var_WOE)print("var_IV = ")print(var_IV)print("merged_features = ")print(merged_features)trainData.to_csv(folderOfData+'allData_2.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)print("var_cutoff = ")print(var_cutoff)with open(folderOfData+'var_WOE.pkl',"wb") as f: f.write(pickle.dumps(var_WOE))with open(folderOfData+'var_IV.pkl',"wb") as f: f.write(pickle.dumps(var_IV))with open(folderOfData+'var_cutoff.pkl',"wb") as f: f.write(pickle.dumps(var_cutoff))with open(folderOfData+'merged_features.pkl',"wb") as f: f.write(pickle.dumps(merged_features))######################################### Step 4: WOE編碼后的單變量分析與多變量分析#########################################trainData = pd.read_csv(folderOfData+'allData_2.csv', header=0, encoding='gbk')with open(folderOfData+'var_WOE.pkl',"rb") as f: var_WOE = pickle.load(f)with open(folderOfData+'var_IV.pkl',"rb") as f: var_IV = pickle.load(f)with open(folderOfData+'var_cutoff.pkl',"rb") as f: var_cutoff = pickle.load(f)with open(folderOfData+'merged_features.pkl',"rb") as f: merged_features = pickle.load(f)#將一些看起來像數值變量實際上是類別變量的字段轉換成字符num2str = ['SocialNetwork_13','SocialNetwork_12','UserInfo_6','UserInfo_5','UserInfo_10','UserInfo_11','UserInfo_12','UserInfo_13','UserInfo_17']for col in num2str: trainData[col] = trainData[col].map(lambda x: str(x))for col in var_WOE.keys(): print(col) col2 = str(col)+"_WOE" if col in var_cutoff.keys(): cutOffPoints = var_cutoff[col] special_attribute = [] if - 1 in cutOffPoints: special_attribute = [-1] # 需要有給樣本賦值bin的過程。因為數值型變量已經安裝ChiMerge值進行了分箱。 binValue = trainData[col].map(lambda x: AssignBin(x, cutOffPoints,special_attribute=special_attribute)) trainData[col2] = binValue.map(lambda x: var_WOE[col][x]) else: trainData[col2] = trainData[col].map(lambda x: var_WOE[col][x])trainData.to_csv(folderOfData+'allData_3.csv', header=True,encoding='gbk', columns = trainData.columns, index=False)### (i) 選擇IV高于閾值的變量trainData = pd.read_csv(folderOfData+'allData_3.csv', header=0,encoding='gbk')all_IV = list(var_IV.values())all_IV = sorted(all_IV, reverse=True)plt.bar(x=range(len(all_IV)), height = all_IV)plt.title("IV sorted")plt.show()iv_threshould = 0.02varByIV = [k for k, v in var_IV.items() if v > iv_threshould]### (ii) 檢查WOE編碼后的變量的兩兩線性相關性var_IV_selected = {k:var_IV[k] for k in varByIV}var_IV_sorted = sorted(var_IV_selected.items(), key=lambda d:d[1], reverse = True)var_IV_sorted = [i[0] for i in var_IV_sorted]removed_var = []roh_thresould = 0.6for i in range(len(var_IV_sorted)-1): if var_IV_sorted[i] not in removed_var: x1 = var_IV_sorted[i]+"_WOE" for j in range(i+1,len(var_IV_sorted)): if var_IV_sorted[j] not in removed_var: x2 = var_IV_sorted[j] + "_WOE" roh = np.corrcoef([trainData[x1], trainData[x2]])[0, 1] if abs(roh) >= roh_thresould: print('the correlation coeffient between {0} and {1} is {2}'.format(x1, x2, str(roh))) if var_IV[var_IV_sorted[i]] > var_IV[var_IV_sorted[j]]: removed_var.append(var_IV_sorted[j]) else: removed_var.append(var_IV_sorted[i])var_IV_sortet_2 = [i for i in var_IV_sorted if i not in removed_var]### (iii)檢查是否有變量與其他所有變量的VIF > 10# 由于涉及到Matrix，所以需要把DataFrame轉為Matrixfor i in range(len(var_IV_sortet_2)): x0 = trainData[var_IV_sortet_2[i]+'_WOE'] x0 = np.array(x0) # 之前，var_IV_sorted_2 = {list} : ['ThirdParty_Info_Period2_6', 'ThirdParty_Info_Period6_6', 'ThirdParty_Info_Period5_6', 'UserInfo_14_encoding', 'ThirdParty_Info_Period4_15', 'ThirdParty_Info_Period1_15', 'ThirdParty_Info_Period3_15', 'ThirdParty_Info_Period6_1', 'ThirdParty_Info_Period5_1', 'ThirdParty_Info_Period5_2', 'UserInfo_16_encoding', 'ThirdParty_Info_Period5_10', 'WeblogInfo_6', 'Idx', 'ThirdParty_Info_Period4_8', 'ThirdParty_Info_Period2_8', 'UserInfo_7_encoding', 'WeblogInfo_20_encoding', 'ThirdParty_Info_Period3_10', 'ThirdParty_Info_Period4_9', 'UserInfo_17', 'ThirdParty_Info_Period1_10', 'ThirdParty_Info_Period1_3', 'ThirdParty_Info_Period2_10', 'ThirdParty_Info_Period4_4', 'WeblogInfo_2_encoding', 'ThirdParty_Info_Period3_3', 'UserInfo_1_encoding', 'LogInfo1_30_avg_count', 'WeblogInfo_5', 'UserInfo_12'] # 之后，X_Col = {list} : ['ThirdParty_Info_Period6_6_WOE', 'ThirdParty_Info_Period5_6_WOE', 'UserInfo_14_encoding_WOE', 'ThirdParty_Info_Period4_15_WOE', 'ThirdParty_Info_Period1_15_WOE', 'ThirdParty_Info_Period3_15_WOE', 'ThirdParty_Info_Period6_1_WOE', 'ThirdParty_Info_Period5_1_WOE', 'ThirdParty_Info_Period5_2_WOE', 'UserInfo_16_encoding_WOE', 'ThirdParty_Info_Period5_10_WOE', 'WeblogInfo_6_WOE', 'Idx_WOE', 'ThirdParty_Info_Period4_8_WOE', 'ThirdParty_Info_Period2_8_WOE', 'UserInfo_7_encoding_WOE', 'WeblogInfo_20_encoding_WOE', 'ThirdParty_Info_Period3_10_WOE', 'ThirdParty_Info_Period4_9_WOE', 'UserInfo_17_WOE', 'ThirdParty_Info_Period1_10_WOE', 'ThirdParty_Info_Period1_3_WOE', 'ThirdParty_Info_Period2_10_WOE', 'ThirdParty_Info_Period4_4_WOE', 'WeblogInfo_2_encoding_WOE', 'ThirdParty_Info_Period3_3_WOE', 'UserInfo_1_encoding_WOE', 'LogInfo1_30_avg_count_WOE', 'WeblogInfo_5_WOE', 'UserInfo_12_WOE'] X_Col = [k+'_WOE' for k in var_IV_sortet_2 if k != var_IV_sortet_2[i]] X = trainData[X_Col] X = np.matrix(X) regr = LinearRegression() clr= regr.fit(X, x0) x_pred = clr.predict(X) R2 = 1 - ((x_pred - x0) ** 2).sum() / ((x0 - x0.mean()) ** 2).sum() vif = 1/(1-R2) if vif > 10: print("Warning: the vif for {0} is {1}".format(var_IV_sortet_2[i], vif))########################## Step 5: 應用邏輯回歸模型########################### 這里 var_IV_sortet_2是變量名稱multi_analysis = [i+'_WOE' for i in var_IV_sortet_2]y = trainData['target']X = trainData[multi_analysis].copy()X['intercept'] = [1]*X.shape[0]LR = sm.Logit(y, X).fit()summary = LR.summary2()pvals = LR.pvalues.to_dict()params = LR.params.to_dict()#發現有變量不顯著，因此需要單獨檢驗顯著性'''把該變量單獨擰出來，看是否顯著'''varLargeP = {k: v for k,v in pvals.items() if v >= 0.1}varLargeP = sorted(varLargeP.items(), key=lambda d:d[1], reverse = True)varLargeP = [i[0] for i in varLargeP]p_value_list = {}for var in varLargeP: # trainData[var] = {Series} X_temp = trainData[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() p_value_list[var] = LR.pvalues[var]for k,v in p_value_list.items(): print("{0} has p-value of {1} in univariate regression".format(k,v))#發現有變量的系數為正，因此需要單獨檢驗正確性varPositive = [k for k,v in params.items() if v >= 0]coef_list = {}for var in varPositive: X_temp = trainData[var].copy().to_frame() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() coef_list[var] = LR.params[var]for k,v in coef_list.items(): print("{0} has coefficient of {1} in univariate regression".format(k,v))selected_var = [multi_analysis[0]]for var in multi_analysis[1:]: try_vars = selected_var+[var] X_temp = trainData[try_vars].copy() X_temp['intercept'] = [1] * X_temp.shape[0] LR = sm.Logit(y, X_temp).fit() #summary = LR.summary2() pvals, params = LR.pvalues, LR.params del params['intercept'] if max(pvals)<0.1 and max(params)<0: selected_var.append(var)# X_temp = {DataFrame} ThirdParty_Info_Period2_6_WOE ... intercept# 0 0.357555 ... 1# 1 -0.413268 ... 1# 2 -0.413268 ... 1# 3 0.357555 ... 1# 4 0.357555 ... 1# ... ... ... ...# 29995 -0.056315 ... 1# 29996 -0.056315 ... 1# 29997 0.357555 ... 1# 29998 0.357555 ... 1# 29999 0.357555 ... 1## [30000 rows x 19 columns]# y = {Series} 0 0# 1 0# 2 0# 3 0# 4 0# ..# 29995 0# 29996 0# 29997 0# 29998 0# 29999 0# Name: target, Length: 30000, dtype: int64# try_vars = {list} : ['ThirdParty_Info_Period2_6_WOE', 'ThirdParty_Info_Period6_6_WOE', 'ThirdParty_Info_Period5_6_WOE', 'UserInfo_14_encoding_WOE', 'ThirdParty_Info_Period1_15_WOE', 'ThirdParty_Info_Period3_15_WOE', 'ThirdParty_Info_Period6_1_WOE', 'UserInfo_16_encoding_WOE', 'WeblogInfo_4_WOE', 'ThirdParty_Info_Period2_8_WOE', 'UserInfo_7_encoding_WOE', 'WeblogInfo_20_encoding_WOE', 'UserInfo_17_WOE', 'ThirdParty_Info_Period1_10_WOE', 'ThirdParty_Info_Period2_10_WOE', 'WeblogInfo_2_encoding_WOE', 'LogInfo1_30_avg_count_WOE', 'UserInfo_12_WOE']# try_vars._len_ = {int} 18LR.summary2()print(LR.summary2())y_pred = LR.predict(X_temp)y_result = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target'])})roc_auc_score_result = roc_auc_score(trainData['target'], y_pred)print("roc_auc_score_result = ", roc_auc_score_result)################# Step 6: 尺度化#################plt.show()scores = Prob2Score(y_pred, 200, 100)# scores.to_excel("scores.xlsx")plt.title("Score Distribution")plt.xlabel("Score")plt.ylabel("Quantity")plt.hist(scores,bins=100)# plt.title("score-transfer")plt.show()print("完成分數轉換！ ")print("Step 6 結束！")######################## Step 7: 生成ROC圖#######################print("開始 step 7")import warningswarnings.filterwarnings('ignore')def proba_conversion(x, threshold3 = 0.3): if (x >= threshold3 ): return 1 else: return 0scores = Prob2Score(y_pred, 200, 100)trainData['predication'] = scoresscorecard = pd.DataFrame({'y_pred':y_pred, 'y_real':list(trainData['target']),'score':scores})ks_result = KS(scorecard,'score','y_real')print("ks_result = ", ks_result)# ROC_AUC(df, score, target)roc_auc_score_result2 = ROC_AUC(trainData, 'predication', 'target')# roc_auc_score_result2 = ROC_AUC(trainData, 'y_pred', 'target', )# 也可用sklearn帶的函數roc_auc_score_result_directly = roc_auc_score(trainData['target'], y_pred)# print("")print("roc_auc_score = ", roc_auc_score_result_directly)#roc_auc_score_result = roc_auc_score(trainData['target'], y_pred)from sklearn.metrics import confusion_matrix, precision_recall_curveimport itertoolsplt.rcParams['font.sans-serif'] = ['SimHei']def plot_precision_recall(): plt.step(recall, precision, color = 'b', alpha = 0.2, where = 'post') plt.fill_between(recall, precision, step = 'post', alpha = 0.2, color = 'b') plt.plot(recall, precision, linewidth=2) plt.xlim([0.0,1]) plt.ylim([0.0,1.05]) plt.xlabel('召回率') plt.ylabel('精確率') plt.title('精確率 - 召回率曲線') plt.show()def show_metrics(): tp = cm[1,1] fn = cm[1,0] fp = cm[0,1] tn = cm[0,0] print('精確率：{:.3f}'.format(tp/(tp+fp))) print('召回率：{:.3f}'.format(tp/(tp+fn))) print('F1 值：{:.3f}'.format( 2*( ( (tp/(tp+fp))*(tp/(tp+fn)) )/( (tp/(tp+fp))+(tp/(tp+fn)) ) ) ))def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): plt.figure() plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=0) plt.yticks(tick_marks, classes) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black') plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show()print("結束")

三、運行結果

下面圖片，是代碼在Jupyter Note 上運行后展示的結果

四、具體運行過程

? ? ? 具體運行過程，可以在碼云(https://gitee.com/)上關注用戶號：abcgz

創作挑戰賽新人創作獎勵來咯，堅持創作打卡瓜分現金大獎

總結

以上是生活随笔為你收集整理的逻辑回归预测事件发生的概率_通过逻辑回归，对信用卡申请数据使用卡方分箱法预测违约率建模...的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：编写一个函数，该函数能判断一个英文句子s
下一篇： oracle怎么查语句消耗时常,Orac

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

编程问答

逻辑回归预测事件发生的概率_通过逻辑回归，对信用卡申请数据使用卡方分箱法预测违约率建模...

總結