當前位置：首頁 > 编程语言 > python >内容正文

python

python分箱统计个数_【数据处理】python变量分箱常见手法：分类型、数值型、卡方、自定义...

發布時間：2024/1/23 python 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 python分箱统计个数_【数据处理】python变量分箱常见手法：分类型、数值型、卡方、自定义... 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

"""

分箱邏輯：

1.類別型特征：

1)類別數在5個以下，可以直接根據類別來分箱 (binning_cate)

2)類別數在5個以上，建議做降基處理，再根據降基后的類別做分箱

2.數值型特征：

1)離散型數值特征(特征value的變動幅度較小)：

若特征value的非重復計數在5個以下，可以直接根據非重復計數值來分箱(binning_cate)

若特征value的非重復計數在5個以上，建議根據業務解釋或者數據分布做自定義分箱(binning_self)

2)連續型數值特征(特征value的變動幅度較大)：

可以用卡方分箱或自定義分箱。(binning_num,binning_self)

PS:一些特征用卡方分可能會報錯，建議這些特征改為手動自定義分箱

3.特征有缺失：

1)缺失率在5%以下，可以先對缺失做填充處理再分箱(binning_num)

2)缺失率在5%以上，建議將缺失當作一個類別來分箱(binning_sparse_col)

4.稀疏特征分箱

建議將稀疏值(一般為0)單獨分為一箱，剩下的值做卡方或者自定義分箱(binning_sparse_col)

"""

def binning_cate(df, col, target):

"""

df:數據集

col:輸入的特征

target:好壞標記的字段名

return:

bin_df :特征的評估結果

"""

total = df[target].count()

bad = df[target].sum()

good = total - bad

d1 = df.groupby([col], as_index=True)

d2 = pd.DataFrame()

d2['樣本數'] = d1[target].count()

d2['黑樣本數'] = d1[target].sum()

d2['白樣本數'] = d2['樣本數'] - d2['黑樣本數']

d2['逾期用戶占比'] = d2['黑樣本數'] / d2['樣本數']

d2['badattr'] = d2['黑樣本數'] / bad

d2['goodattr'] = d2['白樣本數'] / good

d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])

d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']

d2['IV'] = d2['bin_iv'].sum()

bin_df = d2.reset_index()

bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)

bin_df.rename(columns={col: '分箱結果'}, inplace=True)

bin_df['特征名'] = col

bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

return bin_df

def binning_self(df, col, target, cut=None, right_border=True):

"""

df:數據集

col:輸入的特征

target:好壞標記的字段名

cut:總定義劃分區間的list

right_border：設定左開右閉、左閉右開

return:

bin_df :特征的評估結果

"""

total = df[target].count()

bad = df[target].sum()

good = total - bad

bucket = pd.cut(df[col], cut, right=right_border)

d1 = df.groupby(bucket)

d2 = pd.DataFrame()

d2['樣本數'] = d1[target].count()

d2['黑樣本數'] = d1[target].sum()

d2['白樣本數'] = d2['樣本數'] - d2['黑樣本數']

d2['逾期用戶占比'] = d2['黑樣本數'] / d2['樣本數']

d2['badattr'] = d2['黑樣本數'] / bad

d2['goodattr'] = d2['白樣本數'] / good

d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])

d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']

d2['IV'] = d2['bin_iv'].sum()

bin_df = d2.reset_index()

bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)

bin_df.rename(columns={col: '分箱結果'}, inplace=True)

bin_df['特征名'] = col

bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

ks, precision, tpr, fpr = cal_ks(df, col, target)

bin_df['準確率'] = precision

bin_df['召回率'] = tpr

bin_df['打擾率'] = fpr

bin_df['KS'] = ks

return bin_df

def binning_num(df, target, col, max_bin=None, min_binpct=None):

"""

df:數據集

col:輸入的特征

target:好壞標記的字段名

max_bin:最大的分箱個數

min_binpct:區間內樣本所占總體的最小比

return:

bin_df :特征的評估結果

"""

total = df[target].count()

bad = df[target].sum()

good = total - bad

inf = float('inf')

ninf = float('-inf')

cut = ChiMerge(df, col, target, max_bin=max_bin, min_binpct=min_binpct)

cut.insert(0, ninf)

cut.append(inf)

bucket = pd.cut(df[col], cut)

d1 = df.groupby(bucket)

d2 = pd.DataFrame()

d2['樣本數'] = d1[target].count()

d2['黑樣本數'] = d1[target].sum()

d2['白樣本數'] = d2['樣本數'] - d2['黑樣本數']

d2['逾期用戶占比'] = d2['黑樣本數'] / d2['樣本數']

d2['badattr'] = d2['黑樣本數'] / bad

d2['goodattr'] = d2['白樣本數'] / good

d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])

d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']

d2['IV'] = d2['bin_iv'].sum()

bin_df = d2.reset_index()

bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)

bin_df.rename(columns={col: '分箱結果'}, inplace=True)

bin_df['特征名'] = col

bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

ks, precision, tpr, fpr = cal_ks(df, col, target)

bin_df['準確率'] = precision

bin_df['召回率'] = tpr

bin_df['打擾率'] = fpr

bin_df['KS'] = ks

return bin_df

def binning_sparse_col(df, target, col, max_bin=None, min_binpct=None, sparse_value=None):

"""

df:數據集

col:輸入的特征

target:好壞標記的字段名

max_bin:最大的分箱個數

min_binpct:區間內樣本所占總體的最小比

sparse_value:單獨分為一箱的value值

return:

bin_df :特征的評估結果

"""

total = df[target].count()

bad = df[target].sum()

good = total - bad

# 對稀疏值0值或者缺失值單獨分箱

temp1 = df[df[col] == sparse_value]

temp2 = df[~(df[col] == sparse_value)]

bucket_sparse = pd.cut(temp1[col], [float('-inf'), sparse_value])

group1 = temp1.groupby(bucket_sparse)

bin_df1 = pd.DataFrame()

bin_df1['樣本數'] = group1[target].count()

bin_df1['黑樣本數'] = group1[target].sum()

bin_df1['白樣本數'] = bin_df1['樣本數'] - bin_df1['黑樣本數']

bin_df1['逾期用戶占比'] = bin_df1['黑樣本數'] / bin_df1['樣本數']

bin_df1['badattr'] = bin_df1['黑樣本數'] / bad

bin_df1['goodattr'] = bin_df1['白樣本數'] / good

bin_df1['WOE'] = np.log(bin_df1['badattr'] / bin_df1['goodattr'])

bin_df1['bin_iv'] = (bin_df1['badattr'] - bin_df1['goodattr']) * bin_df1['WOE']

bin_df1 = bin_df1.reset_index()

# 對剩余部分做卡方分箱

cut = ChiMerge(temp2, col, target, max_bin=max_bin, min_binpct=min_binpct)

cut.insert(0, sparse_value)

cut.append(float('inf'))

bucket = pd.cut(temp2[col], cut)

group2 = temp2.groupby(bucket)

bin_df2 = pd.DataFrame()

bin_df2['樣本數'] = group2[target].count()

bin_df2['黑樣本數'] = group2[target].sum()

bin_df2['白樣本數'] = bin_df2['樣本數'] - bin_df2['黑樣本數']

bin_df2['逾期用戶占比'] = bin_df2['黑樣本數'] / bin_df2['樣本數']

bin_df2['badattr'] = bin_df2['黑樣本數'] / bad

bin_df2['goodattr'] = bin_df2['白樣本數'] / good

bin_df2['WOE'] = np.log(bin_df2['badattr'] / bin_df2['goodattr'])

bin_df2['bin_iv'] = (bin_df2['badattr'] - bin_df2['goodattr']) * bin_df2['WOE']

bin_df2 = bin_df2.reset_index()

# 合并分箱結果

bin_df = pd.concat([bin_df1, bin_df2], axis=0)

bin_df['IV'] = bin_df['bin_iv'].sum().round(3)

bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)

bin_df.rename(columns={col: '分箱結果'}, inplace=True)

bin_df['特征名'] = col

bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

ks, precision, tpr, fpr = cal_ks(df, col, target)

bin_df['準確率'] = precision

bin_df['召回率'] = tpr

bin_df['打擾率'] = fpr

bin_df['KS'] = ks

return bin_df

總結

以上是生活随笔為你收集整理的python分箱统计个数_【数据处理】python变量分箱常见手法：分类型、数值型、卡方、自定义...的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： mysql无法连接server 2008
下一篇： websocket python爬虫_p