python 特征选择卡方_特征选择
2020-01-10
皮爾遜相關(guān)系數(shù)
image.png
衡量線性相關(guān)性,檢查數(shù)據(jù)集里目標(biāo)和數(shù)值特征之間皮爾遜相關(guān)系數(shù)的絕對(duì)值。根據(jù)這個(gè)準(zhǔn)則保留前n個(gè)特征。def cor_selector(X, y,num_feats):
cor_list = []
feature_name = X.columns.tolist()
# calculate the correlation with y for each feature
for i in X.columns.tolist():
cor = np.corrcoef(X[i], y)[0, 1]
cor_list.append(cor)
# replace NaN with 0
cor_list = [0 if np.isnan(i) else i for i in cor_list]
# feature name
cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))
[-num_feats:]].columns.tolist()
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in
feature_name]
return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
from sklearn.datasets import load_iris
iris=load_iris()
#選擇K個(gè)最好的特征,返回選擇特征后的數(shù)據(jù)
#第一個(gè)參數(shù)為計(jì)算評(píng)估特征是否好的函數(shù),該函數(shù)輸入特征矩陣和目標(biāo)向量,輸出二元組(評(píng)分,P值)的數(shù)組,數(shù)組第i項(xiàng)為第i個(gè)特征的評(píng)分和P值。在此定義為計(jì)算相關(guān)系數(shù)
#參數(shù)k為選擇的特征個(gè)數(shù)
# 定義函數(shù)
def multivariate_pearsonr(X, y):
scores, pvalues = [], []
for ret in map(lambda x:pearsonr(x, y), X.T):
scores.append(abs(ret[0]))
pvalues.append(ret[1])
return (np.array(scores), np.array(pvalues))
transformer = SelectKBest(score_func=multivariate_pearsonr, k=2)
Xt_pearson = transformer.fit_transform(iris.data, iris.target)
print(Xt_pearson)
卡方分布
只能用于二分類(lèi)
計(jì)算目標(biāo)與數(shù)值變量之間的卡方度量分布,只選取卡方值最大的變量。
image.png
假設(shè)自變量有N種取值,因變量有M種取值,考慮自變量等于i且因變量等于j的樣本頻數(shù)的觀察值與期望的差距,構(gòu)建統(tǒng)計(jì)量:
image.pngfrom sklearn.feature_selection import SelectKBestfrom
sklearn.feature_selection import chi2
#選擇K個(gè)最好的特征,返回選擇特征后的數(shù)據(jù)
SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
遞歸特征消除
通過(guò)特征的重要性,遞歸的去掉不重要的from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(),
n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#遞歸特征消除法,返回特征選擇后的數(shù)據(jù)
#參數(shù)estimator為基模型
#參數(shù)n_features_to_select為選擇的特征個(gè)數(shù)
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data,iris.target)
套索:SelectFromModel
Lasso和RF都有自己的特征選擇方法。Lasso正則化器強(qiáng)制許多特征權(quán)重為零from sklearn.feature_selection import Select
FromModelfrom sklearn.linear_model import LogisticRegression
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"),
max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
基于樹(shù)形結(jié)構(gòu):SelectFromModel
使用隨機(jī)森林,根據(jù)特征的重要性來(lái)選擇特征, 使用每個(gè)決策樹(shù)中的節(jié)點(diǎn)雜質(zhì)來(lái)計(jì)算特征的重要性。隨機(jī)森林中,最終的特征重要性是所有決策樹(shù)特征重要性的平均值。from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
embeded_rf_selector =
SelectFromModel(RandomForestClassifier(n_estimators=100),
max_features=num_feats)
embeded_rf_selector.fit(X, y)e
mbeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
結(jié)合GBDT模型from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
#GBDT作為基模型的特征選擇
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
可以使用 LightGBM或者XGBoost 對(duì)象,只要它有feature_importances_屬性from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier
gbc=LGBMClassifier(n_estimators=500, learning_rate=0.05,
num_leaves=32, colsample_bytree=0.2,
reg_alpha=3, reg_lambda=1, min_split_gain=0.01,
min_child_weight=40)
embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features'
總結(jié)
全部使用# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name,
'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
'Logistics':embeded_lr_support,
'Random Forest':embeded_rf_support,
'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df =
feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)
functionComputeSMA(data,window_size)
https://www.jianshu.com/p/ddcc51dfc578
總結(jié)
以上是生活随笔為你收集整理的python 特征选择卡方_特征选择的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: oppo快充数据线多少钱
- 下一篇: java添加窗体_添加的窗体