文章目錄 1. 數(shù)據(jù)預(yù)覽 2. 數(shù)據(jù)集拆分 3. 二分類 4. 性能評估 4.1 交叉驗證 4.2 準(zhǔn)確率、召回率 4.3 受試者工作特征(ROC)曲線 5. 多分類 6. 誤差分析
本文為《機(jī)器學(xué)習(xí)實戰(zhàn):基于Scikit-Learn和TensorFlow》的讀書筆記。 中文翻譯參考
數(shù)據(jù)集為70000張手寫數(shù)字圖片,MNIST 數(shù)據(jù)集下載
1. 數(shù)據(jù)預(yù)覽
from scipy
. io
import loadmat
data
= loadmat
( 'mnist-original.mat' )
data
{ '__header__' : b
'MATLAB 5.0 MAT-file Platform: posix, Created on: Sun Mar 30 03:19:02 2014' , '__version__' : '1.0' , '__globals__' : [ ] , 'mldata_descr_ordering' : array
( [ [ array
( [ 'label' ] , dtype
= '<U5' ) , array
( [ 'data' ] , dtype
= '<U4' ) ] ] , dtype
= object ) , 'data' : array
( [ [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , . . . , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] ] , dtype
= uint8
) , 'label' : array
( [ [ 0 . , 0 . , 0 . , . . . , 9 . , 9 . , 9 . ] ] ) }
data
. keys
( )
dict_keys
( [ '__header__' , '__version__' , '__globals__' , 'mldata_descr_ordering' , 'data' , 'label' ] )
label 標(biāo)簽是啥,數(shù)字圖片的數(shù)是多少
y
= data
[ 'label' ] . ravel
( )
yarray
( [ 0 . , 0 . , 0 . , . . . , 9 . , 9 . , 9 . ] )
數(shù)據(jù)是啥,70000行,784列(28*28的圖片)
data
[ 'data' ] array
( [ [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , . . . , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] , [ 0 , 0 , 0 , . . . , 0 , 0 , 0 ] ] , dtype
= uint8
) import pandas
as pd
X
= pd
. DataFrame
( data
[ 'data' ] . T
)
X
% matplotlib inline
import numpy
as np
import matplotlib
import matplotlib
. pyplot
as plt
some_digit
= np
. array
( X
. iloc
[ 36000 , ] )
some_digit_img
= some_digit
. reshape
( 28 , 28 )
plt
. imshow
( some_digit_img
, interpolation
= 'nearest' )
plt
. show
( )
看起來像是5,y[36000],輸出5.0,確實是5
2. 數(shù)據(jù)集拆分
MNIST 數(shù)據(jù)集已經(jīng)事先被分成了一個訓(xùn)練集(前 60000 張圖片)和一個測試集(最后 10000 張圖片)
X_train
, x_test
, y_train
, y_test
= X
[ : 60000 ] , X
[ 60000 : ] , y
[ : 60000 ] , y
[ 60000 : ]
數(shù)據(jù)集是順序的(1-9),我們打亂數(shù)據(jù):
避免交叉驗證的某一折里,沒有某個數(shù)字 有些算法對訓(xùn)練樣本的順序是敏感的,避免
import numpy
as np
shuffle_idx
= np
. random
. permutation
( 60000 )
X_train
, y_train
= X_train
. iloc
[ shuffle_idx
] , y_train
[ shuffle_idx
]
X_train
3. 二分類
選擇隨機(jī)梯度下降模型、訓(xùn)練一個二分類器,預(yù)測是不是數(shù)字5
y_train_5
= ( y_train
== 5 )
y_test_5
= ( y_test
== 5 )
from sklearn
. linear_model
import SGDClassifier
sgd_clf
= SGDClassifier
( random_state
= 1 )
sgd_clf
. fit
( X_train
, y_train_5
)
sgd_clf
. predict
( [ some_digit
] )
array
( [ True ] )
4. 性能評估
4.1 交叉驗證
from sklearn
. model_selection
import StratifiedKFold
from sklearn
. base
import clone
skfolds
= StratifiedKFold
( n_splits
= 3 )
for train_index
, test_index
in skfolds
. split
( X_train
, y_train_5
) : clone_clf
= clone
( sgd_clf
) X_train_folds
= X_train
. iloc
[ train_index
] y_train_folds
= ( y_train_5
[ train_index
] ) X_test_fold
= X_train
. iloc
[ test_index
] y_test_fold
= ( y_train_5
[ test_index
] ) clone_clf
. fit
( X_train_folds
, y_train_folds
) y_pred
= clone_clf
. predict
( X_test_fold
) n_correct
= sum ( y_pred
== y_test_fold
) print ( n_correct
/ len ( y_pred
) ) 0.9464
0.9472
0.9659
from sklearn
. model_selection
import cross_val_score
cross_val_score
( sgd_clf
, X_train
, y_train_5
, cv
= 3 , scoring
= 'accuracy' )
寫一個預(yù)測不是5的分類器,直接返回 全部不是5
from sklearn
. base
import BaseEstimator
class not5 ( BaseEstimator
) : def fit ( self
, X
, y
= None ) : pass def predict ( self
, X
) : return np
. zeros
( ( len ( X
) , 1 ) , dtype
= bool )
not5_clf
= not5
( )
cross_val_score
( not5_clf
, X_train
, y_train_5
, cv
= 3 , scoring
= 'accuracy' )
因為只有 10% 的圖片是數(shù)字 5,總是猜測某張圖片不是 5,也有90%的可能性是對的。
這證明了為什么精度 通常來說 不是 一個好的性能度量指標(biāo),特別是當(dāng)你處理有偏差的數(shù)據(jù)集,比方說其中一些類比其他類頻繁得多
4.2 準(zhǔn)確率、召回率
精度 不是一個好的性能指標(biāo)混淆矩陣 (準(zhǔn)確率、召回率)
from sklearn
. model_selection
import cross_val_predict
y_train_pred
= cross_val_predict
( sgd_clf
, X_train
, y_train_5
, cv
= 3 ) from sklearn
. metrics
import confusion_matrix
confusion_matrix
( y_train_5
, y_train_pred
)
array
( [ [ 52625 , 1954 ] , [ 856 , 4565 ] ] , dtype
= int64
)
準(zhǔn)確率、召回率、F1值 (前兩者的平均)
F1=21precision+1recall=2?precison??recallprecison?+recall=TPTP+FN+FP2F 1=\frac{2}{\frac{1}{\text {precision}}+\frac{1}{\text {recall}}}=2 * \frac{\text {precison } * \text {recall}}{\text {precison }+\text {recall}}=\frac{T P}{T P+\frac{F N+F P}{2}} F 1 = precision 1 ? + recall 1 ? 2 ? = 2 ? precison? + recall precison? ? recall ? = T P + 2 F N + F P ? T P ?
from sklearn
. metrics
import precision_score
, recall_score
precision_score
( y_train_5
, y_train_pred
)
recall_score
( y_train_5
, y_train_pred
) from sklearn
. metrics
import f1_score
f1_score
( y_train_5
, y_train_pred
)
選擇標(biāo)準(zhǔn),看需求而定:
兒童閱讀,希望過濾不適合的,我們希望高的準(zhǔn)確率,標(biāo)記成適合的,里面真的適合的比例要很高,極大限度保護(hù)兒童 視頻警報預(yù)測,則希望高的召回率,是危險的,不能報不危險 F1值則要求兩者都要比較高
準(zhǔn)確率與召回率的折衷:
提高決策閾值,可以提高準(zhǔn)確率,降低召回率 降低決策閾值,可以提高召回率,降低準(zhǔn)確率
y_scores
= cross_val_predict
( sgd_clf
, X_train
, y_train_5
, cv
= 3 , method
= 'decision_function' )
from sklearn
. metrics
import precision_recall_curve
precisions
, recalls
, thresholds
= precision_recall_curve
( y_train_5
, y_scores
) def plot_precision_recall_vs_threshold ( precisions
, recalls
, thresholds
) : plt
. plot
( thresholds
, precisions
[ : - 1 ] , "b--" , label
= "Precision" ) plt
. plot
( thresholds
, recalls
[ : - 1 ] , "g-" , label
= "Recall" ) plt
. xlabel
( "Threshold" ) plt
. legend
( loc
= "best" ) plt
. ylim
( [ 0 , 1 ] )
plot_precision_recall_vs_threshold
( precisions
, recalls
, thresholds
)
plt
. show
( )
直接畫出 準(zhǔn)確率與召回率的關(guān)系
def plot_precision_recall ( precisions
, recalls
) : plt
. plot
( recalls
[ : - 1 ] , precisions
[ : - 1 ] , "b--" , label
= "Recalls VS Precisions" ) plt
. xlabel
( "Recalls" ) plt
. ylabel
( "Precisions" ) plt
. legend
( loc
= "best" ) plt
. ylim
( [ 0 , 1 ] )
plot_precision_recall
( precisions
, recalls
)
plt
. show
( )
找到準(zhǔn)確率 90%的點,其召回率為 52%
threshold_90_precision
= thresholds
[ np
. argmax
( precisions
>= 0.9 ) ]
y_train_pred_90
= ( y_scores
>= threshold_90_precision
)
precision_score
( y_train_5
, y_train_pred_90
)
recall_score
( y_train_5
, y_train_pred_90
)
4.3 受試者工作特征(ROC)曲線
ROC 曲線是真正例率(true positive rate,召回率)對假正例率(false positive rate, FPR 反例被錯誤分成正例的比率)的曲線
from sklearn
. metrics
import roc_curve
fpr
, tpr
, thresholds
= roc_curve
( y_train_5
, y_scores
) def plot_roc_curve ( fpr
, tpr
, label
= None ) : plt
. plot
( fpr
, tpr
, linewidth
= 2 , label
= label
) plt
. plot
( [ 0 , 1 ] , [ 0 , 1 ] , 'k--' ) plt
. axis
( [ 0 , 1 , 0 , 1 ] ) plt
. xlabel
( 'False Positive Rate' ) plt
. ylabel
( 'True Positive Rate' )
plot_roc_curve
( fpr
, tpr
)
plt
. show
( )
比較分類器優(yōu)劣的方法是:測量ROC曲線下的面積(AUC),面積越接近1越好
完美的分類器的 ROC AUC 等于 1 純隨機(jī)分類器的 ROC AUC 等于 0.5
from sklearn
. metrics
import roc_auc_score
roc_auc_score
( y_train_5
, y_scores
)
from sklearn
. ensemble
import RandomForestClassifier
forest_clf
= RandomForestClassifier
( random_state
= 42 )
y_probas_forest
= cross_val_predict
( forest_clf
, X_train
, y_train_5
, cv
= 3 , method
= "predict_proba" )
help ( RandomForestClassifier
. predict_proba
)
y_scores_forest
= y_probas_forest
[ : , 1 ]
fpr_forest
, tpr_forest
, thresholds_forest
= roc_curve
( y_train_5
, y_scores_forest
)
plt
. plot
( fpr
, tpr
, "b:" , label
= "SGD" )
plot_roc_curve
( fpr_forest
, tpr_forest
, "Random Forest" )
plt
. legend
( loc
= "best" )
plt
. show
( )
roc_auc_score
( y_train_5
, y_scores_forest
)
5. 多分類
一些算法(比如,隨機(jī)森林,樸素貝葉斯)可以直接處理多類分類問題 其他一些算法(比如 SVM 或 線性分類器)則是嚴(yán)格的二分類器
但是:可以可以把二分類用于多分類當(dāng)中
上面的數(shù)字預(yù)測:
一個方法是:訓(xùn)練10個二分類器(是n嗎?不是n嗎?n=0-9)。一個樣本進(jìn)行10次分類,選出決策分?jǐn)?shù)最高。這叫做“一對所有”(OvA )策略(也被叫做“一對其他”,OneVsRest)
另一個策略是對每2個 數(shù)字都訓(xùn)練一個二分類器:一個分類器用來處理數(shù)字 0 和數(shù)字 1,一個用來處理數(shù)字 0 和數(shù)字 2,一個用來處理數(shù)字 1 和 2,以此類推。 這叫做“一對一”(OvO )策略。如果有 N 個類。你需要訓(xùn)練N*(N-1)/2個分類器。選出勝出的分類器
OvO主要優(yōu)點 是:每個分類器只需要 在訓(xùn)練集的部分?jǐn)?shù)據(jù) 上面進(jìn)行訓(xùn)練。這部分?jǐn)?shù)據(jù)是它所需要區(qū)分的那兩個類 對應(yīng)的數(shù)據(jù)
對于大部分 的二分類器來說,OvA 是更好的 選擇
sgd_clf
. fit
( X_train
, y_train
)
sgd_clf
. predict
( [ some_digit
] )
隨機(jī)梯度下降分類器探測到是多分類,訓(xùn)練了10個分類器,分別作出決策
some_digit_scores
= sgd_clf
. decision_function
( [ some_digit
] )
some_digit_scores
array
( [ [ - 3868.24582957 , - 27686.91834291 , - 11576.99227803 , - 1167.01579458 , - 21161.58664081 , 1445.95448704 , - 20347.02376541 , - 11273.60667573 , - 19012.16864028 , - 12849.63656789 ] ] )
np
. argmax
( some_digit_scores
)
sgd_clf
. classes_
sgd_clf
. classes_
[ 5 ]
label 5 獲得的決策值最大,所以預(yù)測為 5
強(qiáng)制 Scikit-Learn 使用 OvO 策略或者 OvA 策略 你可以使用OneVsOneClassifier類或者OneVsRestClassifier類。傳遞一個二分類器給它的構(gòu)造函數(shù)
from sklearn
. multiclass
import OneVsOneClassifier
ovo_clf
= OneVsOneClassifier
( SGDClassifier
( random_state
= 1 ) )
ovo_clf
. fit
( X_train
, y_train
)
ovo_clf
. predict
( [ some_digit
] ) len ( ovo_clf
. estimators_
)
對于隨機(jī)森林模型,不必使用上面的策略,它可以進(jìn)行多分類
forest_clf
. fit
( X_train
, y_train
)
forest_clf
. predict
( [ some_digit
] )
forest_clf
. predict_proba
( [ some_digit
] )
6. 誤差分析
6.1 檢查混淆矩陣
使用cross_val_predict()做出預(yù)測,然后調(diào)用confusion_matrix()函數(shù)
y_train_pred
= cross_val_predict
( sgd_clf
, X_train
, y_train
, cv
= 3 )
conf_mat
= confusion_matrix
( y_train
, y_train_pred
)
conf_mat
array
( [ [ 5777 , 0 , 24 , 21 , 10 , 19 , 22 , 4 , 36 , 10 ] , [ 3 , 6478 , 48 , 46 , 12 , 25 , 12 , 14 , 83 , 21 ] , [ 91 , 71 , 5088 , 235 , 38 , 40 , 77 , 64 , 235 , 19 ] , [ 56 , 26 , 185 , 5376 , 6 , 160 , 32 , 62 , 143 , 85 ] , [ 41 , 34 , 69 , 49 , 5055 , 36 , 64 , 40 , 174 , 280 ] , [ 95 , 27 , 66 , 430 , 65 , 4243 , 98 , 23 , 275 , 99 ] , [ 101 , 20 , 82 , 14 , 31 , 98 , 5501 , 3 , 58 , 10 ] , [ 38 , 27 , 88 , 79 , 47 , 21 , 5 , 5650 , 33 , 277 ] , [ 61 , 130 , 96 , 469 , 31 , 240 , 40 , 39 , 4587 , 158 ] , [ 51 , 34 , 50 , 250 , 141 , 80 , 1 , 330 , 289 , 4723 ] ] , dtype
= int64
)
plt
. matshow
( conf_mat
, cmap
= plt
. cm
. gray
)
白色主要在對角線上,意味著被分類正確。 數(shù)字 5 對應(yīng)的格子比其他的要暗。兩種可能:數(shù)據(jù)5比較少,數(shù)據(jù)5預(yù)測不準(zhǔn)確
row_sums
= conf_mat
. sum ( axis
= 1 , keepdims
= True )
norm_conf_mat
= conf_mat
/ row_sums
np
. fill_diagonal
( norm_conf_mat
, 0 )
plt
. matshow
( norm_conf_mat
, cmap
= plt
. cm
. gray
)
plt
. show
( )
只保留被錯誤分類的數(shù)據(jù),再查看錯誤分布:
可以看出,數(shù)字被錯誤的預(yù)測成3、8、9的較多
把
3 和
5 的預(yù)測情況拿出來分析
def plot_digits ( instances
, images_per_row
= 10 , ** options
) : size
= 28 images_per_row
= min ( len ( instances
) , images_per_row
) images
= [ instance
. reshape
( size
, size
) for instance
in instances
] n_rows
= ( len ( instances
) - 1 ) // images_per_row
+ 1 row_images
= [ ] n_empty
= n_rows
* images_per_row
- len ( instances
) images
. append
( np
. zeros
( ( size
, size
* n_empty
) ) ) for row
in range ( n_rows
) : rimages
= images
[ row
* images_per_row
: ( row
+ 1 ) * images_per_row
] row_images
. append
( np
. concatenate
( rimages
, axis
= 1 ) ) image
= np
. concatenate
( row_images
, axis
= 0 ) plt
. imshow
( image
, cmap
= plt
. cm
. binary
, ** options
) plt
. axis
( "off" ) cl_a
, cl_b
= 3 , 5
X_aa
= X_train
[ ( y_train
== cl_a
) & ( y_train_pred
== cl_a
) ]
X_ab
= X_train
[ ( y_train
== cl_a
) & ( y_train_pred
== cl_b
) ]
X_ba
= X_train
[ ( y_train
== cl_b
) & ( y_train_pred
== cl_a
) ]
X_bb
= X_train
[ ( y_train
== cl_b
) & ( y_train_pred
== cl_b
) ]
plt
. figure
( figsize
= ( 8 , 8 ) )
plt
. subplot
( 221 ) ; plot_digits
( np
. array
( X_aa
[ : 25 ] ) , images_per_row
= 5 )
plt
. subplot
( 222 ) ; plot_digits
( np
. array
( X_ab
[ : 25 ] ) , images_per_row
= 5 )
plt
. subplot
( 223 ) ; plot_digits
( np
. array
( X_ba
[ : 25 ] ) , images_per_row
= 5 )
plt
. subplot
( 224 ) ; plot_digits
( np
. array
( X_bb
[ : 25 ] ) , images_per_row
= 5 )
plt
. show
( )
原因:3 和 5 的不同像素很少,所以模型容易混淆
3 和 5 之間的主要差異 是連接頂部的線和底部的線的細(xì)線的位置。 如果你畫一個 3,連接處稍微向左偏移,分類器很可能將它分類成5。反之亦然。換一個說法,這個分類器對于圖片的位移和旋轉(zhuǎn)相當(dāng)敏感 。 所以,減輕 3、5 混淆的一個方法是對圖片進(jìn)行預(yù)處理,確保它們都很好地中心化和不過度旋轉(zhuǎn)。這同樣很可能幫助減輕其他類型的錯誤。
總結(jié)
以上是生活随笔 為你收集整理的[Hands On ML] 3. 分类(MNIST手写数字预测) 的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔 網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔 推薦給好友。