Scikit-Learn 机器学习笔记 -- MNIST
生活随笔
收集整理的這篇文章主要介紹了
Scikit-Learn 机器学习笔记 -- MNIST
小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
Scikit-Learn 機(jī)器學(xué)習(xí)筆記 – MNIST
參考文檔: handson-ml
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier# 加載MNIST數(shù)據(jù)集 def load_dataset():from sklearn.datasets import fetch_mldatamnist = fetch_mldata('MNIST original', data_home='dataset')X, y = mnist['data'], mnist['target']X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]shuffle_index = np.random.permutation(60000)X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]print('load mnist successfully\n', 'X_train shape is: ', X_train.shape, 'X_test shape is:', X_test.shape)return X_train, X_test, y_train, y_test# 展示數(shù)據(jù)集的樣本 def show_data(dataset, labels, index):sample = dataset[index]sample_img = sample.reshape(28, 28)print('The label of this image is:', labels[index])plt.imshow(sample_img)plt.axis('off')plt.show()# 單個(gè)數(shù)字的隨機(jī)梯度下降二分類器 def single_number_classify(X_train, y_train, number):# 重構(gòu)數(shù)據(jù)標(biāo)簽,該數(shù)字的標(biāo)簽為1其它數(shù)字為0y_train_i = (y_train == number)# y_test_i = (y_test == number)# 創(chuàng)建隨機(jī)梯度下降分類器實(shí)例sgd_clf = SGDClassifier(random_state=42)sgd_clf.fit(X_train, y_train_i)return sgd_clf, y_train_i# 單個(gè)數(shù)字的隨機(jī)梯度下降二分類器預(yù)測 def snc_predict(sgd_clf, samples):predict = sgd_clf.predict(samples)print(' Predicted as:', predict)# 單個(gè)數(shù)字的隨機(jī)梯度下降二分類器性能評估,用驗(yàn)證集評估不是測試集 def snc_assess(sgd_clf, X_train, y_train_i):# K折交叉驗(yàn)證評分,指標(biāo)為精度, 設(shè)置為3折from sklearn.model_selection import cross_val_scorecrs = cross_val_score(sgd_clf, X_train, y_train_i, cv=3, scoring="accuracy")print('3折交叉驗(yàn)證精度為:', crs)# 計(jì)算混淆矩陣, 每一行表示一個(gè)實(shí)際的類, 而每一列表示一個(gè)預(yù)測的類, 值為樣本的個(gè)數(shù)from sklearn.model_selection import cross_val_predictfrom sklearn.metrics import confusion_matrixy_train_pred = cross_val_predict(sgd_clf, X_train, y_train_i, cv=3)confu_matrix = confusion_matrix(y_train_i, y_train_pred)print('單數(shù)字二分類器混淆矩陣為:', confu_matrix)# 計(jì)算準(zhǔn)確率、召回率和F1值from sklearn.metrics import precision_score, recall_score, f1_scoreprecision = precision_score(y_train_i, y_train_pred)recall = recall_score(y_train_i, y_train_pred)f1_sco = f1_score(y_train_i, y_train_pred)print('準(zhǔn)確率為:', precision, '召回率為', recall, 'F1值為:', f1_sco)# 獲得準(zhǔn)確率、召回率、閾值數(shù)據(jù)from sklearn.metrics import precision_recall_curvey_scores = cross_val_predict(sgd_clf, X_train, y_train_i, cv=3, method="decision_function")precisions, recalls, thresholds = precision_recall_curve(y_train_i, y_scores)# 繪制曲線plt.plot(thresholds, precisions[:-1], "b--", label="Precision")plt.plot(thresholds, recalls[:-1], "g-", label="Recall")plt.xlabel("Threshold")plt.legend(loc="upper left")plt.ylim([0, 1])plt.show()# ROC曲線,即真正例率(true positive rate,另一個(gè)名字叫做召回率)對假正例率(false positive rate, FPR)的曲線from sklearn.metrics import roc_curvefpr, tpr, thresholds = roc_curve(y_train_i, y_scores)# 繪制ROC曲線plt.plot(fpr, tpr, linewidth=2, label=None)plt.plot([0, 1], [0, 1], 'k--')plt.axis([0, 1, 0, 1])plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.show()# 手寫數(shù)字的隨機(jī)梯度下降多分類器,默認(rèn)為OvA/OvR(一對所有/一對其它) def number_classify_ova(X_train, y_train):# 創(chuàng)建隨機(jī)梯度下降多分類器實(shí)例sgd_clf = SGDClassifier(random_state=42)sgd_clf.fit(X_train, y_train)# 預(yù)測樣本sample = X_train[100]predict = sgd_clf.predict([sample])# 查看該樣本在各類中的得分digit_scores = sgd_clf.decision_function([sample])print('OvA的隨機(jī)梯度下降分類器預(yù)測結(jié)果為:', predict, '該樣本的各類得分:', digit_scores)return sgd_clf# 手寫數(shù)字的隨機(jī)梯度下降多分類器,使用OvO(一對一) def number_classify_ovo(X_train, y_train):# 創(chuàng)建OvO的隨機(jī)梯度下降多分類器實(shí)例from sklearn.multiclass import OneVsOneClassifierovo_sgd_clf = OneVsOneClassifier(SGDClassifier(random_state=42))ovo_sgd_clf.fit(X_train, y_train)# 預(yù)測樣本sample = X_train[100]predict = ovo_sgd_clf.predict([sample])print('OvO的隨機(jī)梯度下降分類器預(yù)測結(jié)果為:', predict)# 手寫數(shù)字的隨機(jī)森林(Random Forest)多分類器 def number_classify_rf(X_train, y_train):# 創(chuàng)建隨機(jī)森林多分分類器from sklearn.ensemble import RandomForestClassifierforest_clf = RandomForestClassifier(random_state=42)forest_clf.fit(X_train, y_train)# 預(yù)測sample = X_train[100]predict = forest_clf.predict([sample])print('隨機(jī)森林預(yù)測分類器結(jié)果為', predict)# 輸入正則化后的手寫數(shù)字隨機(jī)梯度下降多分類器的結(jié)果 def input_scaled_sgd(sgd_clf, X_train, y_train):# 對訓(xùn)練集進(jìn)行標(biāo)準(zhǔn)的縮放from sklearn.preprocessing import StandardScalerscaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))# 評估分類器的性能from sklearn.model_selection import cross_val_predictfrom sklearn.metrics import confusion_matrixfrom sklearn.model_selection import cross_val_scorescores = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")print('輸入正則化后的手寫數(shù)字隨機(jī)梯度下降多分類器的3折交叉驗(yàn)證精度為:', scores)y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)conf_mx = confusion_matrix(y_train, y_train_pred)print('輸入正則化后的手寫數(shù)字隨機(jī)梯度下降多分類器的混淆矩陣:\n', conf_mx)# 繪制混淆矩陣圖# 換算出各元素在行中所占的比例row_sums = conf_mx.sum(axis=1, keepdims=True)norm_conf_mx = conf_mx / row_sums# 對角線置0,為了更好地觀察不好的點(diǎn)(亮度高),np.fill_diagonal(norm_conf_mx, 0)plt.matshow(norm_conf_mx, cmap=plt.cm.gray)plt.show()if __name__ == '__main__':# 加載數(shù)據(jù)集X_train, X_test, y_train, y_test = load_dataset()# 展示數(shù)據(jù)集的某個(gè)樣本# show_data(X_train, y_train, 100)# 創(chuàng)建單個(gè)數(shù)字的隨機(jī)梯度下降二分類器# sgd_clf, y_train_i = single_number_classify(X_train, y_train, 5)# 用單個(gè)數(shù)字的隨機(jī)梯度下降二分類器測試樣本# snc_predict(sgd_clf, X_train[:3])# 評估單個(gè)數(shù)字的隨機(jī)梯度下降二分類器的性能# snc_assess(sgd_clf, X_train, y_train_i)# 使用手寫數(shù)字的隨機(jī)梯度下降多分類器,OvAsgd_clf_ova = number_classify_ova(X_train, y_train)# 使用手寫數(shù)字的隨機(jī)梯度下降多分類器,OvO# number_classify_ovo(X_train, y_train)# 使用手寫數(shù)字的隨機(jī)森林多分類器# number_classify_rf(X_train, y_train)# 輸入正則化后手寫數(shù)字的隨機(jī)梯度下降多分類器的性能評估input_scaled_sgd(sgd_clf_ova, X_train, y_train)
總結(jié)
以上是生活随笔為你收集整理的Scikit-Learn 机器学习笔记 -- MNIST的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 深入学习二叉树(二) 线索二叉树
- 下一篇: 博途变量类型_PLC数据类型(UDT)