當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Scikit-Learn 机器学习笔记 -- MNIST

發布時間：2025/3/12 编程问答 20 豆豆

生活随笔收集整理的這篇文章主要介紹了 Scikit-Learn 机器学习笔记 -- MNIST 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

Scikit-Learn 機器學習筆記 – MNIST

參考文檔： handson-ml

import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import SGDClassifier# 加載MNIST數據集 def load_dataset():from sklearn.datasets import fetch_mldatamnist = fetch_mldata('MNIST original', data_home='dataset')X, y = mnist['data'], mnist['target']X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]shuffle_index = np.random.permutation(60000)X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]print('load mnist successfully\n', 'X_train shape is: ', X_train.shape, 'X_test shape is:', X_test.shape)return X_train, X_test, y_train, y_test# 展示數據集的樣本 def show_data(dataset, labels, index):sample = dataset[index]sample_img = sample.reshape(28, 28)print('The label of this image is:', labels[index])plt.imshow(sample_img)plt.axis('off')plt.show()# 單個數字的隨機梯度下降二分類器 def single_number_classify(X_train, y_train, number):# 重構數據標簽，該數字的標簽為1其它數字為0y_train_i = (y_train == number)# y_test_i = (y_test == number)# 創建隨機梯度下降分類器實例sgd_clf = SGDClassifier(random_state=42)sgd_clf.fit(X_train, y_train_i)return sgd_clf, y_train_i# 單個數字的隨機梯度下降二分類器預測 def snc_predict(sgd_clf, samples):predict = sgd_clf.predict(samples)print(' Predicted as:', predict)# 單個數字的隨機梯度下降二分類器性能評估,用驗證集評估不是測試集 def snc_assess(sgd_clf, X_train, y_train_i):# K折交叉驗證評分，指標為精度, 設置為3折from sklearn.model_selection import cross_val_scorecrs = cross_val_score(sgd_clf, X_train, y_train_i, cv=3, scoring="accuracy")print('3折交叉驗證精度為：', crs)# 計算混淆矩陣, 每一行表示一個實際的類, 而每一列表示一個預測的類, 值為樣本的個數from sklearn.model_selection import cross_val_predictfrom sklearn.metrics import confusion_matrixy_train_pred = cross_val_predict(sgd_clf, X_train, y_train_i, cv=3)confu_matrix = confusion_matrix(y_train_i, y_train_pred)print('單數字二分類器混淆矩陣為：', confu_matrix)# 計算準確率、召回率和F1值from sklearn.metrics import precision_score, recall_score, f1_scoreprecision = precision_score(y_train_i, y_train_pred)recall = recall_score(y_train_i, y_train_pred)f1_sco = f1_score(y_train_i, y_train_pred)print('準確率為：', precision, '召回率為', recall, 'F1值為：', f1_sco)# 獲得準確率、召回率、閾值數據from sklearn.metrics import precision_recall_curvey_scores = cross_val_predict(sgd_clf, X_train, y_train_i, cv=3, method="decision_function")precisions, recalls, thresholds = precision_recall_curve(y_train_i, y_scores)# 繪制曲線plt.plot(thresholds, precisions[:-1], "b--", label="Precision")plt.plot(thresholds, recalls[:-1], "g-", label="Recall")plt.xlabel("Threshold")plt.legend(loc="upper left")plt.ylim([0, 1])plt.show()# ROC曲線，即真正例率（true positive rate，另一個名字叫做召回率）對假正例率（false positive rate, FPR）的曲線from sklearn.metrics import roc_curvefpr, tpr, thresholds = roc_curve(y_train_i, y_scores)# 繪制ROC曲線plt.plot(fpr, tpr, linewidth=2, label=None)plt.plot([0, 1], [0, 1], 'k--')plt.axis([0, 1, 0, 1])plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.show()# 手寫數字的隨機梯度下降多分類器，默認為OvA/OvR（一對所有/一對其它） def number_classify_ova(X_train, y_train):# 創建隨機梯度下降多分類器實例sgd_clf = SGDClassifier(random_state=42)sgd_clf.fit(X_train, y_train)# 預測樣本sample = X_train[100]predict = sgd_clf.predict([sample])# 查看該樣本在各類中的得分digit_scores = sgd_clf.decision_function([sample])print('OvA的隨機梯度下降分類器預測結果為：', predict, '該樣本的各類得分：', digit_scores)return sgd_clf# 手寫數字的隨機梯度下降多分類器，使用OvO（一對一） def number_classify_ovo(X_train, y_train):# 創建OvO的隨機梯度下降多分類器實例from sklearn.multiclass import OneVsOneClassifierovo_sgd_clf = OneVsOneClassifier(SGDClassifier(random_state=42))ovo_sgd_clf.fit(X_train, y_train)# 預測樣本sample = X_train[100]predict = ovo_sgd_clf.predict([sample])print('OvO的隨機梯度下降分類器預測結果為：', predict)# 手寫數字的隨機森林（Random Forest）多分類器 def number_classify_rf(X_train, y_train):# 創建隨機森林多分分類器from sklearn.ensemble import RandomForestClassifierforest_clf = RandomForestClassifier(random_state=42)forest_clf.fit(X_train, y_train)# 預測sample = X_train[100]predict = forest_clf.predict([sample])print('隨機森林預測分類器結果為', predict)# 輸入正則化后的手寫數字隨機梯度下降多分類器的結果 def input_scaled_sgd(sgd_clf, X_train, y_train):# 對訓練集進行標準的縮放from sklearn.preprocessing import StandardScalerscaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))# 評估分類器的性能from sklearn.model_selection import cross_val_predictfrom sklearn.metrics import confusion_matrixfrom sklearn.model_selection import cross_val_scorescores = cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")print('輸入正則化后的手寫數字隨機梯度下降多分類器的3折交叉驗證精度為:', scores)y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)conf_mx = confusion_matrix(y_train, y_train_pred)print('輸入正則化后的手寫數字隨機梯度下降多分類器的混淆矩陣:\n', conf_mx)# 繪制混淆矩陣圖# 換算出各元素在行中所占的比例row_sums = conf_mx.sum(axis=1, keepdims=True)norm_conf_mx = conf_mx / row_sums# 對角線置0，為了更好地觀察不好的點（亮度高），np.fill_diagonal(norm_conf_mx, 0)plt.matshow(norm_conf_mx, cmap=plt.cm.gray)plt.show()if __name__ == '__main__':# 加載數據集X_train, X_test, y_train, y_test = load_dataset()# 展示數據集的某個樣本# show_data(X_train, y_train, 100)# 創建單個數字的隨機梯度下降二分類器# sgd_clf, y_train_i = single_number_classify(X_train, y_train, 5)# 用單個數字的隨機梯度下降二分類器測試樣本# snc_predict(sgd_clf, X_train[:3])# 評估單個數字的隨機梯度下降二分類器的性能# snc_assess(sgd_clf, X_train, y_train_i)# 使用手寫數字的隨機梯度下降多分類器，OvAsgd_clf_ova = number_classify_ova(X_train, y_train)# 使用手寫數字的隨機梯度下降多分類器，OvO# number_classify_ovo(X_train, y_train)# 使用手寫數字的隨機森林多分類器# number_classify_rf(X_train, y_train)# 輸入正則化后手寫數字的隨機梯度下降多分類器的性能評估input_scaled_sgd(sgd_clf_ova, X_train, y_train)

總結

以上是生活随笔為你收集整理的Scikit-Learn 机器学习笔记 -- MNIST的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：深入学习二叉树(二) 线索二叉树
下一篇： Python 程序打包 -- 使用pyi