日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 >

朴素贝叶斯(NaiveBayes)针对小数据集中文文本分类预测

發布時間:2023/12/20 49 豆豆
生活随笔 收集整理的這篇文章主要介紹了 朴素贝叶斯(NaiveBayes)针对小数据集中文文本分类预测 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

轉自相國大人的博客,

http://blog.csdn.net/github_36326955/article/details/54891204

做個筆記

代碼按照1 2 3 4的順序進行即可:

1.py(corpus_segment.py)

?

#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @version: python2.7.8 @author: XiangguoSun @contact: sunxiangguodut@qq.com @file: corpus_segment.py @time: 2017/2/5 15:28 @software: PyCharm """ import sys import os import jieba # 配置utf-8輸出環境 reload(sys) sys.setdefaultencoding('utf-8') # 保存至文件 def savefile(savepath, content):with open(savepath, "wb") as fp:fp.write(content)'''上面兩行是python2.6以上版本增加的語法,省略了繁瑣的文件close和try操作2.5版本需要from __future__ import with_statement新手可以參考這個鏈接來學習http://zhoutall.com/archives/325''' # 讀取文件 def readfile(path):with open(path, "rb") as fp:content = fp.read()return contentdef corpus_segment(corpus_path, seg_path):'''corpus_path是未分詞語料庫路徑seg_path是分詞后語料庫存儲路徑'''catelist = os.listdir(corpus_path) # 獲取corpus_path下的所有子目錄'''其中子目錄的名字就是類別名,例如:train_corpus/art/21.txt中,'train_corpus/'是corpus_path,'art'是catelist中的一個成員'''# 獲取每個目錄(類別)下所有的文件for mydir in catelist:'''這里mydir就是train_corpus/art/21.txt中的art(即catelist中的一個類別)'''class_path = corpus_path + mydir + "/" # 拼出分類子目錄的路徑如:train_corpus/art/seg_dir = seg_path + mydir + "/" # 拼出分詞后存貯的對應目錄路徑如:train_corpus_seg/art/if not os.path.exists(seg_dir): # 是否存在分詞目錄,如果沒有則創建該目錄os.makedirs(seg_dir)file_list = os.listdir(class_path) # 獲取未分詞語料庫中某一類別中的所有文本'''train_corpus/art/中的21.txt,22.txt,23.txt...file_list=['21.txt','22.txt',...]'''for file_path in file_list: # 遍歷類別目錄下的所有文件fullname = class_path + file_path # 拼出文件名全路徑如:train_corpus/art/21.txtcontent = readfile(fullname) # 讀取文件內容'''此時,content里面存貯的是原文本的所有字符,例如多余的空格、空行、回車等等,接下來,我們需要把這些無關痛癢的字符統統去掉,變成只有標點符號做間隔的緊湊的文本內容'''content = content.replace("\r\n", "") # 刪除換行content = content.replace(" ", "")#刪除空行、多余的空格content_seg = jieba.cut(content) # 為文件內容分詞savefile(seg_dir + file_path, " ".join(content_seg)) # 將處理后的文件保存到分詞后語料目錄print "中文語料分詞結束!!!"if __name__=="__main__":#對訓練集進行分詞corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/" # 未分詞分類語料庫路徑seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" # 分詞后分類語料庫路徑,本程序輸出結果corpus_segment(corpus_path,seg_path)#對測試集進行分詞corpus_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/" # 未分詞分類語料庫路徑seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" # 分詞后分類語料庫路徑,本程序輸出結果corpus_segment(corpus_path,seg_path)


2.py(corpus2Bunch.py)

?

?

#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @version: python2.7.8 @author: XiangguoSun @contact: sunxiangguodut@qq.com @file: corpus2Bunch.py @time: 2017/2/7 7:41 @software: PyCharm """ import sys reload(sys) sys.setdefaultencoding('utf-8') import os#python內置的包,用于進行文件目錄操作,我們將會用到os.listdir函數 import cPickle as pickle#導入cPickle包并且取一個別名pickle ''' 事實上python中還有一個也叫作pickle的包,與這里的名字相同了,無所謂 關于cPickle與pickle,請參考博主另一篇博文: python核心模塊之pickle和cPickle講解 http://blog.csdn.net/github_36326955/article/details/54882506 本文件代碼下面會用到cPickle中的函數cPickle.dump ''' from sklearn.datasets.base import Bunch #這個您無需做過多了解,您只需要記住以后導入Bunch數據結構就像這樣就可以了。 #今后的博文會對sklearn做更有針對性的講解def _readfile(path):'''讀取文件'''#函數名前面帶一個_,是標識私有函數# 僅僅用于標明而已,不起什么作用,# 外面想調用還是可以調用,# 只是增強了程序的可讀性with open(path, "rb") as fp:#with as句法前面的代碼已經多次介紹過,今后不再注釋content = fp.read()return contentdef corpus2Bunch(wordbag_path,seg_path):catelist = os.listdir(seg_path)# 獲取seg_path下的所有子目錄,也就是分類信息#創建一個Bunch實例bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])bunch.target_name.extend(catelist)'''extend(addlist)是python list中的函數,意思是用新的list(addlist)去擴充原來的list'''# 獲取每個目錄下所有的文件for mydir in catelist:class_path = seg_path + mydir + "/" # 拼出分類子目錄的路徑file_list = os.listdir(class_path) # 獲取class_path下的所有文件for file_path in file_list: # 遍歷類別目錄下文件fullname = class_path + file_path # 拼出文件名全路徑bunch.label.append(mydir)bunch.filenames.append(fullname)bunch.contents.append(_readfile(fullname)) # 讀取文件內容'''append(element)是python list中的函數,意思是向原來的list中添加element,注意與extend()函數的區別'''# 將bunch存儲到wordbag_path路徑中with open(wordbag_path, "wb") as file_obj:pickle.dump(bunch, file_obj)print "構建文本對象結束!!!"if __name__ == "__main__":#這個語句前面的代碼已經介紹過,今后不再注釋#對訓練集進行Bunch化操作:wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat" # Bunch存儲路徑,程序輸出seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/" # 分詞后分類語料庫路徑,程序輸入corpus2Bunch(wordbag_path, seg_path)# 對測試集進行Bunch化操作:wordbag_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat" # Bunch存儲路徑,程序輸出seg_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/" # 分詞后分類語料庫路徑,程序輸入corpus2Bunch(wordbag_path, seg_path)

?

?

?

?

?

3.py(TFIDF_space.py)

?

#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @version: python2.7.8 @author: XiangguoSun @contact: sunxiangguodut@qq.com @file: TFIDF_space.py @time: 2017/2/8 11:39 @software: PyCharm """ import sys reload(sys) sys.setdefaultencoding('utf-8')from sklearn.datasets.base import Bunch import cPickle as pickle from sklearn.feature_extraction.text import TfidfVectorizerdef _readfile(path):with open(path, "rb") as fp:content = fp.read()return contentdef _readbunchobj(path):with open(path, "rb") as file_obj:bunch = pickle.load(file_obj)return bunchdef _writebunchobj(path, bunchobj):with open(path, "wb") as file_obj:pickle.dump(bunchobj, file_obj)def vector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None):stpwrdlst = _readfile(stopword_path).splitlines()bunch = _readbunchobj(bunch_path)tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={})if train_tfidf_path is not None:trainbunch = _readbunchobj(train_tfidf_path)tfidfspace.vocabulary = trainbunch.vocabularyvectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary)tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)else:vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)tfidfspace.vocabulary = vectorizer.vocabulary__writebunchobj(space_path, tfidfspace)print "tf-idf詞向量空間實例創建成功!!!"if __name__ == '__main__':# stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件# bunch_path = "train_word_bag/train_set.dat"#輸入的文件# space_path = "train_word_bag/tfdifspace.dat"#輸出的文件# vector_space(stopword_path,bunch_path,space_path)## bunch_path = "test_word_bag/test_set.dat"#輸入的文件# space_path = "test_word_bag/testspace.dat"# train_tfidf_path="train_word_bag/tfdifspace.dat"# vector_space(stopword_path,bunch_path,space_path,train_tfidf_path)stopword_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#輸入的文件train_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#輸入的文件space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#輸出的文件vector_space(stopword_path,train_bunch_path,space_path)train_tfidf_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat" # 輸入的文件,由上面生成test_bunch_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#輸入的文件test_space_path = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#輸出的文件vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)

?

?

?

?

?

4.py(NBayes_Predict.py)

?

#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @version: python2.7.8 @author: XiangguoSun @contact: sunxiangguodut@qq.com @file: NBayes_Predict.py @time: 2017/2/8 12:21 @software: PyCharm """ import sys reload(sys) sys.setdefaultencoding('utf-8')import cPickle as pickle from sklearn.naive_bayes import MultinomialNB # 導入多項式貝葉斯算法# 讀取bunch對象 def _readbunchobj(path):with open(path, "rb") as file_obj:bunch = pickle.load(file_obj)return bunch# 導入訓練集 trainpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat" train_set = _readbunchobj(trainpath)# 導入測試集 testpath = "/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat" test_set = _readbunchobj(testpath)# 訓練分類器:輸入詞袋向量和分類標簽,alpha:0.001 alpha越小,迭代次數越多,精度越高 clf = MultinomialNB(alpha=0.01).fit(train_set.tdm, train_set.label)# 預測分類結果 predicted = clf.predict(test_set.tdm)for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):if flabel != expct_cate:print file_name,": 實際類別:",flabel," -->預測類別:",expct_cateprint "預測完畢!!!"# 計算分類精度: from sklearn import metrics def metrics_result(actual, predict):print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted'))print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))metrics_result(test_set.label, predicted)

?

?

大概說下用法:

一、上面四個代碼依次運行即可

二、要注意數據的存放方式要和轉載的博客中一樣,文件夾的名字就是類別名字,代碼會進行自動識別。

三、每次跑完一遍流程,跑下一次程序前,train_corpus_seg和test_corpus_seg兩個文件夾要全部刪除,不然上次殘留的結果會影響這次的預測。

同樣地,如果更換中文數據集,這兩個文件夾也要刪除,總之,運行以上代碼的第一步驟就是檢查這兩個文件夾下面是不是空的。(當然如果是第一次運行以上四個代碼,沒有生成這兩個文件夾,自然是不用檢查的)

另外,他這篇博客的優點是,可以針對小數據集(數據條數不到1000,十折交叉驗證),預測概率可以達到60%~70%

?

程序之間的輸入輸出關系圖

?


?

總結

以上是生活随笔為你收集整理的朴素贝叶斯(NaiveBayes)针对小数据集中文文本分类预测的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。