朴素贝叶斯--实战分析
生活随笔
收集整理的這篇文章主要介紹了
朴素贝叶斯--实战分析
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
概率論及貝葉斯決策理論的一些知識請參閱相關書籍和博客
https://blog.csdn.net/amds123/article/details/70173402
這里給出源碼及解析。
1. 使用python進行文本分類
# -*- coding: utf-8 -*- """ Created on Mon Aug 14 21:40:38 2017@author: LiLong """ from numpy import *# 創建實驗樣本 def loadDataSet():# 詞條集合postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],['stop', 'posting', 'stupid', 'worthless', 'garbage'],['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]# 類別標簽集合,這里是人工標注的classVec = [0,1,0,1,0,1] return postingList,classVecdef createVocabList(dataSet): #vocabSet = set([]) # 創建一個空集,set()確保元素的唯一性for document in dataSet:vocabSet = vocabSet | set(document) # 兩個集合的并集,既是添加新詞集合print 'vocabSet',vocabSet #得到的是一個集合return list(vocabSet) # 得到的是一個列表,在此需要轉換為列表def setOfWords2Vec(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量,并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = 1 # 出現設置為1,為詞集模型else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 樸素貝葉斯訓練函數 def trainNB0(trainMatrix,trainCategory): #trainCategory每篇文檔類別標簽所構成的向量numTrainDocs = len(trainMatrix) #訓練文檔的數目numWords = len(trainMatrix[0]) # 每篇文檔的詞向量pAbusive = sum(trainCategory)/float(numTrainDocs) # 侮辱性文檔的頻率p0Num = ones(numWords); p1Num = ones(numWords) # 初始化:設為1和2為了消除概率為0的影響p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs):if trainCategory[i] == 1: # 如果該文檔相應的標簽是1p1Num += trainMatrix[i] #兩向量相加,侮辱性的詞語個數累加p1Denom += sum(trainMatrix[i]) #同一個向量的元素相加,得到標簽1的侮辱詞總個數else:p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])p1Vect = log(p1Num/p1Denom) # 為了避免下溢,同時也是為分類時的運算做準備p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive# 樸素貝葉斯分類函數 # vec2Classify是要分類的向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):# p(Ci|W)<=>p(W|Ci)p(Ci)--->log(p(Ci|W))<=>log(p(W|Ci))+log(p(Ci))#sum()列表對應元素相乘,再相加(有點類似求期望)p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)if p1 > p0:return 1else: return 0def testingNB():listOPosts,listClasses = loadDataSet() # 載入文檔和標簽myVocabList = createVocabList(listOPosts) # 得到詞匯表,即文檔中不重復的詞列表trainMat=[]for postinDoc in listOPosts: # 得到所有詞條的詞向量trainMat.append(setOfWords2Vec(myVocabList, postinDoc))# 得到整篇文檔的侮辱性詞條向量的概率以及兩個類別的概率p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))testEntry = ['love', 'my', 'dalmation']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)testEntry = ['stupid', 'garbage']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)# 函數入口 testingNB()運行結果:
runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') vocabSet set(['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my']) ['love', 'my', 'dalmation'] classified as: 0 ['stupid', 'garbage'] classified as: 1這里需要注意幾點:
- 假設所有的詞都相互獨立,使用了條件獨立性假設
- 將每個詞出現與否作為一個特征,即詞集模型
- p1 = sum(vec2Classify * p1Vec) + log(pClass1)這句有點類似求期望
2. 使用樸素貝葉斯過濾垃圾郵件
# -*- coding: utf-8 -*- """ Created on Mon Aug 14 21:40:38 2017@author: LiLong """ from numpy import * #import feedparserdef createVocabList(dataSet): vocabSet = set([]) # 創建一個空集,set()確保元素的唯一性for document in dataSet: # dataset形如[[],[],[],.....]vocabSet = vocabSet | set(document) # 兩個集合的并集,既是添加新詞集合#print 'vocabSet',vocabSet # 得到的是一個集合return list(vocabSet) # 得到的是一個列表,在此需要轉換為列表# 詞集模型 def setOfWords2Vec(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量,并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = 1 # 出現設置為1,為詞集模型else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 詞袋模型 def bagOfWords2VecMN(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量,并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = +1 # 出現就加一else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 樸素貝葉斯訓練函數 def trainNB0(trainMatrix,trainCategory): #trainCategory每篇文檔類別標簽所構成的向量numTrainDocs = len(trainMatrix) #訓練文檔的數目numWords = len(trainMatrix[0]) # 每篇文檔的詞向量pAbusive = sum(trainCategory)/float(numTrainDocs) # 侮辱性文檔的頻率p0Num = ones(numWords); p1Num = ones(numWords) # 初始化:設為1和2為了消除概率為0的影響p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: # 如果該文檔相應的標簽是1,計算p(w|1)p1Num += trainMatrix[i] #兩向量相加,侮辱性的詞語個數累加p1Denom += sum(trainMatrix[i]) #同一個向量的元素相加,得到標簽1的侮辱詞總個數else: # 計算p(w|0)p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])p1Vect = log(p1Num/p1Denom) # 為了避免下溢,同時也是為分類時的運算做準備p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive# 樸素貝葉斯分類函數 # vec2Classify是要分類的向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):# p(Ci|W)<=>p(W|Ci)p(Ci)--->log(p(Ci|W))<=>log(p(W|Ci))+log(p(Ci))#sum()列表對應元素相乘,再相加(有點類似求期望)p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)if p1 > p0:return 1else: return 0# 測試函數,封裝了所有操作 def testingNB():listOPosts,listClasses = loadDataSet() # 載入文檔和標簽myVocabList = createVocabList(listOPosts) # 得到詞匯表,即文檔中不重復的詞列表trainMat=[]for postinDoc in listOPosts: # 得到所有詞條的詞向量trainMat.append(setOfWords2Vec(myVocabList, postinDoc))# 得到整篇文檔的侮辱性詞條向量的概率以及兩個類別的概率p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #在此必須轉換為numpy的array()testEntry = ['love', 'my', 'dalmation']thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) # 只要是數組,就必須array()print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)testEntry = ['stupid', 'garbage']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)# 文件解析 def textParse(bigString): import relistOfTokens = re.split(r'\W*', bigString) # 切分文本# 去掉少于兩個字符的字符串return [tok.lower() for tok in listOfTokens if len(tok) > 2]# 垃圾郵件測試函數 def spamTest():docList=[]; classList = []; fullText =[]for i in range(1,26):# 一種重要的讀取路徑下的文件的有效方法,打開文件并讀取文件內容wordList = textParse(open('spam/%d.txt' % i).read()) docList.append(wordList) # 添加形成[[],[]...]fullText.extend(wordList) # 添加形成[.....]classList.append(1) # 類別1# 讀取另一個文件wordList = textParse(open('ham/%d.txt' % i).read())docList.append(wordList)fullText.extend(wordList)classList.append(0) #類別0#print 'classList:',classList# 創建詞匯表(doclist存儲所有的類別,50個),得到不重復的所有字符串的列表vocabList = createVocabList(docList) #print 'vocabList:',vocabListtrainingSet = range(50); testSet=[] for i in range(10): # 隨機構建訓練集randIndex = int(random.uniform(0,len(trainingSet))) # 0到50的一個隨機整數testSet.append(trainingSet[randIndex])del(trainingSet[randIndex]) print 'testSet:',testSet # 其中10個被選為測試集print 'trainingSet:',trainingSet # 剩下的40個為訓練集# 構建訓練集詞條向量 trainMat=[]; trainClasses = []for docIndex in trainingSet:trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) #trainMat是數組trainClasses.append(classList[docIndex]) # 相應的類別# 樸素貝葉斯訓練函數p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))errorCount = 0# 測試for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:errorCount += 1print "classification error",docList[docIndex] # 輸出相應的判斷錯誤的詞表print 'the error rate is:',float(errorCount)/len(testSet)#return vocabList,fullText# 函數入口 #testingNB() spamTest()結果:
runfile('C:/Users/LiLong/Desktop/Bayesian/debug.py', wdir='C:/Users/LiLong/Desktop/Bayesian') testSet: [34, 23, 8, 10, 40, 13, 21, 14, 2, 20] trainingSet: [0, 1, 3, 4, 5, 6, 7, 9, 11, 12, 15, 16, 17, 18, 19, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49] the error rate is: 0.0runfile('C:/Users/LiLong/Desktop/Bayesian/debug.py', wdir='C:/Users/LiLong/Desktop/Bayesian') testSet: [31, 15, 23, 8, 12, 27, 10, 3, 13, 1] trainingSet: [0, 2, 4, 5, 6, 7, 9, 11, 14, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] classification error ['oem', 'adobe', 'microsoft', 'softwares', 'fast', 'order', 'and', 'download', 'microsoft', 'office', 'professional', 'plus', '2007', '2010', '129', 'microsoft', 'windows', 'ultimate', '119', 'adobe', 'photoshop', 'cs5', 'extended', 'adobe', 'acrobat', 'pro', 'extended', 'windows', 'professional', 'thousand', 'more', 'titles'] the error rate is: 0.1結果是兩次的運行效果,因為電子有郵件是隨機選擇的,所以每次的輸出結果可能有些差別,也可以重復多次,然后求平均值,降低錯誤率。
注意:
- 這里用到的是詞袋模型
數據選擇用的是留存交叉驗證
其他:
3. 使用樸素貝葉斯從個人廣告中獲取區域傾向
我用的是spyder,自身沒有帶feedparser,所以首先安裝feedparser:
下載安裝包 :feedparser-5.2.1
基于spyder平臺安裝:打開spyder后,tools–>open command prompt,打開控制臺后,cd進入下載包的位置,運行python setup.py install。然后在cmd下 輸入pip list 查看已安裝的包,如果是比較老的版本用pipi freeze。
結果:
runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.45runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.35runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.15runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.15如果注釋掉用于移除高頻詞的那幾行代碼,會發現錯誤率有所改變,由此可以看出最具表征性的詞在詞匯表中的重要性,也即是特征的重要性。。
runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 the error rate is: 0.3runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 660 the error rate is: 0.35runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 660 the error rate is: 0.3同時錯誤率要遠高于垃圾郵件的錯誤率,由于這里關注的是單詞概率而不是實際分類,此問題不是很嚴重
4. 最具表征性的詞匯顯示函數
# 最具表征性的詞匯顯示函數 def getTopWords(ny,sf):import operatorvocabList,p0V,p1V=localWords(ny,sf)topNY=[]; topSF=[]for i in range(len(p0V)):if p0V[i] > -4.0 : topSF.append((vocabList[i],p0V[i])) #設定閾值if p1V[i] > -4.0 : topNY.append((vocabList[i],p1V[i]))sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)#print 'sortedSF:',sortedSFprint "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"for item in sortedSF:print item[0]sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"for item in sortedNY:print item[0]結果:
runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 695 the error rate is: 0.45 SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF** and for the NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY** and for have the you在此處我把閾值改為了-0.4,可以看出滿足的詞匯就比較少,但是更最具表征性的詞匯。。。
附:列表存儲元組的用法
tt=[('and', 91), ('for', 60)]tt[0] Out[30]: ('and', 91)for i in tt:print i[0]and for總結
以上是生活随笔為你收集整理的朴素贝叶斯--实战分析的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 决策树—ID3(源码解析)
- 下一篇: 矩阵求导公式,及MathJax公式编辑