當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

朴素贝叶斯--实战分析

發布時間：2024/9/20 编程问答 36 豆豆

生活随笔收集整理的這篇文章主要介紹了朴素贝叶斯--实战分析小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

概率論及貝葉斯決策理論的一些知識請參閱相關書籍和博客
https://blog.csdn.net/amds123/article/details/70173402

這里給出源碼及解析。

1. 使用python進行文本分類

# -*- coding: utf-8 -*- """ Created on Mon Aug 14 21:40:38 2017@author: LiLong """ from numpy import *# 創建實驗樣本 def loadDataSet():# 詞條集合postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],['stop', 'posting', 'stupid', 'worthless', 'garbage'],['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]# 類別標簽集合，這里是人工標注的classVec = [0,1,0,1,0,1] return postingList,classVecdef createVocabList(dataSet): #vocabSet = set([]) # 創建一個空集，set()確保元素的唯一性for document in dataSet:vocabSet = vocabSet | set(document) # 兩個集合的并集，既是添加新詞集合print 'vocabSet',vocabSet #得到的是一個集合return list(vocabSet) # 得到的是一個列表，在此需要轉換為列表def setOfWords2Vec(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量，并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = 1 # 出現設置為1，為詞集模型else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 樸素貝葉斯訓練函數 def trainNB0(trainMatrix,trainCategory): #trainCategory每篇文檔類別標簽所構成的向量numTrainDocs = len(trainMatrix) #訓練文檔的數目numWords = len(trainMatrix[0]) # 每篇文檔的詞向量pAbusive = sum(trainCategory)/float(numTrainDocs) # 侮辱性文檔的頻率p0Num = ones(numWords); p1Num = ones(numWords) # 初始化：設為1和2為了消除概率為0的影響p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs):if trainCategory[i] == 1: # 如果該文檔相應的標簽是1p1Num += trainMatrix[i] #兩向量相加，侮辱性的詞語個數累加p1Denom += sum(trainMatrix[i]) #同一個向量的元素相加，得到標簽1的侮辱詞總個數else:p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])p1Vect = log(p1Num/p1Denom) # 為了避免下溢,同時也是為分類時的運算做準備p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive# 樸素貝葉斯分類函數 # vec2Classify是要分類的向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):# p(Ci|W)<=>p(W|Ci)p(Ci)--->log(p(Ci|W))<=>log(p(W|Ci))+log(p(Ci))#sum()列表對應元素相乘，再相加(有點類似求期望)p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)if p1 > p0:return 1else: return 0def testingNB():listOPosts,listClasses = loadDataSet() # 載入文檔和標簽myVocabList = createVocabList(listOPosts) # 得到詞匯表，即文檔中不重復的詞列表trainMat=[]for postinDoc in listOPosts: # 得到所有詞條的詞向量trainMat.append(setOfWords2Vec(myVocabList, postinDoc))# 得到整篇文檔的侮辱性詞條向量的概率以及兩個類別的概率p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))testEntry = ['love', 'my', 'dalmation']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)testEntry = ['stupid', 'garbage']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)# 函數入口 testingNB()

運行結果：

runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') vocabSet set(['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my']) ['love', 'my', 'dalmation'] classified as: 0 ['stupid', 'garbage'] classified as: 1

這里需要注意幾點：

假設所有的詞都相互獨立，使用了條件獨立性假設
將每個詞出現與否作為一個特征，即詞集模型
p1 = sum(vec2Classify * p1Vec) + log(pClass1)這句有點類似求期望

2. 使用樸素貝葉斯過濾垃圾郵件

# -*- coding: utf-8 -*- """ Created on Mon Aug 14 21:40:38 2017@author: LiLong """ from numpy import * #import feedparserdef createVocabList(dataSet): vocabSet = set([]) # 創建一個空集，set()確保元素的唯一性for document in dataSet: # dataset形如[[],[],[],.....]vocabSet = vocabSet | set(document) # 兩個集合的并集，既是添加新詞集合#print 'vocabSet',vocabSet # 得到的是一個集合return list(vocabSet) # 得到的是一個列表，在此需要轉換為列表# 詞集模型 def setOfWords2Vec(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量，并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = 1 # 出現設置為1，為詞集模型else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 詞袋模型 def bagOfWords2VecMN(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量，并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = +1 # 出現就加一else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 樸素貝葉斯訓練函數 def trainNB0(trainMatrix,trainCategory): #trainCategory每篇文檔類別標簽所構成的向量numTrainDocs = len(trainMatrix) #訓練文檔的數目numWords = len(trainMatrix[0]) # 每篇文檔的詞向量pAbusive = sum(trainCategory)/float(numTrainDocs) # 侮辱性文檔的頻率p0Num = ones(numWords); p1Num = ones(numWords) # 初始化：設為1和2為了消除概率為0的影響p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: # 如果該文檔相應的標簽是1，計算p(w|1)p1Num += trainMatrix[i] #兩向量相加，侮辱性的詞語個數累加p1Denom += sum(trainMatrix[i]) #同一個向量的元素相加，得到標簽1的侮辱詞總個數else: # 計算p(w|0)p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])p1Vect = log(p1Num/p1Denom) # 為了避免下溢,同時也是為分類時的運算做準備p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive# 樸素貝葉斯分類函數 # vec2Classify是要分類的向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):# p(Ci|W)<=>p(W|Ci)p(Ci)--->log(p(Ci|W))<=>log(p(W|Ci))+log(p(Ci))#sum()列表對應元素相乘，再相加(有點類似求期望)p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)if p1 > p0:return 1else: return 0# 測試函數，封裝了所有操作 def testingNB():listOPosts,listClasses = loadDataSet() # 載入文檔和標簽myVocabList = createVocabList(listOPosts) # 得到詞匯表，即文檔中不重復的詞列表trainMat=[]for postinDoc in listOPosts: # 得到所有詞條的詞向量trainMat.append(setOfWords2Vec(myVocabList, postinDoc))# 得到整篇文檔的侮辱性詞條向量的概率以及兩個類別的概率p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #在此必須轉換為numpy的array()testEntry = ['love', 'my', 'dalmation']thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) # 只要是數組，就必須array()print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)testEntry = ['stupid', 'garbage']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)# 文件解析 def textParse(bigString): import relistOfTokens = re.split(r'\W*', bigString) # 切分文本# 去掉少于兩個字符的字符串return [tok.lower() for tok in listOfTokens if len(tok) > 2]# 垃圾郵件測試函數 def spamTest():docList=[]; classList = []; fullText =[]for i in range(1,26):# 一種重要的讀取路徑下的文件的有效方法,打開文件并讀取文件內容wordList = textParse(open('spam/%d.txt' % i).read()) docList.append(wordList) # 添加形成[[],[]...]fullText.extend(wordList) # 添加形成[.....]classList.append(1) # 類別1# 讀取另一個文件wordList = textParse(open('ham/%d.txt' % i).read())docList.append(wordList)fullText.extend(wordList)classList.append(0) #類別0#print 'classList:',classList# 創建詞匯表(doclist存儲所有的類別，50個),得到不重復的所有字符串的列表vocabList = createVocabList(docList) #print 'vocabList:',vocabListtrainingSet = range(50); testSet=[] for i in range(10): # 隨機構建訓練集randIndex = int(random.uniform(0,len(trainingSet))) # 0到50的一個隨機整數testSet.append(trainingSet[randIndex])del(trainingSet[randIndex]) print 'testSet:',testSet # 其中10個被選為測試集print 'trainingSet:',trainingSet # 剩下的40個為訓練集# 構建訓練集詞條向量 trainMat=[]; trainClasses = []for docIndex in trainingSet:trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) #trainMat是數組trainClasses.append(classList[docIndex]) # 相應的類別# 樸素貝葉斯訓練函數p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))errorCount = 0# 測試for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:errorCount += 1print "classification error",docList[docIndex] # 輸出相應的判斷錯誤的詞表print 'the error rate is:',float(errorCount)/len(testSet)#return vocabList,fullText# 函數入口 #testingNB() spamTest()

結果：

runfile('C:/Users/LiLong/Desktop/Bayesian/debug.py', wdir='C:/Users/LiLong/Desktop/Bayesian') testSet: [34, 23, 8, 10, 40, 13, 21, 14, 2, 20] trainingSet: [0, 1, 3, 4, 5, 6, 7, 9, 11, 12, 15, 16, 17, 18, 19, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49] the error rate is: 0.0runfile('C:/Users/LiLong/Desktop/Bayesian/debug.py', wdir='C:/Users/LiLong/Desktop/Bayesian') testSet: [31, 15, 23, 8, 12, 27, 10, 3, 13, 1] trainingSet: [0, 2, 4, 5, 6, 7, 9, 11, 14, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] classification error ['oem', 'adobe', 'microsoft', 'softwares', 'fast', 'order', 'and', 'download', 'microsoft', 'office', 'professional', 'plus', '2007', '2010', '129', 'microsoft', 'windows', 'ultimate', '119', 'adobe', 'photoshop', 'cs5', 'extended', 'adobe', 'acrobat', 'pro', 'extended', 'windows', 'professional', 'thousand', 'more', 'titles'] the error rate is: 0.1

結果是兩次的運行效果，因為電子有郵件是隨機選擇的，所以每次的輸出結果可能有些差別，也可以重復多次，然后求平均值，降低錯誤率。

注意：

這里用到的是詞袋模型
數據選擇用的是留存交叉驗證

其他：

>>> [1,0,1]+[0,0,0] [1, 0, 1, 0, 0, 0] >>> t= array([1,0,1]) >>> m=array([0,0,1]) >>> t+m array([1, 0, 2]) >>> t= array([1,0,2]) >>> t*t array([1, 0, 4]) >>> t= array([[1,0,2],[1,0,2]]) >>> m=array([2,0,2]) >>> t*m array([[2, 0, 4],[2, 0, 4]])

3. 使用樸素貝葉斯從個人廣告中獲取區域傾向

我用的是spyder，自身沒有帶feedparser，所以首先安裝feedparser：

下載安裝包：feedparser-5.2.1
基于spyder平臺安裝：打開spyder后，tools–>open command prompt,打開控制臺后，cd進入下載包的位置，運行python setup.py install。然后在cmd下輸入pip list 查看已安裝的包，如果是比較老的版本用pipi freeze。

# -*- coding: utf-8 -*- """ Created on Mon Aug 14 21:40:38 2017@author: LiLong """ from numpy import * #import feedparser# 創建實驗樣本 def loadDataSet():# 詞條集合postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],['stop', 'posting', 'stupid', 'worthless', 'garbage'],['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]# 類別標簽集合，這里是人工標注的classVec = [0,1,0,1,0,1] return postingList,classVecdef createVocabList(dataSet): #vocabSet = set([]) # 創建一個空集，set()確保元素的唯一性for document in dataSet: # dataset形如[[],[],[],.....]vocabSet = vocabSet | set(document) # 兩個集合的并集，既是添加新詞集合#print 'vocabSet',vocabSet # 得到的是一個集合return list(vocabSet) # 得到的是一個列表，在此需要轉換為列表# 詞集模型 def setOfWords2Vec(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量，并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = 1 # 出現設置為1，為詞集模型else: print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 詞袋模型 def bagOfWords2VecMN(vocabList, inputSet): # 輸入的詞組轉換為向量returnVec = [0]*len(vocabList) # 創建一個列表向量，并且和詞匯表等長for word in inputSet:if word in vocabList: # 判斷單詞是否在詞匯表中if...in....returnVec[vocabList.index(word)] = +1 # 出現就加一else: pass#print "the word: %s is not in my Vocabulary!" % wordreturn returnVec # 返回輸入文本的詞向量,每個都是等長的# 樸素貝葉斯訓練函數 def trainNB0(trainMatrix,trainCategory): #trainCategory每篇文檔類別標簽所構成的向量numTrainDocs = len(trainMatrix) #訓練文檔的數目numWords = len(trainMatrix[0]) # 每篇文檔的詞向量pAbusive = sum(trainCategory)/float(numTrainDocs) # 侮辱性文檔的頻率p0Num = ones(numWords); p1Num = ones(numWords) # 初始化：設為1和2為了消除概率為0的影響p0Denom = 2.0; p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i] == 1: # 如果該文檔相應的標簽是1，計算p(w|1)p1Num += trainMatrix[i] #兩向量相加，侮辱性的詞語個數累加p1Denom += sum(trainMatrix[i]) #同一個向量的元素相加，得到標簽1的侮辱詞總個數else: # 計算p(w|0)p0Num += trainMatrix[i]p0Denom += sum(trainMatrix[i])p1Vect = log(p1Num/p1Denom) # 為了避免下溢,同時也是為分類時的運算做準備p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive# 樸素貝葉斯分類函數 # vec2Classify是要分類的向量 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):# p(Ci|W)<=>p(W|Ci)p(Ci)--->log(p(Ci|W))<=>log(p(W|Ci))+log(p(Ci))#sum()列表對應元素相乘，再相加(有點類似求期望)p1 = sum(vec2Classify * p1Vec) + log(pClass1) p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)if p1 > p0:return 1else: return 0# 測試函數，封裝了所有操作 def testingNB():listOPosts,listClasses = loadDataSet() # 載入文檔和標簽myVocabList = createVocabList(listOPosts) # 得到詞匯表，即文檔中不重復的詞列表trainMat=[]for postinDoc in listOPosts: # 得到所有詞條的詞向量trainMat.append(setOfWords2Vec(myVocabList, postinDoc))# 得到整篇文檔的侮辱性詞條向量的概率以及兩個類別的概率p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #在此必須轉換為numpy的array()testEntry = ['love', 'my', 'dalmation']thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) # 只要是數組，就必須array()print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)testEntry = ['stupid', 'garbage']thisDoc = array(setOfWords2Vec(myVocabList, testEntry))print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb)# 文件解析 def textParse(bigString): import relistOfTokens = re.split(r'\W*', bigString) # 切分文本# 去掉少于兩個字符的字符串return [tok.lower() for tok in listOfTokens if len(tok) > 2]# 垃圾郵件測試函數 def spamTest():docList=[]; classList = []; fullText =[]for i in range(1,26):# 一種重要的讀取路徑下的文件的有效方法,打開文件并讀取文件內容wordList = textParse(open('spam/%d.txt' % i).read()) docList.append(wordList) # 添加形成[[],[]...]fullText.extend(wordList) # 添加形成[.....]classList.append(1) # 類別1# 讀取另一個文件wordList = textParse(open('ham/%d.txt' % i).read())docList.append(wordList)fullText.extend(wordList)classList.append(0) #類別0#print 'classList:',classList# 創建詞匯表(doclist存儲所有的類別，50個),得到不重復的所有字符串的列表vocabList = createVocabList(docList) #print 'vocabList:',vocabListtrainingSet = range(50); testSet=[] for i in range(10): # 隨機構建訓練集randIndex = int(random.uniform(0,len(trainingSet))) # 0到50的一個隨機整數testSet.append(trainingSet[randIndex])del(trainingSet[randIndex]) print 'testSet:',testSet # 其中10個被選為測試集print 'trainingSet:',trainingSet # 剩下的40個為訓練集# 構建訓練集詞條向量 trainMat=[]; trainClasses = []for docIndex in trainingSet:trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) #trainMat是數組trainClasses.append(classList[docIndex]) # 相應的類別# 樸素貝葉斯訓練函數p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))errorCount = 0# 測試for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:errorCount += 1print "classification error",docList[docIndex] # 輸出相應的判斷錯誤的詞表print 'the error rate is:',float(errorCount)/len(testSet)#return vocabList,fullText# 計算出現頻率 def calcMostFreq(vocabList,fullText):import operatorfreqDict = {}for token in vocabList:freqDict[token]=fullText.count(token) # token在列表fullText中的數量sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedFreq[:30] # 類似spamTest()的功能 def localWords(feed1,feed0):import feedparserdocList=[]; classList = []; fullText =[]minLen = min(len(feed1['entries']),len(feed0['entries'])) # entris是一個listprint 'minLen:',minLenfor i in range(minLen):wordList = textParse(feed1['entries'][i]['summary'])docList.append(wordList)fullText.extend(wordList)classList.append(1) #NY is class 1wordList = textParse(feed0['entries'][i]['summary'])docList.append(wordList)fullText.extend(wordList)classList.append(0)vocabList = createVocabList(docList) #create vocabulary# top30Words是[(u'and', 91),.....]的形式top30Words = calcMostFreq(vocabList,fullText) print 'top30Words:',top30Wordsfor pairW in top30Words:if pairW[0] in vocabList: vocabList.remove(pairW[0]) # 移除排序最高的30個單詞trainingSet = range(2*minLen); testSet=[] # 建立測試集for i in range(20):randIndex = int(random.uniform(0,len(trainingSet)))testSet.append(trainingSet[randIndex])del(trainingSet[randIndex]) trainMat=[]; trainClasses = []for docIndex in trainingSet:trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))trainClasses.append(classList[docIndex])p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))errorCount = 0for docIndex in testSet: wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:errorCount += 1print 'the error rate is: ',float(errorCount)/len(testSet)return vocabList,p0V,p1V# 函數入口 #testingNB() #spamTest() ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') # 必須函數外導入 sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') vocabList,psF,pNY=localWords(ny,sf)

結果：

runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.45runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.35runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.15runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 top30Words: [(u'and', 90), (u'you', 54), (u'for', 51), (u'indian', 35), (u'looking', 32), (u'who', 32), (u'the', 29), (u'with', 28), (u'have', 25), (u'can', 21), (u'male', 19), (u'female', 17), (u'your', 17), (u'that', 14), (u'not', 13), (u'just', 13), (u'like', 13), (u'here', 11), (u'out', 11), (u'are', 11), (u'good', 10), (u'married', 10), (u'but', 10), (u'single', 10), (u'area', 10), (u'woman', 9), (u'want', 9), (u'friend', 9), (u'bay', 9), (u'about', 9)] the error rate is: 0.15

如果注釋掉用于移除高頻詞的那幾行代碼，會發現錯誤率有所改變，由此可以看出最具表征性的詞在詞匯表中的重要性，也即是特征的重要性。。

runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 the error rate is: 0.3runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 660 the error rate is: 0.35runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 660 the error rate is: 0.3

同時錯誤率要遠高于垃圾郵件的錯誤率，由于這里關注的是單詞概率而不是實際分類，此問題不是很嚴重

4. 最具表征性的詞匯顯示函數

# 最具表征性的詞匯顯示函數 def getTopWords(ny,sf):import operatorvocabList,p0V,p1V=localWords(ny,sf)topNY=[]; topSF=[]for i in range(len(p0V)):if p0V[i] > -4.0 : topSF.append((vocabList[i],p0V[i])) #設定閾值if p1V[i] > -4.0 : topNY.append((vocabList[i],p1V[i]))sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)#print 'sortedSF:',sortedSFprint "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"for item in sortedSF:print item[0]sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"for item in sortedNY:print item[0]

結果：

runfile('C:/Users/LiLong/Desktop/Bayesian/bayesian.py', wdir='C:/Users/LiLong/Desktop/Bayesian') minLen: 25 695 the error rate is: 0.45 SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF** and for the NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY** and for have the you

在此處我把閾值改為了-0.4，可以看出滿足的詞匯就比較少，但是更最具表征性的詞匯。。。

附：列表存儲元組的用法

tt=[('and', 91), ('for', 60)]tt[0] Out[30]: ('and', 91)for i in tt:print i[0]and for

總結

以上是生活随笔為你收集整理的朴素贝叶斯--实战分析的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：决策树—ID3（源码解析）
下一篇：矩阵求导公式，及MathJax公式编辑