决策树python
**from math import log
import operatordef createDataSet():dataSet = [[1, 1, 'yes'],[1, 1, 'yes'],[1, 0, 'no'],[0, 1, 'no'],[0, 1, 'no']] #創(chuàng)建數(shù)據(jù)集labels = ['no surfacing','flippers'] #分類屬性return dataSet, labelsdef calcShannonEnt(dataSet):numEntries = len(dataSet) #求樣本矩陣的長度labelCounts = {}for featVec in dataSet: #遍歷dataSet中的每一行currentLabel = featVec[-1] #每行中最后一個數(shù)表示數(shù)據(jù)的類標簽if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 #如果標簽數(shù)據(jù)中沒有該標簽,則添加該標簽labelCounts[currentLabel] += 1 #如果有,則標簽數(shù)加1shannonEnt = 0.0for key in labelCounts: #遍歷標簽數(shù)組prob = float(labelCounts[key])/numEntries #標簽數(shù)除以樣本矩陣的長度,即概率shannonEnt -= prob * log(prob,2) #以2為底求對數(shù)return shannonEnt‘’‘
計算所有屬性值得信息增益,并計算最佳劃分方法
傳入的參數(shù),dataSet是樣本矩陣,axis是第axis的位置,value是該位置的值
例:tree.splitDataSet(dataSet,1,1):求dataSet中第2個位置是1的數(shù)組
’‘’
def splitDataSet(dataSet, axis, value): retDataSet = []for featVec in dataSet: #遍歷樣本矩陣if featVec[axis] == value: #如果該行中axis位置的值為valuereducedFeatVec = featVec[:axis] #從第0個位置開始截取到axis位置reducedFeatVec.extend(featVec[axis+1:]) #再從第axis+1的位置截取該行,添加屬性到之前的屬性后面retDataSet.append(reducedFeatVec) #一個個的添加reducedFeatVec數(shù)組return retDataSetdef chooseBestFeatureToSplit(dataSet):numFeatures = len(dataSet[0]) - 1 #因為最后一行是存的類標簽,這里是屬性數(shù)baseEntropy = calcShannonEnt(dataSet) #原始熵bestInfoGain = 0.0; bestFeature = -1for i in range(numFeatures): #遍歷除了類標簽的樣本矩陣featList = [example[i] for example in dataSet] #example[0]=[1,1,1,0,0],example[1]=[1,1,0,1,1]uniqueVals = set(featList) #set集合有去重作用,所以uniqueVals=[0,1]newEntropy = 0.0for value in uniqueVals: #遍歷uniqueVals數(shù)組subDataSet = splitDataSet(dataSet, i, value) #劃分數(shù)據(jù)集prob = len(subDataSet)/float(len(dataSet))newEntropy += prob * calcShannonEnt(subDataSet) #計算劃分后的熵infoGain = baseEntropy - newEntropy #計算信息增益if (infoGain > bestInfoGain): #比較當前最大的信息增益bestInfoGain = infoGain #始終選擇最大值bestFeature = i #返回最大值下的劃分屬性return bestFeature #返回信息增益最大的劃分屬性‘’‘
我們已經(jīng)做到了尋找劃分數(shù)據(jù)集的最佳屬性,接下來就是遞歸調用,不斷劃分下去。劃分到什么時候結束呢?這里有兩個依據(jù),第一,劃分后的某個數(shù)據(jù)集中所有數(shù)據(jù)都同屬于一類,這個時候就沒必要再劃分了,再者,由于這里所講的決策樹是消耗屬性的,所以當所有屬性都用完了,劃分也就停止了。如果所有屬性都用完了,某堆數(shù)據(jù)集中的數(shù)據(jù)仍不統(tǒng)一,解決方法就是少數(shù)服從多數(shù)
’‘’
def majorityCnt(classList): #少數(shù)服從多數(shù)classCount={}for vote in classList:if vote not in classCount.keys(): classCount[vote] = 0classCount[vote] += 1 #該類標簽下數(shù)據(jù)個數(shù)+1sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #對標簽數(shù)據(jù)個數(shù)降序排序return sortedClassCount[0][0] #返回數(shù)據(jù)表最大的那個類標簽def createTree(dataSet,labels): #相當于主函數(shù)classList = [example[-1] for example in dataSet] #保存類標簽數(shù)組,classList=[‘yes’,’yes’,’no’,’no’,’no’]if classList.count(classList[0]) == len(classList): #如果classList全為‘yes’,說明所有數(shù)據(jù)同屬一類return classList[0] #所以停止劃分if len(dataSet[0]) == 1: #如果dataSet第一維的長度為1return majorityCnt(classList) #執(zhí)行少數(shù)服從多數(shù)程序bestFeat = chooseBestFeatureToSplit(dataSet) #找到信息增益最大的屬性bestFeatLabel = labels[bestFeat] #找到該屬性的類標簽myTree = {bestFeatLabel:{}} #以多級字典的形式展示樹,類似多層json結構del(labels[bestFeat]) #在labels數(shù)組中刪除用來劃分的類標簽featValues = [example[bestFeat] for example in dataSet] #把dataSet矩陣中屬于用來劃分的類標簽的屬性保存咋featValues數(shù)組中uniqueVals = set(featValues) #去掉數(shù)組中重復的值for value in uniqueVals:subLabels = labels[:] #拷貝數(shù)組labels,使其不會丟失掉它的屬性myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) #循環(huán)craeteTree函數(shù)return myTree #返回樹 def classify(inputTree,featLabels,testVec):firstStr = inputTree.keys()[0]secondDict = inputTree[firstStr]featIndex = featLabels.index(firstStr)key = testVec[featIndex]valueOfFeat = secondDict[key]if isinstance(valueOfFeat, dict): classLabel = classify(valueOfFeat, featLabels, testVec)else: classLabel = valueOfFeatreturn classLabeldef storeTree(inputTree,filename):import picklefw = open(filename,'w')pickle.dump(inputTree,fw)fw.close()def grabTree(filename):import picklefr = open(filename)return pickle.load(fr)**
總結
- 上一篇: 作者:刘阳(1988-),男,军事医学科
- 下一篇: 使用命名空间解决名字冲突