python c4.5完整代码_python实现c4.5/Id3自我练习
import numpy as np
class DecisionTree:
"""決策樹使用方法:
- 生成實例: clf = DecisionTrees(). 參數mode可選,ID3或C4.5,默認C4.5
- 訓練,調用fit方法: clf.fit(X,y).? X,y均為np.ndarray類型
- 預測,調用predict方法: clf.predict(X). X為np.ndarray類型
- 可視化決策樹,調用showTree方法
"""
def __init__(self,mode='C4.5'):
self._tree = None
if mode == 'C4.5' or mode == 'ID3':
self._mode = mode
else:
raise Exception('mode should be C4.5 or ID3')
def _calcEntropy(self,y):
"""
函數功能:計算熵
參數y:數據集的標簽
"""
num = y.shape[0]
#統計y中不同label值的個數,并用字典labelCounts存儲
labelCounts = {}
for label in y:
if label not in labelCounts.keys(): labelCounts[label] = 0
labelCounts[label] += 1
#計算熵
entropy = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/num
entropy -= prob * np.log2(prob)
return entropy
def _splitDataSet(self,X,y,index,value):
"""
函數功能:返回數據集中特征下標為index,特征值等于value的子數據集
"""
ret = []
featVec = X[:,index]
X = X[:,[i for i in range(X.shape[1]) if i!=index]]
for i in range(len(featVec)):
if featVec[i]==value:
ret.append(i)
return X[ret,:],y[ret]
def _chooseBestFeatureToSplit_ID3(self,X,y):
"""ID3
函數功能:對輸入的數據集,選擇最佳分割特征
參數dataSet:數據集,最后一列為label
主要變量說明:
numFeatures:特征個數
oldEntropy:原始數據集的熵
newEntropy:按某個特征分割數據集后的熵
infoGain:信息增益
bestInfoGain:記錄最大的信息增益
bestFeatureIndex:信息增益最大時,所選擇的分割特征的下標
"""
numFeatures = X.shape[1]
oldEntropy = self._calcEntropy(y)
bestInfoGain = 0.0
bestFeatureIndex = -1
for i in range(numFeatures):
featList = X[:,i]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
sub_X,sub_y = self._splitDataSet(X,y,i,value)
prob = len(sub_y)/float(len(y))
newEntropy += prob * self._calcEntropy(sub_y)
#計算信息增益,根據信息增益選擇最佳分割特征
infoGain = oldEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeatureIndex = i
return bestFeatureIndex
def _chooseBestFeatureToSplit_C45(self,X,y):
"""C4.5
ID3算法計算的是信息增益,C4.5算法計算的是信息增益比
"""
numFeatures = X.shape[1]
oldEntropy = self._calcEntropy(y)
bestGainRatio = 0.0
bestFeatureIndex = -1
#對每個特征都計算一下gainRatio=infoGain/splitInformation
for i in range(numFeatures):
featList = X[:,i]
uniqueVals = set(featList)
newEntropy = 0.0
splitInformation = 0.0
#對第i個特征的各個value,得到各個子數據集,計算各個子數據集的熵,
#進一步地可以計算得到根據第i個特征分割原始數據集后的熵newEntropy
for value in uniqueVals:
sub_X,sub_y = self._splitDataSet(X,y,i,value)
prob = len(sub_y)/float(len(y))
newEntropy += prob * self._calcEntropy(sub_y)
splitInformation -= prob * np.log2(prob)
#計算信息增益比,根據信息增益比選擇最佳分割特征
if splitInformation==0.0:
pass
else:
infoGain = oldEntropy - newEntropy
gainRatio = infoGain/splitInformation
if(gainRatio > bestGainRatio):
bestGainRatio = gainRatio
bestFeatureIndex = i
return bestFeatureIndex
def _majorityCnt(self,labelList):
"""
函數功能:返回labelList中出現次數最多的label
"""
labelCount={}
for vote in labelList:
if vote not in labelCount.keys(): labelCount[vote] = 0
labelCount[vote] += 1
sortedClassCount = sorted(labelCount.iteritems(),key=lambda x:x[1], reverse=True)
return sortedClassCount[0][0]
def _createTree(self,X,y,featureIndex):
"""建立決策樹
featureIndex,類型是元組,它記錄了X中的特征在原始數據中對應的下標。
"""
labelList = list(y)
#所有label都相同的話,則停止分割,返回該label
if labelList.count(labelList[0]) == len(labelList):
return labelList[0]
#沒有特征可分割時,停止分割,返回出現次數最多的label
if len(featureIndex) == 0:
return self._majorityCnt(labelList)
#可以繼續分割的話,確定最佳分割特征
if self._mode == 'C4.5':
bestFeatIndex = self._chooseBestFeatureToSplit_C45(X,y)
elif self._mode == 'ID3':
bestFeatIndex = self._chooseBestFeatureToSplit_ID3(X,y)
bestFeatStr = featureIndex[bestFeatIndex]
featureIndex = list(featureIndex)
featureIndex.remove(bestFeatStr)
featureIndex = tuple(featureIndex)
#用字典存儲決策樹。最佳分割特征作為key,而對應的鍵值仍然是一棵樹
myTree = {bestFeatStr:{}}
featValues = X[:,bestFeatIndex]
uniqueVals = set(featValues)
for value in uniqueVals:
#對每個value遞歸地創建樹
sub_X,sub_y = self._splitDataSet(X,y, bestFeatIndex, value)
myTree[bestFeatStr][value] = self._createTree(sub_X,sub_y,featureIndex)
return myTree
def fit(self,X,y):
#類型檢查
if isinstance(X,np.ndarray) and isinstance(y,np.ndarray):
pass
else:
try:
X = np.array(X)
y = np.array(y)
except:
raise TypeError("numpy.ndarray required for X,y")
featureIndex = tuple(['x'+str(i) for i in range(X.shape[1])])
self._tree = self._createTree(X,y,featureIndex)
return self? #allow chaining: clf.fit().predict()
def predict(self,X):
if self._tree==None:
raise NotFittedError("Estimator not fitted, call `fit` first")
if isinstance(X,np.ndarray):
pass
else:
try:
X = np.array(X)
except:
raise TypeError("numpy.ndarray required for X")
def _classify(tree,sample):
"""
用訓練好的決策樹對輸入數據分類
決策樹的構建是一個遞歸的過程,用決策樹分類也是一個遞歸的過程
_classify()一次只能對一個樣本(sample)分類
To Do: 多個sample的預測怎樣并行化?
"""
featIndex = tree.keys()[0]
secondDict = tree[featIndex]
key = sample[int(featIndex[1:])]
valueOfkey = secondDict[key]
if isinstance(valueOfkey, dict):
label = _classify(valueOfkey,sample)
else: label = valueOfkey
return label
if len(X.shape)==1:
return _classify(self._tree,X)
else:
results = []
for i in range(X.shape[0]):
results.append(_classify(self._tree,X[i]))
return np.array(results)
def show(self):
if self._tree==None:
raise NotFittedError("Estimator not fitted, call `fit` first")
#plot the tree using matplotlib
import treePlotter
treePlotter.createPlot(self._tree)
class NotFittedError(Exception):
"""
Exception class to raise if estimator is used before fitting
"""
pass
總結
以上是生活随笔為你收集整理的python c4.5完整代码_python实现c4.5/Id3自我练习的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: iOS 14 忘记屏幕使用时间密码怎么办
- 下一篇: python绘制横向堆积柱状图_Pyth