當前位置：首頁 > 编程语言 > python >内容正文

python

python c4.5完整代码_python实现c4.5/Id3自我练习

發布時間：2023/12/2 python 44 豆豆

生活随笔收集整理的這篇文章主要介紹了 python c4.5完整代码_python实现c4.5/Id3自我练习小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

import numpy as np

class DecisionTree:

"""決策樹使用方法：

- 生成實例： clf = DecisionTrees(). 參數mode可選，ID3或C4.5，默認C4.5

- 訓練，調用fit方法： clf.fit(X,y).? X,y均為np.ndarray類型

- 預測，調用predict方法： clf.predict(X). X為np.ndarray類型

- 可視化決策樹，調用showTree方法

"""

def __init__(self,mode='C4.5'):

self._tree = None

if mode == 'C4.5' or mode == 'ID3':

self._mode = mode

else:

raise Exception('mode should be C4.5 or ID3')

def _calcEntropy(self,y):

"""

函數功能：計算熵

參數y：數據集的標簽

"""

num = y.shape[0]

#統計y中不同label值的個數，并用字典labelCounts存儲

labelCounts = {}

for label in y:

if label not in labelCounts.keys(): labelCounts[label] = 0

labelCounts[label] += 1

#計算熵

entropy = 0.0

for key in labelCounts:

prob = float(labelCounts[key])/num

entropy -= prob * np.log2(prob)

return entropy

def _splitDataSet(self,X,y,index,value):

"""

函數功能：返回數據集中特征下標為index，特征值等于value的子數據集

"""

ret = []

featVec = X[:,index]

X = X[:,[i for i in range(X.shape[1]) if i!=index]]

for i in range(len(featVec)):

if featVec[i]==value:

ret.append(i)

return X[ret,:],y[ret]

def _chooseBestFeatureToSplit_ID3(self,X,y):

"""ID3

函數功能：對輸入的數據集，選擇最佳分割特征

參數dataSet：數據集，最后一列為label

主要變量說明：

numFeatures：特征個數

oldEntropy：原始數據集的熵

newEntropy：按某個特征分割數據集后的熵

infoGain：信息增益

bestInfoGain：記錄最大的信息增益

bestFeatureIndex：信息增益最大時，所選擇的分割特征的下標

"""

numFeatures = X.shape[1]

oldEntropy = self._calcEntropy(y)

bestInfoGain = 0.0

bestFeatureIndex = -1

for i in range(numFeatures):

featList = X[:,i]

uniqueVals = set(featList)

newEntropy = 0.0

for value in uniqueVals:

sub_X,sub_y = self._splitDataSet(X,y,i,value)

prob = len(sub_y)/float(len(y))

newEntropy += prob * self._calcEntropy(sub_y)

#計算信息增益，根據信息增益選擇最佳分割特征

infoGain = oldEntropy - newEntropy

if (infoGain > bestInfoGain):

bestInfoGain = infoGain

bestFeatureIndex = i

return bestFeatureIndex

def _chooseBestFeatureToSplit_C45(self,X,y):

"""C4.5

ID3算法計算的是信息增益，C4.5算法計算的是信息增益比

"""

numFeatures = X.shape[1]

oldEntropy = self._calcEntropy(y)

bestGainRatio = 0.0

bestFeatureIndex = -1

#對每個特征都計算一下gainRatio=infoGain/splitInformation

for i in range(numFeatures):

featList = X[:,i]

uniqueVals = set(featList)

newEntropy = 0.0

splitInformation = 0.0

#對第i個特征的各個value，得到各個子數據集，計算各個子數據集的熵，

#進一步地可以計算得到根據第i個特征分割原始數據集后的熵newEntropy

for value in uniqueVals:

sub_X,sub_y = self._splitDataSet(X,y,i,value)

prob = len(sub_y)/float(len(y))

newEntropy += prob * self._calcEntropy(sub_y)

splitInformation -= prob * np.log2(prob)

#計算信息增益比，根據信息增益比選擇最佳分割特征

if splitInformation==0.0:

pass

else:

infoGain = oldEntropy - newEntropy

gainRatio = infoGain/splitInformation

if(gainRatio > bestGainRatio):

bestGainRatio = gainRatio

bestFeatureIndex = i

return bestFeatureIndex

def _majorityCnt(self,labelList):

"""

函數功能：返回labelList中出現次數最多的label

"""

labelCount={}

for vote in labelList:

if vote not in labelCount.keys(): labelCount[vote] = 0

labelCount[vote] += 1

sortedClassCount = sorted(labelCount.iteritems(),key=lambda x:x[1], reverse=True)

return sortedClassCount[0][0]

def _createTree(self,X,y,featureIndex):

"""建立決策樹

featureIndex，類型是元組，它記錄了X中的特征在原始數據中對應的下標。

"""

labelList = list(y)

#所有label都相同的話，則停止分割，返回該label

if labelList.count(labelList[0]) == len(labelList):

return labelList[0]

#沒有特征可分割時，停止分割，返回出現次數最多的label

if len(featureIndex) == 0:

return self._majorityCnt(labelList)

#可以繼續分割的話，確定最佳分割特征

if self._mode == 'C4.5':

bestFeatIndex = self._chooseBestFeatureToSplit_C45(X,y)

elif self._mode == 'ID3':

bestFeatIndex = self._chooseBestFeatureToSplit_ID3(X,y)

bestFeatStr = featureIndex[bestFeatIndex]

featureIndex = list(featureIndex)

featureIndex.remove(bestFeatStr)

featureIndex = tuple(featureIndex)

#用字典存儲決策樹。最佳分割特征作為key，而對應的鍵值仍然是一棵樹

myTree = {bestFeatStr:{}}

featValues = X[:,bestFeatIndex]

uniqueVals = set(featValues)

for value in uniqueVals:

#對每個value遞歸地創建樹

sub_X,sub_y = self._splitDataSet(X,y, bestFeatIndex, value)

myTree[bestFeatStr][value] = self._createTree(sub_X,sub_y,featureIndex)

return myTree

def fit(self,X,y):

#類型檢查

if isinstance(X,np.ndarray) and isinstance(y,np.ndarray):

pass

else:

try:

X = np.array(X)

y = np.array(y)

except:

raise TypeError("numpy.ndarray required for X,y")

featureIndex = tuple(['x'+str(i) for i in range(X.shape[1])])

self._tree = self._createTree(X,y,featureIndex)

return self? #allow chaining: clf.fit().predict()

def predict(self,X):

if self._tree==None:

raise NotFittedError("Estimator not fitted, call `fit` first")

if isinstance(X,np.ndarray):

pass

else:

try:

X = np.array(X)

except:

raise TypeError("numpy.ndarray required for X")

def _classify(tree,sample):

"""

用訓練好的決策樹對輸入數據分類

決策樹的構建是一個遞歸的過程，用決策樹分類也是一個遞歸的過程

_classify()一次只能對一個樣本(sample)分類

To Do: 多個sample的預測怎樣并行化？

"""

featIndex = tree.keys()[0]

secondDict = tree[featIndex]

key = sample[int(featIndex[1:])]

valueOfkey = secondDict[key]

if isinstance(valueOfkey, dict):

label = _classify(valueOfkey,sample)

else: label = valueOfkey

return label

if len(X.shape)==1:

return _classify(self._tree,X)

else:

results = []

for i in range(X.shape[0]):

results.append(_classify(self._tree,X[i]))

return np.array(results)

def show(self):

if self._tree==None:

raise NotFittedError("Estimator not fitted, call `fit` first")

#plot the tree using matplotlib

import treePlotter

treePlotter.createPlot(self._tree)

class NotFittedError(Exception):

"""

Exception class to raise if estimator is used before fitting

"""

pass

總結

以上是生活随笔為你收集整理的python c4.5完整代码_python实现c4.5/Id3自我练习的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： iOS 14 忘记屏幕使用时间密码怎么办
下一篇： python绘制横向堆积柱状图_Pyth

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

python

python c4.5完整代码_python实现c4.5/Id3自我练习

總結