基于ID3決策樹算法的實(shí)現(xiàn)(Python版)
實(shí)例如下:
# -*- coding:utf-8 -*- from numpy import * import numpy as np import pandas as pd from math import log import operator #計(jì)算數(shù)據(jù)集的香農(nóng)熵 def calcShannonEnt(dataSet): numEntries=len(dataSet) labelCounts={} #給所有可能分類創(chuàng)建字典 for featVec in dataSet: currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 #以2為底數(shù)計(jì)算香農(nóng)熵 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt-=prob*log(prob,2) return shannonEnt #對(duì)離散變量劃分?jǐn)?shù)據(jù)集,取出該特征取值為value的所有樣本 def splitDataSet(dataSet,axis,value): retDataSet=[] for featVec in dataSet: if featVec[axis]==value: reducedFeatVec=featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #對(duì)連續(xù)變量劃分?jǐn)?shù)據(jù)集,direction規(guī)定劃分的方向, #決定是劃分出小于value的數(shù)據(jù)樣本還是大于value的數(shù)據(jù)樣本集 def splitContinuousDataSet(dataSet,axis,value,direction): retDataSet=[] for featVec in dataSet: if direction==0: if featVec[axis]>value: reducedFeatVec=featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) else: if featVec[axis]<=value: reducedFeatVec=featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet #選擇最好的數(shù)據(jù)集劃分方式 def chooseBestFeatureToSplit(dataSet,labels): numFeatures=len(dataSet[0])-1 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 bestSplitDict={} for i in range(numFeatures): featList=[example[i] for example in dataSet] #對(duì)連續(xù)型特征進(jìn)行處理 if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int': #產(chǎn)生n-1個(gè)候選劃分點(diǎn) sortfeatList=sorted(featList) splitList=[] for j in range(len(sortfeatList)-1): splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0) bestSplitEntropy=10000 slen=len(splitList) #求用第j個(gè)候選劃分點(diǎn)劃分時(shí),得到的信息熵,并記錄最佳劃分點(diǎn) for j in range(slen): value=splitList[j] newEntropy=0.0 subDataSet0=splitContinuousDataSet(dataSet,i,value,0) subDataSet1=splitContinuousDataSet(dataSet,i,value,1) prob0=len(subDataSet0)/float(len(dataSet)) newEntropy+=prob0*calcShannonEnt(subDataSet0) prob1=len(subDataSet1)/float(len(dataSet)) newEntropy+=prob1*calcShannonEnt(subDataSet1) if newEntropy<bestSplitEntropy: bestSplitEntropy=newEntropy bestSplit=j #用字典記錄當(dāng)前特征的最佳劃分點(diǎn) bestSplitDict[labels[i]]=splitList[bestSplit] infoGain=baseEntropy-bestSplitEntropy #對(duì)離散型特征進(jìn)行處理 else: uniqueVals=set(featList) newEntropy=0.0 #計(jì)算該特征下每種劃分的信息熵 for value in uniqueVals: subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy if infoGain>bestInfoGain: bestInfoGain=infoGain bestFeature=i #若當(dāng)前節(jié)點(diǎn)的最佳劃分特征為連續(xù)特征,則將其以之前記錄的劃分點(diǎn)為界進(jìn)行二值化處理 #即是否小于等于bestSplitValue if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int': bestSplitValue=bestSplitDict[labels[bestFeature]] labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue) for i in range(shape(dataSet)[0]): if dataSet[i][bestFeature]<=bestSplitValue: dataSet[i][bestFeature]=1 else: dataSet[i][bestFeature]=0 return bestFeature #特征若已經(jīng)劃分完,節(jié)點(diǎn)下的樣本還沒有統(tǒng)一取值,則需要進(jìn)行投票 def majorityCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 return max(classCount) #主程序,遞歸產(chǎn)生決策樹 def createTree(dataSet,labels,data_full,labels_full): classList=[example[-1] for example in dataSet] if classList.count(classList[0])==len(classList): return classList[0] if len(dataSet[0])==1: return majorityCnt(classList) bestFeat=chooseBestFeatureToSplit(dataSet,labels) bestFeatLabel=labels[bestFeat] myTree={bestFeatLabel:{}} featValues=[example[bestFeat] for example in dataSet] uniqueVals=set(featValues) if type(dataSet[0][bestFeat]).__name__=='str': currentlabel=labels_full.index(labels[bestFeat]) featValuesFull=[example[currentlabel] for example in data_full] uniqueValsFull=set(featValuesFull) del(labels[bestFeat]) #針對(duì)bestFeat的每個(gè)取值,劃分出一個(gè)子樹。 for value in uniqueVals: subLabels=labels[:] if type(dataSet[0][bestFeat]).__name__=='str': uniqueValsFull.remove(value) myTree[bestFeatLabel][value]=createTree(splitDataSet\ (dataSet,bestFeat,value),subLabels,data_full,labels_full) if type(dataSet[0][bestFeat]).__name__=='str': for value in uniqueValsFull: myTree[bestFeatLabel][value]=majorityCnt(classList) return myTree import matplotlib.pyplot as plt decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #計(jì)算樹的葉子節(jié)點(diǎn)數(shù)量 def getNumLeafs(myTree): numLeafs=0 firstSides = list(myTree.keys()) firstStr=firstSides[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=='dict': numLeafs+=getNumLeafs(secondDict[key]) else: numLeafs+=1 return numLeafs #計(jì)算樹的最大深度 def getTreeDepth(myTree): maxDepth=0 firstSides = list(myTree.keys()) firstStr=firstSides[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=='dict': thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth: maxDepth=thisDepth return maxDepth #畫節(jié)點(diǎn) def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\ xytext=centerPt,textcoords='axes fraction',va="center", ha="center",\ bbox=nodeType,arrowprops=arrow_args) #畫箭頭上的文字 def plotMidText(cntrPt,parentPt,txtString): lens=len(txtString) xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002 yMid=(parentPt[1]+cntrPt[1])/2.0 createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstSides = list(myTree.keys()) firstStr=firstSides[0] cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=='dict': plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode) plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key)) plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor='white') fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getTreeDepth(inTree)) plotTree.x0ff=-0.5/plotTree.totalW plotTree.y0ff=1.0 plotTree(inTree,(0.5,1.0),'') plt.show() df=pd.read_csv('watermelon_4_3.csv') data=df.values[:,1:].tolist() data_full=data[:] labels=df.columns.values[1:-1].tolist() labels_full=labels[:] myTree=createTree(data,labels,data_full,labels_full) print(myTree) createPlot(myTree)
最終結(jié)果如下:
{'texture': {'blur': 0, 'little_blur': {'touch': {'soft_stick': 1, 'hard_smooth': 0}}, 'distinct': {'density<=0.38149999999999995': {0: 1, 1: 0}}}}
得到的決策樹如下:
參考資料:
《機(jī)器學(xué)習(xí)實(shí)戰(zhàn)》
《機(jī)器學(xué)習(xí)》周志華著
以上這篇基于ID3決策樹算法的實(shí)現(xiàn)(Python版)就是小編分享給大家的全部?jī)?nèi)容了,希望能給大家一個(gè)參考,也希望大家多多支持腳本之家。
相關(guān)文章
基于python實(shí)現(xiàn)開箱即用的桌面時(shí)鐘
這篇文章主要為大家詳細(xì)介紹了如何基于python實(shí)現(xiàn)開箱一個(gè)即用的桌面時(shí)鐘,文中的示例代碼講解詳細(xì),具有一定的借鑒價(jià)值,需要的小伙伴可以參考下2023-12-12Python使用機(jī)器學(xué)習(xí)模型實(shí)現(xiàn)溫度預(yù)測(cè)詳解
使用?Python?可以使用機(jī)器學(xué)習(xí)模型進(jìn)行溫度預(yù)測(cè)。常用的模型有回歸分析、隨機(jī)森林等。本文就來和大家來了具體實(shí)現(xiàn)方法,希望對(duì)大家有所幫助2023-01-01Python實(shí)現(xiàn)的序列化和反序列化二叉樹算法示例
這篇文章主要介紹了Python實(shí)現(xiàn)的序列化和反序列化二叉樹算法,結(jié)合實(shí)例形式分析了Python二叉樹的構(gòu)造、遍歷、序列化、反序列化等相關(guān)操作技巧,需要的朋友可以參考下2019-03-03python基于OpenCV模塊實(shí)現(xiàn)視頻流數(shù)據(jù)切割為圖像幀數(shù)據(jù)(流程分析)
這篇文章主要介紹了python基于OpenCV模塊實(shí)現(xiàn)視頻流數(shù)據(jù)切割為圖像幀數(shù)據(jù),這里今天主要是實(shí)踐一下視頻流數(shù)據(jù)的預(yù)處理工作,需要的朋友可以參考下2022-05-05