注:大部分参考《机器学习实战》,有空再来加上注释
决策树任务总结:有n条训练数据,每一条数据格式为[属性1,属性2,…,属性k,结果i],即数据为n*(k+1)的矩阵。
根据这n条数据生成一颗决策树,当来一条新数据时,能够根据k个属性,代入决策树预测出结果。
决策树是树状,叶子节点是结果,非叶子节点是决策节点,每一个决策节点是对某个属性的判断。
而选择哪一个属性作为当前划分属性,则是比较每一个属性划分前后信息熵变化的差异,选差异最大的作为当前划分属性。


trees.py

import math
import operator
#计算信息熵
def calcInformationEntropy(dataSet):
    numOfDataset=float(len(dataSet))
    labels=[onepice[-1] for onepice in dataSet]
    uniqueLabels=set(labels)
    Entropy=0.0
    for value in uniqueLabels:
        countValue=labels.count(value)
        prob=float(countValue)/numOfDataset
        Entropy+=-1*prob*math.log(prob,2)
    return Entropy
#按照第i维特征的某一个值划分数据
def splitDataSet(dataSet,featureIndex,value):
    subDataSet=[]
    for line in dataSet:
        if line[featureIndex]==value:
            newline=line[:featureIndex];
            newline.extend(line[featureIndex+1:])
            subDataSet.append(newline)
    return subDataSet
#选择最好的特征,即划分前后信息熵增益最大
def chooseBestFeatureToSplit(dataSet):
    preEntropy=calcInformationEntropy(dataSet)
    numOfFeatures=len(dataSet[0])-1
    bestFeature=-1
    maxEntropyGain=0.0
    for i in range(numOfFeatures):
        featureList=[example[i] for example in dataSet]
        uniqueFeatures=set(featureList)
        postEntropy=0.0
        for value in uniqueFeatures:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=float(len(subDataSet))/float(len(dataSet))
            postEntropy+=prob*calcInformationEntropy(subDataSet)
        entropyGain=preEntropy-postEntropy
        if maxEntropyGain<entropyGain:
            maxEntropyGain=entropyGain
            bestFeature=i
    return bestFeature
#当特征已经用完,投票决定剩下的样本,少数服从多数
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]
#根据训练样本,生成树
def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeature=chooseBestFeatureToSplit(dataSet)
    features=[example[bestFeature] for example in dataSet]
    uniqueFeatures=set(features)
    curLabel=labels[bestFeature]
    dcTree={}
    subLabels=labels[:]
    del(subLabels[bestFeature])
    for value in uniqueFeatures:
        dcTree[value]=createTree(splitDataSet(dataSet,bestFeature,value),subLabels)
    myTree={}
    myTree[curLabel]=dcTree
    return myTree
#决策子函数
def decision(Tree,inputFeature):
    firstNode=list(Tree.keys())[0]
    value=inputFeature[firstNode]
    if isinstance(Tree[firstNode][value],dict):
        return decision(Tree[firstNode][value],inputFeature)
    return Tree[firstNode][value]
#输入一条数据 ,预测他是什么类型的
def prediction(Tree,inputFeatureVec,labelsVec):
    if len(labelsVec)!=len(inputFeatureVec):
        return "error input"
    lenVec=len(labelsVec)
    inputDict={}
    for i in range(lenVec):
        inputDict[labelsVec[i]]=inputFeatureVec[i]
    return decision(Tree,inputDict)
#把树存储下来    
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()
#把树还原出来
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

test.py

import trees
import pandas as pd
df=pd.read_csv("lenses.txt",header=None,sep='\t')
labels=['age','prescript','astigmstic','tearRate']
dataSet=[]
for i in range(len(df)):
    dataSet.append(list(df.loc[i][:]))
myTree=trees.createTree(dataSet[:-1],labels)
result=trees.prediction(myTree,dataSet[-1][:-1],labels)