效果图:
原始数据文件:
lense.txt
young myope no reduced no lenses
young myope no normal soft
young myope yes reduced no lenses
young myope yes normal hard
young hyper no reduced no lenses
young hyper no normal soft
young hyper yes reduced no lenses
young hyper yes normal hard
pre myope no reduced no lenses
pre myope no normal soft
pre myope yes reduced no lenses
pre myope yes normal hard
pre hyper no reduced no lenses
pre hyper no normal soft
pre hyper yes reduced no lenses
pre hyper yes normal no lenses
presbyopic myope no reduced no lenses
presbyopic myope no normal no lenses
presbyopic myope yes reduced no lenses
presbyopic myope yes normal hard
presbyopic hyper no reduced no lenses
presbyopic hyper no normal soft
presbyopic hyper yes reduced no lenses
presbyopic hyper yes normal no lenses
treestore.py代码
存储恢复tree
#-*-coding:utf-8-*-
def storeTree(inputTree, filename):
"""
Function: 存储决策树
Args: inputTree:树信息
filename:文件名称
Returns: 无
"""
#导入模块
import pickle
#新建文件,一定要加b属性,否则可能报错:
#TypeError: write() argument must be str, not bytes
fw = open(filename, 'wb')
#写入数据
pickle.dump(inputTree, fw)
#关闭文件
fw.close()
def grabTree(filename):
"""
Function: 读取决策树
Args: filename:文件名称
Returns: pickle.load(fr):树信息
"""
#导入模块
import pickle
#打开文件,写入属性一致,否则可能报错:
#UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 0: illegal multibyte sequence
fr = open(filename, 'rb')
#导出数据
return pickle.load(fr)
treeplot1.py绘制tree图像代码
# _*_ coding: UTF-8 _*_
import matplotlib.pyplot as plt
"""绘决策树的函数"""
decisionNode = dict(boxstyle="sawtooth", fc="0.8") # 定义分支点的样式
leafNode = dict(boxstyle="round4", fc="0.8") # 定义叶节点的样式
arrow_args = dict(arrowstyle="<-") # 定义箭头标识样式
# 计算树的叶子节点数量
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs
# 计算树的最大深度
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
return maxDepth
# 画出节点
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction', \
xytext=centerPt, textcoords='axes fraction', va="center", ha="center", \
bbox=nodeType, arrowprops=arrow_args)
# 标箭头上的文字
def plotMidText(cntrPt, parentPt, txtString):
lens = len(txtString)
xMid = (parentPt[0] + cntrPt[0]) / 2.0 - lens * 0.002
yMid = (parentPt[1] + cntrPt[1]) / 2.0
createPlot.ax1.text(xMid, yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.x0ff + \
(1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.y0ff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.y0ff = plotTree.y0ff - 1.0 / plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
plotTree(secondDict[key], cntrPt, str(key))
else:
plotTree.x0ff = plotTree.x0ff + 1.0 / plotTree.totalW
plotNode(secondDict[key], \
(plotTree.x0ff, plotTree.y0ff), cntrPt, leafNode)
plotMidText((plotTree.x0ff, plotTree.y0ff) \
, cntrPt, str(key))
plotTree.y0ff = plotTree.y0ff + 1.0 / plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.x0ff = -0.5 / plotTree.totalW
plotTree.y0ff = 1.0
plotTree(inTree, (0.5, 1.0), '')
plt.show()
if __name__=='__main__':
id3的决策树代码
from math import log
import operator
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import treeplot1
import treestore
# 计算数据的熵(entropy)-原始熵
def dataentropy(data, feat):
lendata = len(data) # 数据条数
labelCounts = {} # 数据中不同类别的条数
for featVec in data:
category = featVec[-1] # 每行数据的最后一个字(叶子节点)
if category not in labelCounts.keys():
labelCounts[category] = 0
labelCounts[category] += 1 # 统计有多少个类以及每个类的数量
entropy = 0
for key in labelCounts:
prob = float(labelCounts[key]) / lendata # 计算单个类的熵值
entropy -= prob * log(prob, 2) # 累加每个类的熵值
return entropy
# 处理后导入数据数据
def Importdata(datafile):
dataa = pd.read_excel(datafile) # datafile是excel文件,所以用read_excel,如果是csv文件则用read_csv
# 将文本中不可直接使用的文本变量替换成数字
productDict = {'高': 1, '一般': 2, '低': 3, '帅': 1, '丑': 3, '胖': 3, '瘦': 1, '是': 1, '否': 0}
dataa['income'] = dataa['收入'].map(productDict) # 将每一列中的数据按照字典规定的转化成数字
dataa['hight'] = dataa['身高'].map(productDict)
dataa['look'] = dataa['长相'].map(productDict)
dataa['shape'] = dataa['体型'].map(productDict)
dataa['is_meet'] = dataa['是否见面'].map(productDict)
data = dataa.iloc[:, 5:].values.tolist() # 取量化后的几列,去掉文本列
b = dataa.iloc[0:0, 5:-1]
labels = b.columns.values.tolist() # 将标题中的值存入列表中
return data, labels
# 按某个特征value分类后的数据
def splitData(data, i, value):
splitData = []
for featVec in data:
if featVec[i] == value:
rfv = featVec[:i]
rfv.extend(featVec[i + 1:])
splitData.append(rfv)
return splitData
# 选择最优的分类特征
def BestSplit(data):
numFea = len(data[0]) - 1 # 计算一共有多少个特征,因为最后一列一般是分类结果,所以需要-1
baseEnt = dataentropy(data, -1) # 定义初始的熵,用于对比分类后信息增益的变化
bestInfo = 0
bestFeat = -1
for i in range(numFea):
featList = [rowdata[i] for rowdata in data]
uniqueVals = set(featList)
newEnt = 0
for value in uniqueVals:
subData = splitData(data, i, value) # 获取按照特征value分类后的数据
prob = len(subData) / float(len(data))
newEnt += prob * dataentropy(subData, i) # 按特征分类后计算得到的熵
info = baseEnt - newEnt # 原始熵与按特征分类后的熵的差值,即信息增益
if (info > bestInfo): # 若按某特征划分后,若infoGain大于bestInf,则infoGain对应的特征分类区分样本的能力更强,更具有代表性。
bestInfo = info # 将infoGain赋值给bestInf,如果出现比infoGain更大的信息增益,说明还有更好地特征分类
bestFeat = i # 将最大的信息增益对应的特征下标赋给bestFea,返回最佳分类特征
return bestFeat
# 按分类后类别数量排序,取数量较大的
def majorityCnt(classList):
c_count = {}
for i in classList:
if i not in c_count.keys():
c_count[i] = 0
c_count[i] += 1
ClassCount = sorted(c_count.items(), key=operator.itemgetter(1), reverse=True) # 按照统计量降序排序
return ClassCount[0][0] # reverse=True表示降序,因此取[0][0],即最大值
# 建树
def createTree(data, labels):
classList = [rowdata[-1] for rowdata in data] # 取每一行的最后一列,分类结果(1/0)
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majorityCnt(classList)
bestFeat = BestSplit(data) # 根据信息增益选择最优特征
bestLab = labels[bestFeat]
myTree = {bestLab: {}} # 分类结果以字典形式保存
del (labels[bestFeat])
featValues = [rowdata[bestFeat] for rowdata in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestLab][value] = createTree(splitData(data, bestFeat, value), subLabels)
return myTree
# 选择最优的分类特征C.45算法
def BestSplitc45(data):
numFea = len(data[0])-1#计算一共有多少个特征,因为最后一列一般是分类结果,所以需要-1
baseEnt = dataentropy(data,-1) # 定义初始的熵,用于对比分类后信息增益的变化
bestGainRate = 0
bestFeat = -1
for i in range(numFea):
featList = [rowdata[i] for rowdata in data]
uniqueVals = set(featList)
newEnt = 0
for value in uniqueVals:
subData = splitData(data,i,value)#获取按照特征value分类后的数据
prob =len(subData)/float(len(data))
newEnt +=prob*dataentropy(subData,i) # 按特征分类后计算得到的熵
info = baseEnt - newEnt # 原始熵与按特征分类后的熵的差值,即信息增益
splitonfo = dataentropy(subData,i) #分裂信息
if splitonfo == 0:#若特征值相同(eg:长相这一特征的值都是帅),即splitonfo和info均为0,则跳过该特征
continue
GainRate = info/splitonfo #计算信息增益率
if (GainRate>bestGainRate): # 若按某特征划分后,若infoGain大于bestInf,则infoGain对应的特征分类区分样本的能力更强,更具有代表性。
bestGainRate=GainRate #将infoGain赋值给bestInf,如果出现比infoGain更大的信息增益,说明还有更好地特征分类
bestFeat = i #将最大的信息增益对应的特征下标赋给bestFea,返回最佳分类特征
return bestFeat
def classify(inputTree, featLabels, testVec):
"""
Function: 使用决策树的分类函数
Args: inputTree:树信息
featLabels:标签列表
testVec:测试数据
Returns: classLabel:分类标签
"""
#第一个关键字为第一次划分数据集的类别标签,附带的取值表示子节点的取值
firstStr = list(inputTree.keys())[0]
#新的树,相当于脱了一层皮
secondDict = inputTree[firstStr]
#将标签字符串转为索引
featIndex = featLabels.index(firstStr)
#遍历整棵树
for key in secondDict.keys():
#比较testVec变量中的值与树节点的值
if testVec[featIndex] == key:
#判断子节点是否为字典类型,进而得知是否到达叶子结点
if type(secondDict[key]).__name__=='dict':
#没到达叶子结点,则递归调用classify()
classLabel = classify(secondDict[key], featLabels, testVec)
else:
#到达叶子结点,则分类结果为当前节点的分类标签
classLabel = secondDict[key]
#返回分类标签
return classLabel
if __name__ == '__main__':
# datafile = u'E:\\pythondata\\lense.txt' # 文件所在位置,u为防止路径中有中文名称
# datafile='lense.txt'
fr = open("lense.txt")
lenses = [inst.strip().split("\t") for inst in fr.readlines()]
lensesLabels = ["age", "prescript", "astigmatic", "tearRte"]
# data, labels = Importdata(datafile) # 导入数据
lensesTree=createTree(lenses, lensesLabels)
print(lensesTree) # 输出决策树模型结果
#tree的文件存储测试恢复测试
treestore.storeTree(lensesTree, 'classifierStorage.txt')
lensesTree1=treestore.grabTree('classifierStorage.txt')
print(lensesTree1)
treeplot1.createPlot(lensesTree)
#隐形眼镜类型决策分析
lensesLabels = ["age", "prescript", "astigmatic", "tearRte"] #这个必须再次给值,createtree会改变label,再次校验提示缺少最后一个元素
glasstype=classify(lensesTree, lensesLabels, ['presbyopic', 'hyper', 'no', 'normal'])
print(glasstype)