1、关联规则挖掘算法
关联规则挖掘算法可以实现从两种经典算法Apriori或FP-Growth中任意选取算法,输出各个频繁项集和强关联规则。输入文件由本地导入,可自行设置最小支持度计数和最小置信度参数值。
2、 Apriori算法设计思想
Apriori算法本质上使用一种称作逐层搜索的迭代方法,使用候选项集找频繁项集,其特点在于每找一次频繁项集就需要扫描一次数据库。
3、FP-growth算法设计思想
FP-growth算法将数据集存储在一个特定的称作FP树的结构,只需要遍历数据集2次,就能够完成频繁模式发现,其发现频繁项集的分为两个阶段,第一个阶是段构建FP树,第二个阶段从FP树中挖掘频繁项集。
4、用户界面
1)点击读取文件按钮,读取的文件时,如图4-1所示:
图4-1 关联规则挖掘系统导入文件
2)选择Apriori算法,单击按钮,读取的文件并运行,运行结果如图5-2所示:
图4-2 关联规则挖掘系统的Apriori算法实现
3)在2)的基础上,单击“清屏”按钮,修改最小支持度和最小置信度,单击“Apriori算法”再次运行,运行结果如图5-3所示:
图4-3 关联规则挖掘系统的Apriori算法修改参数实现
4)选择FP-Growth算法,单击按钮,读取的文件并运行,运行结果如图5-4所示:
图4-4 关联规则挖掘系统的FP-Growth算法实现
5)在4)的基础上,单击“清屏”按钮,如图4-5修改最小支持度和最小置信度,单击“FP-Growth算法”再次运行,运行结果如图4-6所示:
图4-5关联规则挖掘系统的清屏
图4-6关联规则挖掘系统的FP-Growth算法修改参数实现
6)点击“退出”,实现控制台清空和系统退出,如图4-7所示:
图4-7 关联规则挖掘系统的退出
5、实验源码
编译环境为Spyder,所用语言及版本为python3.7,GUI环境为tkinter。
1)主运行界面 GUI.py
# -*- coding: utf-8 -*-
import sys
import fp
import tkinter as tk
from tkinter import filedialog
from tkinter import scrolledtext
class GUI(object):
#布局界面
def __init__(self):
#设置初始界面
self.window=tk.Tk()
self.window.title('关联规则挖掘系统')
self.window.geometry('1150x550')
#导入文件按钮
self.botton1=tk.Button(self.window, text='导入文件',bg='green',fg='white', font=('楷体', 12, 'bold'), width=8, height=1,command=self.openfile)
self.botton1.place(x=70,y=60)
#标签配置
self.label2=tk.Label(self.window, text='最小支持数',bg='light blue',fg='white', font=('楷体', 16, 'bold'), width=10, height=1).place(x=10,y=160)
self.label3=tk.Label(self.window, text='最小置信度',bg='light blue',fg='white', font=('楷体', 16, 'bold'), width=10, height=1).place(x=10,y=220)
#导入文件内容的输出显示
self.label4=tk.Label(self.window, text='导入文件内容如下',font=('楷体', 16, 'bold'), width=16, height=1).place(x=260,y=20)
#创建结果显示框
self.text1=scrolledtext.ScrolledText(self.window, height=28, width=23,font=('楷体', 13))
self.text1.place(x=250,y=60)
self.text1.bind("<Button-1>",self.clear)
#各个频繁项集和强关联规则的输出显示
self.label5=tk.Label(self.window, text='频繁项集和强关联规则',font=('楷体', 16, 'bold'), width=20, height=1).place(x=700,y=20)
#创建结果显示框
self.text2=scrolledtext.ScrolledText(self.window, height=28, width=60,font=('楷体', 10))
self.text2.place(x=550,y=60)
self.text2.bind("<Button-1>",self.clear)
# self.text2.bind("<Button-1>",self.run)
#显示导入文件的路径
self.var0=tk.StringVar()
self.entry1=tk.Entry(self.window, show=None, width='25', font=('Arial', 10), textvariable=self.var0)
self.entry1.place(x=10,y=100)
#自行设置最小支持度计数值,默认为0.5
self.var1=tk.StringVar()
self.var1.set('3')
self.entry2=tk.Entry(self.window, show=None, width='3', font=('Arial', 16), textvariable=self.var1)
self.entry2.place(x=180,y=160)
#自行设置最小置信度参数值,默认为0.7
self.var2=tk.StringVar()
self.var2.set('0.7')
self.entry3=tk.Entry(self.window, show=None, width='3', font=('Arial', 16), textvariable=self.var2)
self.entry3.place(x=180,y=220)
#选择所需算法
self.btnlist=tk.IntVar()
self.radiobtn1=tk.Radiobutton(self.window, variable=self.btnlist, value=0, text='Apriori算法', font=('bold'), command=self.runApriori)
self.radiobtn1.place(x=30,y=290)
self.radiobtn2=tk.Radiobutton(self.window, variable=self.btnlist, value=1,text='FP-Growth算法', font=('bold'), command=self.runFPGrowth)
self.radiobtn2.place(x=30,y=330)
self.btnlist.set(0)
#开始运行按钮
# self.btn1=tk.Button(self.window, bg='green',fg='white', text='运行', font=('楷体', 12,'bold'), width=6, height=1, command=self.run)
# self.btn1.place(x=80,y=360)
#清空页面按钮
self.btn2=tk.Button(self.window, bg='green',fg='white', text='清屏', font=('楷体', 12,'bold'), width=6, height=1)
self.btn2.place(x=80,y=390)
self.btn2.bind("<Button-1>",self.clear)
#关闭页面按钮
self.btn3=tk.Button(self.window, bg='green',fg='white', text='退出', font=('楷体', 12,'bold'), width=6, height=1)
self.btn3.place(x=80,y=450)
self.btn3.bind("<Button-1>",self.close)
#主窗口循环显示
self.window.mainloop()
#清空所填内容
def clear(self,event):
# 连同导入文件一起删除的话,会影响操作的连贯性,故注释掉
# self.entry1.delete(0,tk.END)
# self.entry2.delete(0,tk.END)
# self.entry3.delete(0,tk.END)
self.text1.delete("1.0",tk.END)
self.text2.delete("1.0",tk.END)
#退出系统,对控制台清屏
def close(self,event):
e=tk.messagebox.askokcancel('询问','确定退出系统吗?')
if e==True:
exit()
self.window.destroy()
def __del__(self):
# 恢复sys.stdout
sys.stdout = sys.__stdout__
sys.stderr = sys.__stderr__
#从输入文本框中获取文本并返回数字列表
def getDataSupport(self):
entry_num1 = float(self.var1.get())
return entry_num1
def getDataConfidence(self):
entry_num2 =float(self.var2.get())
return entry_num2
def openfile(self):
nameFile = filedialog.askopenfilename(title='打开文件', filetypes=[('csv', '*.csv'),('txt', '*.txt')])
self.entry1.insert('insert', nameFile)
def getnamefile(self):
namefile=self.var0.get()
return namefile
#读取导入的文件并转化为列表
def loadDataSet(self):
nameFile=self.getnamefile()
with open(nameFile,"r",encoding='utf-8') as myfile:
data=myfile.read()
self.text1.insert("0.0",data)
self.text1.see("end")
list_result=data.split("\n")# 以回车符\n分割成单独的行
length=len(list_result)
for i in range(length):
list_result[i]=list_result[i].split(",") # csv文件中的元素是以逗号分隔的
return list_result
def runApriori(self):
loadDataSet = self.loadDataSet()
C1=self.createC1(loadDataSet)
D = list(map(set,loadDataSet))
minSupport = self.getDataSupport()
L1, suppData0 = self.scanD(D,C1,minSupport)
L,suppData = self.apriori(loadDataSet,minSupport)
minConf = self.getDataConfidence()
rules = self.generateRules(L,suppData,minConf)
s='#######################Apriori算法##########################\n'
self.text2.insert('insert',s)
t1='\n频繁项集:\n'
self.text2.insert('insert',t1)
self.text2.insert('insert',L)
t2='\n\n强关联规则:\n'
self.text2.insert('insert',t2)
for line in rules:
r =str(line[0]) + '-->' + str(line[1]) + '置信度:' + str(line[2]) + '\n'
self.text2.insert('insert',r)
def runFPGrowth(self):
dataSet = self.loadDataSet()
frozenDataSet = fp.transfer2FrozenDataSet(dataSet)
minSupport = self.getDataSupport()
s='#######################FP_Growth算法########################\n'
self.text2.insert('insert',s)
t='\nFP树:\n'
self.text2.insert('insert',t)
fptree, headPointTable = fp.createFPTree(frozenDataSet, minSupport)
fptree.disp()
self.text2.insert('insert',fptree.display())
frequentPatterns = {}
prefix = set([])
fp.mineFPTree(headPointTable, prefix, frequentPatterns, minSupport)
t1='\n频繁项集:\n'
self.text2.insert('insert',t1)
t2=frequentPatterns
self.text2.insert('insert',t2)
minConf = self.getDataConfidence()
rules = []
fp.rulesGenerator(frequentPatterns, minConf, rules)
t3='\n\n强关联规则:\n'
self.text2.insert('insert',t3)
for line in rules:
r =str(line[0]) + '-->' + str(line[1]) + '置信度:' + str(line[2]) + '\n'
self.text2.insert('insert',r)
#创建集合C1,C1是大小为1的所有候选项集合
def createC1(self,dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset,C1)) #对C1中每个项构建一个不变集合
#扫描数据集,返回最频繁项集的支持度supportData
def scanD(self,D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if can not in ssCnt:
ssCnt[can] = 1
else:
ssCnt[can] += 1
# numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
# support = ssCnt[key] / numItems #计算所有项集支持度
support = ssCnt[key]
if support >= minSupport:
retList.insert(0,key)
supportData[key] = support
return retList, supportData
#创建候选项集Ck
def aprioriGen(self,Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):#前k-2个项相同时,将两个集合合并
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[:k - 2]
L2 = list(Lk[j])[:k - 2]
L1.sort()
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
#Apriori算法函数
def apriori(self,dataSet, minSupport):
minSupport = self.getDataSupport()
C1 = self.createC1(dataSet)
D = list(map(set, dataSet))
L1, supportData = self.scanD(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k - 2]) > 0):
Ck = self.aprioriGen(L[k - 2], k)
Lk, supK = self.scanD(D, Ck, minSupport)#扫描数据集,从Ck得到Lk
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
#生成关联规则
def generateRules(self,L, supportData, minConf):
minConf = self.getDataConfidence()
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
self.rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
self.calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
#计算可信度值
def calcConf(self,freqSet, H, supportData, brl, minConf):
minConf = self.getDataConfidence()
prunedH = []
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq]
if conf >= minConf:
# print (freqSet-conseq,'-->',conseq,'conf:',conf)
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
#从最初的项集中生成更多的关联规则
def rulesFromConseq(self,freqSet, H, supportData, brl, minConf):
minConf = self.getDataConfidence()
m = len(H[0])
if (len(freqSet) > (m + 1)):
Hmp1 = self.aprioriGen(H, m+1)
Hmp1 = self.calcConf(freqSet, Hmp1, supportData, brl, minConf)
if (len(Hmp1) > 1):
self.rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
if __name__ == '__main__':
GUI()
2)导入的fp.py
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 10:48:56 2019
@author: 29493
"""
#import GUI
def transfer2FrozenDataSet(dataSet):
frozenDataSet = {}
for elem in dataSet:
frozenDataSet[frozenset(elem)] = 1
return frozenDataSet
res1=[]
res2=[]
res3=[]
class TreeNode:
def __init__(self, nodeName, count, nodeParent):
self.nodeName = nodeName
self.count = count
self.nodeParent = nodeParent
self.nextSimilarItem = None
self.children = {}
def increaseC(self, count):
self.count += count
def disp(self, ind=1):
res1.append(self.nodeName)
res2.append(self.count)
res3.append(ind)
for child in self.children.values():
child.disp(ind + 1)
def display(self):
s=''
for i in range(0,len(res1)):
s+=' ' * res3[i]+res1[i]+' '+str(res2[i])+'\n'
return s
def createFPTree(frozenDataSet, minSupport):
#scan dataset at the first time, filter out items which are less than minSupport
headPointTable = {}
for items in frozenDataSet:
for item in items:
headPointTable[item] = headPointTable.get(item, 0) + frozenDataSet[items]
headPointTable = {k:v for k,v in headPointTable.items() if v >= minSupport}
frequentItems = set(headPointTable.keys())
if len(frequentItems) == 0: return None, None
for k in headPointTable:
headPointTable[k] = [headPointTable[k], None]
fptree = TreeNode("null", 1, None)
#scan dataset at the second time, filter out items for each record
for items,count in frozenDataSet.items():
frequentItemsInRecord = {}
for item in items:
if item in frequentItems:
frequentItemsInRecord[item] = headPointTable[item][0]
if len(frequentItemsInRecord) > 0:
orderedFrequentItems = [v[0] for v in sorted(frequentItemsInRecord.items(), key=lambda v:v[1], reverse = True)]
updateFPTree(fptree, orderedFrequentItems, headPointTable, count)
return fptree, headPointTable
def updateFPTree(fptree, orderedFrequentItems, headPointTable, count):
#handle the first item
if orderedFrequentItems[0] in fptree.children:
fptree.children[orderedFrequentItems[0]].increaseC(count)
else:
fptree.children[orderedFrequentItems[0]] = TreeNode(orderedFrequentItems[0], count, fptree)
#update headPointTable
if headPointTable[orderedFrequentItems[0]][1] == None:
headPointTable[orderedFrequentItems[0]][1] = fptree.children[orderedFrequentItems[0]]
else:
updateHeadPointTable(headPointTable[orderedFrequentItems[0]][1], fptree.children[orderedFrequentItems[0]])
#handle other items except the first item
if(len(orderedFrequentItems) > 1):
updateFPTree(fptree.children[orderedFrequentItems[0]], orderedFrequentItems[1::], headPointTable, count)
def updateHeadPointTable(headPointBeginNode, targetNode):
while(headPointBeginNode.nextSimilarItem != None):
headPointBeginNode = headPointBeginNode.nextSimilarItem
headPointBeginNode.nextSimilarItem = targetNode
def mineFPTree(headPointTable, prefix, frequentPatterns, minSupport):
#for each item in headPointTable, find conditional prefix path, create conditional fptree, then iterate until there is only one element in conditional fptree
headPointItems = [v[0] for v in sorted(headPointTable.items(), key = lambda v:v[1][0])]
if(len(headPointItems) == 0): return
for headPointItem in headPointItems:
newPrefix = prefix.copy()
newPrefix.add(headPointItem)
support = headPointTable[headPointItem][0]
frequentPatterns[frozenset(newPrefix)] = support
prefixPath = getPrefixPath(headPointTable, headPointItem)
if(prefixPath != {}):
conditionalFPtree, conditionalHeadPointTable = createFPTree(prefixPath, minSupport)
if conditionalHeadPointTable != None:
mineFPTree(conditionalHeadPointTable, newPrefix, frequentPatterns, minSupport)
def getPrefixPath(headPointTable, headPointItem):
prefixPath = {}
beginNode = headPointTable[headPointItem][1]
prefixs = ascendTree(beginNode)
if((prefixs != [])):
prefixPath[frozenset(prefixs)] = beginNode.count
while(beginNode.nextSimilarItem != None):
beginNode = beginNode.nextSimilarItem
prefixs = ascendTree(beginNode)
if (prefixs != []):
prefixPath[frozenset(prefixs)] = beginNode.count
return prefixPath
def ascendTree(treeNode):
prefixs = []
while((treeNode.nodeParent != None) and (treeNode.nodeParent.nodeName != 'null')):
treeNode = treeNode.nodeParent
prefixs.append(treeNode.nodeName)
return prefixs
def rulesGenerator(frequentPatterns, minConf, rules):
for frequentset in frequentPatterns:
if(len(frequentset) > 1):
getRules(frequentset,frequentset, rules, frequentPatterns, minConf)
def removeStr(set, str):
tempSet = []
for elem in set:
if(elem != str):
tempSet.append(elem)
tempFrozenSet = frozenset(tempSet)
return tempFrozenSet
def getRules(frequentset,currentset, rules, frequentPatterns, minConf):
for frequentElem in currentset:
subSet = removeStr(currentset, frequentElem)
confidence = frequentPatterns[frequentset] / frequentPatterns[subSet]
if (confidence >= minConf):
flag = False
for rule in rules:
if(rule[0] == subSet and rule[1] == frequentset - subSet):
flag = True
if(flag == False):
rules.append((subSet, frequentset - subSet, confidence))
if(len(subSet) >= 2):
getRules(frequentset, subSet, rules, frequentPatterns, minConf)