对以前的图书推荐系统加注释的代码如下:
import os
import pandas as pd
from operator import itemgetter
from collections import defaultdict
random.seed(0) #设置好随机种子,即相同的随机种子seed
class UserBasedCF(object):
def __init__(self):
self.trainset = {} #训练数据集
self.testset = {} #测试数据集
self.n_sim_user = 20 #兴趣最近的20个用户
self.n_rec_song = 10 #系统推荐的10歌曲
self.user_sim_mat = {} #用户兴趣相似度矩阵
self.song_popular = {} #歌曲的欢迎系数
self.song_count = 0 #歌曲的数量
print ('Similar user number = %d' % self.n_sim_user)
print ('recommended movie number = %d' % self.n_rec_song)
def generate_dataset(self,pivot=0.7):
trainset_len = 0 # 训练集的大小
testset_len = 0 # 测试集的大小
data=pd.read_csv("userBehavior.csv",encoding="utf-8",error_bad_lines=False)
for index,line in data.iterrows(): # 遍历文件的每一行
user=line["UserID"]
song=line["SongName"]
singer=line["singer"]
count=line["count"]
if random.random() < pivot: # 加入训练集,训练集和测试集七三开
self.trainset.setdefault(user, {})
self.trainset[user][song] = int(count) # 建立用户-歌曲-评分的字典
trainset_len += 1 # 训练集大小加一
else: # 加入测试集
self.testset.setdefault(user, {})
self.testset[user][song] = int(count)
testset_len += 1 # 测试集大小加一
print('split training set and test set succ')
print('train set = %s' % trainset_len)
print('test set = %s' % testset_len)
def calc_user_sim(self): # 计算用户之间的兴趣相似度
print('building movie-users inverse table...')
song2users = dict()
for user, songs in self.trainset.items():
for song in songs: # 遍历每一部歌曲
if song not in song2users:
song2users[song] = set() # 每首歌曲的观众的集合
song2users[song].add(user) # 将该观众加入到该歌曲的听众集合中
if song not in self.song_popular: # 如果该歌曲不在歌曲流行度数组
self.song_popular[song] = 0 # 将该歌曲的流行度初始化为0
self.song_popular[song] += 1 # 每首歌曲的听众人数加一
print('build song-users inverse table succ')
self.song_count = len(song2users) # 获得歌曲的部数
print('total song number = %d' % self.song_count)
usersim_mat = self.user_sim_mat # 用户之间的兴趣相似度矩阵
print('building user co-rated movies matrix...')
for song, users in song2users.items(): # 循环每一个键值对,即 for key,values in xxx.items()
for u in users: # u、v是否在同一首歌曲的听众集合里面
usersim_mat.setdefault(u, defaultdict(int))
for v in users:
if u == v:
continue
usersim_mat[u][v] += 1 # 如果在同一首歌曲的听众里面,则兴趣点加一
print('build user co-rated songs matrix succ')
print('calculating user similarity matrix...')
simfactor_count = 0
for u, related_users in usersim_mat.items():
for v, count in related_users.items():
usersim_mat[u][v] = count / math.sqrt(len(self.trainset[u]) * len(self.trainset[v])) # 计算两个用户的兴趣相似度
simfactor_count += 1
print('calculate user similarity matrix(similarity factor) succ')
print('Total similarity factor number = %d' % simfactor_count)
def recommend(self, user):
''' 找到兴趣最近的前20个用户,从中找到最适合的前10首歌曲'''
K = self.n_sim_user # 前面给出是20
N = self.n_rec_song # 前面给出是10
rank = dict()
listened_songs = self.trainset[user] # 当前用户看过的歌曲
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(), key=itemgetter(1),
reverse=True)[0:K]:
# 排序,找出兴趣相似度最高的前20个用户
for song in self.trainset[similar_user]:
if song in listened_songs: # 如果该歌曲被该用户听过,则跳过
continue
# predict the user's "interest" for each movie
rank.setdefault(song, 0)
rank[song] += similarity_factor
# 返回最好的N部歌曲
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
'''计算召回率、准确率、覆盖率、新颖度'''
print('Evaluation start...')
N = self.n_rec_song # n_rec_movie在前面已经给出是10
hit = 0 # 成功推荐的歌曲数量
rec_count = 0 # 总共推荐了多少首歌曲
test_count = 0 # 测试集中的歌曲数量
# 计算覆盖率
all_rec_movies = set() # 成功推荐的歌曲
# 计算新颖度
popular_sum = 0
for i, user in enumerate(self.trainset): # i为下标,user为训练集的内容
test_songs = self.testset.get(user, {})
rec_songs = self.recommend(user) # 此处的recommend是该函数的前一个函数,即系统推荐的最适合的10首歌曲
for movie, _ in rec_songs:
if movie in test_songs:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.song_popular[movie])
rec_count += N
test_count += len(test_songs)
precision = hit / (1.0 * rec_count) # 准确率
recall = hit / (1.0 * test_count) # 召回率
coverage = len(all_rec_movies) / (1.0 * self.song_count) # 覆盖率
popularity = popular_sum / (1.0 * rec_count) # 新颖度
print('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' % (
precision, recall, coverage, popularity))
if __name__ == '__main__':
usercf = UserBasedCF() # 类的实例化
usercf.generate_dataset()
usercf.calc_user_sim()
usercf.evaluate()