写在之前

本书涉及的源程序和数据都可以在以下网站中找到:http://guidetodatamining.com/ 这本书理论比较简单,书中错误较少,动手锻炼较多,如果每个代码都自己写出来,收获不少。总结:适合入门。欢迎转载,转载请注明出处,如有问题欢迎指正。合集地址:https://www.zybuluo.com/hainingwyx/note/559139

协同过滤

相似用户评判标准:曼哈顿距离、欧氏距离、明氏距离。

 
  1. # Manhattan.py

  2. users = {

  3. "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5,

  4. "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5,

  5. "Vampire Weekend": 2.0},

  6. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0,

  7. "Phoenix": 2.0,"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},

  8. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0,

  9. "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},

  10. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5,

  11. "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,

  12. "Vampire Weekend": 2.0},

  13. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0,

  14. "The Strokes": 4.0, "Vampire Weekend": 1.0},

  15. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0,

  16. "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0,

  17. "Vampire Weekend": 4.0},

  18. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0,

  19. "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},

  20. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0,

  21. "Slightly Stoopid": 2.5, "The Strokes": 3.0}

  22.        }

  23.  

  24. def manhattan(rating1, rating2):

  25.    """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries

  26.       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""

  27.    distance = 0

  28.    commonRatings = False

  29.    for key in rating1:

  30.        if key in rating2:

  31.            distance += abs(rating1[key] - rating2[key])

  32.            commonRatings = True

  33.    if commonRatings:

  34.        return distance

  35.    else:

  36.        return -1 #Indicates no ratings in common

  37.  

  38. def computeNearestNeighbor(username, users):

  39.    """creates a sorted list of users based on their distance to username"""

  40.    distances = []

  41.    for user in users:

  42.        if user != username:

  43.            distance = manhattan(users[user], users[username])

  44.            distances.append((distance, user))

  45.    # sort based on distance -- closest first

  46.    distances.sort()

  47.    return distances

  48.  

  49. def recommend(username, users):

  50.    """Give list of recommendations"""

  51.    # first find nearest neighbor

  52.    nearest = computeNearestNeighbor(username, users)[0][1]

  53.    print nearest

  54.    recommendations = []

  55.    # now find bands neighbor rated that user didn't

  56.    neighborRatings = users[nearest]

  57.    userRatings = users[username]

  58.    for artist in neighborRatings:

  59.        if not artist in userRatings:

  60.            recommendations.append((artist, neighborRatings[artist]))

  61.    # using the fn sorted for variety - sort is more efficient

  62.    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)

 
  1. # -*- coding: utf-8 -*-

  2. from math import sqrt

  3.  

  4. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},

  5. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},

  6. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},

  7. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},

  8. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},

  9. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},

  10. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},

  11. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}

  12. }

  13.  

  14. #明氏距离

  15. def minkowski(rating1,rating2,r):

  16.  distance=0

  17.  commonRatings=False

  18.  for key in rating1:

  19.    if key in rating2:

  20.      distance += pow(abs(rating1[key]-rating2[key]),r)

  21.      commonRatings=True

  22.      distance = pow(distance,1.0/r)

  23.    if commonRatings:

  24.      return distance

  25.    else:

  26.      return -1 #Indicates no ratings in common

  27.  

  28. def computeNearestNeighbor(username, users):

  29.  """creates a sorted list of users based on their distance to username"""

  30.  distances = []

  31.  for user in users:

  32.    if user != username:

  33.      distance = minkowski(users[user], users[username],3)

  34.      distances.append((distance, user))

  35.      # sort based on distance -- closest first

  36.      distances.sort()

  37.      return distances

  38.  

  39. def recommend(username, users):

  40.  """Give list of recommendations"""

  41.  # first find nearest neighbor

  42.  nearest = computeNearestNeighbor(username, users)[0][1]

  43.  print nearest

  44.  recommendations = []

  45.  # now find bands neighbor rated that user didn't

  46.  neighborRatings = users[nearest]

  47.  userRatings = users[username]

  48.  for artist in neighborRatings:

  49.    if not artist in userRatings:

  50.      recommendations.append((artist, neighborRatings[artist]))

  51.      # using the fn sorted for variety - sort is more efficient

  52.  return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)

但是可能存在常数差别,但是两者爱好相同的问题。皮尔逊相关系数:

 
  1. # Pearson.py

  2. from math import sqrt

  3.  

  4. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},

  5. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},

  6. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},

  7. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},

  8. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},

  9. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},

  10. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},

  11. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}

  12. }

  13.  

  14. # 这里为了简单使用近似代替

  15. def pearson(rating1,rating2):

  16.  sum_xy=0

  17.  sum_x=0

  18.  sum_y=0

  19.  sum_x2=0

  20.  sum_y2=0

  21.  n=0

  22.  for key in rating1:

  23.    if key in rating2:

  24.      n += 1

  25.      x = rating1[key]

  26.      y = rating2[key]

  27.      sum_xy += x*y

  28.      sum_x += x

  29.      sum_y += y

  30.      sum_x2 += x**2

  31.      sum_y2 += y**2

  32.  denominnator = sqrt(sum_x2-(sum_x**2)/n)*sqrt(sum_y2-(sum_y**2)/n)

  33.  if denominnator == 0:

  34.    return 0

  35.  else:

  36.    return (sum_xy-(sum_x*sum_y)/n)/denominnator

  37.  

  38. def cos_like(rating1,rating2):

  39.  innerProd=0

  40.  vector_x=0

  41.  vectoy_y=0

  42.  for key in rating1:

  43.    if key in rating2:

  44.      x=rating1[key]

  45.      y=rating2[key]

  46.      innerProd += x*y

  47.      vector_x += x**2

  48.      vectoy_y += y**2

  49.  denominnator = sqrt(vector_x) * sqrt(vectoy_y)

  50.  if denominnator == 0:

  51.    return 0

  52.  else:

  53.    return innerProd / denominnator

余弦相似度:$$cos(x, y)=\frac{x·y}{||x||\times ||y||}$$ 总结:如果数据稠密使用欧氏距离;如果数据稀疏,使用余弦相似度;如果用户评级范围不同,使用皮尔逊相关系数。但是如果仅仅是基于一个用户进行推荐,个别用户的怪癖也会被推荐。

k近邻:利用k个最相似的用户确定推荐结果,K和应有有关。利用皮尔逊系数来确定每个人的影响因子。

 
  1. # A dictionary of movie critics and their ratings of a small

  2. # set of movies

  3. critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,

  4. 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,

  5. 'The Night Listener': 3.0},

  6. 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,

  7. 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,

  8. 'You, Me and Dupree': 3.5},

  9. 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,

  10. 'Superman Returns': 3.5, 'The Night Listener': 4.0},

  11. 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,

  12. 'The Night Listener': 4.5, 'Superman Returns': 4.0,

  13. 'You, Me and Dupree': 2.5},

  14. 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,

  15. 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,

  16. 'You, Me and Dupree': 2.0},

  17. 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,

  18. 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},

  19. 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

  20.  

  21.  

  22. from math import sqrt

  23.  

  24. # Returns a distance-based similarity score for person1 and person2

  25. def sim_distance(prefs,person1,person2):

  26.  # Get the list of shared_items

  27.  si={}

  28.  for item in prefs[person1]:

  29.    if item in prefs[person2]: si[item]=1

  30.  

  31.  # if they have no ratings in common, return 0

  32.  if len(si)==0: return 0

  33.  

  34.  # Add up the squares of all the differences

  35.  sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)

  36.  for item in prefs[person1] if item in prefs[person2]])

  37.  

  38.  return 1/(1+sum_of_squares)

  39.  

  40. # Returns the Pearson correlation coefficient for p1 and p2

  41. def sim_pearson(prefs,p1,p2):

  42.  # Get the list of mutually rated items

  43.  si={}

  44.  for item in prefs[p1]:

  45.  if item in prefs[p2]: si[item]=1

  46.  

  47.  # if they are no ratings in common, return 0

  48.  if len(si)==0: return 0

  49.  

  50.  # Sum calculations

  51.  n=len(si)

  52.  

  53.  # Sums of all the preferences

  54.  sum1=sum([prefs[p1][it] for it in si])

  55.  sum2=sum([prefs[p2][it] for it in si])

  56.  

  57.  # Sums of the squares

  58.  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])

  59.  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])

  60.  

  61.  # Sum of the products

  62.  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])

  63.  

  64.  # Calculate r (Pearson score)

  65.  num=pSum-(sum1*sum2/n)

  66.  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))

  67.  if den==0: return 0

  68.  

  69.  r=num/den

  70.  

  71.  return r

  72.  

  73. # Returns the best matches for person from the prefs dictionary.

  74. # Number of results and similarity function are optional params.

  75. def topMatches(prefs,person,n=5,similarity=sim_pearson):

  76.  scores=[(similarity(prefs,person,other),other)

  77.  for other in prefs if other!=person]

  78.    scores.sort()

  79.    scores.reverse()

  80.  return scores[0:n]

  81.  

  82. # Gets recommendations for a person by using a weighted average

  83. # of every other user's rankings

  84. def getRecommendations(prefs,person,similarity=sim_pearson):

  85.  totals={}

  86.  simSums={}

  87.  for other in prefs:

  88.    # don't compare me to myself

  89.    if other==person: continue

  90.      sim=similarity(prefs,person,other)

  91.  

  92.    # ignore scores of zero or lower

  93.    if sim<=0: continue

  94.  for item in prefs[other]:

  95.  

  96.    # only score movies I haven't seen yet

  97.    if item not in prefs[person] or prefs[person][item]==0:

  98.    # Similarity * Score

  99.      totals.setdefault(item,0)

  100.      totals[item]+=prefs[other][item]*sim

  101.    # Sum of similarities

  102.      simSums.setdefault(item,0)

  103.      simSums[item]+=sim

  104.  

  105. # Create the normalized list

  106.  rankings=[(total/simSums[item],item) for item,total in totals.items()]

  107.  

  108. # Return the sorted list

  109.  rankings.sort()

  110.  rankings.reverse()

  111.  return rankings

  112.  

  113. def transformPrefs(prefs):

  114.  result={}

  115.  for person in prefs:

  116.  for item in prefs[person]:

  117.    result.setdefault(item,{})

  118.  

  119.  # Flip item and person

  120.  result[item][person]=prefs[person][item]

  121.  return result

  122.  

  123.  

  124. def calculateSimilarItems(prefs,n=10):

  125.  # Create a dictionary of items showing which other items they

  126.  # are most similar to.

  127.  result={}

  128.  # Invert the preference matrix to be item-centric

  129.  itemPrefs=transformPrefs(prefs)

  130.  c=0

  131.  for item in itemPrefs:

  132.  # Status updates for large datasets

  133.  c+=1

  134.  if c%100==0: print "%d / %d" % (c,len(itemPrefs))

  135.  # Find the most similar items to this one

  136.  scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)

  137.  result[item]=scores

  138.  return result

  139.  

  140. def getRecommendedItems(prefs,itemMatch,user):

  141.  userRatings=prefs[user]

  142.  scores={}

  143.  totalSim={}

  144.  # Loop over items rated by this user

  145.  for (item,rating) in userRatings.items( ):

  146.  

  147.  # Loop over items similar to this one

  148.  for (similarity,item2) in itemMatch[item]:

  149.  

  150.    # Ignore if this user has already rated this item

  151.    if item2 in userRatings: continue

  152.    # Weighted sum of rating times similarity

  153.    scores.setdefault(item2,0)

  154.    scores[item2]+=similarity*rating

  155.    # Sum of all the similarities

  156.    totalSim.setdefault(item2,0)

  157.    totalSim[item2]+=similarity

  158.  

  159.  # Divide each total score by total weighting to get an average

  160.  rankings=[(score/totalSim[item],item) for item,score in scores.items( )]

  161.  

  162.  # Return the rankings from highest to lowest

  163.  rankings.sort( )

  164.  rankings.reverse( )

  165.  return rankings

  166.  

  167. def loadMovieLens(path='C:\Users\WangYixin\Desktop\PCI_Code\PCI_Code Folder\chapter2\data'):

  168.  # Get movie titles

  169.  movies={}

  170.  for line in open(path+'/u.item'):

  171.    (id,title)=line.split('|')[0:2]

  172.    movies[id]=title

  173.  

  174.    # Load data

  175.    prefs={}

  176.  for line in open(path+'/u.data'):

  177.    (user,movieid,rating,ts)=line.split('\t')

  178.    prefs.setdefault(user,{})

  179.    prefs[user][movies[movieid]]=float(rating)

  180.  return prefs

 
  1. # -*- coding: utf-8 -*-

  2. # 推荐类

  3. import codecs

  4. from math import sqrt

  5.  

  6. users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,

  7. "Norah Jones": 4.5, "Phoenix": 5.0,

  8. "Slightly Stoopid": 1.5,

  9. "The Strokes": 2.5, "Vampire Weekend": 2.0},

  10.  

  11. "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5,

  12. "Deadmau5": 4.0, "Phoenix": 2.0,

  13. "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},

  14.  

  15. "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,

  16. "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,

  17. "Slightly Stoopid": 1.0},

  18.  

  19. "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,

  20. "Deadmau5": 4.5, "Phoenix": 3.0,

  21. "Slightly Stoopid": 4.5, "The Strokes": 4.0,

  22. "Vampire Weekend": 2.0},

  23.  

  24. "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,

  25. "Norah Jones": 4.0, "The Strokes": 4.0,

  26. "Vampire Weekend": 1.0},

  27.  

  28. "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0,

  29. "Norah Jones": 5.0, "Phoenix": 5.0,

  30. "Slightly Stoopid": 4.5, "The Strokes": 4.0,

  31. "Vampire Weekend": 4.0},

  32.  

  33. "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,

  34. "Norah Jones": 3.0, "Phoenix": 5.0,

  35. "Slightly Stoopid": 4.0, "The Strokes": 5.0},

  36.  

  37. "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,

  38. "Phoenix": 4.0, "Slightly Stoopid": 2.5,

  39. "The Strokes": 3.0}

  40. }

  41.  

  42.  

  43.  

  44. class recommender:

  45.  

  46.  def __init__(self, data, k=1, metric='pearson', n=5):

  47.    """ initialize recommender

  48.    currently, if data is dictionary the recommender is initialized

  49.    to it.

  50.    For all other data types of data, no initialization occurs

  51.    k is the k value for k nearest neighbor

  52.    metric is which distance formula to use

  53.    n is the maximum number of recommendations to make"""

  54.    self.k = k

  55.    self.n = n

  56.    self.username2id = {}

  57.    self.userid2name = {}

  58.    self.productid2name = {}

  59.    # for some reason I want to save the name of the metric

  60.    self.metric = metric

  61.    if self.metric == 'pearson':

  62.      self.fn = self.pearson

  63.    #

  64.    # if data is dictionary set recommender data to it

  65.    #

  66.    if type(data).__name__ == 'dict':

  67.      self.data = data

  68.  

  69.  def convertProductID2name(self, id):

  70.    """Given product id number return product name"""

  71.    if id in self.productid2name:

  72.      return self.productid2name[id]

  73.    else:

  74.      return id

  75.  

  76.  

  77.  def userRatings(self, id, n):

  78.    """Return n top ratings for user with id"""

  79.    print ("Ratings for " + self.userid2name[id])

  80.    ratings = self.data[id]

  81.    print(len(ratings))

  82.    ratings = list(ratings.items())

  83.    ratings = [(self.convertProductID2name(k), v)

  84.    for (k, v) in ratings]

  85.      # finally sort and return

  86.      ratings.sort(key=lambda artistTuple: artistTuple[1],

  87.      reverse = True)

  88.      ratings = ratings[:n]

  89.    for rating in ratings:

  90.      print("%s\t%i" % (rating[0], rating[1]))

  91.  

  92.  

  93.  

  94.  

  95.  def loadBookDB(self, path=''):

  96.    """loads the BX book dataset. Path is where the BX files are

  97.    located"""

  98.    self.data = {}

  99.    i = 0

  100.    #

  101.    # First load book ratings into self.data

  102.    #

  103.    f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')

  104.    f.readline() #read the title

  105.    for line in f:

  106.      i += 1

  107.      #separate line into fields

  108.      fields = line.split(';') # still with ""

  109.      user = fields[0].strip('"') #delete “ in the fields

  110.      book = fields[1].strip('"')

  111.      rating = int(fields[2].strip().strip('"'))

  112.      if user in self.data:

  113.        currentRatings = self.data[user]

  114.      else:

  115.        currentRatings = {}

  116.        currentRatings[book] = rating

  117.        self.data[user] = currentRatings

  118.        #line = f.readline()

  119.    f.close()

  120.    #

  121.    # Now load books into self.productid2name

  122.    # Books contains isbn, title, and author among other fields

  123.    #

  124.    f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')

  125.    for line in f:

  126.      i += 1

  127.      #separate line into fields

  128.      fields = line.split(';')

  129.      isbn = fields[0].strip('"')

  130.      title = fields[1].strip('"')

  131.      author = fields[2].strip().strip('"')

  132.      title = title + ' by ' + author

  133.      self.productid2name[isbn] = title

  134.    f.close()

  135.    #

  136.    # Now load user info into both self.userid2name and

  137.    # self.username2id

  138.    #

  139.    f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')

  140.    for line in f:

  141.      i += 1

  142.      #print(line)

  143.      #separate line into fields

  144.      fields = line.split(';')

  145.      userid = fields[0].strip('"')

  146.      location = fields[1].strip('"')

  147.      if len(fields) > 3:

  148.        age = fields[2].strip().strip('"')

  149.      else:

  150.        age = 'NULL'

  151.      if age != 'NULL':

  152.        value = location + ' (age: ' + age + ')'

  153.      else:

  154.        value = location

  155.        self.userid2name[userid] = value

  156.        self.username2id[location] = userid

  157.    f.close()

  158.    print(i)

  159.  

  160.  

  161.  def pearson(self, rating1, rating2):

  162.    sum_xy = 0

  163.    sum_x = 0

  164.    sum_y = 0

  165.    sum_x2 = 0

  166.    sum_y2 = 0

  167.    n = 0

  168.    for key in rating1:

  169.      if key in rating2:

  170.        n += 1

  171.        x = rating1[key]

  172.        y = rating2[key]

  173.        sum_xy += x * y

  174.        sum_x += x

  175.        sum_y += y

  176.        sum_x2 += pow(x, 2)

  177.        sum_y2 += pow(y, 2)

  178.  if n == 0:

  179.    return 0

  180.  # now compute denominator

  181.  denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)

  182.  * sqrt(sum_y2 - pow(sum_y, 2) / n))

  183.  if denominator == 0:

  184.    return 0

  185.  else:

  186.    return (sum_xy - (sum_x * sum_y) / n) / denominator

  187.  

  188.  

  189.  def computeNearestNeighbor(self, username):

  190.    """creates a sorted list of users based on their distance to

  191.    username"""

  192.    distances = []

  193.    for instance in self.data:

  194.      if instance != username:

  195.        distance = self.fn(self.data[username],

  196.        self.data[instance])

  197.        distances.append((instance, distance))

  198.  # sort based on distance -- closest first

  199.  distances.sort(key=lambda artistTuple: artistTuple[1],

  200.  reverse=True)

  201.  return distances

  202.  

  203.  def recommend(self, user):

  204.    """Give list of recommendations"""

  205.    recommendations = {}

  206.    # first get list of users ordered by nearness

  207.    nearest = self.computeNearestNeighbor(user)

  208.    # now get the ratings for the user

  209.    userRatings = self.data[user]

  210.    # determine the total distance

  211.    totalDistance = 0.0

  212.    for i in range(self.k):

  213.      totalDistance += nearest[i][1]

  214.      # now iterate through the k nearest neighbors

  215.      # accumulating their ratings

  216.    for i in range(self.k):

  217.      # compute slice of pie

  218.      weight = nearest[i][1] / totalDistance

  219.      # get the name of the person

  220.      name = nearest[i][0]

  221.      # get the ratings for this person

  222.      neighborRatings = self.data[name]

  223.      # get the name of the person

  224.      # now find bands neighbor rated that user didn't

  225.    for artist in neighborRatings:

  226.      if not artist in userRatings:

  227.        if artist not in recommendations:

  228.          recommendations[artist] = (neighborRatings[artist] * weight)

  229.        else:

  230.          recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight)

  231.      # now make list from dictionary

  232.    recommendations = list(recommendations.items())

  233.    recommendations = [(self.convertProductID2name(k), v)

  234.    for (k, v) in recommendations]

  235.      # finally sort and return

  236.      recommendations.sort(key=lambda artistTuple: artistTuple[1],

  237.      reverse = True)

  238.      # Return the first n items

  239.  return recommendations[:self.n]

  240.  

  241.  ############test code############

  242.  #r = recommender(users)

  243.  #r.recommend('Jordyn')

  244.  #r.recommend('Hailey')

  245.  #r.loadBookDB('BX-CSV-Dump/')

  246.  #r.recommend('171118')

  247.  #r.userRatings('171118', 5)