-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyelprecommender.py
140 lines (116 loc) · 4.4 KB
/
yelprecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import os
import os.path
import math
from surprise import (Reader, Dataset, NormalPredictor, BaselineOnly,
accuracy, SVD, SVDpp, NMF, KNNBasic, KNNWithMeans,
KNNWithZScore, KNNBaseline)
from surprise.model_selection import (cross_validate, KFold,
train_test_split, GridSearchCV,
RandomizedSearchCV)
# Always make it pretty.
plt.style.use('ggplot')
%matplotlib inline
sns.set_style(style="whitegrid")
def inner_to_raw_iid(row):
'''
Returns the raw item id associated with a given Surprise inner id
Arguments:
row: corresponds to a row in a dataframe
Returns:
output (float): raw item id
'''
output = trainset.to_raw_iid(row['iid'])
return output
def inner_to_raw_uid(row):
'''
Returns the raw user id associated with a given Surprise inner id
Arguments:
row: corresponds to a row in a dataframe
Returns:
output (float): raw user id
'''
output = trainset.to_raw_uid(row['uid'])
return output
def get_top_n(predictions, n=10):
'''
Return the top-N recommendation for each user from a set of predictions.
Arguments:
predictions(list of Prediction objects): The list of predictions, as
returned by the test method of an algorithm.
n(int): The number of recommendation to output for each user. Default
is 10.
Returns:
A dict where keys are user (raw) ids and values are lists of tuples:
[(raw item id, rating estimation), ...] of size n.
'''
# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
# Then sort the predictions for each user and retrieve the k highest ones.
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
def find_and_compare(user_id, top_n):
'''
Returns the raw user id associated with a given Surprise inner id
Arguments:
user_id (string): user_id string
top_n (dict): top_n dictionary generated from get_top_n
Returns:
reviewed_businesses_df (dataframe): dataframe of businesses reviewed by
the given user
recommended_df (dataframe): dataframe of recommended restaurants
'''
reviewed_df = utility_3[utility_3['user_id'] == user_id]
merged_reviewed_df = reviewed_df.merge(right=business_df,
how='left',
left_on='business_id',
right_on='business_id')
reviewed_businesses_df = merged_reviewed_df[['business_name',
'review_stars',
'categories']]
top_n_df = pd.DataFrame(columns=['business_id', 'predicted'])
i = 0
for business, predicted in top_n[user_id]:
top_n_df.loc[i] = [business, predicted]
i += 1
top_n_businesses_df = top_n_df.merge(right=business_df,
how='left',
left_on='business_id',
right_on='business_id')
recommended_df = top_n_businesses_df[['business_name',
'predicted',
'categories']]
return reviewed_businesses_df, recommended_df
def get_user_totals(uid):
'''
Returns the number of items rated by given user
Arguments:
uid (string): the id of the user
Returns:
(int): the number of items rated by the user
'''
try:
return len(trainset_3.ur[trainset_3.to_inner_uid(uid)])
except ValueError: # user was not part of the trainset
return 0
def get_item_totals(iid):
'''
Returns the number of users that have rated a given item
Arguments:
iid (string): the raw id of the item
Returns:
(int): the number of users that have rated the item.
'''
try:
return len(trainset_3.ir[trainset_3.to_inner_iid(iid)])
except ValueError:
return 0