-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_ratings.py
316 lines (255 loc) · 10.9 KB
/
scrape_ratings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/usr/bin/env python
"""
A script for scrapping ratings from the Voobly website.
The list of players must be stored in a file named `players.csv`.
This file starts with a header line, then each following line is the player name
then the voobly profile link.
For example:
```
player-name, voobly-profile-link
TWest,https://www.voobly.com/profile/view/123684015
robo_boro,https://www.voobly.com/profile/view/123905987
smarthy_,https://www.voobly.com/profile/view/124230162
Pete26196,https://www.voobly.com/profile/view/123685133
AkeNo,https://www.voobly.com/profile/view/123723545
```
The output is saved in a file `ratings.csv`.
Each line contains a player name and their 4 ratings, separated by commas.
"""
import csv
import sys
import argparse
import requests
from bs4 import BeautifulSoup
# File path to the file storing the player names to scrap.
PLAYERS_FILE_PATH = 'players.csv'
# Error message to display if players.csv does not exist.
PLAYERS_FILE_NOT_FOUND = "The file 'players.csv' does not exist."
# Error message to display if there is an OSError when attempting to read the
# players file.
PLAYERS_FILE_ERROR_MSG = "Cannot read 'players.csv'."
# File in which to save the output.
OUT_FILE_PATH = 'ratings.csv'
# Error message to display when failing to write to the ratings file.
WRITE_ERROR_MSG = "Cannot write to 'ratings.csv'."
# Path to the file of invalid players and uids.
INVALID_FILE_PATH = 'invalid.csv'
# Error message to print when at least one player uid is invalid
INVALID_UID_MSG = "{} invalid player uid(s), writing to 'invalid.csv'."
# Error message to display when failing to write to the invalid uid file.
WRITE_ERROR_INVALID = "Cannot write to 'invalid.csv'"
# Message to display when failing to log in to Voobly.
VOOBLY_LOGIN_FAIL_MSG = ('Cannot log in to Voobly.'
+' Check your username and password.')
# Map of ladder names to their ids.
LADDERS = {
'RM - Clans': 14,
'CS - Europe': 100,
'Beginners RM': 101,
'Beginners DM': 102,
'RM - 1v1': 131,
'RM - Team Games': 132,
'RM AoFE': 143,
'AoFE Overall': 144,
'AoFE RM - 1v1': 145,
'AoFE RM - TG': 146,
'VCOM Clan Wars': 148,
'AoFE Castle Blood': 149,
'AoFE CS': 150,
'VCOM Ladder': 151,
'DM TG': 162,
'DM 1v1': 163
}
# URL for Voobly's login page
VOOBLY_LOGIN_URL = 'https://www.voobly.com/login'
# URL to which to send the post login request
VOOBLY_LOGIN_AUTH_URL = 'https://www.voobly.com/login/auth'
# Base url from which to grab a player's ratings profile
# Use the string format method to supply the user and ladder ids.
RATINGS_BASE_URL = 'https://www.voobly.com/profile/view/{uid}/Ratings/games/profile/{uid}/{lid}' # pylint: disable=line-too-long
# Start of geader for the ratings output csv file.
RATINGS_HEADER_START = 'Player Name'
def load_players(fname=None):
"""
Returns a dictionary of player_name: uid.
Args:
fname: The file path to the players file.
Returns:
A dict mapping a string player name to their string Voobly user id.
Raises:
FileNotFoundError: If the file fname does not exist.
OSError: If fname cannot be read.
"""
if fname is None: fname = PLAYERS_FILE_PATH
with open(fname) as player_file:
profilereader = csv.reader(player_file)
rows = [row for row in profilereader]
if not rows: return {} # return if file is empty
if rows[0][0] == 'player-name': rows = rows[1:] # skip header if present
players = {} # maps a player name to a list of that player's uids
for row in rows: players[row[0]] = [parse_id(uid) for uid in row[1:]]
return players
def write_ratings(player_ratings, ladders, fname=None):
"""
Saves player ratings to fname.
Args:
player_ratings: A dictonary mapping a string player name to a list of
strings. Each string is a list representing the player's rating
on a ladder. This list must be the same length as ladders.
ladders: A list of string ladder names. Must be the same length as
player_ratings.
fname: The file path to the output file.
Raises:
OSError: If fname cannot be written to.
"""
if fname is None: fname = OUT_FILE_PATH
with open(fname, 'w') as output_file:
header = [RATINGS_HEADER_START]
for ladder in ladders:
header.append('Current ' + ladder)
header.append('Highest ' + ladder)
output_file.write(', '.join(header) + '\n')
for player, ratings in player_ratings.items():
output_file.write('{}, {}\n'.format(player, ', '.join(ratings)))
def parse_id(voobly_url):
"""
Returns the player user id from a voobly url.
A Voobly url has the format 'www.voobly.com/profile/view/uid', where
the uid is the users id number. The url may optionally have text prepended
or appended, as long as it contains this string.
Example URLs:
www.voobly.com/profile/view/123684015
https://www.voobly.com/profile/view/123684015
https://www.voobly.com/profile/view/123684015/
https://www.voobly.com/profile/view/123684015/Ratings/games/profile/123684015/131
Note: this method simply parses the URL to obtain the uid, it does not
check whether a Voobly profile with that uid exists.
Args:
voobly_url: A voobly url, must not end in a trailing slash.
Returns:
The player user id parsed from the url.
Raises:
ValueError: If the url is not correctly formatted.
"""
try:
split_url = voobly_url.split('/')
view_index = split_url.index('view')
uid = split_url[view_index + 1]
int(uid) # ensure that the uid is an integer
return uid
except (ValueError, IndexError) as e:
raise ValueError(
"The url '{}' is incorrectly formatted.".format(voobly_url)) from e
def get_ratings(sess, uid_list, lid):
"""
Returns the current and highest ratings of a player on the given ladder.
If an account has 0 games, does not consider the rating on that ladder.
Assigns a default value of 1600 if all of the accounts have 0 games.
Args:
sess: The current session logged in to access Voobly profiles.
uid_list: A list of string Voobly user ids.
lid: Ladder id, the id of a Voobly ladder, must be a value in LADDERS.
Returns:
Two strings: current_rating, highest_rating.
Raises:
ValueError: If a player uid is invalid. The ValueError contains the
invalid uid as a message.
"""
max_current = -1
max_highest = -1
for uid in uid_list:
ratings_url = RATINGS_BASE_URL.format(uid=uid, lid=lid)
ratings_response = sess.get(ratings_url)
soup = BeautifulSoup(ratings_response.content, 'html.parser')
if soup.title.get_text() == 'Page Not Found':
raise ValueError("{}".format(uid))
current = soup.find('td', text='Current Rating').find_next().get_text()
# account for 0 games
if current: max_current = max(max_current, int(current))
highest = soup.find('td', text='Highest Rating').find_next().get_text()
max_highest = max(max_highest, int(highest))
if max_current == -1: max_current = 1600 # account for 0 games
return str(max_current), str(max_highest)
def parse_args(args):
"""
Parses args.
Args:
args: List of strings to parse.
Returns:
An object containing the parsed arguments. The object has three fields:
username: Voobly username string.
password: Voobly password string.
ladders: List of string names of Voobly ladders from which to pull
ratings.
"""
parser = argparse.ArgumentParser()
parser.add_argument('username', help='Voobly account username.')
parser.add_argument('password', help='Voobly account password.')
parser.add_argument('--ladders', default=['RM - 1v1', 'RM - Team Games'],
help='Select the ladders form which you want ratings.',
choices=sorted(LADDERS, key=LADDERS.get), nargs='*')
parsed = parser.parse_args(args)
# a single argument is parsed as a single string, turn it into a list
if isinstance(parsed.ladders, str): parsed.ladders = [parsed.ladders]
return parsed
def main(args):
"""
Runs the script, loading player ratings from the Voobly website and saving
them in `ratings.csv`.
Args:
args: Usually sys.argv[1:].
"""
parsed = parse_args(args)
try:
player_profiles = load_players() # dict of player name to voobly uid
except FileNotFoundError:
print(PLAYERS_FILE_NOT_FOUND)
return # terminate when no players are present
except OSError:
print(PLAYERS_FILE_ERROR_MSG)
return # terminate when player data cannot be read
except ValueError as e:
print(e)
return # terminate when player data contains an invalid url
invalid_players = {} # maps a player name to their invalid uids
with requests.Session() as sess:
sess.get(VOOBLY_LOGIN_URL) # initial login page get to populate cookies
# TODO handle failure of initial get (try with internet off)
login_data = {'username': parsed.username, 'password': parsed.password}
hdr = {'referer': VOOBLY_LOGIN_AUTH_URL}
login_response = sess.post(VOOBLY_LOGIN_AUTH_URL, data=login_data,
headers=hdr)
# Voobly login failed if title of the result is 'Login'
login_soup = BeautifulSoup(login_response.content, 'html.parser')
if login_soup.title.get_text() == 'Login':
print(VOOBLY_LOGIN_FAIL_MSG)
return # terminate if Voobly login failed
ratings = {} # maps a player name to their list of ratings
for player, uid_list in player_profiles.items():
try:
ratings[player] = []
for ladder in parsed.ladders:
lid = LADDERS[ladder]
current, highest = get_ratings(sess, uid_list, lid)
ratings[player].append(current)
ratings[player].append(highest)
except ValueError as err:
del ratings[player] # remove player from good output
invalid_players[player] = str(err)
try:
write_ratings(ratings, parsed.ladders)
except OSError:
print(WRITE_ERROR_MSG)
return # terminate if the ratings cannot be written
if invalid_players:
try:
print(INVALID_UID_MSG.format(len(invalid_players)))
with open(INVALID_FILE_PATH, 'w') as bad_uid_file:
for player, uid in invalid_players.items():
bad_uid_file.write(
'{},{}\n'.format(player, uid))
except OSError:
print(WRITE_ERROR_INVALID)
return # terminate if the invalid uids cannot be written
if __name__ == '__main__':
main(sys.argv[1:])