-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_data.py
100 lines (83 loc) · 3.46 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import os, sys, collections
from math import log2 as log
def parse_data(season_file, teams_file, sos_file):
# read csv into panda dataframe using headers and found delimiter
season_data = pd.read_csv(season_file, delimiter=',')
team_info = pd.read_csv(teams_file, delimiter=',', encoding="latin1")
sos_info = pd.read_csv(sos_file, delimiter=',')
team_data_cols = [
"Year",
"TeamID",
"GameNum",
"Points",
"FGA",
"FGM",
"FTA",
"OR",
"TO",
"Ast",
"OpPoints",
"OpFGA",
"OpFGM",
"OpFTA",
"OpOR",
"OpTO",
"OpAst",
"Result"
]
team_data = pd.DataFrame(columns=team_data_cols)
year = season_data.iloc[0].Season
game_count = collections.Counter()
win_percentage = {}
i = 0
for index, game in season_data.iterrows():
if year != game.Season:
game_count = collections.Counter()
if game.Season not in win_percentage:
win_percentage[game.Season] = {}
if game.WTeamID not in win_percentage[game.Season]:
win_percentage[game.Season][game.WTeamID] = [0,0]
if game.LTeamID not in win_percentage[game.Season]:
win_percentage[game.Season][game.LTeamID] = [0,0]
row1 = [game.Season, game.WTeamID, game_count[game.WTeamID]+1, game.WScore, game.WFGA, game.WFGM, game.WFTA, game.WOR, game.WTO, game.WAst, game.LScore, game.LFGA, game.LFGM, game.LFTA, game.LOR, game.LTO, game.LAst, 1]
row2 = [game.Season, game.LTeamID, game_count[game.LTeamID]+1, game.LScore, game.LFGA, game.LFGM, game.LFTA, game.LOR, game.LTO, game.LAst, game.WScore, game.WFGA, game.WFGM, game.WFTA, game.WOR, game.WTO, game.WAst, 0]
team_data.loc[i] = row1
team_data.loc[i+1] = row2
game_count[game.WTeamID] += 1
game_count[game.LTeamID] += 1
win_percentage[game.Season][game.WTeamID][0] += log(game_count[game.WTeamID])
win_percentage[game.Season][game.WTeamID][1] += log(game_count[game.WTeamID])
win_percentage[game.Season][game.LTeamID][1] += log(game_count[game.LTeamID])
i += 2
year = game.Season
team_data.to_csv("data/season_results.csv", index=False)
sos_cols = [
"Year",
"TeamID",
"SOS",
"AdjWin"
]
sos_data = pd.DataFrame(columns=sos_cols)
i = 0
bad = set()
for index, item in sos_info.iterrows():
tid = team_info.loc[team_info['TeamNameSpelling'] == item.SCHOOL.lower()].TeamID
if tid.empty:
tid = team_info.loc[team_info['TeamNameSpelling'] == item.SCHOOL.lower().replace(".", "")].TeamID
if tid.empty:
tid = team_info.loc[team_info['TeamNameSpelling'] == item.SCHOOL.lower().replace("-", " ")].TeamID
if tid.empty:
tid = team_info.loc[team_info['TeamNameSpelling'] == item.SCHOOL.lower().replace(".", "").replace("-", " ")].TeamID
tid = int(tid)
wp = win_percentage[item.YEAR][tid][0] / win_percentage[item.YEAR][tid][1]
row = [item.YEAR, tid, item.SOS, wp]
sos_data.loc[i] = row
i += 1
sos_data.to_csv("data/sos_results.csv", index=False)
return
if __name__=="__main__":
season_file = "data/RegularSeasonDetailedResults.csv"
team_file = "data/TeamSpellings.csv"
sos_file = "data/StrengthOfSchedule.csv"
parse_data(season_file, team_file, sos_file)