-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProject_TicketsByTime.py
149 lines (77 loc) · 4.68 KB
/
Project_TicketsByTime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn
# Read in csv
df = pd.read_csv("./Parking_Tickets.csv", )
# Separate out DateIssued column into Date and Time columns
df['Date'] = df.DateIssued.str[:10]
df['Time'] = df.DateIssued.str[11:-5]
# Format the Hour column to have padded zeroes
df['Hour'] = pd.to_datetime(df['Time'], format= '%H:%M:%S' ).dt.hour
df['Hour'] = df.Hour.map("{:02}".format)
# Add column that takes into account if this the time is midnight, then output the hour of the TimeIssued column
df['Hour'] = np.where(df.eval("Time == '00:00:00'"), df.TimeIssued.str[:2], df.Hour)
#df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format = True, errors='ignore')
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format = True, errors='coerce')
# Get current date and make new dataframes for tickets given out in the past
# (make two of these for use in different parts of program) and in the "future"
date_current = pd.to_datetime(datetime.date.today())
future = df[df['Date'] > date_current]
current = df[df['Date'] < date_current]
past = df[df['Date'] < date_current]
# Change Hour column in current dataframe to numeric data
current['Hour'] = pd.to_numeric(current['Hour'])
# Drop anything in current dataframe of hour 24 or higher
future_hour = current[current['Hour'] >= 24]
current = current[current['Hour'] < 24]
# Group current dataframe by hour and drop unnecessary columns
# Basically only need one and Hour, chose to keep RecordID
current = current.drop(columns=['TicketNumber', 'DateIssued', 'StreetName', 'TimeIssued', 'StreetNumber', 'LicenseState', 'WaiverRequestDate', 'WaiverGrantedDate', 'AppealDate', 'AppealGrantedDate', 'ViolationDescription', 'Location', 'LicensePlateAnon', 'Date', 'Time', 'AppealStatus'])
grouped_by_hour = current.groupby('Hour')
# Get counts for each variable in grouped dataframe
grouped_by_hour_counts = grouped_by_hour.agg(np.size)
# Plot it!
axhour = grouped_by_hour_counts.plot(legend=False, title="Total Tickets by Hour").set_ylabel("Total Number of Tickets")
fighour = axhour.get_figure()
fighour.savefig('Tickets by Hour.png', dpi=1000)
# Clear figure (just for TDD purposes)
plt.clf()
# Make new columns for year and month
past['Year'] = past['Date'].dt.year
past['Month'] = past['Date'].dt.month
# Get rid of 1999 values; seem erroneous
past = past[past['Year'] > 1999]
# Group by year then plot counts
grouped_by_year = past.groupby(['Year']).size()
axyear = grouped_by_year.plot(title="Number of Tickets by Year", xticks=[2000,2004,2008,2012,2016,2020]).set_ylabel("Number of Tickets")
figyear = axyear.get_figure()
figyear.savefig('Tickets by Year.png', dpi=1000)
# Group by year then violation description while replacing whitespace with nothing
current.replace(' ', '', regex=True, inplace=True)
groupby_year_viol = past.groupby(['Year','ViolationDescription']).agg(np.size).reset_index()
# Create dataframe to only keep top 5 results each year (for easier reading of graph)
top5 = groupby_year_viol.groupby('Year').head(50)
top5 = top5.sort_values(['Year','RecordID'])
top5 = top5.groupby('Year').tail(5)
# Drop unnecessary columns
top5 = top5[['Year','ViolationDescription','TicketNumber']]
# Rename columns
top5.columns = ['Year','ViolationDescription','TotalTickets']
# Set grouped dataframe to a shorter name for easier test/coding
gyv = top5
# Initialize seaborn and DISTINCTIVE COLOR LIST. Very difficult to find, so I just hardcoded it myself.
# This list is roughly unnecessary at this point, but if you'd like to display all ViolationDescriptions,
# it becomes very necessary
seaborn.set(style='ticks')
# distinct_colors = ['#000000','#FF0000','#00FF00','#0000FF','#FF00FF','#00FFFF','#800000','#008000','#000080','#808000','#800080','#008080','#C0C0C0','#808080','#9999FF','#993366','#FFFFCC','#660066','#FF8080','#0066CC','#FF6600','#FFFF00', '#003300','#993366','#339966','#FF99CC','#FFFF99']
some_colors = ['#59FFA0', '#FF4242', '#FB62F6', '#645DD7', '#41BA75', '#FF9797', '#FCA9FA', '#AAA6E9', '#2E2B62', '#741E1E']
# Set seaborn palette to DISTINCTIVE COLOR LIST
seaborn.set_palette(some_colors)
# Plot with seaborn, explicitly setting xticks and asking for a legend
# while color coding the ViolationDescription values
fg = seaborn.FacetGrid(data=gyv, hue='ViolationDescription', aspect=1.5)
fg.fig.suptitle("Top 5 Violations Each Year")
fg.map(plt.scatter, 'Year', 'TotalTickets').add_legend().set(xticks=[2000,2004,2008,2012,2016,2020])
fg.savefig("Top 5 Violations Each Year", dpi=1000)