-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgtalk.py
104 lines (85 loc) · 3.01 KB
/
gtalk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from HTMLParser import HTMLParser
from datetime import datetime
from datetime import timedelta
import json
import os
class GTalkParser(HTMLParser):
message = None
currentUser = None
state = 0
datestr = ''
starttime = None
def handle_starttag(self, tag, attrs):
for name, value in attrs:
if name != 'class':
continue
if value == 'aO1':
self.state = 'user'
elif value == 'aOT':
self.state = 'message'
elif value == 'aSy':
self.state = 'time'
elif value == 'aOM':
self.state = 'startdate'
elif value == 'aZc':
self.state = 'starttime'
else:
self.state = ''
def handle_data(self, data):
data = data.strip()
if len(data) == 0:
return
if self.state == 'user':
if data == 'Brangi Brangelina': self.currentUser = u'Mela'
elif data == 'Kris Jurgowski': self.currentUser = u'Kris'
else: print data
elif self.state == 'message':
if not self.message:
self.message = {'name': self.currentUser, 'message': data}
else:
self.message['message'] += " " + data
elif self.state == 'time':
self.update_with_time(data)
if not self.message:
return
self.message['ts'] = self.convert_to_timestamp(self.starttime)
out_event = self.message
out_events.append(out_event)
self.message = None
elif self.state == 'startdate':
self.datestr = data + " "
elif self.state == 'starttime':
timestr = self.datestr + data + ' EST'
self.starttime = datetime.strptime(timestr, '%A, %B %d, %Y %I:%M %p %Z')
# else:
# print data
def convert_to_timestamp(self, time):
return int((time - datetime.fromtimestamp(0)).total_seconds())
def update_with_time(self, timestr):
parts = timestr.split(':')
hour = int(parts[0])
parts = parts[1].split(' ')
midday = parts[1]
if hour == 12:
hour = 0 if midday == 'AM' else 12
elif midday == 'PM':
hour += 12
min = int(parts[0])
newtime = self.starttime.replace(hour=hour, minute=min)
if (newtime == self.starttime):
newtime = newtime + timedelta(seconds=1)
else:
newtime = newtime.replace(second=0)
if newtime < self.starttime:
newtime += timedelta(days=1)
self.starttime = newtime
out_events = []
parser = GTalkParser()
#file = open('data/Hangouts/2014-05-09.html')
for name in os.listdir('data/Hangouts'):
file = open('data/Hangouts/' + name)
parser.feed(file.read().replace("<wbr>", ""))
sorted_out = sorted(out_events, key= lambda k: float(k[u'ts']))
out_file = open('data/brangi_old.json', 'w')
json.dump(sorted_out, out_file, indent=0)
out_file.close()