-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathohn.py
79 lines (65 loc) · 3.39 KB
/
ohn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
from lxml import html
from lxml.html.soupparser import fromstring
from hashlib import sha1
from re import split
import requests, sys, math
reload(sys)
sys.setdefaultencoding('utf-8')
class Spidey:
def _get_yearly_listings(self,url):
page = requests.get(url)
tree = html.fromstring(page.content)
return tree.xpath("//a[@class='subcategory_link']")
def _get_chat_listings(self,url):
page = requests.get(url)
tree = html.fromstring(page.content)
listings = []
elements = tree.xpath("//font/b/a")
for element in elements:
listing_url = element.get('href').strip()
listing_title = element.get('title').strip()
title_text = element.xpath('.//text()')
new_listing = [listing_url, listing_title, title_text]
listings.append(new_listing)
return listings
def _get_chats(self,url):
page = requests.get(url)
tree = html.soupparser.fromstring(page.content, features='html.parser')
posts = []
elements = tree.xpath("//table[preceding::h1]/tr")
for element in elements:
timestamp = element.xpath(".//td[contains(@style,'color:#888;')]/text()")
username = element.xpath(".//td/em/text()")
message = element.xpath(".//td/strong/text()")
if len(timestamp) > 0 and len(username) > 0 and len(message) > 0:
new_post = [timestamp[0], username[0].encode('punycode'), message[0]]
posts.append(new_post)
return posts
def crawl(self,dir='ohn'):
base = 'https://web.archive.org'
main_url = base+'/web/20130129082319/http://www.optimalhealthnetwork.com/Alternative-Health-Live-Chat-Log-Archive-s/196.htm'
year_listings = self._get_yearly_listings(main_url)
f = open(dir+'/chats.tsv', 'w')
f.write('thread_id\tchat_title\tdate\ttimestamp\tuser\tmessage\n')
for year_listing in year_listings:
chat_listing_url = year_listing.get('href')
chat_listings = self._get_chat_listings(chat_listing_url)
for chat_listing in chat_listings:
chat_url = chat_listing[0]
chat_title = chat_listing[1]
date = chat_listing[2][0][21:].strip()
chats = self._get_chats(chat_url)
id = sha1(chat_title+date).hexdigest()
for chat in chats:
f.write(id+'\t'+chat_title+'\t'+date+'\t'+chat[0]+'\t'+chat[1]+'\t'+chat[2]+'\n')
f.close()
class TestSpidey(object):
def test_get_yearly_listings(self):
assert len(Spidey()._get_yearly_listings('https://web.archive.org/web/20130129082319/http://www.optimalhealthnetwork.com/Alternative-Health-Live-Chat-Log-Archive-s/196.htm')) == 9
def test_get_chat_listings(self):
assert len(Spidey()._get_chat_listings('https://web.archive.org/web/20120212091416/https://www.optimalhealthnetwork.com/Alternative-Health-Live-Chat-Log-Archive-2012-s/793.htm')) == 4
def test_get_chats(self):
assert len(Spidey()._get_chats('https://web.archive.org/web/20120209041230/http://www.optimalhealthnetwork.com/Healing-with-Essential-Oils-Alternative-Health-Live-Chat-s/794.htm')) == 102
if __name__ == '__main__':
Spidey().crawl()