-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhealthse.py
259 lines (218 loc) · 11.7 KB
/
healthse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
from lxml import html
from lxml.html.soupparser import fromstring
from hashlib import sha1
from re import split
from requests.exceptions import ConnectionError
import requests, sys, math, time
import pandas as pd
reload(sys)
sys.setdefaultencoding('utf-8')
class Spidey:
paging = 50
def _get_pages(self,url):
try:
page = requests.get(url)
except ConnectionError as ce:
print ce
return 0
tree = html.fromstring(page.content)
num_questions = tree.xpath("//div[contains(@class,'mr12')]/text()")
if num_questions:
num_questions = num_questions[2].split()[0].strip()
else:
return 0
num_questions = int(num_questions.replace(',',''))
extra = num_questions % self.paging
if extra > 0:
extra = 1
return (num_questions/self.paging) + extra
def _get_question_links(self,base,url):
question_links = ""
try:
page = requests.get(base+url)
except ConnectionError as ce:
print ce
return question_links
tree = html.fromstring(page.content)
list_questions = tree.xpath("//div[@class='summary']/h3/a[@class='question-hyperlink']")
if len(list_questions) == 0:
return list_questions
for q in list_questions:
question_links += base + q.get('href') + ","
return question_links
def _get_question_details(self,tree):
qtitle = tree.xpath("//h1[@itemprop='name']/a/text()")
if qtitle:
qtitle = qtitle[0]
else:
return None
qbody = tree.xpath("//div[contains(@class,'postcell')]/div/p/text()")
qbody = ' '.join(qbody).replace('\n','<br />').replace('\r','<br />').replace('\t',' ')
qbody = qbody.encode('punycode')
qtimestamp = tree.xpath("//span[@class='relativetime']")
if qtimestamp:
qtimestamp = qtimestamp[0].get('title')
else:
qtimestamp = ''
quser = tree.xpath("//div[@class='user-details']/a/text()")
if quser:
quser = quser[0]
else:
quser = ''
qvote = tree.xpath("//div[@itemprop='upvoteCount']/text()")
if qvote:
qvote = int(qvote[0])
else:
qvote = ''
qfavorite = tree.xpath("//div[contains(@class,'js-favorite-count')]/text()")
if qfavorite:
qfavorite = int(qfavorite[0])
else:
qfavorite = 0
qtags = tree.xpath("//div[contains(@class,'post-taglist')]/div/a/text()")
qtags = ', '.join(qtags)
return [qtitle, qbody, qtimestamp, quser, qvote, qfavorite, qtags]
def _get_comments(self, tree, qid):
comments = tree.xpath("//div[@id='comments-"+qid+"']/ul/li/div[contains(@class,'comment-text')]/div[contains(@class,'comment-body')]")
all_comments = []
for cmt in comments:
cbody = cmt.xpath(".//span[@class='comment-copy']")
if cbody:
cbody = cbody[0].text_content().replace('\n','<br />').replace('\r','<br />').replace('\t',' ')
else:
continue
cauthor = cmt.xpath(".//a[@class='comment-user']/text()")
if cauthor:
cauthor = cauthor[0]
else:
cauthor = ""
cdate = cmt.xpath(".//span[@class='comment-date']/span")
if cdate:
cdate = cdate[0].get('title')
else:
cdate = ''
all_comments.append([qid,cbody,cdate,cauthor])
return all_comments
def _get_question_answers(self,base,tree):
all_answers = []
answers = tree.xpath("//div[contains(@class,'answercell')]")
for ans in answers:
aid = ans.xpath(".//div/div/div[@class='post-menu']/a[contains(@class,'js-share-link')]")
if not aid:
continue
aurl = base + aid[0].get('href')
aid = aid[0].get('href')[3:] # Skip /a/
abody = ans.xpath(".//div[@class='post-text']/p/text()")
abody = ' '.join(abody).replace('\n','<br />').replace('\r','<br />').replace('\t',' ')
auser = ans.xpath(".//div[@class='user-details']/a/text()")
if auser:
auser = auser[0]
else:
auser = ""
atime = ans.xpath(".//div[@class='user-action-time']/span[@class='relativetime']")
if atime:
atime = atime[0].get('title')
else:
atime = ''
avotes = tree.xpath("//div[@data-answerid='"+aid+"']/div/div/div[contains(@class,'js-voting-container')]/div[@itemprop='upvoteCount']/text()")
if avotes:
avotes = avotes[0]
else:
avotes = '0'
all_answers.append([aid,aurl,abody,atime,auser,avotes])
return all_answers
def crawl(self,dir='healthse'):
base = 'https://medicalsciences.stackexchange.com'
query = '/questions?pagesize=' + str(self.paging) + '&sort=newest&page='
pages = self._get_pages(base+query+'1')
question_urls = ""
for i in range(1, pages+1):
question_urls += self._get_question_links(base,query+str(i))
question_urls = question_urls[:-1].split(',')
counts = 0
questions_df = pd.DataFrame(columns=['question_id','question_url','question_title','question_body','time_stamp','username','votes','favorites','tags'])
qcomments_df = pd.DataFrame(columns=['question_id','comment','time_stamp','username'])
answers_df = pd.DataFrame(columns=['question_id','answer_id','answer_url','answer','time_stamp','username','votes'])
acomments_df = pd.DataFrame(columns=['question_id','answer_id','comment','time_stamp','username'])
for qurl in question_urls:
time.sleep(1) # Pause to avoid rate limits
try:
page = requests.get(qurl)
except ConnectionError as ce:
print ce
time.sleep(60) # On rate limiting, pause and retry later
counts += 1
tree = html.soupparser.fromstring(page.content, features='html.parser')
sep = '/questions/'
st = qurl.find(sep)
ed = qurl[st+len(sep):]
qid = ed[:ed.find('/')]
q = self._get_question_details(tree)
if q is None: continue
questions_df.loc[len(questions_df)] = [qid] + [qurl] + q
qcmts = self._get_comments(tree,qid)
for qc in qcmts:
qcomments_df.loc[len(qcomments_df)] = qc
qans = self._get_question_answers(base,tree)
ans_comments = []
for qa in qans:
answers_df.loc[len(answers_df)] = [qid] + qa
ans_comments.extend(self._get_comments(tree, qa[0]))
for ac in ans_comments:
acomments_df.loc[len(acomments_df)] = [qid] + ac
questions_df.to_csv(path_or_buf=dir+'/questions.tsv',sep='\t',index=False,encoding='utf-8')
qcomments_df.to_csv(path_or_buf=dir+'/question_comments.tsv',sep='\t',index=False,encoding='utf-8')
answers_df.to_csv(path_or_buf=dir+'/answers.tsv',sep='\t',index=False,encoding='utf-8')
acomments_df.to_csv(path_or_buf=dir+'/answer_comments.tsv',sep='\t',index=False,encoding='utf-8')
return counts
class TestSpidey(object):
def _get_tree(self, url):
page = requests.get(url)
return html.soupparser.fromstring(page.content, features='html.parser')
def test_get_pages(self):
assert Spidey()._get_pages('https://medicalsciences.stackexchange.com/questions?pagesize=50&sort=newest&page=1') == 120
def test_get_question_links(self):
assert len(Spidey()._get_question_links('https://medicalsciences.stackexchange.com','/questions?pagesize=50&sort=newest&page=1').split(',')) == 51
def test_get_question_details1(self):
assert Spidey()._get_question_details(self._get_tree('https://medicalsciences.stackexchange.com/questions/15473/can-diabetes-relieve-symptoms-of-hemophilia'))[5] == 0
def test_get_question_details2(self):
assert Spidey()._get_question_details(self._get_tree('https://medicalsciences.stackexchange.com/questions/15473/can-diabetes-relieve-symptoms-of-hemophilia'))[4] == 0
def test_get_question_details3(self):
assert Spidey()._get_question_details(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'))[5] == 4
def test_get_question_details4(self):
assert Spidey()._get_question_details(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'))[4] == 20
def test_get_comments1(self):
assert len(Spidey()._get_comments(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'),'3585')) == 3
def test_get_comments2(self):
assert len(Spidey()._get_comments(self._get_tree('https://medicalsciences.stackexchange.com/questions/43/what-is-the-cause-of-type-1-diabetes'),'43')) == 0
def test_get_comments3(self):
assert Spidey()._get_comments(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'),'3585')[1][3] == 'dakre18'
def test_get_answer_comments1(self):
assert len(Spidey()._get_comments(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'),'4010')) == 0
def test_get_answer_comments2(self):
assert len(Spidey()._get_comments(self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'),'12521')) == 1
def test_get_question_answers1(self):
assert len(Spidey()._get_question_answers('https://medicalsciences.stackexchange.com',self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'))) == 2
def test_get_question_answers2(self):
assert Spidey()._get_question_answers('https://medicalsciences.stackexchange.com',self._get_tree('https://medicalsciences.stackexchange.com/questions/3585/is-eating-a-meal-with-2000-calories-at-once-any-different-from-eating-4-times-at'))[0][5] == '14'
def test_get_question_answers3(self):
assert len(Spidey()._get_question_answers('https://medicalsciences.stackexchange.com',self._get_tree('https://medicalsciences.stackexchange.com/questions/15473/can-diabetes-relieve-symptoms-of-hemophilia'))) == 0
def test_all(self):
self.test_get_pages()
self.test_get_question_links()
self.test_get_question_details1()
self.test_get_question_details2()
self.test_get_question_details3()
self.test_get_question_details4()
self.test_get_comments1()
self.test_get_comments2()
self.test_get_comments3()
self.test_get_answer_comments1()
self.test_get_answer_comments2()
self.test_get_question_answers1()
self.test_get_question_answers2()
self.test_get_question_answers3()
if __name__ == '__main__':
TestSpidey().test_all()
Spidey().crawl()