-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathbuild-dataset.py
49 lines (38 loc) · 1.41 KB
/
build-dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import urllib
from bs4 import BeautifulSoup
def spider(name, found_titles, url, found):
try:
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
title = soup.title.string.lower()
keywords = soup.select('meta[name="keywords"]')[0]['content'].lower().split(', ')
if name in keywords:
keywords.remove(name)
cleaned_keywords = []
for k in keywords:
if k in title:
cleaned_keywords.append(k)
if len(cleaned_keywords) > 0 and title not in found_titles:
found_titles.append(title)
print(title)
print(cleaned_keywords)
f = open('keyword-data.txt', 'a')
f.write(
title + "\t" + ' '.join(
k.replace(' ', '_') for k in cleaned_keywords
) + "\n"
)
f.close()
for a in soup.select('a[href]'):
b = a['href'].replace('#replies', '')
if 'https://' + name + '.com' in b and b not in found:
found.append(b)
spider(name, found_titles, b, found)
except:
pass
def main():
name = 'lifehacker'
start_url = 'https://lifehacker.com/im-doordash-ceo-tony-xu-and-this-is-how-i-work-1821196705'
spider(name, [], start_url, [start_url])
if __name__ == "__main__":
main()