-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
101 lines (78 loc) · 2.74 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#crawler example
from html.parser import HTMLParser
from urllib.parse import urljoin
from urllib.request import urlopen
from re import findall
class Collector(HTMLParser):
'a crawler that prints the frequency of words in a webpage and the HTTP links found on the webpage'
def __init__(self, url):
'setting the constructors'
HTMLParser.__init__(self)
self.url = url
self.links = []
self.text = ''
def handle_starttag(self, tag, attrs):
'looks for the url start tag'
if tag == 'a':
for attr in attrs:
if attr[0] == 'href':
absolute = urljoin(self.url, attr[1])
if absolute[:4] == 'http':
self.links.append(absolute)
def handle_data(self, data):
'return the concatenated text in the page'
self.text += data
def getLinks(self):
'returns the links'
return self.links
def getdata(self):
'returns the text'
return self.text
# build analyzer
class Crawler(object):
'crawl links'
def __init__(self):
'setting the constructors'
self.visited = set()
def freq(self, content):
'count the words in the content'
#dictionary
dictionary = {}
#populate it with words
#increment it everytime we see a new word
pattern = '[a-zA-Z]+'
words = findall(pattern, content)
for w in words:
if w in dictionary:
dictionary[w] += 1
else:
dictionary[w] = 1
return dictionary
def analyze(self, url):
'return the list of urls found on the page'
print('\nVisting ', url)
content = urlopen(url).read().decode()
collector = Collector(url)
collector.feed(content)
urls = collector.getLinks()
content = collector.getdata()
frequency = self.freq(content)
print('\n{:50} {:10} {:5}'.format('URL', 'Word', 'Count'))
for word in frequency:
print('\n{:50} {:10} {:5}'.format(url, word, frequency[word]))
for link in urls:
print('\n{:50} {:10}'.format(url, link))
return urls
def crawl2(self, url):
'a recursive web crawler that calls analyze() on every visited web page'
self.visited.add(url)
links = self.analyze(url)
for link in links:
if link not in self.visited:
try:
self.crawl2(link)
except:
pass
# test
c = Crawler()
c.crawl2('https://facweb.cdm.depaul.edu/ahecktma/one.html')