-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawl.py
143 lines (76 loc) · 2.61 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import random
import re
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import lxml.html as lh
from lxml import etree
START_PAGE = "http://www.facebook.com/directory/people/A"
EXTRACT_PAGES = "directory\/people\/" + letter.upper()
ONLY_FOLLOW = "directory\/people\/" + letter.upper()
DOWNLOAD_DELAY = 1
PAUSE_TEXT = "Security Check Required | Facebook"
ONLY_FOLLOW_regex = re.compile(ONLY_FOLLOW).search
EXTRACT_regex = re.compile(EXTRACT_PAGES).search
urls_didcrawl = {}
urls_willcrawl = [START_PAGE]
# gets pages that match the ONLY_FOLLOW pattern
def getOnlyFollowURLs(urls):
return [ l for l in urls for m in (ONLY_FOLLOW_regex(l),) if m]
# gets a url with the driver and allows you to clear the captcha
def getPage(url,driver):
page_source = PAUSE_TEXT
while PAUSE_TEXT in page_source:
driver.get(url)
page_source = driver.page_source
if PAUSE_TEXT in page_source:
breakCaptcha(driver)
page_source = driver.page_source
else:
print "rate lmit / captcha overcome."
if PAUSE_TEXT in page_source:
print ("Clear the CAPTCHA or wait till your burn out the rate limit.")
# continue to see if the page has changed
while PAUSE_TEXT in page_source:
time.sleep(2)
page_source = driver.page_source
return page_source
# add urls if not already crawled
def addNewURLsToCrawl(urls):
for url in urls:
if url not in urls_didcrawl:
urls_willcrawl.append(url)
# navigate around captcha / rate limit
def breakCaptcha(driver):
'''
use this to wait for the rate lmit or input the
capthca and wait for raw_input() or other logic
'''
return
# what to do when an extracted page is found
def parsePage(url, urls, page_source, lhtml, driver):
'''
use url, urls, page_source, lhtml, and the selenium
driver to get the content rendered via javascript
'''
return
driver = webdriver.Firefox()
# crawl
while len(urls_willcrawl) > 0:
url = urls_willcrawl.pop(0)
try:
page_source = getPage(url, driver)
lhtml = lh.fromstring(page_source) #using lxml because it's faster
except Exception, R:
print R
urls_didcrawl[url] = 1
continue
pass
urls = list(set(lhtml.xpath("//a/@href")))
# if this is a url that should be extracted, then parse the page for data
if EXTRACT_regex(url) is not None:
items = parsePage(url, urls, page_source, lhtml, driver)
urls_didcrawl[url] = 1
addNewURLsToCrawl(getOnlyFollowURLs(urls))
time.sleep(DOWNLOAD_DELAY*random.random()*2)
print 'crawled: ', len(urls_didcrawl), '/', len(urls_willcrawl) , url