-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript2
67 lines (47 loc) · 2.01 KB
/
script2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
import re
# List of companies. Eventually you'd want to load this from a file
import csv
companies = []
f = open ('companies - Sheet3.csv', 'rt')
reader = csv.reader(f)
for row in reader :
print row
if len(row) > 0:
companies.append([row[0]])
# The base url. We append the company name to this url
base_url = "http://www.linkedin.com/company/"
# When the request goes out, it has 'headers'. The user-agent header tells
# linkedin that this script is actually the Chrome browser. It isn't actually --
# we're spoofing Chrome because otherwise Linkedin doesn't give us the page we
# want. They don't want to be scraped.
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
# This is the regular expression we used to pull out when it was founded. The
# ([0-9]*) tells the regular expression engine to capture any number of the
# characters [0-9]. Anything in square brackets [] is called a
# 'character class', so [0-9] is a character class of the numbers. The * says
# 'match any number of the previous thing', which is the character class [0-9].
pattern = "<h4>Founded</h4>\n<p>([0-9]*)</p>"
def getfounded(name):
url = base_url + name
print "requesting " + url
# request the page
result = requests.get(url, headers=headers)
# Run the regex against the page's html source
matches = re.search(pattern, result.text)
if matches is not None:
return matches.group(1)
else:
return None
outfile = open('processed.csv','wt')
writer = csv.writer(outfile)
for company in companies:
print "fetching company " + company[0]
lowername = company[0].lower().replace(',','').replace('\'','')
year = (getfounded(lowername.replace(' ','')) or
getfounded(lowername.replace(' ','-')))
if year is not None:
# print out the first match group which will contain that [0-9]*
print "founded in: " + year
company.append(year)
writer.writerow(company)