-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindeed.py
212 lines (186 loc) · 10.9 KB
/
indeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# import webbrowser
import requests
import bs4
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
from data_processing import dataProcessing, functionsCommonToAllSites
class indeedScraper:
#dictionary to saves the job role and job description as key value pair respectively
jobrole_description_dict={}
# jd_scraped_count : counts the number of job descriptions scraped
jd_scraped_count = 0
# counts the number of duplicate JD's scraped
duplicate_count = 0
def __init__(self, jd_count, job_role, file_name_to_save, stats_filename,job_place="United States", experience_level="", search_format=""):
'''
This is a special Python method that is automatically called when memory is allocated for a new object.
jd_count : number of job descriptions to scrape
experience_level : job experience level (Entry Level, Mid Level, Senior Level)
job_role : job role to scrape like Machine Learning Engineer, Data Scientist, DevOps Engineer, etc
job_place : Country/City where Job is located
search_format : search format like : title:(), title:"",""
file_name_to_save : name of file to save the scraped data
'''
self.jd_count = jd_count
self.experience_level = experience_level
self.job_role = job_role
self.job_place = job_place
self.search_format = search_format
self.file_name_to_save = file_name_to_save
self.stats_filename = stats_filename
#object for data processing class
self.data_processing = dataProcessing(file_name_to_save = self.file_name_to_save, jd_count = self.jd_count, job_role = self.job_role, stats_filename = self.stats_filename, job_site = "Indeed", experience_level = self.experience_level, job_place = self.job_place, search_format = self.search_format)
# object for functions common to all sites
self.common_functions_obj = functionsCommonToAllSites()
def indeed_get_content(self, url_content_list, indeed_url):
'''
This function scrapes job description of all urls received from indeed_scrape() function and save it to jobrole_description_dict
url_content_list: contains the list of job urls (from indeed_scrape() function)
job_title : contains the title of scraped job
job_description : contains the job description
company_name : contains the company name
jd_key : contains the key for job description, to save job description in a dictionary
'''
for url_content in url_content_list:
if indeedScraper.jd_scraped_count < self.jd_count:
try:
# Title of Job Description
job_title = url_content.text.strip()
# Request the page
res = requests.get(indeed_url + url_content.get('href'))
# Check status
res.raise_for_status()
# Look for Job description text
soup = bs4.BeautifulSoup(res.text, features="html.parser")
job_description = soup.find('div', class_='jobsearch-jobDescriptionText')
if job_description =="":
print("job_description not found in url: %s" %(indeed_url + url_content.get('href')))
continue
# Find company name
try:
company_url = soup.find('div', class_="icl-u-lg-mr--sm icl-u-xs-mr--xs")
jd_key = job_title+'@'+company_url.text
# If Job key is unique
# Store Job description into the dictionary
if jd_key not in indeedScraper.jobrole_description_dict.keys():
indeedScraper.jobrole_description_dict[jd_key] = job_description
indeedScraper.jd_scraped_count+=1
print(jd_key, indeedScraper.jd_scraped_count)
else:
print("Cannot save duplicate key:%s" %jd_key)
indeedScraper.duplicate_count += 1
except Exception as e:
print("Warning! Cound not find Company name")
print(e)
except Exception as e:
print('Error: Problem downloading url: %s' %(indeed_url+url_content.get('href')))
print(e)
# Time between 2 requests
time.sleep(1)
else:
break
def indeed_scrape(self):
'''
This function scrapes urls of job posts from indeed.com and pass these urls to function indeed_get_content()
indeed_url : contains url for website to scrape
job_keyword : contains the job role along with search format
search_job_elem : contains the reference to element job title for search
search_place_elem : contains the reference to element job place for search
url_content_list : contains the urls for all the jobs on a page
next_element : element to move to next page.
close_popup : element to close the pop-up appearing on website
tic : time at which script starts
download_start_time : It is the time elapsed from opening of selenium browser to starting download of JD's
'''
# Site URL
indeed_url = "https://www.indeed.com"
#Start timer
tic = time.perf_counter()
# Site URL with US Locality
url = "https://www.indeed.com/stc?_ga=2.97591007.1313838538.1592977347-878814794.1592977347"
# Job keyword according to the input search format
job_keyword=""
if self.search_format == 'title:()':
job_keyword = 'title:('+self.job_role+')'
elif self.search_format == 'title:""':
job_keyword = 'title:"'+self.job_role+'"'
elif self.search_format == '""':
job_keyword = '"'+self.job_role+'"'
else:
job_keyword = self.job_role
try:
# Opening Chrome browser
browser = webdriver.Chrome(ChromeDriverManager().install())
browser.get(url)
self.common_functions_obj.wait_until_page_load(browser = browser, elem_key='text-input-what', key_type="id", wait_time=60)
#Enter Job keyword
search_job_elem = browser.find_element_by_id('text-input-what')
search_job_elem.send_keys(job_keyword)
# Enter Place
search_place_elem = browser.find_element_by_id('text-input-where')
search_place_elem.send_keys(Keys.CONTROL+'a')
search_place_elem.send_keys(Keys.DELETE)
search_place_elem.send_keys(self.job_place)
# Search
search_place_elem.send_keys(Keys.ENTER)
# experience level filter
if self.experience_level != "":
self.common_functions_obj.wait_until_page_load(browser = browser, elem_key='filter-experience-level', key_type="id", wait_time=60)
dropdown = browser.find_element_by_id("filter-experience-level")
dropdown.click()
browser.find_element_by_partial_link_text(self.experience_level).click()
download_start_time = time.perf_counter() - tic
# to download job roles = jd_count provided
while indeedScraper.jd_scraped_count < self.jd_count:
# Wait until job cards are visible or maximum 100 seconds
self.common_functions_obj.wait_until_page_load(browser = browser, elem_key="//div[@class='jobsearch-SerpJobCard unifiedRow row result clickcard']", key_type="xpath", wait_time=100)
try:
#Finding all job description urls from a page
soup = bs4.BeautifulSoup(browser.page_source, features="html.parser")
url_content_list = soup.select('div.jobsearch-SerpJobCard.unifiedRow.row.result.clickcard > h2 > a')
# Get job description and store in a dictionary
self.indeed_get_content(url_content_list, indeed_url)
except Exception as e:
print("Error! Couldn't extract URLs!")
print(e)
# Next tab (pagination)
try:
next_element = browser.find_element_by_css_selector("[aria-label=Next]")
browser.execute_script("arguments[0].click();", next_element)
except Exception:
try:
# Close pop dialog
self.common_functions_obj.wait_until_page_load(browser = browser, elem_key="[aria-label=Close]", key_type="css_selector", wait_time=60)
close_popup = browser.find_element_by_css_selector("[aria-label=Close]")
close_popup.click()
self.common_functions_obj.wait_until_page_load(browser = browser, elem_key="[aria-label=Next]", key_type="css_selector", wait_time=100)
next_element = browser.find_element_by_css_selector("[aria-label=Next]")
browser.execute_script("arguments[0].click();", next_element)
except Exception as e:
print("Next Element not found")
break
# # #### Save unprocessed data and stats into csv
self.output_to_file(tic, download_start_time, script_status = "Not Finished")
# # #### Save unprocessed data and stats into csv
self.output_to_file(tic, download_start_time, script_status = "Finished")
except Exception as e:
print(e)
browser.quit()
def output_to_file(self, tic, download_start_time, script_status):
'''
toc : time at which script ends
total_runtime : It is the total time for which script runs
time_per_jd : It is the average time for one job description scraped
'''
# # #### Save unprocessed data into csv
toc = time.perf_counter()
total_runtime = toc - tic
time_per_jd = (total_runtime-download_start_time)/float(indeedScraper.jd_scraped_count)
# Downloaded JD's are passed to below function for processing and saving into csv
self.data_processing.data_processing(jobrole_description_dict = indeedScraper.jobrole_description_dict, jd_scraped_count= indeedScraper.jd_scraped_count)
# To save statistics to Stats file
self.data_processing.log_stats(jd_scraped_count = indeedScraper.jd_scraped_count, total_runtime = total_runtime, time_per_jd = time_per_jd, duplicate_count = indeedScraper.duplicate_count, script_status= script_status)
indeedScraper.jobrole_description_dict = {}