-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstructurizer.py
206 lines (193 loc) · 8.17 KB
/
structurizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import re, time, sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from bs4 import element
'''
This class is meant to be used with Jobbing GUI.
This can scrape a well-structured website by
looking for known equally structured data as
compared to a given set of data.
'''
class Website:
'''
Initialize global and local variables
url: url of the website to scrape
'''
def __init__(self, url=None, company=None):
self.__url = url # The URL for the website to scrape.
# only do the time-lengthy stuff when needed
if self.__url:
self.set_url(url)
# local variable necessary for the get_by_path method
self.link = None
self.past_links = []
self.company = company
'''
Getters and Setters for the url (string)
'''
def get_url(self):
return self.__url
def set_url(self, url):
# setting up the webdriver
self.CrOptions = webdriver.ChromeOptions()
self.CrOptions.add_argument('headless')
self.CrOptions.add_argument('log-level=3') # fatal errors only
self.CrOptions.add_experimental_option('excludeSwitches', ['enable-logging']) #in attempt to get rid of the "devtools" line
#self.CrOptions.add_argument('no-sandbox') # potentially solve tab crash problems
#self.CrOptions.add_argument('disable-gpu') # potentially solve GpuChannelMsg_CreateCommandBuffer
self.CrOptions.add_argument('mute-audio') # to resolve issues with MediaEvents
# setting up the browser and getting the url post-JS loading
self.browser = webdriver.Chrome(options=self.CrOptions, service_log_path="NUL")
self.browser.get(url)
time.sleep(1)
page = 1
self.html = self.browser.page_source
# creating a soup with the html as the ingredient
self.soup = BeautifulSoup(self.html, "html.parser")
# inserting body tag's innerHTML from all the rest of the pages (2 to n)
while(str(page+1) in self.soup.body.stripped_strings):
# click on the element with just page#
to_click = self.browser.find_elements(By.XPATH, f"//*[text()='{page+1}']")[0]
self.browser.execute_script('arguments[0].click();', to_click)
time.sleep(0.5)
# updating the html and soup variables
self.html = self.html.split("</body>")[0] + self.browser.find_element(By.TAG_NAME, "body").get_attribute('innerHTML') + "</body>" + self.html.split("</body>")[1]
self.soup = BeautifulSoup(self.html, "html.parser")
page += 1
sys.stdout.write("\r")
sys.stdout.write(f"Page: {page}")
sys.stdout.flush()
sys.stdout.write("\r")
# Closing and quiting the browser's processes
self.browser.close()
self.browser.quit()
'''
Getters and Setters for the HTML (string)
'''
def get_html(self):
return self.html
def set_html(self, html):
self.html = html
'''
Getter for the text in an HTML
'''
def get_text(self):
return list(set([string for string in self.soup.body.stripped_strings]))
# return [string.strip() for string in self.soup.body.get_text().split("\n") if string.strip() != "" and not "<" in string]
'''
Getter for the BeautifulSoup object pertaining to the data in this website
'''
def get_soup(self):
return self.soup
'''
Return the count of tags by tag name.
# Not necessary and may be deleted in the future
'''
def count_tags(self, tag, start=0, end=-1):
return self.html[start:end].count("<"+tag)
'''
Given a string that is located in the web page, return the tag name path taken
to get to that string.
Sample return:
body.div.div.section.a
matcher: the string for which to get a navigable path
'''
def get_path(self, matcher):
# for some reason, sometimes I get a None error
if matcher not in self.html:
return None
# also return none if the number of words exceeds 5
# so as to remove some garbage
if len(matcher.split(" ")) > 5:
return None
# replace all characters that would not compile with regex so that re
# will take every character as literals
replaceable_chars = "\\ [ ] ( ) { } * + ? | ^ $ .".split(" ")
# print(matcher, end=" -> ")
for char in replaceable_chars:
matcher = matcher.replace(char, "\\" + char)
# print(matcher)
elems = self.soup.find_all(text=re.compile(matcher))
if len(elems) == 0:
return None
path = ""
# print(matcher, self.soup.find_all(text=re.compile(matcher)))
for elem in elems:
for parent in elem.parents:
path = parent.name + "." + path
if parent.name == "body":
break
if 'body' != path[0:4]:
path = ""
else:
break
return path[0:-1]
'''
Given a path, find all strings with a tag name path the same as the path
path: path returned by get_path method
current: default None (changes to body tag), the element from which to look at.
'''
def get_by_path(self, path, current=None):
if path == None:
return None
path = path.split(".")
matches = []
if len(path) == 0:
matches.append("Completed Recursion")
return matches
# find current
if not current:
current = self.soup.find("body")
if len(current.contents) > 0:
for child in current.contents:
if child.name == "a":
try:
self.link = child["href"]
except KeyError:
self.link = None
# if the a tag doesn't have a href attribute, then assume no link
if len(path) == 1 and type(child) == element.NavigableString:
if self.link != None and self.link != "" and self.link not in self.past_links and self.link != "javascript:void(0);" and self.link != "javascript:void(0)" and self.link[0] != "#" and str(child).strip() != "": ## assumption: there are href links
matches.append((child, self.link))
self.past_links.append(self.link)
## print error message
# if self.link == None:
# print("Error: Unable to get the application link for", str(child).strip() + ".")
# print("Error: Unable to add", str(child).strip() + ".")
self.link = None
elif len(path) == 1 and type(child) != element.NavigableString:
continue
elif child.name == path[1]:
for string in self.get_by_path(".".join(path[1:]), child):
matches.append(string)
return matches
if __name__ == "__main__":
url = "https://www.hubspot.com/careers/jobs?page=1"
company = "HubSpot"
# sample = "Executive Assistant"
# setup_start = time.time()
webpage = Website(url, company)
# print(webpage.get_text())
# start = time.time()
# path = webpage.get_path(sample)
# all_commons = []
# if path:
# print("I found the sample job! Looking for more jobs!")
# all_commons = webpage.get_by_path(path)
# end = time.time()
# # Print at most the first 10 results of the page (less if there are less # jobs found)
# length = len(all_commons)
# if length > 10:
# length = 10
# print("I have listed below the first", length, "course(s).\n")
# i = 1
# for job in all_commons[:length]:
# print(i, job[0])
# i += 1
# else:
# print("No record of a", sample, "job in this job board.")
# end = time.time()
# print("\n\nIt took", (end-start), "second(s) to search for the jobs!")
# print("I found", len(all_commons), "jobs from the company:", webpage.company + ".")
# print((start-setup_start), "seconds to connect to the webpage on Chrome.")