-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraping.py
167 lines (154 loc) · 6.36 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from selenium import webdriver
import os
import re
import yaml
def set_driver():
"""Preparing chromedriver"""
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), options=options)
# driver = webdriver.Chrome(options=options)
driver.set_window_size(1920, 1080)
return driver
def open_website(driver, url):
"""Open the website"""
driver.get(url)
if driver.title == "":
raise AssertionError("Error accessing '{url}'")
return driver
def get_metadata(key):
"""Get url and xpath location from yaml file"""
data = yaml.load(open("scraping_metadata.yaml"), Loader=yaml.FullLoader)
return data[key].values()
def write_to_yaml(data, filename):
"""Save scraping result to yaml file"""
yaml_path = f"scrape_files/data/{filename}.yaml"
with open(yaml_path, "w") as yaml_file:
yaml.dump(data, yaml_file)
def screenshot_element(driver, element, img_path):
"""Screenshot element from web page"""
driver.execute_script("window.scrollTo(0, 475)") # scroll page
screenshot = element.screenshot_as_png
with open(img_path, "wb") as file:
file.write(screenshot)
def scraping_kalendar_akademik():
driver = set_driver()
url, xpath = get_metadata("kalendar")
try:
open_website(driver, url)
caption = driver.find_element_by_xpath(xpath["title"]).text
table = driver.find_element_by_xpath(xpath["table"])
url_file = driver.find_element_by_xpath(xpath["file_url"]).get_attribute("href")
img_path = "scrape_files/img/kalendar_akademik.png"
data = {"caption":f"<b>{caption}</b>",
"img_path":img_path,
"url_file":url_file}
screenshot_element(driver, table, img_path)
write_to_yaml(data, "kalendar")
print("Successfull scraping 'kalendar'")
driver.quit()
except Exception as e:
print(f"Failed scraping 'kalendar': ({e})")
def scraping_jam():
driver = set_driver()
url, xpath = get_metadata("jam")
try:
open_website(driver, url)
table = driver.find_elements_by_xpath(xpath["table"])[0]
img_path = "scrape_files/img/jam.png"
data = {"img_path":img_path}
screenshot_element(driver, table, img_path)
write_to_yaml(data, "jam")
print("Successfull scraping 'jam'")
driver.quit()
except Exception as e:
print(f"Failed scraping 'jam': {e}")
def scraping_jadwal_kuliah(class_or_lecturer):
driver = set_driver()
url, xpath = get_metadata("jadwal_kuliah")
try:
open_website(driver, url)
form_input = driver.find_element_by_xpath(xpath["form_input"])
form_submit = driver.find_element_by_xpath(xpath["form_submit"])
form_input.send_keys(class_or_lecturer)
form_submit.click()
try:
table = driver.find_element_by_xpath(xpath["table"])
except:
return None
title = driver.find_elements_by_xpath(xpath["title"])[0].text
valid_from = driver.find_element_by_xpath(xpath["valid_from"]).text
filename = re.sub(r'[ /]', '_', title).lower()
caption = f"<b>{title}</b>\n\nUntuk Input : <b>{class_or_lecturer.upper()}</b>\n{valid_from}"
img_path = f"scrape_files/img/{class_or_lecturer.replace(' ','_')}_jadwal.png"
driver.execute_script("window.scrollTo(0, 475)")
screenshot_element(driver, table, img_path)
driver.quit()
return img_path, caption
except Exception as e:
print(f"Failed scraping 'jadwal_kuliah': {e}")
print(f"err: {e}")
def scraping_berita():
driver = set_driver()
url, xpath = get_metadata("berita")
try:
open_website(driver, url)
title_url = driver.find_elements_by_xpath(xpath["title_and_url"])
date = driver.find_elements_by_xpath(xpath["date"])
post_title = [post.text for post in title_url]
post_url = [post.get_attribute("href") for post in title_url]
post_id = [re.search("berita/(\d+)", post).group(1) for post in post_url]
post_date = [post.text for post in date]
post_content = []
for url in post_url: # scrape every post
driver.get(url)
page_content = driver.find_elements_by_xpath(xpath["page_content"])
content = page_content[0].text
content = re.sub("[ \-\w\(\)\d]+(?:.doc|.pdf)", "", content).strip()
post_content.append(content)
contents = zip(post_id, post_title, post_url, post_date, post_content)
data = {}
for id, title, url, date, content in contents:
data[id] = {"title":title, "url":url, "date":date, "content":content}
write_to_yaml(data, "berita")
driver.quit()
print("Successfull scraping 'berita'")
except Exception as e:
print(f"Failed scraping 'berita': {e}")
def scraping_loker():
driver = set_driver()
url, xpath = get_metadata("loker")
try:
open_website(driver, url)
elements = driver.find_elements_by_xpath(xpath["title_and_url"])
post_title = [element.text for element in elements]
post_url = [element.get_attribute("href") for element in elements]
post_id = [re.search("node/(\d+)", url).group(1) for url in post_url]
post_date = []
for url in post_url: # scrape every post
driver.get(url)
elements = driver.find_elements_by_xpath("//span[@class='meta submitted']")
date_posted = re.search("\d{2}/\d{2}/\d{4}", elements[0].text)[0]
post_date.append(date_posted)
contents = zip(post_id, post_title, post_url, post_date)
data = {}
for id, title, url, date in contents:
data[id] = {"date":date, "title":title, "url":url}
write_to_yaml(data, 'loker')
print("Successfull scraping 'loker'")
driver.quit()
except Exception as e:
print(f"Failed scraping 'loker': \n{e}")
def update_all():
"""Update all data"""
scraping_berita()
scraping_loker()
scraping_jam()
scraping_kalendar_akademik()
if __name__ == "__main__":
print("Testing scraping...")
update_all()
scraping_jadwal_kuliah("3ka17")