-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcamx.py
142 lines (125 loc) · 6.06 KB
/
camx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from selenium import webdriver
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import time
import random
# import winsound
# ssl._create_default_https_context = ssl._create_unverified_context
filename = "camx_2022-extracted.csv"
headers = "Firmenname, Website, Adresse, Land, Phone/Fax, Kategorien, Beschreibung, \n"
loopfile = "camx_dir_urls.txt"
# Wartepausen
waittimefrom_main, waittimeto_main = 12, 15 # Vorgabe der Wartezeit zwischen ... Sekunden und ... Sekunden
waittimefrom_card, waittimeto_card = 10, 12 # Vorgabe der Wartezeit zwischen ... Sekunden und ... Sekunden
# PROFILEINSTELLUNGEN BEGINN
# loc_adblock = 'uBlock0_1.45.3rc5.firefox.signed.xpi' # Ad-block file path
opts = Options()
opts.set_preference("javascript.enabled", True) # Javascript deaktivieren
opts.set_preference("permissions.default.image", 2,)
opts.set_preference("plugin.state.flash", 0) # Flash deaktivieren
opts.set_preference("toolkit.telemetry.unified", False) # Telemetrie deaktivieren
opts.page_load_strategy = 'normal' # 'eager': DOM ready, but not yet images
# opts.add_argument("-headless") # option for headless browser
# PROFILEINSTELLUNGEN ENDE
driver_card = webdriver.Firefox(options=opts) # Fenster für Adress-Karte
driver_card.maximize_window()
f = open(filename, "a", encoding="utf-8") # "w" fuer "write", a fuer append
f.write(headers)
with open(loopfile, encoding='utf-8', errors='replace') as linkfile:
for page_nr, page in enumerate(linkfile):
driver_main = webdriver.Firefox(options=opts) # Fenster für Verzeichnis (Hauptfenster)
driver_main.maximize_window()
print('\nÖffne Verzeichnisseite', page)
driver_main.get(page) # Hauptfenster
# Scrolling system
height_first = driver_main.execute_script("return document.body.scrollHeight") # Höhe der Seite in Pixel
driver_main.execute_script("window.scrollBy(0, 1200)")
time.sleep(1)
driver_main.execute_script("window.scrollBy(0, 1200)")
time.sleep(1)
height_second = driver_main.execute_script("return document.body.scrollHeight") # Höhe der Seite in Pixel
while height_second > height_first:
height_first = height_second
driver_main.execute_script("window.scrollBy(0, 1200)")
time.sleep(1)
driver_main.execute_script("window.scrollBy(0, 1200)")
time.sleep(1)
height_second = driver_main.execute_script("return document.body.scrollHeight")
time.sleep(1)
# Kontaktkarte-container
container = driver_main.find_elements(by=By.XPATH, value="//h3[contains(text(),card-Title)]/child::a")
url_extr_list = []
for i in container:
lnk = i.get_attribute("href")
url_extr_list.append(lnk)
# Kontaktkarten extrahieren
for idx, card_url in enumerate(url_extr_list):
print("iteration: ", idx + 1, "of", len(url_extr_list), "on page", page_nr+1, "of 27")
print("Öffne Kontaktkarte:", card_url )
driver_card.get(card_url)
time.sleep(random.randint(waittimefrom_card, waittimeto_card))
try:
firmenname = driver_card.find_element(by=By.XPATH, value="//h1").text
firmenname = f'"{firmenname}"'
except Exception as e:
print("Ausnahme: Firmenname")
print(e)
firmenname = ""
pass
# Adresse
try:
addr = driver_card.find_element(by=By.XPATH, value='//p[contains(text(), showcase-address)][*]').text
splitted = addr.split("\n")
country = splitted[-1]
country = f'"{country}"'
address = splitted[:-1]
address = address[0]
address = f'"{address}"'
except Exception as e:
print("Ausnahme: Adresse")
print(e)
address = ""
pass
try:
contact = driver_card.find_element(by=By.CLASS_NAME, value='showcase-web-phone')
cont_extr = contact.find_elements(by=By.TAG_NAME, value="li")
website = cont_extr[0].text
cont_extr.pop(0)
phone_fax = []
for item in cont_extr:
phone_fax.append(item.text)
phone_fax = "\n".join(phone_fax)
phone_fax = f'"{phone_fax}"'
except Exception as e:
print("Ausnahme: Kontakt")
print(e)
website, phone_fax = "", ""
pass
try:
categories_tmp = driver_card.find_element(by=By.ID, value='js-vue-products')
categories = categories_tmp.find_elements(by=By.XPATH, value="*")[-1].text
categories = f'"{categories}"'
except Exception as e:
print("Ausnahme: Kategorie")
print(e)
categories = ""
pass
try:
description=driver_card.find_element(by=By.ID, value='js-vue-description').text
description=description.split("\n")[-1]
description = f'"{description}"'
except Exception as e:
print("Ausnahme: Beschreibung")
print(e)
description = ""
pass
f.write(firmenname + "," + website + "," + address + "," + country + "," + phone_fax + "," + categories +"," + description + "\n")
driver_main.close()
time.sleep(random.randint(waittimefrom_main, waittimeto_main))
print("Finished")
# winsound.Beep(1000, 2000) # Piepen wenn es beendet ist.
f.close()
driver_card.close()
# os.system("shutdown -s -t 10") # Computer herunterfahren, letzte Zahl ist Timer