-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadDMExperience_selenium.py
95 lines (82 loc) · 2.99 KB
/
DownloadDMExperience_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#Code for finding and printing webpages
#Need to have installed with webdriver in path. Instructions at http://selenium-python.readthedocs.io/installation.html
#Pre-load stuff
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import os
import urllib2
import pdfkit
driver = webdriver.Firefox()
#Working Directory
savedir = "C:\DM Experience_Chris Perkins\WOTC_Archive"
os.chdir(savedir)
#Get to Starting URL
url = "http://dnd.wizards.com/articles"
driver.get(url)
assert "All articles" in driver.title
#Remove Popup if there
time.sleep(2)
try:
alert = driver.find_element_by_css_selector("a[class='fsrDeclineButton']")
alert.click()
except:
print('No Alert')
#Navigate to The Dungeon Master Experience
dropdown = driver.find_element_by_class_name('tag')
dropdown.send_keys('The Dungeon Master Experience', Keys.ENTER)
#Remove Popup if there
time.sleep(2)
try:
alert = driver.find_element_by_css_selector("a[class='fsrDeclineButton']")
alert.click()
except:
print('No Alert')
#Accept Cookies Thing
cookiebutton = driver.find_element_by_class_name('agree-button')
cookiebutton.click()
#Open all See More Articles (Click 8 times)
totclicks = 8 #Sometimes 8, sometimes 9
morearticles = driver.find_element_by_css_selector("a[class$='more-button']") #Find More Articles Button
#Stupid For Loop doesn't work on WOTC lame website
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
time.sleep(1)
morearticles.click()
all_links = driver.find_elements_by_link_text('Chris Perkins')
base_site = "http://dnd.wizards.com"
for link in all_links:
#Non Selenium Solution
#Extract URL Link Info
soup = BeautifulSoup(link.get_attribute('outerHTML'),'lxml')
slnk = soup.findAll('a')
end_site = slnk[0].get('href')
#Go to new URL and pull out HTML
page = urllib2.urlopen(base_site + end_site)
psoup = BeautifulSoup(page, 'lxml')
pheader = psoup.find_all(id='page-header')
particle = psoup.find_all(class_="main-content article")
#Convert to Strings
headerstr = unicode.join(u'\n', map(unicode, pheader))
articlestr = unicode.join(u'\n', map(unicode, particle))
cleanarticle = articlestr.replace(u'\u2019', '\'').replace(u'\xa0', u' ').replace(u'\u201c', '"').replace(u'\u201d','"').replace(u'\u2014','--').replace(u'\xc3\xa9','e').encode('ascii','ignore')
#Put together with background color and save
fullarticle = ['<body style="background-color:#E5DECE; font: 30px Arial;">' +
headerstr + cleanarticle + '</body>'] #Add color & font to texts
name = end_site.split('/')[-1]
pdfkit.from_string(fullarticle[0], name+'.pdf')
driver.close() #Close Selenium Driver.