-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpercona-slides-downloader.py
executable file
·87 lines (69 loc) · 2.54 KB
/
percona-slides-downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Download Percona Live's all slides: just input Percona Live index page, and this program will download all slides
Requires Python 3.6+
Downloaded slides will be saved on current dir's subdir like this: percona_live_XX_slides
Install:
python3 -m pip install percona-slides-downloader
Usage:
./percona-slides-downloader.py -u <url> [-t <threads>]
Options:
-u <url>, the web page
-t <threads>, optional, default to 100
Examples:
./percona-slides-downloader.py -u 'http://www.percona.com/live/17/resources/slides'
'''
import os
import logging
from concurrent import futures
from bs4 import BeautifulSoup
import requests
from docopt import docopt
__author__ = '刀尖红叶'
__version__='0.1'
# config loggging
logging.captureWarnings(True)
log_file=os.path.realpath(__file__).rstrip('.py') + '.debug.log'
logging.basicConfig(filename=log_file,level=logging.DEBUG,format='%(levelname)s: %(asctime)s, %(filename)s, line:%(lineno)d: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
columns, rows = os.get_terminal_size()
extra_length = columns - 47
logging.debug('-' * extra_length)
# get args
args = docopt(__doc__, version=__version__)
url_index = args['-u']
#logging.debug(f'args:{args})')
# def var
workers = 100 if not args['-t'] else args['-t']
url_prefix = 'http://www.percona.com'
href_c = set()
year = '20' + url_index.split('/')[4]
dirname = f'percona_live_{year}_slides'
os.system(f'mkdir -p ./{dirname}/')
def get_session_url(url):
global href_c
html = requests.get(url, timeout=10).text
soup = BeautifulSoup(html, "lxml")
list_a = soup.find_all('a')
for link in list_a:
if link.get('href',None):
if '/live/' in link['href'] and '/sessions/' in link['href']:
href_c.add(url_prefix+link['href'])
if link.get('title',None):
if link['title'] == 'Go to next page':
get_session_url(url_prefix+link['href'])
def get_slide(url):
html = requests.get(url, timeout=10).text
soup = BeautifulSoup(html, "lxml")
url_pdf = soup.object['data']
file_name = url_pdf.split('/')[-1]
with open(f'./{dirname}/{file_name}', 'wb') as f:
f.write(requests.get(url_pdf, timeout=10).content)
# main entrance
if __name__ == '__main__':
get_session_url(url_index)
#logging.debug(f'href_c: {href_c}, len: {len(href_c)}')
with futures.ThreadPoolExecutor(max_workers=workers) as executor:
for future in executor.map(get_slide, href_c):
pass
# logging.debug(f'href_c:{href_c}')