-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathsem_09_191_parsing.py
114 lines (79 loc) · 2.82 KB
/
sem_09_191_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import requests # скачиваем странички
from bs4 import BeautifulSoup # парсим страничку
from selenium import webdriver # управляем браузером
import pandas as pd
url = 'http://books.toscrape.com/catalogue/page-2.html'
response = requests.get(url)
response.status_code
response.elapsed
html_text = response.content
tree = BeautifulSoup(html_text)
tree.body
tree.body.h3
tree.body.h3.a
element = tree.body.h3.a
element.text
element.get('href')
all_price_elements = tree.find_all('p', {'class': 'price_color'})
all_price_elements
all_price_elements[0].text
prices = [float(pc.text[1:]) for pc in all_price_elements]
prices
currency = [pc.text[:1] for pc in all_price_elements]
currency
articles = tree.find_all('article')
articles[0].h3.a.text
titles = [article.h3.a.text for article in articles]
titles
prefix = 'http://books.toscrape.com/catalogue/'
prefix + articles[0].h3.a.get('href')
links = [prefix + article.h3.a.get('href') for article in articles]
all_rate_elements = tree.find_all('p', {'class': 'star-rating'})
all_rate_elements
all_rate_elements[0].get('class')[1]
ratings = [rate_element.get('class')[1] for rate_element in all_rate_elements]
ratings
books = pd.DataFrame({'price': prices,
'currency': currency,
'title': titles,
'link': links,
'rating': ratings})
books
[i ** 2 for i in [2, 6, 9]]
# делаем функцию!
def get_books_info(html_text):
"""
Функция парсит html странички с http://books.toscrape.com/catalogue/page-2.html
"""
tree = BeautifulSoup(html_text)
all_price_elements = tree.find_all('p', {'class': 'price_color'})
prices = [float(pc.text[1:]) for pc in all_price_elements]
currency = [pc.text[:1] for pc in all_price_elements]
articles = tree.find_all('article')
titles = [article.h3.a.text for article in articles]
prefix = 'http://books.toscrape.com/catalogue/'
links = [prefix + article.h3.a.get('href') for article in articles]
all_rate_elements = tree.find_all('p', {'class': 'star-rating'})
ratings = [rate_element.get('class')[1] for rate_element in all_rate_elements]
books = pd.DataFrame({'price': prices,
'currency': currency,
'title': titles,
'link': links,
'rating': ratings})
return books
url = "http://books.toscrape.com/catalogue/page-36.html"
response = requests.get(url)
get_books_info(response.content)
# window: geckodriver + (Chrome())
# mac: Safari: разрешить внешнее автоматическое управление
lisichka = webdriver.Firefox()
url = 'https://www.google.com/'
lisichka.get(url)
sf = lisichka.find_element_by_name('q')
sf.click()
sf.send_keys('ВШЭ')
knopka = lisichka.find_element_by_name('btnK')
knopka.click()
html_text = lisichka.page_source
html_text
lisichka.close()