-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
80 lines (63 loc) · 4.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import utility
def run():
# (1) scrape websites by calling the scrape_data(url) method from utility
# retrieve a total of 200 soups, i.e. one positive (100) and two negative classes (50 each)
movie_url = 'https://www.rottentomatoes.com/top/bestofrt/' # Rotten Tomatoes (movie review - positive class)
documentary_url = 'https://www.rottentomatoes.com/top/bestofrt/top_100_documentary_movies/' # Rotten Tomatoes (documentary review - related to positive class)
restaurant_url = 'https://www.theworlds50best.com/list/1-50' # The World's 50 Best Restaurants (restaurant review - unrelated to positive class)
movie_soup = utility.scrape_data(movie_url)
documentary_soup = utility.scrape_data(documentary_url)
restaurant_soup = utility.scrape_data(restaurant_url)
# 100 positive soups
movie_links = []
for link in movie_soup.body.find_all('a', href=re.compile(r'^/m/.+'), class_='unstyled articleLink'):
movie_links.append('https://www.rottentomatoes.com' + link.get('href')) # '/m/(title)'
movie_size = len(movie_links)
# 50 negative soups
documentary_links = []
for link in documentary_soup.body.find_all('a', href=re.compile(r'^/m/.+'), class_='unstyled articleLink')[50:101]:
documentary_links.append('https://www.rottentomatoes.com' + link.get('href')) # '/m/(title)'
documentary_size = len(documentary_links)
# 50 negative soups
restaurant_links = []
for link in restaurant_soup.find_all('a', class_='item')[:50]:
restaurant_links.append('https://www.theworlds50best.com' + link.get('href')) # '/the-list/(id)/(name)'
restaurant_size = len(restaurant_links)
pos_soup = [utility.scrape_data(url) for url in movie_links]
pos_size = len(pos_soup)
neg_soup = [utility.scrape_data(url) for url in (documentary_links + restaurant_links)]
neg_size = len(neg_soup)
# (2a) remove html and character encoding before writing the data to a csv file
# retrieve all descendants of the body tag from each soup and put irrelevant tags into a list
pos_tags = []
for soup in pos_soup:
pos_tags.append(set([child.name for child in soup.body.descendants if child.name is not None]))
neg_tags = []
for soup in neg_soup:
neg_tags.append(set([child.name for child in soup.body.descendants if child.name is not None]))
# inspect pos_tags/neg_tags for each soup and decide which tags to remove
for soup in (pos_soup + neg_soup):
for div in soup.find_all('div', id=['movies_sidebar']):
div.decompose()
for div in soup.find_all('div', class_=['modal-dialog', 'rate-and-review-widget__module']):
div.decompose()
for section in soup.find_all('section', class_=['mop-ratings-wrap__row js-scoreboard-container', 'panel panel-rt panel-box']):
section.decompose()
pos_tags_to_remove = ['span', 'button', 'script', 'noscript', 'style', 'input', 'table', 'img', 'form', 'nav', 'header', 'footer']
neg_tags_to_remove = ['span', 'button', 'script', 'noscript', 'style', 'input', 'table', 'img', 'form', 'nav', 'header', 'footer', 'iframe', 'hr', 'br']
# remove a list of irrelevant tags from each soup by calling the remove_tags(tags, soup) method from utility
pos_docs = [utility.remove_tags(pos_tags_to_remove, soup) for soup in pos_soup]
neg_docs = [utility.remove_tags(neg_tags_to_remove, soup) for soup in neg_soup]
# join the lists (a total of 200 soups)
docs = pos_docs + neg_docs
doc_size = len(docs)
# (2b) create a DataFrame with pos/neg labels by calling the build_corpus(docs, labels) method from utility
labels = ['movie_review'] * len(pos_docs) + ['not_movie_review'] * len(neg_docs)
df = utility.build_corpus(docs, labels)
# remove tabs, carriage returns, new lines and specific Unicode special characters from the DataFrame
# by calling the clean_doc(df) method from utility
df['document'] = df['document'].apply(utility.clean_doc)
# print(df)
# export the DataFrame to a CSV file
df.to_csv('data.csv', encoding='utf-8', index=False)