-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapers.py
40 lines (29 loc) · 1.54 KB
/
scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import bs4 as bs
import urllib.request
import re
def article_scraper(url):
scraped_data = urllib.request.urlopen(url)
article = scraped_data.read()
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = news_check(url, parsed_article)
article_text = ''
for p in paragraphs:
article_text += p.text
# put unwanted words to be replaced in the dictionary
badwords = {'Facebook': '', 'Twitter': '', 'Email': '', 'LinkedIn': '', 'WhatsApp': '', 'Messenger': '', 'Pinterest': '', 'Share via': '', 'Share on': '', 'Read more': '', 'SUBSCRIBE': ''}
# this section of code replaces substrings that matches the key with the keyvalue from the dictionary
badwords = dict((re.escape(k), v) for k, v in badwords.items())
pattern = re.compile("|".join(badwords.keys()))
cleaned_text = pattern.sub(lambda m: badwords[re.escape(m.group(0))], article_text)
return cleaned_text
def news_check(url, parsed_article):
if 'theverge' in url:
return parsed_article.find_all('div', class_='c-entry-content')
elif 'theguardian' in url:
return parsed_article.find_all('div', class_='content__article-body from-content-api js-article__body')
elif 'abcnews' in url:
return parsed_article.find_all('div', class_='StoryBody__main--1VIBe fonts__tiemposTextRegular--1u6HI')
elif 'ctvnews' in url:
return parsed_article.find_all('div', class_='articleBody')
elif 'nbcnews' in url:
return parsed_article.find_all('div', class_='body___2BbXy publico-txt f4 f5-m lh-copy gray-100')