-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsinglePageCrawler.py
52 lines (42 loc) · 1.1 KB
/
singlePageCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import copy
import re
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def getPageContent(url):
try:
r = requests.get(url)
if not r.status_code==200:
print("Problem accessing page data.")
return -1
return r.text
except:
print("Bad URL")
return -1
def getText(content):
contentCopy = copy.deepcopy(content)
soup = BeautifulSoup(contentCopy, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
result = u" ".join(t.strip() for t in visible_texts)
return result
def getLinks(content):
contentCopy = copy.deepcopy(content)
soup = BeautifulSoup(contentCopy, 'html.parser')
links = []
for a in soup.find_all('a', href=True):
links.append(a['href'])
return links
def mainFunc(url):
content = getPageContent(url)
if not content==-1:
text = getText(content)
links = getLinks(content)
return text, links
return -1, -1