forked from nathanu98/ScammerPhoneNumbers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimpleScraper.py
38 lines (33 loc) · 1.41 KB
/
simpleScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# TODO pip install beautifulsoup4, requests, lxml
from bs4 import BeautifulSoup
import time
import requests
# TODO Set these variables
startPageNum = 1
endPageNum = 1
baseUrl = "https://www.nomorobo.com/lookup?page="
outputFileName = "ScammerNumbers.txt"
# w - will overwrite data in the file.
# a - will append the new data to the end of the file.
f = open(outputFileName, "w")
while startPageNum <= endPageNum:
print("scanning page " + str(startPageNum))
# This only works if there is a simple page counter in the url
url = baseUrl + str(startPageNum)
# Retrieves the page in text format
page = requests.get(url).text
# Sets the page up in BeautifulSoup
soup = BeautifulSoup(page, 'lxml')
# This for loop is where you will need to make changes depending on the content
# that you wish to scrape from the webpage (arguably the most difficult part)
# which will heavily depend on what webpage/content you are scraping.
for number in soup.find_all('nobr'):
# TODO
f.write(number.text.strip() + "\n")
print("page " + str(startPageNum) + " scanned successfully now sleeping 10 seconds...")
startPageNum = startPageNum + 1
# The sleep timer is to not stress out the web server on the other end, 10 seconds
# is extremely conservative.
time.sleep(10)
print("Successfully scanned " + str(startPageNum) + " pages")
f.close()