-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwiki-crawler.py
75 lines (57 loc) · 2.46 KB
/
wiki-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
import time
from bs4 import BeautifulSoup
# This function finds the first Wikipedia link in the body of a given Wikipedia url
def find_first_link(url):
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")
article_link = None
for element in content_div.find_all("p", recursive=False):
if element.find("a", recursive=False):
article_link = element.find("a", recursive=False).get('href')
break
if not article_link:
return
first_link = 'https://en.wikipedia.org' + article_link
return first_link
# This function checks if our Wikipedia crawler has hit the target url, crossed the max number of steps, or has hit a loop.
def continue_crawl(search_history, target_url, max_links = 25):
if search_history[-1] == target_url:
print("\nWe've reached your specified target url - " + target_url[25:] + "\n")
print("Wiki crawler ending...")
print("Wiki crawler ended.\n")
return False
elif len(search_history) > max_links:
print("\nOur search history has hit the 25 article barrier.\n")
print("Wiki crawler ending...")
print("Wiki crawler ended.\n")
return False
elif len(search_history) != len(set(search_history)):
print("\nWe've hit a cycle.")
print("The " + search_history[-1][30:] + " article is the article that repeats.\n")
print("Wiki crawler ending...")
print("Wiki crawler ended.\n")
return False
else:
return True
# This is where the program begins running
start_url = "https://en.wikipedia.org/wiki/Special:Random" # leads to a random Wikipedia page
target_url = "https://en.wikipedia.org/wiki/Language"
article_chain = [start_url] # the list of articles crawled
article_count = 0 # the number of articles crawled.
print("\nYour start url is: " + start_url)
print("Your target url is: " + target_url)
print("\nWiki crawler starting...\n")
while continue_crawl(article_chain, target_url):
print(str(article_count) + ": " + article_chain[-1])
first_link = find_first_link(article_chain[-1])
if not first_link:
print("\nThe article has no links.\n")
print("Wiki crawler ending...")
print("Wiki crawler ended.\n")
break
article_chain.append(first_link) # add the first link found to article_chain
article_count += 1 # increment the article count
time.sleep(2) # delay for about two seconds