-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
79 lines (66 loc) · 1.71 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from sys import argv
import singlePageCrawler
import databaseStuff
import general
MAX_COUNT = 10
MAIN_URL = 'https://en.wikipedia.org'
def crawlTheLinks(link, text):
links = []
texts = []
links.extend(link)
texts.extend(text)
count = 0
for webpageLink in links:
count += 1
if webpageLink[0]=='/':
webpageLink = MAIN_URL+webpageLink
print('Crawling: ', webpageLink)
text, link = singlePageCrawler.mainFunc(webpageLink)
if not text==-1:
texts.append(text)
links.append(link)
databaseStuff.insert({'url': webpageLink, 'value': text}, 'wiki_text')
databaseStuff.insert({'url': webpageLink, 'value': link}, 'wiki_links')
if count == MAX_COUNT:
break
return texts, links
def getTheData():
databaseStuff.createTables()
URL = 'https://en.wikipedia.org/wiki/Main_Page'
text, link = singlePageCrawler.mainFunc(URL)
if not text==-1:
texts, links = crawlTheLinks(link, text)
return texts, links
def countEachWord(texts):
words = {}
for text in texts:
seperated = text.split(' ')
for word in seperated:
if word in words:
words[word] += 1
else:
words[word] = 1
return words
def useTheWebsite():
text, links = getTheData()
words = countEachWord(text)
general.writeDictToFile('wordCount.txt', words)
def countWordsDB(textTable):
words = {}
for text in textTable:
seperated = text['value'].split(' ')
for word in seperated:
if word in words:
words[word] += 1
else:
words[word] = 1
return words
def useTheDatabase():
textTable = databaseStuff.readTable('wiki_text')
words = countWordsDB(textTable)
general.writeDictToFile('wordCount.txt', words)
if __name__ == '__main__':
if '--website' in argv:
useTheWebsite()
elif '--db' in argv:
useTheDatabase()