-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGeneratingCorpus.py
123 lines (105 loc) · 3.81 KB
/
GeneratingCorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Importing required libraries
import os
import re
from bs4 import BeautifulSoup
# Declaring global variables
INPUT_DIRECTORY = "URL_CONTENTS"
OUTPUT_DIRECTORY = "CORPUS"
INPUT_FOLDER = os.getcwd() + "/" + INPUT_DIRECTORY
# Function to generate link and content dictionary
def getFileContents():
files = os.listdir(INPUT_FOLDER)
fileDictionary = {}
for file in files:
key = file.split(".")[0]
value = open(INPUT_DIRECTORY + "/" + file, "r")
fileDictionary[key] = value.read()
return fileDictionary
# Function to clean the files
def cleanTextFiles():
fileDictionary = getFileContents()
for file in fileDictionary:
fileContent = fileDictionary[file]
if fileContent.find('<span class="mw-headline" id="See_also">') != -1:
fileContent = fileContent[:fileContent.index('<span class="mw-headline" id="See_also">')]
elif fileContent.find('<span class="mw-headline" id="References">') != -1:
fileContent = fileContent[:fileContent.index('<span class="mw-headline" id="References">')]
if fileContent.find('<div class="toc" id="toc">') != -1:
startContent = fileContent[:fileContent.index('<div class="toc" id="toc">')]
endContent = fileContent[fileContent.find('</div>', (fileContent.find('</div>',(fileContent.index('<div class="toc" id="toc">') + 1)) + 1)) + 7:]
fileContent = startContent + endContent
fileDictionary[file] = fileContent
return fileDictionary
# Function to check whether the given word/text is float
def isFloat(word):
word = re.sub('[.,]', '', word)
try:
float(word)
return True
except ValueError:
return False
# Function to remove punctuations
def removePunctuations(word):
if word:
if((word[-1] == '.') or
(word[-1] == ',')):
word = word[:len(word)-1]
if word:
if((word[0] == '.') or
(word[0] == ',')):
word = word[1:]
return word
# Function to extract text
def extractText():
texts = cleanTextFiles()
for text in texts:
fileContent = texts[text]
soup = BeautifulSoup(fileContent, "html.parser")
title = soup.find('title').text
header = soup.find('h1').text
body = ""
divs = soup.findAll('div', {'id' : 'bodyContent'})
for div in divs:
body += div.text
fullContent = title + " " + header + " " + body
symbols = re.compile('[_!@\s#$%=+~()}{\][^?&*:;\\/|<>"\']')
fullContent = re.sub(symbols, ' ', fullContent)
words = fullContent.split()
fullText = []
for word in words:
if isFloat(word):
fullText.append(word)
else:
fullText.append(removePunctuations(word))
texts[text] = fullText
return texts
# Function to store file and extracted text in a dictionary
def convertToText():
fileDictionary = extractText()
for file in fileDictionary:
contents = fileDictionary[file]
text = ""
for content in contents:
text += content.lower() + " "
fileDictionary[file] = text
return fileDictionary
# Function to write the content to a file
def writeFile(name, content):
if not os.path.exists(OUTPUT_DIRECTORY):
os.makedirs(OUTPUT_DIRECTORY)
filename = name + str(".txt")
path = os.getcwd() + '/' + OUTPUT_DIRECTORY
files = os.listdir(path)
if filename not in files:
file = open(OUTPUT_DIRECTORY + "/" + filename, "w")
file.write(str(content))
else:
file = open(OUTPUT_DIRECTORY + "/" + name + "1" + str(".txt"), "w")
file.write(str(content))
file.close()
# Main function
def main():
fileDictionary = convertToText()
for link in fileDictionary:
writeFile(link, fileDictionary[link])
main()