-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbmr.py
159 lines (123 loc) · 4.97 KB
/
bmr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import json
import os
import nltk
from nltk.stem import PorterStemmer
swl = []
dic = {}
docid = []
i_index = {}
p_index = {}
title = {}
ps = PorterStemmer()
# Read stop word from file and split them
# and store each word in swl list
def readStopWord():
try:
global swl
swl = open('Stopword-List.txt', 'r').read().split()
except Exception as e:
print(e)
# Find the all file name in the directory, extract docid and store it on docid variable
def AllFileInDir():
global docid
# list of file name in the ShortStories dir
file_names = os.listdir("ShortStories/")
# split the name of file to get docid
docid = [int(fn.split('.')[0]) for fn in file_names]
try:
docid.sort() # sort the docids
except Exception as e:
print(e)
# Remove punctuation from stories
def removePunctuation(words):
words = words.replace("n’t", " not").replace("’ll", " will").replace("’m", " am").replace(
"’ve", " have").replace("’re", " are").replace("’d", " had").replace("it’s", "it is").replace(
"he’s", "he is").replace("she’s", "she is").replace("that’s", "that is").replace("who’s", "who is").replace(
"to-morrow", "tomorrow").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace(
"’", "").replace("—", " ").replace("“", "").replace("”", "").replace("‘", "").replace(
"'", "").replace(",", "").replace("!", "").replace(".", "").replace(":", "").replace(
";", "").replace("?", "").replace("-", " ").replace("*", "")
# .replace("¯", "").replace(
# "ã", "a").replace("ª", "a").replace("©", "c").replace("§", "s").replace("¨", "")
return words
# Reads all stories from files one by one and stemm the tokens
def readFilesAndStemm():
for x in docid:
f = open("ShortStories/"+str(x)+".txt", 'r',
encoding='utf8') # read file one by one
title[x] = f.readline().replace(" \n", "").replace("\n", "")
f.seek(0)
f = f.read()
# casefolding and removing punctuations and tokenize (words list)
f = removePunctuation(f.lower()).split()
# Stemming words and storing list of words in a dictionary agains their doc id
dic[x] = [ps.stem(word) if word not in swl else word for word in f]
# making inverted index and positional index
creatInvertedandPositionalIndex()
# print(dic)
# take word and docid and add them in Inverted Index
def InvertedIndex(word, docid):
if word not in i_index: # if word is not in the inverted index then add it
i_index[word] = []
i_index[word].append(docid)
else: # if doc id is not in the list then append it againt the given word/key
if docid not in i_index[word]:
i_index[word].append(docid)
# take word, docid and their position in document and add them in Inverted Index
def positionIndex(word, docid, position):
if word not in p_index: # if word is not in the positional index then add it also add its doc id
p_index[word] = {}
p_index[word][docid] = []
else: # if doc id is not in the list then append it againt the given word/key
if docid not in p_index[word]:
p_index[word][docid] = []
# if position not in p_index[word][docid]:
p_index[word][docid].append(position)
# Inverted index and Positional index creation
def creatInvertedandPositionalIndex():
for docid in dic.keys():
for position, word in enumerate(dic[docid]):
if word in swl:
continue
InvertedIndex(word, docid)
positionIndex(word, docid, position)
# Write Both indexes to seperatre files
def WriteIndexesToFile():
json_ii = json.dumps(i_index)
json_pi = json.dumps(p_index)
json_title = json.dumps(title)
iiFile = open('InvertedIndex.json', 'w', encoding='utf8')
piFile = open('PositionalIndex.json', 'w', encoding='utf8')
titleFile = open('Titels.json', 'w', encoding='utf8')
iiFile.write(json_ii)
piFile.write(json_pi)
titleFile.write(json_title)
iiFile.close()
piFile.close()
titleFile.close()
# Reading indexes from their respective file and saving them in global dictionaries
def ReadIndexesFromFile():
global i_index
global p_index
global title
try:
iiFile = open('InvertedIndex.json', 'r', encoding='utf8')
piFile = open('PositionalIndex.json', 'r', encoding='utf8')
titleFile = open('Titels.json', 'r', encoding='utf8')
i_index = json.loads(iiFile.read())
p_index = json.loads(piFile.read())
title = json.loads(titleFile.read())
iiFile.close()
piFile.close()
titleFile.close()
if (not i_index) or (not p_index):
readFilesAndStemm()
WriteIndexesToFile()
except Exception as e:
print(e)
readFilesAndStemm()
WriteIndexesToFile()
def main():
readStopWord()
AllFileInDir()
ReadIndexesFromFile()