-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
170 lines (144 loc) · 6.78 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import utils
import string
import numpy as np
# DO NOT MODIFY CLASS NAME
class Indexer:
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def __init__(self, config):
self.docs_index = {}
self.inverted_idx = {}
self.postingDict = {}
self.config = config
self.lower_case_words = {}
self.docs_counter = 0
self.total_docs_len = 0
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def add_new_doc(self, document):
"""
This function perform indexing process for a document object.
Saved information is captures via two dictionaries ('inverted index' and 'posting')
:param document: a document need to be indexed.
:return: -
"""
document_dictionary = document.term_doc_dictionary
if not document_dictionary:
return
self.docs_counter += 1
document_dictionary = self.capital_letters(document_dictionary) # get dictionary according to lower and upper
doc_max_term = ''
max_tf = max(list(document_dictionary.values())) # Get the most frequent used value
terms_with_one_occurrence = 0
number_of_curses = 0
for term in document_dictionary:
if document_dictionary[term] == 1:
terms_with_one_occurrence += 1
if term == "*CENSORED*":
number_of_curses += 1
self.docs_index[document.tweet_id] = [0, max_tf, document.doc_length, terms_with_one_occurrence,
number_of_curses, np.zeros(50)]
self.total_docs_len += document.doc_length
# Go over each term in the doc
for term in document_dictionary.keys():
try:
if not doc_max_term and document_dictionary[term] == max_tf:
doc_max_term = term
self.docs_index[document.tweet_id].append(doc_max_term)
# Update inverted index and posting
if term not in self.inverted_idx.keys():
# check if term was already added as upper (and should now be lower)
if term.islower() and term.upper() in self.inverted_idx:
# remove upper term and update it as a lower term
self.inverted_idx[term] = self.inverted_idx[term.upper()]
self.inverted_idx[term]["freq"] += 1
self.inverted_idx[term]["df"] += 1
self.inverted_idx[term]["posting_list"][document.tweet_id] = [
document_dictionary[term] / document.doc_length]
self.inverted_idx.pop(term.upper())
else: # term is not in the dictionary in any form (case)
self.inverted_idx[term] = {"freq": 1,
"df": 1,
"posting_list": {document.tweet_id: [
document_dictionary[term] / document.doc_length]}}
else:
# term is in the inverted_index
self.inverted_idx[term]["freq"] += 1
self.inverted_idx[term]["df"] += 1
self.inverted_idx[term]["posting_list"][document.tweet_id] = [
document_dictionary[term] / document.doc_length]
# self.postingDict[term].append((document.tweet_id, document_dictionary[term]))
except Exception as e:
print(str(e))
print('problem with the following key {}'.format(term[0]))
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def load_index(self, fn):
"""
Loads a pre-computed index (or indices) so we can answer queries.
Input:
fn - file name of pickled index.
"""
return utils.load_objects(fn)[0]
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def save_index(self, fn):
"""
Saves a pre-computed index (or indices) so we can save our work.
Input:
fn - file name of pickled index.
"""
index_with_docs_index = [self.inverted_idx, self.docs_index]
utils.save_objects(index_with_docs_index, fn)
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def _is_term_exist(self, term):
"""
Checks if a term exist in the dictionary.
"""
return term in self.inverted_idx
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def get_term_posting_list(self, term):
"""
Return the posting list from the index for a term.
"""
return self.inverted_idx[term]["posting_list"] if self._is_term_exist(term) else []
def capital_letters(self, document_dictionary):
"""
:param document_dictionary: dictionary holding the tokenized term and the amount of its appearances
:return: document_dictionary_new: will hold the same terms as the original only with caps or lower letters
accordingly
"""
document_dictionary_new = {} # dictionary with words saved correctly by capitals
for term in document_dictionary:
if not term:
continue
if term[0] not in string.ascii_letters:
document_dictionary_new[term] = document_dictionary[term]
continue
if term[0].islower():
document_dictionary_new[term] = document_dictionary[term]
else: # term is upper case
lower_term = term.lower()
if lower_term in self.inverted_idx:
document_dictionary_new[lower_term] = document_dictionary[term]
else: # giving it a chance as upper case
document_dictionary_new[term.upper()] = document_dictionary[term]
if not document_dictionary_new:
return document_dictionary
return document_dictionary_new
def get_inverted_index(self):
return self.inverted_idx
def get_docs_index(self):
return self.docs_index
def get_docs_count(self):
return self.docs_counter
def __getitem__(self, term_and_item):
if term_and_item[0] in self.inverted_idx:
return self.inverted_idx[term_and_item[0]][term_and_item[1]]
elif [term_and_item[0]] in self.docs_index:
return self.docs_index[[term_and_item[0]]][term_and_item[1]]
else:
# The item is not in either the idx and docs_index
return None