-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
executable file
·135 lines (123 loc) · 4.44 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from score import *
from math import log, sqrt
ALPHA = 1
BETA = 0.75
GAMMA = 0.15
def generateInvertedIndex():
invertedIndex = {}
tokenDict = {}
doc_dict = open('./data/doc.txt')
for line in doc_dict.readlines():
doc_id, text = line.strip().split('\t')
doc_text = text.split()
length = len(doc_text)
tokenDict[doc_id] = length
for word in text.split():
if word not in invertedIndex.keys():
docIDCount = {doc_id : 1}
invertedIndex[word] = docIDCount
elif doc_id in invertedIndex[word].keys():
invertedIndex[word][doc_id] += 1
else:
docIDCount = {doc_id : 1}
invertedIndex[word].update(docIDCount)
return invertedIndex
def queryFrequency(query, invertedIndex):
queryFreq = {}
for term in query.split():
if term in queryFreq.keys():
queryFreq[term] += 1
else:
queryFreq[term] = 1
for term in invertedIndex:
if term not in queryFreq.keys():
queryFreq[term] = 0
#print(queryFreq)
return queryFreq
def calculateDocsCount(doc, docIndex):
doc_dict = open('./data/doc.txt')
for line in doc_dict.readlines():
doc_id, text = line.strip().split('\t')
if doc_id == doc:
for term in text.split():
if term in docIndex.keys():
docIndex[term] += 1
else:
docIndex[term] = 1
return docIndex
def findDocs(k, sortedBM25Score, invertedIndex, relevancy):
relIndex = {}
nonRelIndex = {}
if relevancy == "Relevant":
for i in range(0, k):
doc,doc_score = sortedBM25Score[i]
relIndex = calculateDocsCount(doc, relIndex)
for term in invertedIndex:
if term not in relIndex.keys():
relIndex[term] = 0
return relIndex
elif relevancy == "Non-Relevant":
for i in range(k+1,len(sortedBM25Score)):
doc,doc_score = sortedBM25Score[i]
nonRelIndex = calculateDocsCount(doc, nonRelIndex)
for term in invertedIndex:
if term not in nonRelIndex.keys():
nonRelIndex[term] = 0
return nonRelIndex
def findRelDocMagnitude(docIndex):
mag = 0
for term in docIndex:
mag += float(docIndex[term]**2)
mag = float(sqrt(mag))
return mag
def findNonRelDocMagnitude(docIndex):
mag = 0
for term in docIndex:
mag += float(docIndex[term]**2)
mag = float(sqrt(mag))
return mag
def findRocchioScore(term, queryFreq, relDocMag, relIndex, nonRelMag, nonRelIndex):
Q1 = ALPHA * queryFreq[term]
Q2 = (BETA/relDocMag) * relIndex[term]
Q3 = (GAMMA/nonRelMag) * nonRelIndex[term]
rocchioScore = ALPHA * queryFreq[term] + (BETA/relDocMag) * relIndex[term] - (GAMMA/nonRelMag) * nonRelIndex[term]
return rocchioScore
def findNewQuery(query, k, sortedBM25Score, invertedIndex):
queryFreq = queryFrequency(query, invertedIndex)
relIndex = findDocs(k, sortedBM25Score, invertedIndex, "Relevant")
relDocMag = findRelDocMagnitude(relIndex)
nonRelIndex = findDocs(k, sortedBM25Score, invertedIndex, "Non-Relevant")
nonRelMag = findNonRelDocMagnitude(nonRelIndex)
updatedQuery = {}
newQuery = query
for term in invertedIndex:
updatedQuery[term] = findRocchioScore(term, queryFreq, relDocMag, relIndex, nonRelMag, nonRelIndex)
sortedUpdatedQuery = sorted(updatedQuery.items(), key=lambda x:x[1], reverse=True)
if len(sortedUpdatedQuery)<5:
loopRange = len(sortedUpdatedQuery)
else:
loopRange = 5
for i in range(loopRange):
term,frequency = sortedUpdatedQuery[i]
#print("term, frequency", term, frequency)
if term not in query:
newQuery += " "
newQuery += term
return newQuery
#invertedIndex = generateInvertedIndex()
#print(invertedIndex)
def getReduceIndex(query, invertedIndex):
query_term_freq = {}
query_term_list = query.split()
reduced_inverted_index = {}
for term in query.split():
if term in query_term_freq.keys():
query_term_freq[term] += 1
else:
query_term_freq[term] = 1
for term in query_term_freq:
if term in invertedIndex:
reduced_inverted_index.update({term:invertedIndex[term]})
else:
reduced_inverted_index.update({term:{}})
return reduced_inverted_index