-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtfidf_vector_extraction.py
executable file
·344 lines (298 loc) · 15.1 KB
/
tfidf_vector_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# -*- coding: utf-8 -*-
# used to be called TFIDF_final_scores_newlaptop
# same file as the previous version but use already extracted info to get the vectors for clustering
# input info needed:
# a folder with reports .docx
# folder with datasets .csv
# correlation table sql already created
# =============================================================================
# output results:
# tf-idf for each word within a document
# ranking based on bow for each document
# directly added to sql as a table
# =============================================================================
# improved lemmatizer with tf-idf but manually implemented then select 15 words
# after disregarding tf-idf scores less than 0.0019
# importing useful libraries
import mysql.connector # connect to database
import nltk # preprocessing text
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter, defaultdict
import numpy as np # maths equations
import docxpy # word extraction of text
import math
from utils.utils_ml import id_to_name
from pathlib import Path
import argparse
from utils.feature_extraction_utils import delete_table
def compute_tf(wordDict: dict, words_number: int):
"""
Create a dictionary for tf scores for each document.
@param wordDict: dictionary containing words and number of appearances withing all the documents
@param words_number: number of total words in doc after lemmatization and removal of stop words
@return: dictionary of tf scored for document
"""
tf_dict = {}
wordDict_items = wordDict.items()
for word, count in wordDict_items:
# for each word in a document calculate tf score
tf_dict[word] = count / float(words_number)
# import pdb; pdb.set_trace()
return tf_dict
def compute_idf(documents_list):
"""
Function generated IDF dictionary for the list of existing documents
@param documents_list: list of dictionaries, each corresponding to a document, of the form (word, noApp)
@return idf_dict: dictionary of idf scores for each word
"""
n = len(documents_list) # number of documents in the corpus <=> number of dictionaries in the list
idf_dict = dict.fromkeys(documents_list[0].keys(), 0) # initialises each word's count to 0
for document in documents_list:
for word, number_of_appearances in document.items():
if number_of_appearances > 0:
idf_dict[word] += 1 # increase the number of appearances of the word in the corpus
for word, number_of_appearances in idf_dict.items():
# for each word in corpus calculate idf score update the dictionary values
idf_dict[word] = math.log10(n / float(number_of_appearances))
return idf_dict
def compute_tf_idf(tf_dict: dict, idf_dict: dict):
"""
Create dictionary of TF-IDF scores for each word in corpus
@param tf_dict: TF dictionary for each document
@param idf_dict: IDF word dictionary for whole corpus
@return tf_idf: dictionary of tf-idf scores for each word in current document
"""
tf_idf = {}
tf_items = tf_dict.items()
for word, tf_word_score in tf_items:
tf_idf[word] = tf_word_score * idf_dict[word]
return tf_idf
def word_lemmatization(word: str, word_pos, lemmatizer):
"""
Takes word and part of speech and returns the lemmatized version of word
@param word: word to be lemmatized
@param word_pos: part of speech of given word
@return: lemmatized version of given word
"""
if 'NN' in word_pos:
x = wordnet.NOUN
else:
if 'VB' in word_pos:
x = wordnet.VERB
else:
if 'JJ' in word_pos:
x = wordnet.ADJ
else:
x = wordnet.ADV
return lemmatizer.lemmatize(word, x)
def text_preprocessing(file_path: Path):
"""
Extract text, pre-process text and add part of speech to no_stop words.
@param file_path: file location
@return: vector of tuples of words and the corresponding POS
"""
text = docxpy.process(str(file_path))
file_words = [w for w in word_tokenize(text.lower()) if w.isalpha()] # tokenize and discard non words
no_stops = [w for w in file_words if w not in stopwords.words('english')] # discard stopwords
vec = nltk.pos_tag(no_stops) # add part of speech tagging
return vec
def create_corpus(report_names: list, lemmatizer, report_folder_location: Path):
"""
Function generated the most relevant words for all documents in corpus
@param report_names: list or report names
@param lemmatizer: WordNetLemmatizer used to lemmatize all words in all documents
based on their Part of Speech (pos)
@return: @param number_of_words_per_document: list of number of lemmatized words for each doc
@param new: list of string version of lemmatized words describing each document
@param word_set: set of representative words for all documents
@param lemmatized_document_words: dictionary of list of lemmatized words for each doc
"""
number_of_words_per_document = []
new = []
word_set = []
word_and_pos = defaultdict() # dictionary of list of word-POS tuples for each document
lemmatized_document_words = defaultdict(list)
for report_name in report_names:
report_path = report_folder_location / report_name[0]
word_and_pos[report_name] = text_preprocessing(report_path)
for word, pos in word_and_pos[report_name]:
lemmatized_document_words[report_name].append(word_lemmatization(word, pos, lemmatizer))
number_of_words_per_document.append(len(lemmatized_document_words[report_name]))
# create a new string containing all lemmatized preprocessed words separated by space
string_of_lemmatized_words = ' '.join(lemmatized_document_words[report_name])
new.append(string_of_lemmatized_words) # concatenate the formed string to a list containing all the documents
# discard words that appear more times to create word_set corpus
word_set = set(word_set).union(set(lemmatized_document_words[report_name]))
return number_of_words_per_document, new, word_set, lemmatized_document_words
def create_document_wordsets(report_names: list,
word_set: set,
lemmatized_document_words: dict):
"""
Function counts which words of the word set appear in each document
@param report_names: list of report names
@param word_set: set of most relevant words describing all documents
@param lemmatized_document_words: dictionary of list of lemmatized words for each doc
@return: @param document_word_vectors: vector of word appearances from the word set
"""
document_word_vectors = [] # list of document's word vectors
for report_name in report_names:
# create dictionary with 0 for each word in corpus; this will be the document's word vector
corpus_dictionary_for_report = dict.fromkeys(word_set, 0)
for word in lemmatized_document_words[report_name]:
corpus_dictionary_for_report[word] += 1
# add document's word vector (actually dictionary) to the list of vectors
document_word_vectors.append(corpus_dictionary_for_report)
return document_word_vectors
def create_tf_idf_document_vectors(document_word_vectors: list,
report_names: list,
number_of_words_per_document: list):
"""
Function takes word appearance frequency word set for each document and calculates the tf-idf
word vector for each document.
@param document_word_vectors: vector of word appearances from the word set
@param report_names: list of report names
@param number_of_words_per_document: list of number of relevant words per document
@return: list of tf-idf document vectors
"""
length = len(report_names) # number of documents
idf_dict = compute_idf(document_word_vectors) # obtain idf dictionary
list_tf_idf = [] # create list of tfidf scores
for i in range(0, length):
# loop through documents
tf_dict = compute_tf(document_word_vectors[i], number_of_words_per_document[i])
list_tf_idf.append(compute_tf_idf(tf_dict, idf_dict)) # append to list tfidf scores of each document
return list_tf_idf
def filter_tf_idf_words(report_names, lemmatized_document_words, final, mycursor, cnx):
"""
Function returns lemmatized representative words for each document before
and after being filtered by the tfidf score. This is used to print
representative words in the NLP section. Tables 3 and 4.
@param report_names: list of report names
@param lemmatized_document_words: dictionary of lists of lemmatized words
@param final: panda dataframe to easier visualise word vectors and their
corresponding report name
"""
create_table_command = "CREATE TABLE wordrep300 (MeasID varchar(255), Word varchar(255), NoApp int, " \
"TFIDF float(8), count int)"
try:
print("Table wordrep300 is being created")
mycursor.execute(create_table_command)
except mysql.connector.errors.ProgrammingError:
print("Table wordrep300 already created. Data will be replaced")
delete_table('wordrep300', cnx)
mycursor.execute(create_table_command)
create_table_command = "CREATE TABLE wordrep400 (MeasID varchar(255), Word varchar(255), NoApp int, " \
"TFIDF float(8), count int)"
try:
print("Table wordrep400 is being created")
mycursor.execute(create_table_command)
except mysql.connector.errors.ProgrammingError:
print("Table wordrep400 already created. Data will be replaced")
delete_table('wordrep400', cnx)
mycursor.execute(create_table_command)
id2name = id_to_name(mycursor)
name2id = {v: k for k, v in id2name.items()}
for idx, report_name in enumerate(report_names):
full_word_vector = [] # new list of words with any tfidf
filtered_word_vector = [] # new list of words where tfidf>0.0019
# now create a new list of words for each document that contains words with tfidf>0.0019
for lemmatized_word in lemmatized_document_words[report_name]:
if final[lemmatized_word][idx] > 0.0019:
filtered_word_vector.append(lemmatized_word)
full_word_vector.append(lemmatized_word)
full_word_vector = np.asarray(full_word_vector) # transform list to array
filtered_word_vector = np.asarray(filtered_word_vector) # transform list to array
# keep only most commonly met 15 words with tfidf>0.0019
filtered_word_vector_15 = Counter(filtered_word_vector).most_common(15)
# keep only most commonly met 15 words with any tfidf
full_word_vector_15 = Counter(full_word_vector).most_common(15)
report_id = name2id[report_name[0]]
add_to_sql_command = f"INSERT INTO wordrep300 (MeasID, Word, NoApp, TFIDF, count) VALUES " \
f"(%s, %s, '%s', '%s', '%s')"
word_counter = 0
for word, count in filtered_word_vector_15:
word_vector = [report_id, str(word), count, float(final.loc[report_name][word]), word_counter]
mycursor.execute(add_to_sql_command, word_vector)
cnx.commit()
word_counter += 1
add_to_sql_command = f"INSERT INTO wordrep400 (MeasID, Word, NoApp, TFIDF, count) VALUES " \
f"(%s, %s, '%s', '%s', '%s')"
word_counter = 0
for word, count in full_word_vector_15:
word_vector = [report_id, str(word), count, float(final.loc[report_name][word]), word_counter]
mycursor.execute(add_to_sql_command, word_vector)
cnx.commit()
word_counter += 1
def save_to_sql(final2, mycursor, report_names, cnx):
# create an sql table containing the tf-idf scores with column names being the vector of words describing all documents and the id column containing the name of the report
addf = "CREATE TABLE tfidfpd2 (measID VARCHAR(255)"
for ii in range(0, 258):
addf = addf + ",_" + final2.columns[ii] + " Float(15,14)"
addf = addf + ")"
try:
print("Table tfidfpd2 is being created")
mycursor.execute(addf)
except mysql.connector.errors.ProgrammingError:
print("Table tfidfpd2 already created. Data will be replaced")
delete_table('tfidfpd2', cnx)
mycursor.execute(addf)
# create the string representing the command for sql for adding values
addf2 = "INSERT INTO tfidfpd2 (measID"
for ii in range(0, 258):
addf2 = addf2 + ",_" + final2.columns[ii]
addf2 = addf2 + ") VALUES (%s"
for ii in range(0, 258):
addf2 = addf2 + ",'%s'"
addf2 = addf2 + ")"
# insert into the previously created table the tfidf scores
col = final2.columns
length = len(report_names)
for ii in range(0, length):
listToAdd = [final2.index[ii]]
for el in col:
listToAdd.append(float(final2[el][ii]))
mycursor.execute(addf2, listToAdd)
cnx.commit()
def main():
lemmatizer = WordNetLemmatizer()
parser = argparse.ArgumentParser()
parser.add_argument(
"--sql_password",
type=str,
)
parser.add_argument(
"--report_folder_location",
type=Path,
default='C:/Users/oncescu/OneDrive - Nexus365/Reports',
)
parser.add_argument(
"--database_name",
type=str,
default='final',
)
args = parser.parse_args()
cnx = mysql.connector.connect(user='root', password=args.sql_password,
host='127.0.0.1',
database=args.database_name)
mycursor = cnx.cursor()
# select reports from table which contains number of report, title and name of data file associated
sql = "select nameRep from corr"
mycursor.execute(sql)
report_names = list(mycursor.fetchall())
number_of_words_per_document, _, word_set, lemmatized_document_words = \
create_corpus(report_names, lemmatizer, args.report_folder_location)
document_word_vectors = create_document_wordsets(report_names, word_set, lemmatized_document_words)
list_tf_idf = create_tf_idf_document_vectors(document_word_vectors, report_names, number_of_words_per_document)
indexname = []
for report_name in report_names:
indexname.append(report_name[0])
# create data frame and set index as indexname
final = pd.DataFrame(list_tf_idf, index=indexname)
save_to_sql(final, mycursor, report_names, cnx)
filter_tf_idf_words(report_names, lemmatized_document_words, final, mycursor, cnx)
if __name__ == '__main__':
main()