-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAutofill.py
90 lines (71 loc) · 2.24 KB
/
Autofill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
@authors: Alaa Farouk - Mariam Makram
"""
import re
ngramsNum = 3
ngrams_list = {}
probabilities = {}
count = 0
nPredictions = 5
options = []
def prepareData():
file = open(
"dataset.txt",
"r", encoding="UTF-8")
dataset = file.read()
file.close()
return dataset
# preparing data for generating ngrams
def tokenizeText(text):
text = text.lower()
# tokenizing text to work on arabic and english words and numbers
text = re.sub('[^\sa-zA-Z0-9ء-ي]', '', text)
return text.split()
def calculateProb(sentence, counter=0):
if sentence not in ngrams_list.keys():
ngrams_list[sentence] = 1
else:
ngrams_list[sentence] += 1
counter += 1
probabilities[sentence] = ngrams_list[sentence] / counter
def generateNGrams(words_list, n, counter=0):
nGrams = []
for num in range(0, len(words_list)):
sentence = ' '.join(words_list[num:num + n])
calculateProb(sentence, counter)
def splitSequence(seq):
return seq.split(" ")
def getPredictions(sequence):
predicted = []
nPred = nPredictions
inputSequence = splitSequence(sequence)
for sentence in probabilities.keys():
if sequence in sentence:
outputSequence = splitSequence(sentence)
cont = False
for i in range(0, len(inputSequence)):
if outputSequence[i] != inputSequence[i]:
cont = True
break
if cont:
continue
predicted.append((sentence, probabilities[sentence]))
predicted.sort(key=lambda x: x[1], reverse=True)
noPrediction = False
if len(predicted) == 0:
print("No predicted words")
noPrediction = True
else:
if len(predicted) < nPredictions:
nPred = len(predicted)
for i in range(0, nPred):
outputSequence = predicted[i][0].split(" ")
print(outputSequence[len(inputSequence)])
options.append(outputSequence[len(inputSequence)])
return options, noPrediction, nPred
dataset = prepareData()
words = tokenizeText(dataset)
seq = input("Enter search words: ")
generateNGrams(words, len(splitSequence(seq)) + 1, count)
getPredictions(seq.lower())