-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSummarize.py
70 lines (55 loc) · 2.13 KB
/
Summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
# get word frequency table from list of words
def getWordFreqTable(words):
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
table = dict()
for word in words:
word = stemmer.stem(word)
if word in stop_words:
continue
if word in table:
table[word] += 1
else:
table[word] = 1
return table
# get sentence score table
def getSentenceScoreTable(sentences, wordFreqTable, character_depth = 10):
table = dict()
for sentence in sentences:
num_words = len(word_tokenize(sentence))
for wordValue in wordFreqTable:
if wordValue in sentence.lower():
if sentence[:character_depth] in table:
table[sentence[:character_depth]] += wordFreqTable[wordValue]
else:
table[sentence[:character_depth]] = wordFreqTable[wordValue]
table[sentence[:character_depth]] = table[sentence[:character_depth]] // num_words
return table
# get average sentence score
def getAverageScore(sentenceScoreTable):
sumValues = 0
for key in sentenceScoreTable:
sumValues += sentenceScoreTable[key]
avg = int(sumValues / len(sentenceScoreTable))
return avg
# get summary from sentences
def getSummary(sentences, sentenceScoreTable, threshold, character_depth = 10):
num_sentences = 0
summary = ""
for sentence in sentences:
if sentence[:character_depth] in sentenceScoreTable and sentenceScoreTable[sentence[:character_depth]] > threshold:
summary += sentence + " "
num_sentences += 1
return summary
# summarize text
def summarize(text, thresholdScale = 1.1):
words = word_tokenize(text)
sentences = sent_tokenize(text)
wordFreqTable = getWordFreqTable(words)
sentenceScores = getSentenceScoreTable(sentences, wordFreqTable)
threshold = getAverageScore(sentenceScores) * thresholdScale
summary = getSummary(sentences, sentenceScores, threshold)
return summary