-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspacetest.py
73 lines (55 loc) · 2.33 KB
/
spacetest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from scrapers import *
def findfrequency(document, stopwords):
word_frequencies = {}
for word in document:
if word.text not in stopwords and word.text not in punctuation:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1
return word_frequencies
def sentencescoring(sentence_list, word_frequencies):
sentence_scores = {}
for sent in sentence_list:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if len(sent.text.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word.text.lower()]
return sentence_scores
def summary(url):
stopwords = list(STOP_WORDS)
document1 = article_scraper(url)
nlp = spacy.load("en_core_web_sm")
docx = nlp(document1)
# build word frequency
# word.text is tokenization in spacy
word_frequencies = findfrequency(docx, stopwords)
# maximum word frequency
max_f = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/max_f)
# frequency table
# print(word_frequencies)
# sentence tokens
sentence_list = [sentence for sentence in docx.sents]
# sentence score via comparison between words and sentence
sentence_scores = sentencescoring(sentence_list, word_frequencies)
# sentence score table
# print(sentence_scores)
summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
# print(summarized_sentences)
# convert sentences from spacy span to strings for joining entire sentence
# for w in summarized_sentences:
# print(w.text)
# List Comprehension of Sentences Converted From Spacy.span to strings
final_sentences = [w.text for w in summarized_sentences]
summary = ' '.join(final_sentences)
return summary
# print('The original document had', len(document1), 'words, the summary has', len(summary), 'words.')