-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathtfidf.py
executable file
·164 lines (135 loc) · 5.48 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
#
# Copyright 2009 Niniane Wang (niniane@gmail.com)
# Reviewed by Alex Mendes da Costa.
#
# Modified in 2012 by Benjamin Fields (me@benfields.net)
#
# This is a simple Tf-idf library. The algorithm is described in
# http://en.wikipedia.org/wiki/Tf-idf
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# Tfidf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details:
#
# http://www.gnu.org/licenses/lgpl.txt
__author__ = "Niniane Wang"
__email__ = "niniane at gmail dot com"
import math
import re
import codecs
from operator import itemgetter
class TfIdf:
"""Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
The library constructs an IDF corpus and stopword list either from
documents specified by the client, or by reading from input files. It
computes IDF for a specified term based on the corpus, or generates
keywords ordered by tf-idf for a specified document.
"""
def __init__(self, corpus_filename = None, stopword_filename = None,
DEFAULT_IDF = 1.5):
"""Initialize the idf dictionary.
If a corpus file is supplied, reads the idf dictionary from it, in the
format of:
# of total documents
term: # of documents containing the term
If a stopword file is specified, reads the stopword list from it, in
the format of one stopword per line.
The DEFAULT_IDF value is returned when a query term is not found in the
idf corpus.
"""
self.num_docs = 0
self.term_num_docs = {} # term : num_docs_containing_term
self.stopwords = set([])
self.idf_default = DEFAULT_IDF
if corpus_filename:
self.merge_corpus_document(corpus_filename)
if stopword_filename:
stopword_file = codecs.open(stopword_filename, "r", encoding='utf-8')
self.stopwords = set([line.strip() for line in stopword_file])
def get_tokens(self, str):
"""Break a string into tokens, preserving URL tags as an entire token.
This implementation does not preserve case.
Clients may wish to override this behavior with their own tokenization.
"""
return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())
def merge_corpus_document(self, corpus_filename):
"""slurp in a corpus document, adding it to the existing corpus model
"""
corpus_file = codecs.open(corpus_filename, "r", encoding='utf-8')
# Load number of documents.
line = corpus_file.readline()
self.num_docs += int(line.strip())
# Reads "term:frequency" from each subsequent line in the file.
for line in corpus_file:
tokens = line.rsplit(":",1)
term = tokens[0].strip()
try:
frequency = int(tokens[1].strip())
except IndexError, err:
if line in ("","\t"):
#catch blank lines
print "line is blank"
continue
else:
raise
if self.term_num_docs.has_key(term):
self.term_num_docs[term] += frequency
else:
self.term_num_docs[term] = frequency
def add_input_document(self, input):
"""Add terms in the specified document to the idf dictionary."""
self.num_docs += 1
words = set(self.get_tokens(input))
for word in words:
if word in self.term_num_docs:
self.term_num_docs[word] += 1
else:
self.term_num_docs[word] = 1
def save_corpus_to_file(self, idf_filename, stopword_filename,
STOPWORD_PERCENTAGE_THRESHOLD = 0.01):
"""Save the idf dictionary and stopword list to the specified file."""
output_file = codecs.open(idf_filename, "w", encoding='utf-8')
output_file.write(str(self.num_docs) + "\n")
for term, num_docs in self.term_num_docs.items():
output_file.write(term + ": " + str(num_docs) + "\n")
sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1),
reverse=True)
stopword_file = open(stopword_filename, "w")
for term, num_docs in sorted_terms:
if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs:
break
stopword_file.write(term + "\n")
def get_num_docs(self):
"""Return the total number of documents in the IDF corpus."""
return self.num_docs
def get_idf(self, term):
"""Retrieve the IDF for the specified term.
This is computed by taking the logarithm of (
(number of documents in corpus) divided by (number of documents
containing this term) ).
"""
if term in self.stopwords:
return 0
if not term in self.term_num_docs:
return self.idf_default
return math.log(float(1 + self.get_num_docs()) /
(1 + self.term_num_docs[term]))
def get_doc_keywords(self, curr_doc):
"""Retrieve terms and corresponding tf-idf for the specified document.
The returned terms are ordered by decreasing tf-idf.
"""
tfidf = {}
tokens = self.get_tokens(curr_doc)
tokens_set = set(tokens)
for word in tokens_set:
mytf = float(tokens.count(word)) / len(tokens_set)
myidf = self.get_idf(word)
tfidf[word] = mytf * myidf
return sorted(tfidf.items(), key=itemgetter(1), reverse=True)