-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfilters.py
177 lines (144 loc) · 5.37 KB
/
filters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Copyright 2012-2013
Eindhoven University of Technology (Erik Kouters, Bogdan Vasilescu, Alexander Serebrenik)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import itertools
import re
import unicodedata
# Not part of standard library:
from dictUtils import MyDict
from unidecode import unidecode
def computeOracle(nameEmailData, aliasToIdName):
oracle = MyDict()
indices = range(len(nameEmailData.keys()))
# print aliasToIdName.keys()
# Initially all false
for idx1, idx2 in itertools.combinations(indices, 2):
oracle[(idx1, idx2)] = 0
# Reflexive matches are always true:
# any (alias,email) pair should be matched to itself
for idx in indices:
oracle[(idx, idx)] = 1
# Look up the identity name for an alias
# Match (alias,email) pairs that share the same identity name
for idx1, idx2 in itertools.combinations(indices, 2):
# try:
(name1, email1) = nameEmailData[idx1]
(name2, email2) = nameEmailData[idx2]
# print name1
# if name1 not in aliasToIdName.keys():
# print name1
# exit()
if aliasToIdName[name1] == aliasToIdName[name2]:
oracle[(idx1, idx2)] = 1
# except:
# print idx1, nameEmailData[idx1]
# print idx2, nameEmailData[idx2]
return oracle
def unspam(email):
'''Normalize an email address'''
if email.find('@') == -1:
# gerard dot b at bbox dot fr
email = email.replace(" dot ", ".")
email = email.replace(" DOT ", ".")
email = email.replace(" @ ", "@")
email = email.replace(" at ", "@")
email = email.replace(" AT ", "@")
email = email.replace(" -at- ", "@")
email = email.replace(" -dot- ", ".")
email = email.replace(" gmail com", "@gmail.com")
email = email.replace(".gmail.com", "@gmail.com")
email = email.replace("%40", "@") # PHP
email = email.replace(" cpan org", "@cpan.org")
email = email.replace(" onirica com", "@onirica.com")
email = email.replace(".wipro.com", "@wipro.com")
email = email.replace(".163.com", "@163.com")
email = email.replace(" thurman org uk", "@thurman.org.uk")
email = email.replace(".gnome.org", "@gnome.org")
email = email.replace(" zip com au", "@zip.com.au")
email = email.replace(" gmail.com", "@gmail.com")
email = email.replace("set EMAIL_ADDRESS environment variable", "")
email = email.replace("tec.ifilp.org/info?gcompris", "")
email = email.replace("Stefan Walter", "")
email = email.replace("@redhat,^B", "@redhat.com")
email = email.replace("@redhat.ocm", "@redhat.com")
return email.strip().lower()
def strip_accents(s):
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
ignorechars = ".?,;:'\"!\/-_#~`&%$@*-+()_="
def normalize(s):
s = unicode(s)
return unidecode(''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')).lower()).translate(None, ignorechars).lower().strip().split()
def simpleNormalize(s):
return unidecode(''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')).lower()).lower().strip()
def slugify(value):
'''
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
'''
try:
value = unicodedata.normalize('NFKD', unicode(value, encoding='utf_8_sig'))#.encode('ascii', 'ignore')
value = strip_accents(value)
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
re.sub('[-\s]+', '-', value)
value = value.replace(' ',' ')
value = value.strip()
except:
print value
print unicode(value, errors='ignore')
v = unicode(value, encoding='utf_8_sig')
# v = unicode(re.sub('[^\w\s-]', '', v).strip().lower())
print strip_accents(v)
exit()
return value
def LevenshteinDistance(s, t):
m = len(s)
n = len(t)
# for all i and j, d[i,j] will hold the Levenshtein distance between
# the first i characters of s and the first j characters of t;
# note that d has (m+1)x(n+1) values
d = {}
for i in range(m+1):
d[i] = {}
d[i][0] = i # the distance of any first string to an empty second string
for j in range(n+1):
d[0][j] = j # the distance of any second string to an empty first string
for j in range(1,n+1):
for i in range(1,m+1):
if s[i-1] == t[j-1]:
d[i][j] = d[i-1][j-1] # no operation required
else:
#first: deletion
#second: insertion
#third: substitution
d[i][j] = min((d[i-1][j] + 1), (d[i][j-1] + 1), (d[i-1][j-1] + 1))
return d[m][n]
def similar(str1, str2, t):
try:
similarity = float(1) - (float(LevenshteinDistance(str1, str2)) / float(max(len(str1), len(str2))))
if similarity >= t:
return True
else:
return False
except:
return False
# Decode quoted-printable
def qpdecode(text):
import email
try:
parts = email.Header.decode_header(text)
new_header = email.Header.make_header(parts)
human_readable = unicode(new_header)
return human_readable
except:
return None