-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalignment.py
110 lines (82 loc) · 3.21 KB
/
alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# MIT License
#
# Copyright 2020-2021 New York University Abu Dhabi
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import sys
from collections import deque
import rapidfuzz.distance.Levenshtein as editdistance
def _print_table(tbl, m, n):
for i in range(0, m + 1):
for j in range(0, n + 1):
sys.stdout.write("%s/%s" % tbl[(i, j)])
sys.stdout.write('\t')
sys.stdout.write('\n')
def _edit_distance(tokens1, tokens2, weight_fns):
tbl = {}
tbl[(0, 0)] = (0, 'n')
m = len(tokens1)
n = len(tokens2)
for i in range(0, m):
tbl[(i + 1, 0)] = (i + 1, 'd')
for j in range(0, n):
tbl[(0, j + 1)] = (j + 1, 'i')
if m == 0 or n == 0:
return tbl
for i in range(0, m):
for j in range(0, n):
if (tokens1[i] == tokens2[j]):
edit_cost = tbl[(i + 1, j + 1)] = (tbl[(i, j)][0], 'n')
else:
edit_cost = (tbl[(i, j)][0] + weight_fns['s'](tokens1[i], tokens2[j]), 's')
insert_cost = (tbl[(i, j + 1)][0] + weight_fns['d'](tokens1[i]), 'd')
delete_cost = (tbl[(i + 1, j)][0] + weight_fns['i'](tokens2[j]), 'i')
tbl[(i + 1, j + 1)] = min([insert_cost, delete_cost, edit_cost], key = lambda t: t[0])
return tbl
def _gen_alignments(tokens1, tokens2):
weight_fns = {
's': lambda x, y: editdistance.distance(x, y) * 2 / max(len(x), len (y)),
'd': lambda x: 1,
'i': lambda x: 1
}
dist_table = _edit_distance(tokens1, tokens2, weight_fns)
m = len(tokens1)
n = len(tokens2)
alignments = deque()
i = m
j = n
while i != 0 or j != 0:
op = dist_table[(i, j)][1]
cost = dist_table[(i, j)][0]
if op == 'n' or op == 's':
alignments.appendleft((i, j, op, cost))
i -= 1
j -= 1
elif op == 'i':
alignments.appendleft((None, j, 'i', cost))
j -= 1
elif op == 'd':
alignments.appendleft((i, None, 'd', cost))
i -= 1
return alignments
def align_words(s1, s2):
s1_tokens = s1.split()
s2_tokens = s2.split()
alignments = _gen_alignments(s1_tokens, s2_tokens)
return list(alignments)