-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfol_parser.py
601 lines (486 loc) · 20.4 KB
/
fol_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
import json
from os.path import join as joinpath
import os
import re
import nltk
from copy import deepcopy
import numpy as np
from nltk.tree import Tree
from itertools import product, combinations, permutations
from typing import Optional, Union, List, Dict, Callable
from utils import all_exists, has_same_obj_in_list, DataPreparer
import random
from Levenshtein import distance as edit_dist
# TODO inefficient
op_ls = ['⊕', '∨', '∧', '→', '↔', '∀', '∃', '¬', '(', ')', ',']
sym_reg = re.compile(r'[^⊕∨∧→↔∀∃¬(),]+')
cfg_template = """
S -> F | Q F
Q -> QUANT VAR | QUANT VAR Q
F -> '¬' '(' F ')' | '(' F ')' | F OP F | L
OP -> '⊕' | '∨' | '∧' | '→' | '↔'
L -> '¬' PRED '(' TERMS ')' | PRED '(' TERMS ')'
TERMS -> TERM | TERM ',' TERMS
TERM -> CONST | VAR
QUANT -> '∀' | '∃'
"""
# used in perturbation
last_nt_nodes = set(['PRED', 'OP', 'CONST', 'VAR', 'QUANT'])
# used in node insertion
insertable_nt_nodes = set(['Q', 'S', 'TERMS', 'F'])
# used in node deletion
deleteable_nt_nodes = set(['Q', 'TERMS', 'F', 'L'])
def parse_text_FOL_to_tree(rule_str):
"""
Parse a text FOL rule into nltk.tree
Returns: nltk.tree, or None if the parse fails
"""
rule_str = reorder_quantifiers(rule_str)
r, parsed_fol_str = msplit(rule_str)
cfg_str = make_cfg_str(r)
grammar = nltk.CFG.fromstring(cfg_str)
parser = nltk.ChartParser(grammar)
tree = parser.parse_one(r)
return tree
def reorder_quantifiers(rule_str):
matches = re.findall(r'[∃∀]\w', rule_str)
for match in matches[::-1]:
rule_str = '%s ' % match + rule_str.replace(match, '', 1)
return rule_str
def msplit(s):
for op in op_ls:
s = s.replace(op, ' %s ' % op)
r = [e.strip() for e in s.split()]
#remove ' from the string if it contains any: this causes error in nltk cfg parsing
r = [e.replace('\'', '') for e in r]
r = [e for e in r if e != '']
# deal with symbols with spaces like "dc universe" and turn it to "DcUniverse"
res = []
cur_str_ls = []
for e in r:
if (len(e) > 1) and sym_reg.match(e):
cur_str_ls.append(e[0].upper() + e[1:])
else:
if len(cur_str_ls) > 0:
res.extend([''.join(cur_str_ls), e])
else:
res.extend([e])
cur_str_ls = []
if len(cur_str_ls) > 0:
res.append(''.join(cur_str_ls))
# re-generate the FOL string
make_str_ls = []
for ind, e in enumerate(r):
if re.match(r'[⊕∨∧→↔]', e):
make_str_ls.append(' %s ' % e)
elif re.match(r',', e):
make_str_ls.append('%s ' % e)
# a logical variable
elif (len(e) == 1) and re.match(r'\w', e):
if ((ind - 1) >= 0) and ((r[ind-1] == '∃') or (r[ind-1] == '∀')):
make_str_ls.append('%s ' % e)
else:
make_str_ls.append(e)
else:
make_str_ls.append(e)
return res, ''.join(make_str_ls)
def make_cfg_str(token_ls):
"""
NOTE: since nltk does not support reg strs like \w+, we cannot separately recognize VAR, PRED, and CONST.
Instead, we first allow VAR, PRED, and CONST to be matched with all symbols found in the FOL; once the tree is
parsered, we then go back and figure out the exact type of each symbols
"""
sym_ls = list(set([e for e in token_ls if sym_reg.match(e)]))
sym_str = ' | '.join(["'%s'" % s for s in sym_ls])
cfg_str = cfg_template + 'VAR -> %s\nPRED -> %s\nCONST -> %s' % (sym_str,sym_str,sym_str)
return cfg_str
def symbol_resolution(tree):
lvars, consts, preds = set(), set(), set()
if tree[0].label() == 'Q':
isFOL = True
main_tree = tree[1]
for sym, tag in tree[0].pos():
if tag == 'VAR':
lvars.add(sym)
else:
isFOL = False
main_tree = tree[0]
preorder_resolution(main_tree, lvars, consts, preds)
return isFOL, lvars, consts, preds
def preorder_resolution(tree, lvars, consts, preds):
# reached terminal nodes
if isinstance(tree, str):
return
if tree.label() == 'PRED':
preds.add(tree[0])
return
if tree.label() == 'TERM':
sym = tree[0][0]
if sym in lvars:
tree[0].set_label('VAR')
else:
tree[0].set_label('CONST')
consts.add(sym)
return
for child in tree:
preorder_resolution(child, lvars, consts, preds)
class Rule:
def __init__(self, isFOL, lvars, consts, preds, tree):
self.isFOL = isFOL
self.lvars = lvars
self.consts = consts
self.preds = preds
self.tree = tree
def rule_str(self):
# TODO inefficient
_, rule_str = msplit(''.join(self.tree.leaves()))
return rule_str
def _get_nodes(
self,
root,
nodes,
allowed_labels=None,
not_allowed_nodes=None,
not_allowed_nodes_in_subtree=False
):
all_child_allowed = True
if isinstance(root, str):
return all_child_allowed
# post-order check children
for child in root:
child_allowed = self._get_nodes(
child,
nodes,
allowed_labels,
not_allowed_nodes,
not_allowed_nodes_in_subtree
)
all_child_allowed = all_child_allowed and child_allowed
# this is twisted... what i mean is that if you don't want to filter the subtrees then simply set
# all_child_allowed to true regardless of the post-order check result
if not not_allowed_nodes_in_subtree:
all_child_allowed = True
root_is_allowed = (not_allowed_nodes is None) or all(e is not root for e in not_allowed_nodes)
tree_is_allowed = all_child_allowed and root_is_allowed
if (
((allowed_labels is None) or (root.label() in allowed_labels))
and tree_is_allowed
):
nodes.append(root)
return tree_is_allowed
def get_nodes(
self,
root,
allowed_labels=None,
not_allowed_nodes=None,
not_allowed_nodes_in_subtree=False
):
"""
get tree nodes from a tree
Args:
root:
allowed_labels: None or a set of strs; only nodes with the allowed label is included
not_allowed_nodes: None or a list of node objects; only node objects that are not in the list
is included
not_allowed_nodes_in_subtree: set to True to also filter out nodes whose subtree nodes are in
not_allowed_nodes, this is used when finding the deletable nodes (we want to find nodes whose entire
subtree is not perturbed before)
"""
if not_allowed_nodes_in_subtree:
assert all_exists(not_allowed_nodes), 'must specify not_allowed_nodes'
nodes = []
self._get_nodes(
root,
nodes,
allowed_labels,
not_allowed_nodes,
not_allowed_nodes_in_subtree
)
return nodes
def random_node_by_label(
self,
root,
allowed_labels=None,
not_allowed_nodes=None,
not_allowed_nodes_in_subtree=False
):
nodes = self.get_nodes(root, allowed_labels, not_allowed_nodes, not_allowed_nodes_in_subtree)
choice = nodes[int(np.random.randint(len(nodes)))] if len(nodes) > 0 else None
return choice
def get_all_fopf(self, root, res):
if isinstance(root, str):
return
if root.label() == 'F':
if len(root) == 3 and all(not isinstance(child, str) for child in root):
# this is a F - F OP F subtree
res.append(root)
for child in root:
self.get_all_fopf(child, res)
def order_of(self, node):
cnt, flag = self._inorder_order(self.tree, node, 0)
assert flag
return cnt
def parent_of(self, root, node):
if isinstance(root, str):
return None
for child in root:
if child is node:
return root
parent = self.parent_of(child, node)
if all_exists(parent):
return parent
return None
def _inorder_order(self, root, node, order_cnt):
if root is node:
return order_cnt + 1, True
if isinstance(root, str):
return order_cnt + (root == node[0]), False
cnt = order_cnt
flag = False
for n in root:
cnt, flag = self._inorder_order(n, node, cnt)
if flag:
break
return cnt, flag
class Sample:
def __init__(self, nl, rule: Rule):
self.nl = nl
self.rule = rule
self.perturbed_rule = None
word_reg = re.compile(r'^\w+$')
nl_words = [e.strip().lower() for e in self.nl.split()]
nl_words = [e for e in nl_words if word_reg.match(e) is not None]
self.symbols = list(self.rule.lvars) + list(self.rule.consts) + list(self.rule.preds) + nl_words
assert len(self.symbols) > 1, 'to avoid dead loop; maybe improve later'
@classmethod
def get_changeable_nodes(cls, rule: Rule, occupied_nodes):
return rule.get_nodes(
rule.tree,
allowed_labels=last_nt_nodes,
not_allowed_nodes=occupied_nodes,
not_allowed_nodes_in_subtree=False
)
@classmethod
def get_insertable_nodes(cls, rule: Rule):
return rule.get_nodes(
rule.tree,
allowed_labels=insertable_nt_nodes,
not_allowed_nodes=None,
not_allowed_nodes_in_subtree=False
)
@classmethod
def get_deletable_nodes(cls, rule: Rule, occupied_nodes):
# first, we find nodes that can be deleted entirely, this means its subtree cannot be occupied
nodes = rule.get_nodes(
rule.tree,
allowed_labels=deleteable_nt_nodes,
not_allowed_nodes=occupied_nodes,
not_allowed_nodes_in_subtree=False
)
deletable_nodes = []
for node in nodes:
label = node.label()
if label == 'Q':
# a Q node whose direct QUANT and VAR children are not occupied is deletable
if has_same_obj_in_list(node[0], occupied_nodes) or has_same_obj_in_list(node[1], occupied_nodes):
continue
elif label == 'TERMS':
# only TERMS node with a TERMS parent is deletable
parent = rule.parent_of(rule.tree, node)
if parent.label() != 'TERMS':
continue
# only TERMS node with unoccupied direct TERM and VAR child is deletable
if has_same_obj_in_list(node[0], occupied_nodes) or has_same_obj_in_list(node[0][0], occupied_nodes):
continue
elif label == 'F':
# only F node with siblings of F OP F is deletable
parent = rule.parent_of(rule.tree, node)
if len(parent) != 3:
continue
if isinstance(parent[0], str):
continue
if isinstance(parent[2], str):
continue
if not (
parent[0].label() == 'F'
and parent[1].label() == 'OP'
and parent[2].label() == 'F'
):
continue
# only F node with an entire unoccupied tree is deletable
# I check this by checking if this F node is in the unoccupied_nodes while
# not_allowed_nodes_in_subtree=True
unoccupied_nodes = rule.get_nodes(
node,
allowed_labels=deleteable_nt_nodes,
not_allowed_nodes=occupied_nodes,
not_allowed_nodes_in_subtree=True
)
if not has_same_obj_in_list(node, unoccupied_nodes):
continue
else:
continue
deletable_nodes.append(node)
# secondly, we find nodes that has deletable negation, where it's fine if the subtree is occupied
nodes_with_deletable_negation = []
for node in nodes:
label = node.label()
# only F/L node has a deletable negation if it has negation in it
if (label == 'F') or (label == 'L'):
if isinstance(node[0], str) and (node[0] == '¬'):
nodes_with_deletable_negation.append(node)
# combine the possible deletions and assign a mode tag
deletable_nodes = \
[(e, 'delete_node') for e in deletable_nodes] \
+ [(e, 'delete_negation') for e in nodes_with_deletable_negation]
return deletable_nodes
class VecRuleEvaluator:
dummy_input_str: str = '#DUMMY'
dummy_distance: int = 10000
@classmethod
def default_input_similarity(cls, e1: str, e2: str):
if e1.startswith(cls.dummy_input_str) or e2.startswith(cls.dummy_input_str):
return cls.dummy_distance
return edit_dist(e1, e2)
@classmethod
def enumerate_bindings_with_greedy_match(cls, ls1: List[str], ls2: List[str], top_n: int):
"""
Given two lists of strings ls1 and ls2, yields the ind bindings of ls2 strings that matches the strings in
ls1. I use greedy match and yields starts from the best to the worst binding until full enumeration or hit
the top_n bound
"""
used_inds = []
def _enum_bindings(ind1: int):
if ind1 == len(ls1):
yield deepcopy(used_inds)
return
e1 = ls1[ind1]
match_ls = [
(ind, cls.default_input_similarity(e1, e2))
for ind, e2 in enumerate(ls2) if ind not in used_inds
]
match_ls.sort(key=lambda x:x[1])
for ind, dist in match_ls:
used_inds.append(ind)
for inds in _enum_bindings(ind1+1):
yield inds
used_inds.pop()
for cnt, ind_ls in enumerate(_enum_bindings(0)):
yield ind_ls
if cnt + 1 == top_n:
break
@classmethod
def find_inputs(cls, root, input_set=None):
if isinstance(root, str):
return
label = root.label()
if label == 'L':
literal_str = ''.join(root.leaves())
literal_str = literal_str[1:] if literal_str[0] == '¬' else literal_str
if input_set is None:
input_set = set()
input_set.add(literal_str)
else:
for child in root:
cls.find_inputs(child, input_set)
@classmethod
def gen_input_vecs(cls, num_inputs):
return np.array(list(product([False, True], repeat=num_inputs)))
@classmethod
def from_nltk_tree(cls, root, name2ind_dict, input_vecs):
assert not isinstance(root, str), 'something wrong with the rule or the algo; you should not parse a leave'
label = root.label()
if label == 'S':
return cls.from_nltk_tree(root[-1], name2ind_dict, input_vecs)
elif label == 'F':
# the case F -> L
if (len(root) == 1) and (root[0].label() == 'L'):
return cls.from_nltk_tree(root[0], name2ind_dict, input_vecs)
# the case F -> '¬' '(' F ')' | (' F ')'
elif root[-2].label() == 'F':
isnegated_rule = isinstance(root[0], str) and (root[0] == '¬')
res = cls.from_nltk_tree(root[-2], name2ind_dict, input_vecs)
if isnegated_rule:
res = ~res
return res
# the case F -> F OP F
elif root[-2].label() == 'OP':
p, q = cls.from_nltk_tree(root[0], name2ind_dict, input_vecs), \
cls.from_nltk_tree(root[-1], name2ind_dict, input_vecs)
op = root[1][0]
if op == '⊕':
return np.logical_xor(p, q)
elif op == '∨':
return np.logical_or(p, q)
elif op == '∧':
return np.logical_and(p, q)
elif op == '→':
return np.logical_or(~p, q)
elif op == '↔':
return np.logical_or(np.logical_and(p, q), np.logical_and(~p, ~q))
else:
raise ValueError
elif label == 'L':
isnegated_literal = isinstance(root[0], str) and (root[0] == '¬')
literal_str = ''.join(root.leaves())
# remove the possible negation at the beginning
literal_str = literal_str[1:] if isnegated_literal else literal_str
vec = input_vecs[:, name2ind_dict[literal_str]]
if isnegated_literal:
vec = ~vec
return vec
else:
raise ValueError
@classmethod
def find_best_LE_score(
cls,
true_root,
pred_root,
soft_binding: bool,
greedy_match: bool,
top_n: int,
verbose: bool = False
):
"""
Given the groundtruth and the predicted nltk FOL trees, compute the truth tables over all
literal bindings and returns the best one
"""
# first we find "inputs" in each tree, i.e. the set of unique literals in a FOL
true_inputs, pred_inputs = set(), set()
VecRuleEvaluator.find_inputs(true_root, true_inputs), VecRuleEvaluator.find_inputs(pred_root, pred_inputs)
true_inputs, pred_inputs = list(true_inputs), list(pred_inputs)
n_true_inputs, n_pred_inputs = len(true_inputs), len(pred_inputs)
min_n, max_n = sorted([n_true_inputs, n_pred_inputs])
best_score, best_binded_pred_inputs = 0., None
# once we found the inputs, then we deal with the case where # inputs in two trees are different
# either we do soft binding by adding dummy inputs to the shorter ones, or we simply return 0
if n_true_inputs != n_pred_inputs:
if soft_binding:
# extend the shorter inputs to the max number of inputs by adding dummy input names
ls_to_extend = true_inputs if n_true_inputs < max_n else pred_inputs
ls_to_extend.extend([f'{cls.dummy_input_str}_{ind}' for ind in range(max_n-min_n)])
else:
return best_score, true_inputs, best_binded_pred_inputs
# at this point, we have two list ofs inputs of the same length and we will find the input binding that yields
# the best score
input_vecs = VecRuleEvaluator.gen_input_vecs(len(true_inputs))
true_name2ind_dict = dict((e, ind) for ind,e in enumerate(true_inputs))
true_res_vec = VecRuleEvaluator.from_nltk_tree(true_root, true_name2ind_dict, input_vecs)
ind_binding_enumerator = \
cls.enumerate_bindings_with_greedy_match(true_inputs, pred_inputs, top_n) if greedy_match \
else permutations(list(range(max_n)))
for cnt_ind, binded_pred_inputs_inds in enumerate(ind_binding_enumerator):
binded_pred_inputs = [pred_inputs[ind] for ind in binded_pred_inputs_inds]
pred_name2ind_dict = dict((e, ind) for ind, e in enumerate(binded_pred_inputs))
pred_res_vec = VecRuleEvaluator.from_nltk_tree(pred_root, pred_name2ind_dict, input_vecs)
score = (pred_res_vec == true_res_vec).mean(dtype=np.float32).item()
if verbose:
print('{0}\n{1}\n{2}\n---\n'.format(
score, true_inputs, binded_pred_inputs)
)
if score > best_score:
best_score = score
best_binded_pred_inputs = binded_pred_inputs
if cnt_ind + 1 >= top_n:
break
return best_score, true_inputs, best_binded_pred_inputs