-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiltering.py
101 lines (76 loc) · 2.71 KB
/
filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
pattern_csv = re.compile(r'"(.*?)","(.*?)","(.*?)"')
pattern_ttl_uri = re.compile(r"<(.*?)> <(.*?)> <(.*?)> .")
pattern_ttl_literals = re.compile(r'<(.*?)> <(.*?)> "(.*?)"(.*?) .')
replaces = [
("http://dbpedia.org/ontology/", "dbo:"),
("http://dbpedia.org/resource/", "dbr:"),
("http://dbpedia.org/property/", "dbp:"),
("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdf:"),
]
def replace_predicate(line: str):
# it's ok the replace list is not complete.
# it's just for speeding up regex matching or reducing memory cost.
for t in replaces:
line = line.replace(t[0], t[1])
return line
def replace_predicate_inverse(fact: list):
for i in range(3):
for t in replaces:
fact[i] = fact[i].replace(t[1], t[0], 1)
def replace_prefix_in_terms(term: str):
for t in replaces:
if t[1] in term:
term = term.replace(t[1], t[0])
uri = lambda x: "<" + x + ">"
return uri(term)
return term
def extract_triple(buf: str):
# reg matching makes it slower
buf = pattern_csv.findall(buf)
if len(buf) == 1:
return list(buf[0])
def wrap_fact(fact: list, is_uri: bool):
uri = lambda x: "<" + x + ">"
literal = lambda x: '"' + x + '"'
for i in (0, 1):
fact[i] = uri(fact[i])
fact[2] = uri(fact[2]) if is_uri else literal(fact[2])
def extract_triple_from_ttl(buf: str):
res = pattern_ttl_uri.findall(buf)
if len(res) == 1:
fact = list(res[0])
wrap_fact(fact, True)
return fact
res = pattern_ttl_literals.findall(buf)
if len(res) == 1:
fact = list(res[0])
wrap_fact(fact, False)
return fact
def extract_triple_from_json(result, indices: list):
# !!! modify it when getting facts by entities
# May be the func need another param.
# indices[pos] = "callret-" + str(pos)
# print(result)
# print(indices)
is_uri = result[indices[2]]["type"] == "uri"
fact = [result[indices[i]]["value"] for i in (0, 1, 2)]
wrap_fact(fact, is_uri)
return fact, is_uri
def get_ttl_str(fact: list):
return fact[0] + " " + fact[1] + " " + repr(fact[2])[1:-1] + " .\n"
def write_dict(dic_e: dict, fn: str):
with open(fn, "a", encoding="utf-8") as f:
for k in dic_e:
f.write(k + "\t" + str(dic_e[k]) + "\n")
def filter_fact(fact: list):
with open("meaningless_predicates.txt", "r", encoding="utf-8") as f:
bad_pres = f.readline()
bad_pres = bad_pres.strip("\n").split(",")
if (
fact[1] not in bad_pres
and "http://dbpedia.org/class/yago" not in fact[2]
and "http://www.wikidata.org/entity" not in fact[2]
):
return True
return False