forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmerge_graphs.py
executable file
·117 lines (99 loc) · 4.61 KB
/
merge_graphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
'''merge_graphs.py: merge two KGs that are in the KG2 JSON format
Usage: merge_graphs.py [--kgFileOrphanEdges <kgFileOrphanEdges>]
--outpufFile <outputFile.json>
<kgNodesFile1> ... <kgNodesFileN>
<kgEdgesFile1> ... <kgEdgesFileN>
'''
__author__ = 'Stephen Ramsey'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
import argparse
import kg2_util
import json
import sys
def make_arg_parser():
arg_parser = argparse.ArgumentParser(description='merge_graphs.py: merge two or more JSON KG files')
arg_parser.add_argument('--test', dest='test', action="store_true", default=False)
arg_parser.add_argument('--kgFileOrphanEdges', type=str, nargs='?', default=None)
arg_parser.add_argument('--outputNodesFile', type=str, nargs='?', default=None)
arg_parser.add_argument('--outputEdgesFile', type=str, nargs='?', default=None)
arg_parser.add_argument('--kgNodesFiles', type=str, nargs='+')
arg_parser.add_argument('--kgEdgesFiles', type=str, nargs='+')
return arg_parser
if __name__ == '__main__':
args = make_arg_parser().parse_args()
kg_nodes_file_names = args.kgNodesFiles
kg_edges_file_names = args.kgEdgesFiles
test_mode = args.test
output_nodes_file_name = args.outputNodesFile
output_edges_file_name = args.outputEdgesFile
orphan_edges_file_name = args.kgFileOrphanEdges
nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
nodes_output = nodes_info[0]
edges_output = edges_info[0]
orphan_info = kg2_util.create_single_jsonlines(test_mode)
orphan_output = orphan_info[0]
nodes = dict()
for kg_nodes_file_name in kg_nodes_file_names:
kg2_util.log_message("reading nodes from file",
ontology_name=kg_nodes_file_name,
output_stream=sys.stderr)
num_nodes_added = 0
kg_nodes_read_jsonlines_info = kg2_util.start_read_jsonlines(kg_nodes_file_name)
kg_nodes = kg_nodes_read_jsonlines_info[0]
for node in kg_nodes:
node_id = node['id']
if node_id not in nodes:
nodes[node_id] = node
num_nodes_added += 1
else:
nodes[node_id] = kg2_util.merge_two_dicts(nodes[node_id], node)
kg2_util.log_message("number of nodes added: " + str(num_nodes_added),
ontology_name=kg_nodes_file_name,
output_stream=sys.stderr)
kg2_util.end_read_jsonlines(kg_nodes_read_jsonlines_info)
for node in nodes.values():
nodes_output.write(node)
nodes_list = nodes.keys()
del nodes
ctr_edges_added = 0
last_edges_added = 0
last_orphan_edges = 0
kg_orphan_edges_count = 0
edge_keys = set()
for kg_edges_file_name in kg_edges_file_names:
kg2_util.log_message("reading edges from file",
ontology_name=kg_edges_file_name,
output_stream=sys.stderr)
edges_read_jsonlines_info = kg2_util.start_read_jsonlines(kg_edges_file_name)
kg_edges = edges_read_jsonlines_info[0]
for rel_dict in kg_edges:
subject_curie = rel_dict['subject']
object_curie = rel_dict['object']
if subject_curie in nodes_list and object_curie in nodes_list:
ctr_edges_added += 1
edge_key =rel_dict["id"]
if edge_key not in edge_keys:
edge_keys.add(edge_key)
edges_output.write(rel_dict)
else:
orphan_output.write(rel_dict)
kg_orphan_edges_count += 1
kg2_util.end_read_jsonlines(edges_read_jsonlines_info)
kg2_util.log_message("number of edges added: " + str(ctr_edges_added - last_edges_added),
ontology_name=kg_edges_file_name,
output_stream=sys.stderr)
last_edges_added = ctr_edges_added
kg2_util.log_message("number of orphan edges: " + str(kg_orphan_edges_count -
last_orphan_edges),
ontology_name=kg_edges_file_name,
output_stream=sys.stderr)
last_orphan_edges = kg_orphan_edges_count
kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
kg2_util.close_single_jsonlines(orphan_info, orphan_edges_file_name)