forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreport_stats_on_kg_jsonl.py
executable file
·297 lines (242 loc) · 12 KB
/
report_stats_on_kg_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/usr/bin/env python3
'''Provides a JSON overview report of a JSON knowledge graph in Biolink format.
Usage: report_stats_on_json_kg.py [--useSimplifiedPredicates] <inputNodesFile.jsonl> <inputEdgesFile.jsonl> <outputFile.json>
'''
__author__ = 'Stephen Ramsey'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Erica Wood', 'Veronica Flores']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
import argparse
import collections
import datetime
import gzip
import json
import kg2_util
import sys
import jsonlines
def make_arg_parser():
arg_parser = argparse.ArgumentParser(description='build-kg2: builds the KG2 knowledge graph for the RTX system')
arg_parser.add_argument('inputNodesFile', type=str)
arg_parser.add_argument('inputEdgesFile', type=str)
arg_parser.add_argument('outputFile', type=str)
arg_parser.add_argument('--useSimplifiedPredicates', dest='use_simplified_predicates', action='store_true', default=False)
return arg_parser
def get_prefix_from_curie_id(curie_id: str):
"""
:param curie_id: This parameter is the node id that we need to get the prefix from
"""
assert ':' in curie_id
return curie_id.split(':')[0]
def get_edge_stats(edges_file_name: list):
"""
:param edges_file_name: This parameter refers to the edges file name that we can get all of the edges from
"""
# Initialize edges reader
edges_read_jsonlines_info = kg2_util.start_read_jsonlines(edges_file_name)
edges = edges_read_jsonlines_info[0]
# Pick which edge keys we want to access now, especially with questions of simiplified predicates
source_key = 'primary_knowledge_source'
excluded_key = 'domain_range_exclusion'
relation_label_key = 'relation_label'
subject_key = 'subject'
object_key = 'object'
predicate_curie_key = 'source_predicate' if not args.use_simplified_predicates else 'predicate'
label_key = 'relation_label' if not args.use_simplified_predicates else 'predicate_label'
# Initialize our output data
edge_count = 0
edge_sources = dict()
edges_by_predicate_curie = dict()
edges_by_predicate_type = dict()
edges_by_predicate_curie_prefix = dict()
unique_relation_curies = set()
prefix_pairs_dict_for_xrefs = dict()
prefix_pairs_dict_for_equivs = dict()
excluded_edges = dict()
nodes_on_edges = set()
# We only have one pass through all of the edges, so we have to get all of the data we want in that one pass
for edge in edges:
# Formerly under _number_of_edges
edge_count += 1
# Gather all of the data we need from each edge at the start so it can be easily applied to multiple metrics
source = edge[source_key]
excluded = edge[excluded_key]
relation_label = edge[relation_label_key]
subject_curie = edge[subject_key]
subject_prefix = get_prefix_from_curie_id(subject_curie)
object_curie = edge[object_key]
object_prefix = get_prefix_from_curie_id(object_curie)
predicate_curie = edge[predicate_curie_key]
predicate_curie_prefix = get_prefix_from_curie_id(predicate_curie)
label = edge[label_key]
# Formerly count_edges_by_source()
if source not in edge_sources:
edge_sources[source] = 0
edge_sources[source] += 1
# Formerly count_edges_by_predicate_curie()
if predicate_curie not in edges_by_predicate_curie:
edges_by_predicate_curie[predicate_curie] = 0
edges_by_predicate_curie[predicate_curie] += 1
# Formerly count_edges_by_predicate_type()
if label not in edges_by_predicate_type:
edges_by_predicate_type[label] = 0
edges_by_predicate_type[label] += 1
# Formerly count_edges_by_predicate_curie_prefix()
if predicate_curie_prefix not in edges_by_predicate_curie_prefix:
edges_by_predicate_curie_prefix[predicate_curie_prefix] = 0
edges_by_predicate_curie_prefix[predicate_curie_prefix] += 1
# Formerly part of count_predicates_by_predicate_curie_prefix()
# The rest must be done after all edges have been processed
unique_relation_curies.add(predicate_curie)
# Formerly count_types_of_pairs_of_curies_for_xrefs()
if edge['relation_label'] == 'xref' or edge['relation_label'] == 'close_match':
key = subject_prefix + '---' + object_prefix
if key not in prefix_pairs_dict_for_xrefs:
prefix_pairs_dict_for_xrefs[key] = 0
prefix_pairs_dict_for_xrefs[key] += 1
# Formerly count_types_of_pairs_of_curies_for_equivs()
if relation_label == kg2_util.EDGE_LABEL_OWL_SAME_AS:
key = subject_prefix + '---' + object_prefix
if key not in prefix_pairs_dict_for_equivs:
prefix_pairs_dict_for_equivs[key] = 0
prefix_pairs_dict_for_equivs[key] += 1
# Formerly get_excluded_edges()
if excluded:
if source not in excluded_edges:
excluded_edges[source] = 0
excluded_edges[source] += 1
# Formerly part of count_orphan_nodes(); needs to process nodes for second part
nodes_on_edges.add(edge.get('subject', ""))
nodes_on_edges.add(edge.get('object', ""))
# Close our reader since we have finished
kg2_util.end_read_jsonlines(edges_read_jsonlines_info)
# Formerly part of count_predicates_by_predicate_curie_prefix()
predicate_by_predicate_curie_prefix = dict(collections.Counter([get_prefix_from_curie_id(curie) for curie in unique_relation_curies]))
# Save the data in dictionary form
edges_report = {'_number_of_edges': edge_count,
'number_of_edges_by_predicate_curie': edges_by_predicate_curie,
'number_of_edges_by_predicate_type': edges_by_predicate_type,
'number_of_edges_by_predicate_curie_prefixes': edges_by_predicate_curie_prefix,
'number_of_predicates_by_predicate_curie_prefixes': predicate_by_predicate_curie_prefix,
'number_of_edges_by_source': edge_sources,
'types_of_pairs_of_curies_for_xrefs': prefix_pairs_dict_for_xrefs,
'types_of_pairs_of_curies_for_equivs': prefix_pairs_dict_for_equivs,
'number_of_excluded_edges': excluded_edges}
# Return the dictionary report and the set of all nodes on edges
return edges_report, nodes_on_edges
def get_node_stats(nodes_file_name: list, nodes_on_edges: set):
"""
:param nodes_file_name: This parameter refers to the nodes file name that we can get all of the nodes from
:param nodes_on_edges: This parameter provides a set containing all of the node ids that are on edges
"""
# Initialize nodes reader
nodes_read_jsonlines_info = kg2_util.start_read_jsonlines(nodes_file_name)
nodes = nodes_read_jsonlines_info[0]
# Pick which node keys we want to access now
category_label_key = 'category_label'
id_key = 'id'
source_key = 'provided_by'
name_key = 'name'
category_key = 'category'
deprecated_key = 'deprecated'
source_node_category = kg2_util.convert_biolink_category_to_curie(kg2_util.SOURCE_NODE_CATEGORY)
# Initialize our output data
node_count = 0
build_info = dict()
nodes_by_curie_prefix = dict()
nodes_by_curie_prefix_given_no_category = dict()
nodes_by_category = dict()
nodes_by_source = dict()
nodes_by_source_and_category = dict()
sources = list()
deprecated_nodes = dict()
orphan_nodes = dict()
# We only have one pass through all of the nodes, so we have to get all of the data we want in that one pass
for node in nodes:
# Formerly under _number_of_nodes
node_count += 1
# Gather all of the data we need from each node at the start so it can be easily applied to multiple metrics
category_label = node[category_label_key]
node_id = node[id_key]
curie_prefix = get_prefix_from_curie_id(node_id)
source = node[source_key][0]
name = node[name_key]
category = node[category_key]
deprecated = node[deprecated_key]
# Formerly _build_version and _build_time
if node_id == kg2_util.CURIE_PREFIX_RTX + ':' + 'KG2':
build_info = node
continue
# Formerly count_nodes_by_curie_prefix()
if curie_prefix not in nodes_by_curie_prefix:
nodes_by_curie_prefix[curie_prefix] = 0
nodes_by_curie_prefix[curie_prefix] += 1
# Formerly count_nodes_by_curie_prefix_given_no_category()
if category_label is None or category_label == 'unknown category':
if curie_prefix not in nodes_by_curie_prefix_given_no_category:
nodes_by_curie_prefix_given_no_category[curie_prefix] = 0
nodes_by_curie_prefix_given_no_category[curie_prefix] += 1
# Formerly count_nodes_by_category()
if category_label not in nodes_by_category:
nodes_by_category[category_label] = 0
nodes_by_category[category_label] += 1
# Formerly count_nodes_by_source()
for multi_source in node[source_key]:
if multi_source not in nodes_by_source:
nodes_by_source[multi_source] = 0
nodes_by_source[multi_source] += 1
if not args.use_simplified_predicates:
break
# Formerly count_number_of_nodes_by_source_and_category()
if source not in nodes_by_source_and_category:
nodes_by_source_and_category[source] = dict()
if category_label not in nodes_by_source_and_category[source]:
nodes_by_source_and_category[source][category_label] = 0
nodes_by_source_and_category[source][category_label] += 1
# Formerly get_sources()
if category == source_node_category:
sources.append(name)
# Formerly get_deprecated_nodes()
if deprecated:
if source not in deprecated_nodes:
deprecated_nodes[source] = 0
deprecated_nodes[source] += 1
# Formerly part of count_orphan_nodes()
if node_id not in nodes_on_edges:
if source not in orphan_nodes:
orphan_nodes[source] = 0
orphan_nodes[source] += 1
# Close our reader since we have finished
kg2_util.end_read_jsonlines(nodes_read_jsonlines_info)
if len(build_info) == 0:
print("WARNING: 'build' property is missing from the input JSON.", file=sys.stderr)
# Save the data in dictionary form
nodes_report = {'_number_of_nodes': node_count,
'_build_version': build_info.get('name', ""),
'_build_time': build_info.get('update_date', ""),
'number_of_nodes_by_curie_prefix': nodes_by_curie_prefix,
'number_of_nodes_without_category_by_curie_prefix': nodes_by_curie_prefix_given_no_category,
'number_of_nodes_by_category_label': nodes_by_category,
'number_of_nodes_by_source': nodes_by_source,
'number_of_nodes_by_source_and_category': nodes_by_source_and_category,
'sources': sources,
'number_of_deprecated_nodes': deprecated_nodes,
'number_of_orphan_nodes': orphan_nodes}
# Return the dictionary report
return nodes_report
if __name__ == '__main__':
args = make_arg_parser().parse_args()
input_nodes_file_name = args.inputNodesFile
input_edges_file_name = args.inputEdgesFile
stats = {'_report_datetime': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
# Get stats from the edges first (since we need the set of nodes on edges), then nodes
edges_report, nodes_on_edges = get_edge_stats(input_edges_file_name)
nodes_report = get_node_stats(input_nodes_file_name, nodes_on_edges)
# Add the output of get_edge_stats() and get_node_stats() to the return dictionary
stats.update(edges_report)
stats.update(nodes_report)
# Save our output dictionary to the output file
kg2_util.save_json(stats, args.outputFile, True)