forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmulti_ont_to_kg_jsonl.py
executable file
·1278 lines (1082 loc) · 63.5 KB
/
multi_ont_to_kg_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
'''Builds the RTX "KG2" second-generation knowledge graph, from various OWL input files.
Usage: multi_ont_to_kg_jsonl.py <categoriesFile.yaml> <curiesToURILALFile>
<ontLoadInventoryFile.yaml> <outputNodesFile> <outputEdgesFile>
'''
__author__ = 'Stephen Ramsey'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Erica Wood']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
import argparse
import kg2_util
import ontobio
import os.path
import re
import sys
import urllib.parse
import urllib.request
from typing import Dict
import datetime
# -------------- define globals here ---------------
REGEX_ENSEMBL = re.compile('ENS[A-Z]{0,3}([PG])[0-9]{11}')
REGEX_YEAR = re.compile('([12][90][0-9]{2})')
REGEX_YEAR_MONTH_DAY = re.compile('([12][90][0-9]{2})_([0-9]{1,2})_([0-9]{1,2})')
REGEX_MONTH_YEAR = re.compile('([0-9]{1,2})_[12][90][0-9]{2}')
REGEX_YEAR_MONTH = re.compile('[12][90][0-9]{2}_([0-9]{1,2})')
REGEX_PUBLICATIONS = re.compile(r'((?:(?:PMID)|(?:ISBN)):\d+)')
REGEX_XREF_END_DESCRIP = re.compile(r'.*\[([^\]]+)\]$')
REGEX_OBSOLETE = re.compile("^obsolete|\(obsolete|obsolete$", re.IGNORECASE)
IRI_OBO_XREF = kg2_util.IRI_OBO_FORMAT_XREF
CURIE_OBO_XREF = kg2_util.CURIE_ID_OBO_FORMAT_XREF
OWL_BASE_CLASS = kg2_util.CURIE_ID_OWL_THING
OWL_NOTHING = kg2_util.CURIE_ID_OWL_NOTHING
NOCODE = 'NOCODE'
MYSTERIOUS_BASE_NODE_ID_TO_FILTER = '_:genid'
ENSEMBL_LETTER_TO_CATEGORY = {'P': 'protein',
'G': 'gene',
'T': 'transcript'}
# -------------- subroutines with side-effects go here ------------------
def date():
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def delete_ontobio_cachier_caches():
# This is causing issues in the current build because these files don't exist.
# Temporarily commenting out to avoid error.
# kg2_util.purge("~/.cachier", ".ontobio*")
# kg2_util.purge("~/.cachier", ".prefixcommons*")
pass
def load_ont_file_return_ontology_and_metadata(file_name: str,
download_url: str = None,
ontology_title: str = None,
save_pickle: bool = False):
ontology = kg2_util.make_ontology_from_local_file(file_name, save_pickle=save_pickle)
file_last_modified_timestamp = kg2_util.format_timestamp(kg2_util.get_file_last_modified_timestamp(file_name))
print("file: " + file_name + "; last modified: " + file_last_modified_timestamp)
ont_version = ontology.meta.get('version', None)
bpv = ontology.meta.get('basicPropertyValues', None)
title = ontology_title
description = None
umls_sver = None
umls_release = None
source_file_date = None
if bpv is not None:
for bpv_dict in bpv:
pred = bpv_dict['pred']
value = bpv_dict['val']
if 'description' in pred:
description = value
elif 'title' in pred:
if title is None:
title = value
elif pred == kg2_util.BASE_URL_UMLS + 'sver':
ont_version = value
umls_sver = value
elif pred == kg2_util.BASE_URL_OWL + 'versionInfo':
umls_release = value
elif pred.endswith('source_file_date'):
source_file_date = value
if ont_version is None:
ont_version = 'downloaded:' + file_last_modified_timestamp
ontology_id = None
if download_url is not None:
ontology_id = download_url
else:
ontology_id = ontology.id
# print(ontology_id)
if not kg2_util.is_a_valid_http_url(ontology_id):
ontology_id = os.path.basename(file_name)
metadata_dict = {'id': ontology_id,
'handle': ontology.handle,
'file': file_name,
'file last modified timestamp': file_last_modified_timestamp,
'version': ont_version,
'title': title,
'description': description,
'umls-sver': umls_sver,
'umls-release': umls_release,
'source-file-date': source_file_date}
# print(metadata_dict)
return [ontology, metadata_dict]
def make_kg2(curies_to_categories: dict,
uri_to_curie_shortener: callable,
curie_to_uri_expander: callable,
ont_urls_and_files: tuple,
nodes_output,
edges_output,
umls_cui_tsv_file: str,
test_mode: bool = False,
save_pickle: bool = False):
ont_file_information_dict_list = []
# for each OWL file (or URL for an OWL file) described in the YAML config file...
for ont_source_info_dict in ont_urls_and_files:
if ont_source_info_dict['download']:
# get the OWL file onto the local file system and get a full path to it
print(ont_source_info_dict["url"])
local_file_name = kg2_util.download_file_if_not_exist_locally(ont_source_info_dict['url'],
ont_source_info_dict['file'])
else:
local_file_name = ont_source_info_dict['file']
assert os.path.exists(ont_source_info_dict['file']), local_file_name
# load the OWL file data into an ontobio.ontol.Ontology data structure and information dictionary
[ont, metadata_dict] = load_ont_file_return_ontology_and_metadata(local_file_name,
ont_source_info_dict['url'],
ont_source_info_dict['title'],
save_pickle)
metadata_dict['ontology'] = ont
ont_file_information_dict_list.append(metadata_dict)
kg2_util.log_message('Calling get_inverse_rels')
biolink_inverses = get_inverse_rels(ont_file_information_dict_list[0]['ontology'], ont_file_information_dict_list[0], uri_to_curie_shortener)
kg2_util.log_message('Calling make_nodes_dict_from_ontologies_list')
nodes_dict = make_nodes_dict_from_ontologies_list(ont_file_information_dict_list,
curies_to_categories,
uri_to_curie_shortener,
curie_to_uri_expander)
kg2_util.log_message('Calling make_map_of_node_ontology_ids_to_curie_ids')
map_of_node_ontology_ids_to_curie_ids = make_map_of_node_ontology_ids_to_curie_ids(nodes_dict)
kg2_util.log_message('Calling get_rels_dict')
# get a dictionary of all relationships including xrefs as relationships
all_rels_dict = get_rels_dict(nodes_dict,
ont_file_information_dict_list,
uri_to_curie_shortener,
curie_to_uri_expander,
map_of_node_ontology_ids_to_curie_ids)
## This is not necessarily the most efficient place to do #321, but it will have to work for now
for edge in all_rels_dict.values():
edges_output.write(edge)
for edge in biolink_inverses:
edges_output.write(edge)
for node_dict in list(nodes_dict.values()):
del node_dict['xrefs']
del node_dict['ontology node ids']
nodes_output.write(node_dict)
def get_biolink_category_for_node(ontology_node_id: str,
node_curie_id: str,
ontology: ontobio.ontol.Ontology,
curies_to_categories: dict,
uri_to_curie_shortener: callable,
ontology_node_ids_previously_seen: set,
get_node_id_of_node_with_category: bool,
biolink_category_depths: dict):
if node_curie_id is None:
kg2_util.log_message("Ontology node " + ontology_node_id + " has node_curie_id of None",
ontology_name=ontology.id,
output_stream=sys.stderr)
return [None, None]
# if we have already looked for a category for this node, return None
if ontology_node_id in ontology_node_ids_previously_seen:
return [None, None]
ontology_node_ids_previously_seen.add(ontology_node_id)
curie_prefix = get_prefix_from_curie_id(node_curie_id)
if curie_prefix is None:
kg2_util.log_message("Unable to get prefix from node CURIE id",
ontology_name=ontology.id,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
return [None, None]
# Inelegant hack to ensure that TUI: nodes get mapped to "semantic type" while still enabling us
# to use get_biolink_category_for_node to determine the specific semantic type of a CUI based on its
# TUI record. Need to think about a more elegant way to do this. [SAR]
if curie_prefix == kg2_util.CURIE_PREFIX_UMLS_STY and node_curie_id.split(':')[1].startswith('T') and ontology.id == kg2_util.BASE_URL_UMLS_STY:
return [kg2_util.BIOLINK_CATEGORY_NAMED_THING, None]
if get_node_id_of_node_with_category:
ret_ontology_node_id_of_node_with_category = ontology_node_id
else:
ret_ontology_node_id_of_node_with_category = None
curies_to_categories_terms = curies_to_categories['term-mappings']
curies_to_categories_prefixes = curies_to_categories['prefix-mappings']
# check if the term directly maps
ret_category = curies_to_categories_terms.get(node_curie_id, None)
if ret_category is None:
ret_category = curies_to_categories_prefixes.get(curie_prefix, None)
if ret_category is None:
# need to walk the ontology hierarchy until we encounter a parent term with a defined biolink category
parent_nodes_list = list(ontology.parents(ontology_node_id, ['subClassOf']))
parent_nodes_same_prefix = set()
parent_nodes_different_prefix = set()
parent_nodes_ont_to_curie = dict()
for parent_ontology_node_id in parent_nodes_list:
parent_node_curie_id = get_node_curie_id_from_ontology_node_id(parent_ontology_node_id,
ontology,
uri_to_curie_shortener,
curie_to_uri_expander)
parent_node_curie_prefix = get_prefix_from_curie_id(parent_node_curie_id)
if parent_node_curie_prefix is None:
kg2_util.log_message("Unable to get prefix from node CURIE id",
ontology_name=ontology.id,
node_curie_id=parent_node_curie_id,
output_stream=sys.stderr)
continue
if parent_node_curie_prefix == curie_prefix:
parent_nodes_same_prefix.add(parent_ontology_node_id)
else:
parent_nodes_different_prefix.add(parent_ontology_node_id)
parent_nodes_ont_to_curie[parent_ontology_node_id] = parent_node_curie_id
candidate_categories = set()
for parent_ontology_node_id in list(parent_nodes_same_prefix) + list(parent_nodes_different_prefix):
parent_node_curie_id = parent_nodes_ont_to_curie[parent_ontology_node_id]
try:
[candidate_category,
ontology_node_id_of_node_with_category] = get_biolink_category_for_node(parent_ontology_node_id,
parent_node_curie_id,
ontology,
curies_to_categories,
uri_to_curie_shortener,
ontology_node_ids_previously_seen,
get_node_id_of_node_with_category,
biolink_category_depths)
if get_node_id_of_node_with_category and ontology_node_id_of_node_with_category is not None:
ret_ontology_node_id_of_node_with_category = ontology_node_id_of_node_with_category
except RecursionError:
kg2_util.log_message(message="recursion error: " + ontology_node_id,
ontology_name=ontology.id,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
assert False
if candidate_category is not None:
candidate_categories.add(candidate_category)
if len(candidate_categories) == 1:
ret_category = next(iter(candidate_categories))
elif len(candidate_categories) > 1:
candidate_category_depths = {category: biolink_category_depths.get(kg2_util.CURIE_PREFIX_BIOLINK + ':' +
kg2_util.convert_snake_case_to_camel_case(category.replace(' ', '_'),
uppercase_first_letter=True), None) for
category in sorted(candidate_categories)}
keys_remove = {k for k, v in candidate_category_depths.items() if v is None}
for k in keys_remove:
kg2_util.log_message(message="unexpected None category depth for category " + k,
ontology_name=ontology.id,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
del candidate_category_depths[k]
# candidate_category_depths = {k: v for k, v in candidate_category_depths.items() if v is not None}
if len(candidate_category_depths) > 0:
ret_category = max(candidate_category_depths, key=candidate_category_depths.get)
else:
assert ret_category is None
if ret_category is None:
if node_curie_id.startswith(kg2_util.CURIE_PREFIX_ENSEMBL + ':'):
curie_suffix = node_curie_id.replace(kg2_util.CURIE_PREFIX_ENSEMBL + ':', '')
ensembl_match = REGEX_ENSEMBL.match(curie_suffix)
if ensembl_match is not None:
ensembl_match_letter = ensembl_match[1]
ret_category = ENSEMBL_LETTER_TO_CATEGORY.get(ensembl_match_letter, None)
if ret_category is None:
kg2_util.log_message(message="unrecognized Ensembl ID: " + curie_suffix,
ontology_name=ontology.id,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
return [ret_category, ret_ontology_node_id_of_node_with_category]
# --------------- subroutines that have no side effects except logging printing ----------
def make_rel_key(subject_id: str,
predicate_name: str,
object_id: str,
ontology_id: str = None):
key = subject_id + ';' + predicate_name + ';' + object_id
if ontology_id is not None:
key += ';' + str(ontology_id)
return key
def parse_umls_sver_date(umls_sver: str, sourcename: str):
if umls_sver.startswith(sourcename + '_'):
umls_sver = umls_sver.split(sourcename + '_')[1]
umls_sver_match = REGEX_YEAR.match(umls_sver)
updated_date = None
if umls_sver_match is not None:
updated_date = umls_sver_match[0]
else:
umls_sver_match = REGEX_YEAR_MONTH_DAY.match(umls_sver)
if umls_sver_match is not None:
updated_date = umls_sver_match[0] + '-' + ('%0.2d' % int(umls_sver_match[1])) + '-' + ('%0.2d' % int(umls_sver_match[2]))
else:
umls_sver_match = REGEX_MONTH_YEAR.match(umls_sver)
if umls_sver_match is not None:
updated_date = umls_sver_match[1] + '-' + ('%0.2d' % int(umls_sver_match[0]))
else:
umls_sver_match = REGEX_YEAR_MONTH.match(umls_sver)
if umls_sver_match is not None:
updated_date = umls_sver_match[0] + ('%0.2d' % int(umls_sver_match[1]))
return updated_date
# ===========================================
# These next functions (until make_nodes_dict_from_ontologies_list)
# are for addressing issue #762 regarding duplicate TUIs
def generate_biolink_category_tree(biolink_ontology: ontobio.ontol.Ontology,
curies_to_categories: dict):
# Adapts biolink parent-child relationships into a tree format
# Format is {parent1: [child1, child2], parent2: [child3, child4]}
# Such that every parent is also a child of another with the exception
# of "named thing" and no child is a child of more than one parent
biolink_category_tree = kg2_util.get_biolink_category_tree(biolink_ontology)
# Takes the TUI mappings from the curies-to-categories.yaml file
# and stores them for the later functions to use
mappings_to_categories = dict()
terms = curies_to_categories['term-mappings']
for term in terms:
if term.startswith(kg2_util.CURIE_PREFIX_UMLS_STY):
mappings_to_categories[term.split(':')[1]] = terms[term]
return [biolink_category_tree, mappings_to_categories]
def get_shorter_list_first(list1: list, list2: list):
# Returns the compared lists in the form [short_list, long_list]
# for use in comparing two lists of biolink category hierarchies
len1 = len(list1)
len2 = len(list2)
if len1 > len2:
return [list2, list1]
return [list1, list2]
def compare_two_lists_in_reverse(list1: list, list2: list):
# Returns most specific category that present in both lists.
# The most specific category of each list is in [0]
# So, by comparing them in reverse, once you get to a discrepancy,
# you go forward back to the last one where they were the same
[shortlist, longlist] = get_shorter_list_first(list1, list2)
for short_item in reversed(shortlist):
if short_item not in longlist:
return shortlist[shortlist.index(short_item) + 1]
return shortlist[0]
def get_path(tree_dict: dict,
base_node: str,
goal_node: str,
return_list: list):
# Iterates through the biolink category tree until it gets to
# the goal category in one of the subclasses. Once it does that,
# it adds the superclass to the return list (which contains the
# category hierarchy for the goal category), and if the superclass
# isn't "named thing" (the base category), it recursively calls it again
# with the superclass as the goal category this time, continuing
# to add onto the return list until it hits the base category
for superclass, subclasses in tree_dict.items():
if goal_node in subclasses:
return_list.append(superclass)
if superclass != base_node:
get_path(tree_dict, base_node, superclass, return_list)
def split_into_chunks(fulllist: list):
# Takes a list of TUI categories and splits it into a list
# of pairs of TUI categories so that the multiple TUI categories can
# can be handled in pairs
# Ex. [molecular entity, chemical substance, chemical substance] ->
# [[molecular entity, chemical substance], [chemical substance]]
returnlist = []
addtofullarray = True
for element in fulllist:
if addtofullarray:
returnlist.append([element])
addtofullarray = False
else:
returnlist[-1].append(element)
addtofullarray = True
return returnlist
def find_common_ancestor(tui_categories: list, biolink_category_tree: dict):
# Iterates through chunked TUI category list in pairs, such that
# the most specific common ancestor between the two categories is found
# and stored in the category list, which then gets re-chunked.
# This continues until there is only one category - the most specific common
# ancestory - left in the tui_categories list.
tui_split = split_into_chunks(tui_categories)
while len(tui_split) > 0:
if len(tui_split[0]) == 1:
break
for pair in tui_split:
if len(pair) < 2:
tui_split[tui_split.index(pair)] = pair[0]
else:
path_list1 = []
path_list1.append(pair[0])
get_path(biolink_category_tree, "named thing", pair[0], path_list1)
path_list2 = []
path_list2.append(pair[1])
get_path(biolink_category_tree, "named thing", pair[1], path_list2)
# print(f"list1: {path_list1!s} list2: {path_list2!s}", file=sys.stderr)
tui_split[tui_split.index(pair)] = compare_two_lists_in_reverse(path_list1,
path_list2)
tui_split = split_into_chunks(tui_split)
return tui_split[0][0]
def get_category_for_multiple_tui(biolink_category_tree: dict,
tui_group: list,
mappings_to_categories: dict):
# Takes the list of multiple TUIs and uses the mappings to categories
# dictionary to create a list of categories that can be used to find
# the most common ancestor between them
tui_categories = []
for tui in tui_group:
tui_category_snakecase = mappings_to_categories[tui]
tui_categories.append(tui_category_snakecase)
return find_common_ancestor(tui_categories, biolink_category_tree)
def make_nodes_dict_from_ontologies_list(ontology_info_list: list,
curies_to_categories: dict,
uri_to_curie_shortener: callable,
curie_to_uri_expander: callable) -> Dict[str, dict]:
ret_dict = dict()
omim_to_hgnc_symbol = dict()
ontologies_iris_to_curies = dict()
tuis_not_in_mappings_but_in_kg2 = set()
biolink_categories_ontology_depths = None
first_ontology = ontology_info_list[0]['ontology']
print(f"Ont: {first_ontology}\nBase URL: {kg2_util.BASE_URL_BIOLINK_ONTOLOGY} {first_ontology.id}")
assert first_ontology.id == kg2_util.BASE_URL_BIOLINK_ONTOLOGY, "biolink needs to be first in ont-load-inventory.yaml"
[biolink_category_tree, mappings_to_categories] = generate_biolink_category_tree(first_ontology, curies_to_categories)
biolink_categories_ontology_depths = kg2_util.get_biolink_categories_ontology_depths(first_ontology)
convert_bpv_pred_to_curie_func = make_convert_bpv_predicate_to_curie(uri_to_curie_shortener,
curie_to_uri_expander)
def biolink_depth_getter(category: str):
return biolink_categories_ontology_depths.get(category, None)
for ontology_info_dict in ontology_info_list:
ontology = ontology_info_dict['ontology']
iri_of_ontology = ontology_info_dict['id']
assert iri_of_ontology is not None
ontology_curie_id = uri_to_curie_shortener(iri_of_ontology)
if ontology_curie_id is None or len(ontology_curie_id) == 0:
ontology_curie_id = iri_of_ontology
print(f"processing ontology: {ontology_curie_id} start {datetime.datetime.now()}", file=sys.stderr)
umls_sver = ontology_info_dict.get('umls-sver', None)
ont_version = ontology_info_dict.get('version', None)
ontology_name = ontology_info_dict['title']
if ont_version is not None:
ontology_name = ontology_name + " version " + ont_version
updated_date = None
if umls_sver is not None:
# if you can, parse sver string into a date string
updated_date = parse_umls_sver_date(umls_sver, ontology_curie_id.split(':')[1])
if updated_date is None:
updated_date = ontology_info_dict.get('source-file-date', None)
if updated_date is None:
umls_release = ontology_info_dict.get('umls-release', None)
if umls_release is not None:
updated_date = re.sub(r'\D', '', umls_release)
if updated_date is None:
updated_date = ontology_info_dict['file last modified timestamp']
ontology_node = kg2_util.make_node(ontology_curie_id,
iri_of_ontology,
ontology_name,
kg2_util.SOURCE_NODE_CATEGORY,
updated_date,
ontology_curie_id)
ontology_node['description'] = ontology_info_dict['description']
ontology_node['ontology node ids'] = [iri_of_ontology]
ontology_node['xrefs'] = []
ret_dict[ontology_curie_id] = ontology_node
ontologies_iris_to_curies[iri_of_ontology] = ontology_curie_id
for ontology_node_id in ontology.nodes():
onto_node_dict = ontology.node(ontology_node_id)
assert onto_node_dict is not None
if ontology_node_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
continue
if ontology_node_id == OWL_NOTHING:
continue
if ontology_node_id.endswith(NOCODE):
continue
node_curie_id = get_node_curie_id_from_ontology_node_id(ontology_node_id,
ontology,
uri_to_curie_shortener,
curie_to_uri_expander)
if node_curie_id is None:
kg2_util.log_message(message="Unable to obtain a CURIE for ontology node ID: " + ontology_node_id,
ontology_name=iri_of_ontology,
output_stream=sys.stderr)
continue
iri = onto_node_dict.get('id', None)
if iri is None:
iri = ontology_node_id
if not kg2_util.is_a_valid_http_url(iri):
iri = curie_to_uri_expander(iri)
iri = curie_to_uri_expander(node_curie_id)
if iri is None:
kg2_util.log_message(message="Cannot obtain IRI for CURIE",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
continue
assert kg2_util.is_a_valid_http_url(iri), iri
node_name = onto_node_dict.get('label', None)
node_full_name = None
assert node_curie_id is not None
if node_curie_id in ret_dict:
prev_provided_by = ret_dict[node_curie_id].get('provided_by')
if prev_provided_by is not None and node_curie_id == prev_provided_by:
continue # issue 984
curie_prefix = get_prefix_from_curie_id(node_curie_id)
if curie_prefix == kg2_util.CURIE_PREFIX_UMLS_STY and node_curie_id.split(':')[1].startswith('T') and ontology.id != kg2_util.BASE_URL_UMLS_STY:
# this is a UMLS semantic type TUI node from a non-STY UMLS source, ignore it
continue
# to address issue #1361
if curie_prefix == kg2_util.CURIE_PREFIX_ENSEMBL and REGEX_ENSEMBL.match(node_curie_id.replace(curie_prefix + ':', '')) is None:
node_curie_id = node_curie_id.replace(kg2_util.CURIE_PREFIX_ENSEMBL, kg2_util.CURIE_PREFIX_ENSEMBL_GENOMES)
iri = curie_to_uri_expander(node_curie_id)
kg2_util.log_message(message="Switching Ensembl: prefix to EnsemblGenomes:",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
[node_category_label, node_with_category] = get_biolink_category_for_node(ontology_node_id,
node_curie_id,
ontology,
curies_to_categories,
uri_to_curie_shortener,
set(),
True,
biolink_categories_ontology_depths)
node_deprecated = False
node_description = None
node_creation_date = None
node_update_date = None
node_replaced_by_curie = None
node_full_name = None
node_publications = set()
node_synonyms = set()
node_xrefs = set()
node_tui = None
node_has_cui = False
node_tui_category_label = None
node_gene_symbol = None
node_meta = onto_node_dict.get('meta', None)
if node_meta is not None:
node_deprecated = node_meta.get('deprecated', False)
if node_meta.get('deprecated', False):
kg2_util.log_message(message="Node has obsolete meta; setting deprecated=True",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
node_definition = node_meta.get('definition', None)
if node_definition is not None:
node_description = node_definition['val']
node_definition_xrefs = node_definition.get('xrefs', None)
if node_definition_xrefs is not None:
assert type(node_definition_xrefs) == list
for xref in node_definition_xrefs:
xref_pub = xref_as_a_publication(xref)
if xref_pub is not None:
node_publications.add(xref_pub)
node_synonyms_list = node_meta.get('synonyms', None)
if node_synonyms_list is not None:
for syn_dict in node_synonyms_list:
syn_pred = syn_dict['pred']
if syn_pred == 'hasExactSynonym':
node_synonyms.add(syn_dict['val'])
syn_xrefs = syn_dict['xrefs']
if len(syn_xrefs) > 0:
for syn_xref in syn_xrefs:
syn_xref_pub = xref_as_a_publication(syn_xref)
if syn_xref_pub is not None:
node_publications.add(syn_xref_pub)
node_xrefs_list = node_meta.get('xrefs', None)
if node_xrefs_list is not None:
for xref_dict in node_xrefs_list:
xref_curie = xref_dict['val']
if xref_curie.startswith('UMLS:C'):
xref_curie = kg2_util.CURIE_PREFIX_UMLS + ':' + xref_curie.split('UMLS:')[1]
node_xrefs.add(xref_curie)
basic_property_values = node_meta.get('basicPropertyValues', None)
if basic_property_values is not None:
node_tui_list = []
for basic_property_value_dict in basic_property_values:
bpv_pred = basic_property_value_dict['pred']
bpv_pred_curie = convert_bpv_pred_to_curie_func(bpv_pred)
if bpv_pred_curie is None:
bpv_pred_curie = bpv_pred
bpv_val = basic_property_value_dict['val']
if bpv_pred_curie in {kg2_util.CURIE_ID_OIO_CREATION_DATE,
kg2_util.CURIE_ID_DCTERMS_ISSUED,
kg2_util.CURIE_ID_HGNC_DATE_CREATED}:
node_creation_date = bpv_val
elif bpv_pred_curie == kg2_util.CURIE_ID_HGNC_DATE_LAST_MODIFIED:
node_update_date = bpv_val
elif bpv_pred_curie == kg2_util.CURIE_ID_IAO_TERM_REPLACED_BY:
if not node_deprecated:
node_deprecated = True
kg2_util.log_message(message="Node has IAO:0100001 attribute but not owl:deprecated; setting deprecated=True",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
node_replaced_by_uri = bpv_val
node_replaced_by_curie = uri_to_curie_shortener(node_replaced_by_uri)
elif bpv_pred_curie == kg2_util.CURIE_ID_UMLS_HAS_TUI: # STY_BASE_IRI:
node_tui_list.append(bpv_val)
elif bpv_pred_curie == kg2_util.CURIE_ID_SKOS_PREF_LABEL:
if not node_curie_id.startswith(kg2_util.CURIE_PREFIX_HGNC + ':'):
node_name = bpv_val
else:
node_full_name = bpv_val
if node_name is None:
node_name = node_full_name
elif bpv_pred_curie == kg2_util.CURIE_ID_SKOS_ALT_LABEL:
if node_curie_id.startswith(kg2_util.CURIE_PREFIX_HGNC + ':') and bpv_val.endswith(' gene'):
node_gene_symbol = bpv_val.replace(' gene', '')
node_synonyms.add(bpv_val)
elif bpv_pred_curie == kg2_util.CURIE_ID_SKOS_DEFINITION:
node_description = kg2_util.strip_html(bpv_val)
elif bpv_pred_curie == kg2_util.CURIE_ID_HGNC_GENE_SYMBOL:
node_gene_symbol = bpv_val
node_synonyms.add(node_gene_symbol)
if len(node_tui_list) == 1:
node_tui = node_tui_list[0]
node_tui_curie = kg2_util.CURIE_PREFIX_UMLS_STY + ':' + node_tui
node_tui_uri = curie_to_uri_expander(node_tui_curie)
assert node_tui_curie is not None
[node_tui_category_label,
_] = get_biolink_category_for_node(node_tui_uri,
node_tui_curie,
ontology,
curies_to_categories,
uri_to_curie_shortener,
set(),
True,
biolink_categories_ontology_depths)
node_comments = node_meta.get('comments', None)
if node_comments is not None:
comments_str = 'COMMENTS: ' + (' // '.join(node_comments))
if node_description is not None:
node_description += ' // ' + comments_str
else:
node_description = comments_str
node_type = onto_node_dict.get('type', None)
if node_type is not None and node_type == 'PROPERTY':
node_category_label = kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY
if node_category_label is None or node_category_label == 'named thing':
# This is a fix for #891. It was supposed to be addressed on line 756 ("if node_category_label is None:")
# and 757 ("node_category_label = node_tui_category_label"), but due to the assignment of the label
# 'named thing', that condition was never triggered. Instead, that is now handled here.
if node_tui is not None:
if node_tui in mappings_to_categories:
node_category_label = mappings_to_categories[node_tui]
else:
kg2_util.log_message(message="Node with ontology_node_id " + ontology_node_id + " does not have a category and has tui " + node_tui,
output_stream=sys.stderr)
tuis_not_in_mappings_but_in_kg2.add(node_tui)
if node_category_label is None:
node_category_label = 'named thing'
if node_has_cui:
assert node_tui is not None or len(node_tui_list) > 0
if node_tui_category_label is None:
node_tui_category_label = 'named thing'
if node_tui is not None:
kg2_util.log_message(message='Node ' + ontology_node_id + ' has CUI whose TUI cannot be mapped to category: ' + node_tui,
ontology_name=iri_of_ontology,
output_stream=sys.stderr)
else:
try:
# POSSIBLY SHOULD REMOVE "or node_category_label == 'named thing'"
if node_category_label is None or node_category_label == 'named thing' or node_curie_id.split(":")[0] == kg2_util.CURIE_PREFIX_UMLS:
node_tui_category_label = get_category_for_multiple_tui(biolink_category_tree, node_tui_list, mappings_to_categories)
node_category_label = node_tui_category_label
except KeyError:
kg2_util.log_message(message='Node ' + node_curie_id + ' has CUI with multiple associated TUIs: ' + ', '.join(node_tui_list) +
' and could not be mapped',
ontology_name=iri_of_ontology,
output_stream=sys.stderr)
else:
if node_category_label is None:
node_category_label = node_tui_category_label # override the node category_label if we have a TUI
node_tui_category_curie = kg2_util.convert_biolink_category_to_curie(node_tui_category_label)
ontology_curie_id = ontologies_iris_to_curies[iri_of_ontology]
source_ontology_information = ret_dict.get(ontology_curie_id, None)
if source_ontology_information is None:
kg2_util.log_message(message="ontology IRI has no information dictionary available",
ontology_name=iri_of_ontology,
output_stream=sys.stderr)
assert False
source_ontology_update_date = source_ontology_information['update_date']
if node_update_date is None:
node_update_date = source_ontology_update_date
if node_description is not None:
node_description_xrefs_match = REGEX_XREF_END_DESCRIP.match(node_description)
if node_description_xrefs_match is not None:
node_description_xrefs_str = node_description_xrefs_match[1]
node_description_xrefs_list = node_description_xrefs_str.split(',')
for node_description_xref_str in node_description_xrefs_list:
node_description_xref_str = node_description_xref_str.strip()
if ':' in node_description_xref_str:
node_xrefs.add(node_description_xref_str)
node_description_pubs = REGEX_PUBLICATIONS.findall(node_description)
for pub_curie in node_description_pubs:
node_publications.add(pub_curie)
# deal with node names that are ALLCAPS
if node_name is not None and node_name.isupper():
node_name = kg2_util.allcaps_to_only_first_letter_capitalized(node_name)
if node_name is not None:
if node_name.lower().startswith('obsolete:') or \
(node_curie_id.startswith(kg2_util.CURIE_PREFIX_GO + ':') and node_name.lower().startswith('obsolete ')):
node_deprecated = True
kg2_util.log_message(message="Node has obsolete name but not owl:deprecated; setting deprecated=True",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
if REGEX_OBSOLETE.match(node_name) is not None:
node_deprecated = True
kg2_util.log_message(message="Node has obsolete regex in name but not owl:deprecated; setting deprecated=True",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
if node_description is not None:
if node_description.lower().startswith('obsolete:') or node_description.lower().startswith('obsolete.'):
node_deprecated = True
kg2_util.log_message(message="Node has obsolete description but not owl:deprecated; setting deprecated=True",
ontology_name=iri_of_ontology,
node_curie_id=node_curie_id,
output_stream=sys.stderr)
provided_by = ontology_curie_id
if node_name is None:
if node_gene_symbol is not None:
node_name = node_gene_symbol
node_dict = kg2_util.make_node(node_curie_id,
iri,
node_name,
node_category_label,
node_update_date,
provided_by)
if node_gene_symbol is not None:
node_dict['name'] = node_gene_symbol
node_dict['full_name'] = node_full_name
node_dict['description'] = node_description
node_dict['creation_date'] = node_creation_date # slot name is not biolink standard
node_dict['deprecated'] = node_deprecated # slot name is not biolink standard
node_dict['replaced_by'] = node_replaced_by_curie # slot name is not biolink standard
node_dict['ontology node ids'] = [ontology_node_id] # slot name is not biolink standard
node_dict['xrefs'] = sorted(list(node_xrefs)) # slot name is not biolink standard
node_dict['synonym'] = sorted(list(node_synonyms)) # slot name is not biolink standard
node_dict['publications'] = sorted(list(node_publications))
if node_curie_id in ret_dict:
if node_curie_id != provided_by:
node_dict = kg2_util.merge_two_dicts(ret_dict[node_curie_id],
node_dict,
biolink_depth_getter)
ret_dict[node_curie_id] = node_dict
else:
ret_dict[node_curie_id] = node_dict # issue 984
else:
ret_dict[node_curie_id] = node_dict
return ret_dict
def get_rels_dict(nodes: dict,
ont_file_information_dict_list: list,
uri_to_curie_shortener: callable,
curie_to_uri_expander: callable,
map_of_node_ontology_ids_to_curie_ids: dict):
rels_dict = dict()
for ont_file_information_dict in ont_file_information_dict_list:
ontology = ont_file_information_dict['ontology']
ontology_id = ont_file_information_dict['id']
ont_graph = ontology.get_graph()
ontology_curie_id = map_of_node_ontology_ids_to_curie_ids[ontology_id]
for (object_id, subject_id, predicate_dict) in ont_graph.edges(data=True):
assert type(predicate_dict) == dict
ontology_node = nodes.get(ontology_curie_id, None)
if ontology_node is not None:
ontology_update_date = ontology_node['update_date']
if subject_id == OWL_BASE_CLASS or object_id == OWL_BASE_CLASS:
continue
if subject_id == OWL_NOTHING or object_id == OWL_NOTHING:
continue
if subject_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER) or \
object_id.startswith(MYSTERIOUS_BASE_NODE_ID_TO_FILTER):
continue
if subject_id.endswith(NOCODE) or object_id.endswith(NOCODE):
continue
# subject_id and object_id are IDs from the original ontology objects; these may not
# always be the node curie IDs (e.g., for SNOMED terms). Need to map them
subject_curie_id = map_of_node_ontology_ids_to_curie_ids.get(subject_id, None)
if subject_curie_id is None:
kg2_util.log_message(message="subject node ontology ID has no curie ID in the map",
ontology_name=ontology.id,
node_curie_id=subject_id,
output_stream=sys.stderr)
continue
object_curie_id = map_of_node_ontology_ids_to_curie_ids.get(object_id, None)
if object_curie_id is None:
kg2_util.log_message(message="object node ontology ID has no curie ID in the map",
ontology_name=ontology.id,
node_curie_id=object_id,
output_stream=sys.stderr)
continue
predicate_label = None
edge_pred_string = predicate_dict['pred']
if subject_curie_id.startswith(kg2_util.CURIE_PREFIX_UMLS_STY) and \
object_curie_id.startswith(kg2_util.CURIE_PREFIX_UMLS_STY) and edge_pred_string == 'subClassOf':
continue
if edge_pred_string == "type" and ontology_curie_id.startswith(kg2_util.CURIE_PREFIX_BIOLINK_SOURCE + ':'):
continue
if not kg2_util.is_a_valid_http_url(edge_pred_string):
# edge_pred_string is not a URI; this is the most common case
if ':' not in edge_pred_string:
# edge_pred_string is not a CURIE; this is the most common subcase
if edge_pred_string in kg2_util.RDFS_EDGE_NAMES_SET:
predicate_curie = kg2_util.CURIE_PREFIX_RDFS + ':' + edge_pred_string
elif edge_pred_string in kg2_util.OWL_EDGE_NAMES_SET:
predicate_curie = kg2_util.CURIE_PREFIX_OWL + ':' + edge_pred_string
elif edge_pred_string in kg2_util.MONDO_EDGE_NAMES_SET:
predicate_curie = kg2_util.CURIE_PREFIX_MONDO + ':' + edge_pred_string
elif edge_pred_string in kg2_util.RDF_EDGE_NAMES_SET:
predicate_curie = kg2_util.CURIE_PREFIX_RDF + ':' + edge_pred_string
else:
assert False, "Cannot map predicate name: " + edge_pred_string + " to a predicate CURIE, in ontology: " + ontology.id
predicate_label = kg2_util.convert_camel_case_to_snake_case(edge_pred_string)
else:
# edge_pred_string is a CURIE
predicate_curie = edge_pred_string
predicate_node = nodes.get(predicate_curie, None)
if predicate_node is not None:
predicate_label = predicate_node['name']
else:
# predicate has no node object defined; just pull the label out of the CURIE
if edge_pred_string.startswith(kg2_util.CURIE_PREFIX_OBO + ':'):
test_curie = edge_pred_string.replace(kg2_util.CURIE_PREFIX_OBO + ':', '').replace('_', ':')
predicate_node = nodes.get(test_curie, None)
if predicate_node is None:
predicate_label = edge_pred_string.split(':')[1].split('#')[-1]
else:
predicate_curie = test_curie
else:
predicate_label = edge_pred_string
predicate_iri = curie_to_uri_expander(predicate_curie)
predicate_curie_new = uri_to_curie_shortener(predicate_iri)
if predicate_curie_new is not None:
predicate_curie = predicate_curie_new
else:
predicate_iri = edge_pred_string
predicate_curie = uri_to_curie_shortener(predicate_iri)
if predicate_curie is None:
kg2_util.log_message(message="predicate IRI has no CURIE: " + predicate_iri,
ontology_name=ontology.id,
output_stream=sys.stderr)
continue
if subject_curie_id == object_curie_id and predicate_label == 'xref':
continue
if predicate_curie == kg2_util.CURIE_ID_UMLS_HAS_STY:
subject_node = nodes[subject_curie_id]
object_node = nodes[object_curie_id]
subject_description = subject_node['description']
if subject_description is None:
subject_description = ''
subject_node['description'] = '; '.join(list(filter(None, [subject_description,
'UMLS Semantic Type: ' + object_node['id']])))
continue