forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_kg2_pmids.py
60 lines (47 loc) · 1.89 KB
/
extract_kg2_pmids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
''' extract_kg2_pmids.py: stores all PMIDs listed in KG2 in
a JSON file to be used by pubmed_xml_to_kg_json.py
Usage: extract_kg2_pmids.py <inputDirectory> <outputFile.json>
'''
import datetime
import kg2_util
import json
import argparse
__author__ = 'Erica Wood'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Erica Wood']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
def get_args():
arg_parser = argparse.ArgumentParser(description='extract_kg2_pmids.py: \
stores all unique PMIDs listed in \
KG2 in a JSON file to be used by \
pubmed_xml_to_kg_json.py')
input_description = "The Full KG2 JSON File"
arg_parser.add_argument('inputFile', type=str, help=input_description)
output_description = "A JSON File that Will Store a List of Unique PMIDs"
arg_parser.add_argument('outputFile', type=str, help=output_description)
return arg_parser.parse_args()
if __name__ == '__main__':
args = get_args()
input_file = open(args.inputFile)
kg2_data = json.load(input_file)
input_file.close()
publications = {}
for node in kg2_data["nodes"]:
for publication in node["publications"]:
publications[publication] = None
for edge in kg2_data["edges"]:
for publication in edge["publications_info"].keys():
publications[publication] = None
for publication in edge["publications"]:
publications[publication] = None
publications_list = []
for publication in publications.keys():
if publication.startswith("PMID"):
publications_list.append(publication)
with open(args.outputFile, 'w+') as output_file:
output_file.write(json.dumps(publications_list))