forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchembl_mysql_to_kg_jsonl.py
executable file
·496 lines (451 loc) · 21.1 KB
/
chembl_mysql_to_kg_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#!/usr/bin/env python3
'''chembl_mysql_to_kg_json.py: Extracts a KG in JSON format from the ChEMBL mysql database
Usage: chembl_mysql_to_kg_json.py [--test] <mysql_conf> <chembl_mysql_dbname> <outputNodesFile.json> <outputEdgesFile.json>
'''
__author__ = 'Stephen Ramsey'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
import argparse
import kg2_util
import pymysql
import datetime
CHEMBL_CURIE_BASE_COMPOUND = kg2_util.CURIE_PREFIX_CHEMBL_COMPOUND
CHEMBL_CURIE_BASE_TARGET = kg2_util.CURIE_PREFIX_CHEMBL_TARGET
CHEMBL_CURIE_BASE_MECHANISM = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM
CHEMBL_KB_CURIE_ID = kg2_util.CURIE_PREFIX_IDENTIFIERS_ORG_REGISTRY + \
':' + 'chembl.compound'
CHEMBL_KB_URL = kg2_util.BASE_URL_IDENTIFIERS_ORG_REGISTRY + 'chembl.compound'
CHEMBL_BASE_IRI_COMPOUND = kg2_util.BASE_URL_CHEMBL_COMPOUND
CHEMBL_BASE_IRI_TARGET = kg2_util.BASE_URL_CHEMBL_TARGET
CHEMBL_BASE_IRI_PREDICATE = kg2_util.BASE_URL_CHEMBL_MECHANISM
ROW_LIMIT_TEST_MODE = 10000
TARGET_TYPE_TO_CATEGORY = {
'CELL-LINE': kg2_util.BIOLINK_CATEGORY_CELL_LINE,
'CHIMERIC PROTEIN': kg2_util.BIOLINK_CATEGORY_PROTEIN,
'LIPID': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY,
'MACROMOLECULE': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY,
'METAL': kg2_util.BIOLINK_CATEGORY_CHEMICAL_ENTITY,
'NUCLEIC-ACID': kg2_util.BIOLINK_CATEGORY_NUCLEIC_ACID_ENTITY,
'OLIGOSACCHARIDE': kg2_util.BIOLINK_CATEGORY_SMALL_MOLECULE,
'ORGANISM': kg2_util.BIOLINK_CATEGORY_ORGANISM_TAXON,
'PHENOTYPE': kg2_util.BIOLINK_CATEGORY_PHENOTYPIC_FEATURE,
'PROTEIN COMPLEX': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY,
'PROTEIN COMPLEX GROUP': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY,
'PROTEIN FAMILY': kg2_util.BIOLINK_CATEGORY_GENE_FAMILY,
'PROTEIN NUCLEIC-ACID COMPLEX': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY,
'PROTEIN-PROTEIN INTERACTION': kg2_util.BIOLINK_CATEGORY_PROTEIN,
'SELECTIVITY GROUP': kg2_util.BIOLINK_CATEGORY_GENE_FAMILY,
'SINGLE PROTEIN': kg2_util.BIOLINK_CATEGORY_PROTEIN,
'SMALL MOLECULE': kg2_util.BIOLINK_CATEGORY_SMALL_MOLECULE,
'SUBCELLULAR': kg2_util.BIOLINK_CATEGORY_CELLULAR_COMPONENT,
'TISSUE': kg2_util.BIOLINK_CATEGORY_ANATOMICAL_ENTITY,
'UNKNOWN': kg2_util.BIOLINK_CATEGORY_MOLECULAR_ENTITY
}
def get_args():
arg_parser = argparse.ArgumentParser(
description='chembl_mysql_to_kg_json.py: Extracts a KG in JSON ' +
'format from the ChEMBL mysql database')
arg_parser.add_argument('--test', dest='test', action="store_true",
default=False)
arg_parser.add_argument('mysqlConfigFile', type=str)
arg_parser.add_argument('mysqlDBName', type=str)
arg_parser.add_argument('outputNodesFile', type=str)
arg_parser.add_argument('outputEdgesFile', type=str)
return arg_parser.parse_args()
def date():
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def make_edge(subject_id: str,
object_id: str,
predicate_label: str,
update_date: str = None,
publications: list = None):
relation_curie = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + \
predicate_label
edge = kg2_util.make_edge(subject_id,
object_id,
relation_curie,
predicate_label,
CHEMBL_KB_CURIE_ID,
update_date)
edge['publications'] = [] if publications is None else publications
edge['publications_info'] = {}
return edge
def make_node(id: str,
iri: str,
name: str,
category_label: str,
description: str,
synonym: list,
publications: list,
update_date: str,
canonical_smiles: str = None):
node_dict = kg2_util.make_node(id,
iri,
name,
category_label,
update_date,
CHEMBL_KB_CURIE_ID)
node_dict['description'] = description
node_dict['synonym'] = sorted(synonym)
node_dict['publications'] = sorted(publications)
node_dict['has_biological_sequence'] = canonical_smiles
return node_dict
if __name__ == '__main__':
print("Start time: ", date())
args = get_args()
mysql_config_file = args.mysqlConfigFile
mysql_db_name = args.mysqlDBName
output_nodes_file_name = args.outputNodesFile
output_edges_file_name = args.outputEdgesFile
test_mode = args.test
connection = pymysql.connect(read_default_file=mysql_config_file, db=mysql_db_name)
nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
nodes_output = nodes_info[0]
edges_output = edges_info[0]
str_sql_row_limit_test_mode = ' limit ' + str(ROW_LIMIT_TEST_MODE)
sql = "select name, DATE_FORMAT(creation_date, '%Y-%m-%d') from version"
with connection.cursor() as cursor:
cursor.execute(sql)
version_results = cursor.fetchone()
version = version_results[0].replace('ChEMBL_', '')
update_date = version_results[1]
# create node objects for ChEMBL compounds
sql = '''select distinct
molecule_dictionary.chembl_id,
molecule_dictionary.pref_name,
molecule_dictionary.molecule_type,
molecule_dictionary.max_phase,
molecule_dictionary.availability_type,
compound_structures.standard_inchi,
compound_structures.standard_inchi_key,
compound_structures.canonical_smiles,
compound_properties.full_mwt,
molecule_dictionary.molregno
from (molecule_dictionary
left join compound_structures on molecule_dictionary.molregno = compound_structures.molregno)
left join compound_properties on molecule_dictionary.molregno = compound_properties.molregno'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
molecule_sql_results = cursor.fetchall()
row_ctr = 0
for (chembl_id,
pref_name,
molecule_type,
max_phase_int,
availability_type,
standard_inchi,
standard_inchi_key,
canonical_smiles,
full_mwt,
molregno) in molecule_sql_results:
row_ctr += 1
if row_ctr % 100000 == 0:
print("have processed " + str(row_ctr) + " compounds")
synonyms = []
if standard_inchi is not None:
synonyms.append(standard_inchi)
if standard_inchi_key is not None:
synonyms.append(standard_inchi_key)
if canonical_smiles is not None:
synonyms.append(canonical_smiles)
sequence = canonical_smiles
curie_id = 'CHEMBL.COMPOUND:' + chembl_id
category_label = kg2_util.BIOLINK_CATEGORY_SMALL_MOLECULE
# query to get all synonyms and publications associated with the ChEMBL molecule
sql_synonyms = ('select distinct compound_name, src_short_name, src_compound_id, pubmed_id '
'from (compound_records natural join source) '
'left join docs on compound_records.doc_id = docs.doc_id '
'where molregno =')
sql_synonyms += str(molregno)
publications = []
publications_set = set()
with connection.cursor() as cursor:
cursor.execute(sql_synonyms)
synonym_results = cursor.fetchall()
synonym_set = set()
for (compound_name,
src_short_name,
src_compound_id,
pubmed_id) in synonym_results:
if pref_name is None and compound_name is not None:
pref_name = compound_name
if compound_name is not None:
synonym_set.add(compound_name)
if pubmed_id is not None:
publications_set.add(kg2_util.CURIE_PREFIX_PMID + ':' + str(pubmed_id))
if src_compound_id is not None and src_short_name is not None and src_short_name != "LITERATURE":
synonym_set.add(src_short_name + ':' + src_compound_id)
compound_synonyms = list(synonym_set)
publications += list(publications_set)
synonyms += compound_synonyms
synonyms = synonyms
if pref_name is not None:
description = pref_name
else:
description = ''
if full_mwt is not None:
description += '; FULL_MW:' + str(full_mwt)
if max_phase_int is not None:
description += '; MAX_FDA_APPROVAL_PHASE: ' + str(max_phase_int)
id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id
iri = CHEMBL_BASE_IRI_COMPOUND + chembl_id
node_dict = make_node(id,
iri,
pref_name,
category_label,
description,
synonyms,
publications,
update_date,
canonical_smiles)
nodes_output.write(node_dict)
# create node objects for ChEMBL targets
sql = '''select distinct
target_dictionary.chembl_id,
target_dictionary.tax_id,
target_dictionary.pref_name,
target_type.target_type from
target_dictionary natural join target_type'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (chembl_id,
tax_id,
pref_name,
target_type) in results:
curie_id = 'CHEMBL.TARGET:' + chembl_id
category_label = 'drug_target'
description = pref_name
if target_type is not None:
description += '; TARGET_TYPE: ' + target_type
category_label = TARGET_TYPE_TO_CATEGORY.get(target_type, None)
if category_label is None:
continue
node_dict = make_node(curie_id,
CHEMBL_BASE_IRI_TARGET + chembl_id,
pref_name,
category_label,
description,
[],
[],
update_date)
nodes_output.write(node_dict)
# create node objects for "mechanism_of_action" types
sql = 'select distinct mechanism_of_action from drug_mechanism'
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (mechanism_of_action,) in results:
if mechanism_of_action is not None:
node_label = mechanism_of_action.lower().replace(' ', '_')
node_curie_id = CHEMBL_CURIE_BASE_MECHANISM + ':' + node_label
category_label = kg2_util.BIOLINK_CATEGORY_NAMED_THING
node_dict = make_node(node_curie_id,
CHEMBL_BASE_IRI_PREDICATE + node_label,
mechanism_of_action,
category_label,
None,
[],
[],
update_date)
nodes_output.write(node_dict)
# get action_type nodes and their subclass_of relationships
sql = 'select action_type, description, parent_type from action_type'
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (action_type, description, parent_type) in results:
name = action_type.lower()
predicate_label = name.replace(' ', '_')
curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + predicate_label
node_dict = make_node(curie_id,
CHEMBL_BASE_IRI_PREDICATE + predicate_label,
name,
kg2_util.BIOLINK_CATEGORY_NAMED_THING,
description,
[],
[],
update_date)
nodes_output.write(node_dict)
parent_label = parent_type.lower().replace(' ', '_')
parent_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + parent_label
new_edge = kg2_util.make_edge_biolink(curie_id,
parent_curie_id,
kg2_util.EDGE_LABEL_BIOLINK_SUBCLASS_OF,
CHEMBL_KB_CURIE_ID,
update_date)
edges_output.write(new_edge)
# get target-to-target subset_of relationships
sql = '''select distinct
t1.chembl_id,
target_relations.relationship,
t2.chembl_id
from
(target_dictionary as t1 inner join
target_relations on t1.tid = target_relations.tid) inner join
target_dictionary as t2 on t2.tid = target_relations.related_tid'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (t1_chembl_id,
relationship,
t2_chembl_id) in results:
subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_TARGET + ':' + t1_chembl_id
object_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_TARGET + ':' + t2_chembl_id
predicate_label = relationship.lower().replace(' ', '_')
edges_output.write(make_edge(subject_curie_id,
object_curie_id,
predicate_label,
update_date))
# get ChEMBL target-to-protein and target-to-RNA relationships
sql = '''select distinct
target_dictionary.chembl_id,
target_components.homologue,
component_sequences.component_type,
component_sequences.accession,
component_sequences.db_source,
component_sequences.db_version
from
(target_dictionary right join
target_components on target_dictionary.tid = target_components.tid)
left join component_sequences on target_components.component_id = component_sequences.component_id
where component_sequences.accession is not NULL'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (chembl_id,
homologue,
component_type,
accession,
db_source,
db_version) in results:
subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_TARGET + ':' + chembl_id
if component_type == 'PROTEIN':
object_curie_id = kg2_util.CURIE_PREFIX_UNIPROT + ':' + accession
elif component_type == 'RNA':
object_curie_id = kg2_util.CURIE_PREFIX_ENSEMBL + ':' + accession
edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
object_curie_id,
kg2_util.EDGE_LABEL_BIOLINK_PART_OF,
CHEMBL_KB_CURIE_ID,
update_date))
# get drug-to-target edges and additional information about drugs (direct_interaction, has_role, etc.)
sql = '''select distinct
molecule_dictionary.chembl_id,
drug_mechanism.mechanism_of_action,
drug_mechanism.direct_interaction,
mechanism_refs.ref_url,
action_type.action_type,
target_dictionary.chembl_id
from (((molecule_dictionary
natural join drug_mechanism)
inner join target_dictionary on drug_mechanism.tid = target_dictionary.tid)
natural join action_type
left join mechanism_refs on drug_mechanism.mec_id = mechanism_refs.mec_id)'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (molec_chembl_id,
mechanism_of_action,
direct_interaction,
ref_url,
action_type,
target_chembl_id) in results:
subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + molec_chembl_id
object_curie_id = CHEMBL_CURIE_BASE_TARGET + ':' + target_chembl_id
predicate_label = action_type.lower().replace(' ', '_')
if ref_url is not None:
publications = [ref_url]
else:
publications = None
edges_output.write(make_edge(subject_curie_id,
object_curie_id,
predicate_label,
update_date,
publications))
if direct_interaction is not None and direct_interaction == 1:
edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
object_curie_id,
kg2_util.EDGE_LABEL_BIOLINK_PHYSICALLY_INTERACTS_WITH,
CHEMBL_KB_CURIE_ID,
update_date))
if mechanism_of_action is not None:
mech_label = mechanism_of_action.lower().replace(' ', '_')
mech_curie_id = CHEMBL_CURIE_BASE_MECHANISM + ':' + mech_label
edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
mech_curie_id,
kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO,
CHEMBL_KB_CURIE_ID,
update_date))
# get molecule-to-disease indications
sql = '''select md.chembl_id, di.mesh_id from molecule_dictionary as md inner join drug_indication as di on md.molregno = di.molregno'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (chembl_id, mesh_id) in results:
subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id
object_curie_id = kg2_util.CURIE_PREFIX_MESH + ':' + mesh_id
predicate_label = kg2_util.EDGE_LABEL_BIOLINK_TREATS
edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
object_curie_id,
predicate_label,
CHEMBL_KB_CURIE_ID,
update_date))
# get metabolism information
sql = '''select m1.chembl_id as drug_id,
m2.chembl_id as compound_id,
m3.chembl_id as metabolite_id
from metabolism as met
inner join compound_records as c1
on met.drug_record_id = c1.record_id
inner join molecule_dictionary as m1
on c1.molregno = m1.molregno
inner join compound_records as c2
on met.substrate_record_id = c2.record_id
inner join molecule_dictionary as m2
on c2.molregno = m2.molregno
inner join compound_records as c3
on met.metabolite_record_id = c3.record_id
inner join molecule_dictionary as m3
on c3.molregno = m3.molregno'''
if test_mode:
sql += str_sql_row_limit_test_mode
with connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
for (drug_id, compound_id, metabolite_id) in results:
subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + compound_id
object_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + metabolite_id
predicate_label = kg2_util.EDGE_LABEL_BIOLINK_HAS_METABOLITE
edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
object_curie_id,
predicate_label,
CHEMBL_KB_CURIE_ID,
update_date))
nodes_output.write(kg2_util.make_node(CHEMBL_KB_CURIE_ID,
CHEMBL_KB_URL,
'ChEMBL v' + version,
kg2_util.SOURCE_NODE_CATEGORY,
update_date,
CHEMBL_KB_CURIE_ID))
kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
print("Finish time: ", date())