From a65b301b428ac5b0235708daa11870ea3481fe4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Fri, 12 Aug 2022 00:10:13 +0300 Subject: [PATCH 01/32] Add files via upload --- pypath/inputs/chembl.py | 278 +++++++++++++++++++++++++ pypath/inputs/drugbank.py | 418 ++++++++++++++++++++++++++++++++++++++ pypath/inputs/hpo.py | 166 +++++++++++++++ 3 files changed, 862 insertions(+) create mode 100644 pypath/inputs/chembl.py create mode 100644 pypath/inputs/drugbank.py create mode 100644 pypath/inputs/hpo.py diff --git a/pypath/inputs/chembl.py b/pypath/inputs/chembl.py new file mode 100644 index 000000000..aecdc4eda --- /dev/null +++ b/pypath/inputs/chembl.py @@ -0,0 +1,278 @@ +from typing import List + +import json +import collections + +import pypath.share.curl as curl +import pypath.resources.urls as urls + +def chembl_targets() -> List[tuple] : + """ + Retrieves targets data from ChEMBL. + + Returns: + namedtuple. + """ + + fields_target = ('accession','target_chembl_id') + + Target = collections.namedtuple('Target', fields_target,defaults = ("None",) * len(fields_target)) + + trgtlst = [] + + flag = 0 + + while True: + + if flag == 0: + + url = urls.urls['chembl']['url'] + urls.urls['chembl']['target'] + c = curl.Curl(url, large=True, silent=False) + flag = 1 + + else: + + if lst['page_meta']['next']: + + url = urls.urls['chembl']['url'] + lst['page_meta']['next'] + c = curl.Curl(url, large=True, silent=False) + + else: + + break + + fileObject = open(c.fileobj.name) + lst = json.loads(fileObject.read()) + + for trgt_attr in lst['targets']: + + if trgt_attr['target_components']: + + trgtlst.append( + Target( + accession = trgt_attr['target_components'][0]['accession'], + target_chembl_id = trgt_attr['target_chembl_id'], + ) + ) + + else: + + trgtlst.append( + Target( + target_chembl_id = trgt_attr['target_chembl_id'], + ) + ) + + return trgtlst + +def chembl_assays() -> List[tuple] : + """ + Retrieves assays data from ChEMBL. + + Returns: + namedtuple. + """ + + fields_assay = ('assay_chembl_id','assay_organism','assay_type','confidence_score','target_chembl_id') + + Assay = collections.namedtuple('Assay', fields_assay,defaults = ("None",) * len(fields_assay)) + + assylst = [] + + flag = 0 + + while True: + + if flag == 0: + + url = urls.urls['chembl']['url'] + urls.urls['chembl']['assay'] + c = curl.Curl(url, large=True, silent=False) + flag = 1 + + else: + + if lst['page_meta']['next']: + + url = urls.urls['chembl']['url'] + lst['page_meta']['next'] + c = curl.Curl(url, large=True, silent=False) + + else: + + break + + fileObject = open(c.fileobj.name) + lst = json.loads(fileObject.read()) + + for assy_attr in lst['assays']: + + assylst.append( + Assay( + assay_chembl_id = assy_attr['assay_chembl_id'], + assay_organism = assy_attr['assay_organism'], + assay_type = assy_attr['assay_type'], + confidence_score = assy_attr['confidence_score'], + target_chembl_id = assy_attr['target_chembl_id'], + ) + ) + + return assylst + +def chembl_molecules() -> List[tuple] : + """ + Retrieves molecules data from ChEMBL. + + Returns: + namedtuple. + """ + + fields_molecule = ('alogp','conanicle_smiles','chirality','full_mwt','heavy_atoms','standard_inchi_key','molecular_species', + 'molecul_type','molecule_chembl_id','parent_chembl_id','prodrug','standard_inchi', 'xrefs') + + Molecule = collections.namedtuple('Molecule', fields_molecule,defaults = ("None",) * len(fields_molecule)) + + mlcllst = [] + + flag = 0 + + while True: + + if flag == 0: + + url = urls.urls['chembl']['url'] + urls.urls['chembl']['molecule'] + c = curl.Curl(url, large=True, silent=False) + flag = 1 + + else: + + if lst['page_meta']['next']: + + url = urls.urls['chembl']['url'] + lst['page_meta']['next'] + c = curl.Curl(url, large=True, silent=False) + + else: + + break + + fileObject = open(c.fileobj.name) + lst = json.loads(fileObject.read()) + + for mlcl_attr in lst['molecules']: + + xrefs = [] + mlcllst.append( + Molecule( + chirality = mlcl_attr['chirality'], + molecul_type = mlcl_attr['molecule_type'], + prodrug = mlcl_attr['prodrug'], + ) + ) + + if mlcl_attr['molecule_hierarchy'] != None: + mlcllst[-1] = mlcllst[-1]._replace( + molecule_chembl_id = mlcl_attr['molecule_hierarchy']['molecule_chembl_id'], + parent_chembl_id = mlcl_attr['molecule_hierarchy']['parent_chembl_id'], + ) + + if mlcl_attr['molecule_properties'] != None: + mlcllst[-1] = mlcllst[-1]._replace( + alogp = mlcl_attr['molecule_properties']['alogp'], + full_mwt = mlcl_attr['molecule_properties']['full_mwt'], + heavy_atoms = mlcl_attr['molecule_properties']['heavy_atoms'], + molecular_species = mlcl_attr['molecule_properties']['molecular_species'], + ) + + if mlcl_attr['molecule_structures'] != None: + mlcllst[-1] = mlcllst[-1]._replace( + conanicle_smiles = mlcl_attr['molecule_structures']['canonical_smiles'], + standard_inchi_key = mlcl_attr['molecule_structures']['standard_inchi_key'], + standard_inchi = mlcl_attr['molecule_structures']['standard_inchi'], + ) + + if mlcl_attr['cross_references'] != None: + + for rec in mlcl_attr['cross_references']: + + xrefs.append({'xref_id' : rec['xref_id'], 'xref_src': rec['xref_src']}) + + mlcllst[-1] = mlcllst[-1]._replace( + xrefs = xrefs + ) + + + return mlcllst + +def chembl_activities( + pchembl_value_none: bool = False, + standard_relation: bool = '=', + ) -> List[tuple] : + """ + Retrieves activities data from ChEMBL. + + Args: + pchembl_value_none (bool): Whether the pchembl value should be none or not. + standard_relation (str): Which standard relation in needed. + + Returns: + namedtuple. + standard_flag and standard_units attributes are not included in the returned namedtuple. + Only records returned are the ones where data_validity_comment is none. + """ + + fields_activity = ('assay_chembl_id','data_validity_comment','molecule_chembl_id','pchembl_value', + 'standard_relation','standard_value','target_chembl_id') + + Activity = collections.namedtuple('Activity', fields_activity,defaults = ("None",) * len(fields_activity)) + + actvtylst = [] + + flag = 0 + + while True: + + if flag == 0: + + if pchembl_value_none == True: + + url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=true' + + else: + + url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=false' + + url = url + '&standard_relation__exact='+standard_relation + c = curl.Curl(url, large=True, silent=False) + flag = 1 + + else: + + if lst['page_meta']['next']: + + url = urls.urls['chembl']['url'] + lst['page_meta']['next'] + c = curl.Curl(url, large=True, silent=False) + + else: + + break + + fileObject = open(c.fileobj.name) + lst = json.loads(fileObject.read()) + + + for actvty_attr in lst['activities']: + + if actvty_attr['data_validity_comment'] == None: + + actvtylst.append( + Activity( + assay_chembl_id = actvty_attr['assay_chembl_id'], + data_validity_comment = actvty_attr['data_validity_comment'], + molecule_chembl_id = actvty_attr['molecule_chembl_id'], + pchembl_value = actvty_attr['pchembl_value'], + standard_relation = actvty_attr['standard_relation'], + standard_value = actvty_attr['standard_value'], + target_chembl_id = actvty_attr['target_chembl_id'], + ) + ) + + + return actvtylst \ No newline at end of file diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py new file mode 100644 index 000000000..9f4ed3ae2 --- /dev/null +++ b/pypath/inputs/drugbank.py @@ -0,0 +1,418 @@ +from typing import List + +import os +import csv +import collections +import base64 + +import pypath.resources.urls as urls +import pypath.share.curl as curl +import pypath.share.session as session +import pypath.share.settings as settings + +_logger = session.Logger(name = 'drugbank') +_log = _logger._log + +def add_prot_id( + user: str, + passwd: str, + pharma_active: bool = False, + ) -> List[tuple] : + """ + Retrieves protein identifiers from Drugbank. + + Args: + user (str): E-mail address for login to DrugBank. + passwd (str): Password for login to DrugBank. + pharma_active (bool): Wheter to include pharmacologically active identifiers. + + Returns: + namedtuple. + """ + + credentials = {'user': user, 'passwd': passwd} + + auth_str = base64.b64encode( + ('%s:%s' % (credentials['user'], credentials['passwd'])).encode() + ).decode() + + decoded = 'Basic %s' % auth_str + + req_hdrs = ['Authorization: %s' % decoded] + req_hdrs.extend([settings.get('user_agent')]) + + fields = ('DrugBank_ID','Target_UniProt_ID','Transporter_UniProt_ID','Enzym_UniProt_ID','Carrier_UniProt_ID') + + ProteinIdentifiers = collections.namedtuple('ProteinIndetifiers', fields,defaults = ("",) * len(fields)) + + url = urls.urls['drugbank']['drug_enzym_identifiers'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False, + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") + zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") + enzym = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) + + if pharma_active: + + active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) + + for rec in active: + + enzym.append(rec) + + result = [] + + result.append( + ProteinIdentifiers( + DrugBank_ID = "", + ) + ) + + for enzym_attr in enzym: + + DrugBank_IDs = [i for i in enzym_attr['Drug IDs'].replace(" ","").split(';')] + + for id in DrugBank_IDs: + + index = 0 + flag = 0 + + for res_attr in result: + + if id == res_attr.DrugBank_ID: + + flag = 1 + + if res_attr.Enzym_UniProt_ID == "": + + result[index] = result[index]._replace( + Enzym_UniProt_ID = enzym_attr['UniProt ID'],) + + else: + + result[index] = result[index]._replace( + Enzym_UniProt_ID = result[index].Enzym_UniProt_ID + ";" + enzym_attr['UniProt ID'],) + + break + + index += 1 + + if flag == 0: + + result.append( + ProteinIdentifiers( + DrugBank_ID = id, + Enzym_UniProt_ID = enzym_attr['UniProt ID'], + ) + ) + + del result[0] + + url = urls.urls['drugbank']['drug_carrier_identifiers'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False, + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") + zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") + carrier = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) + + if pharma_active: + + active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) + + for rec in active: + + carrier.append(rec) + + for carrier_attr in carrier: + + DrugBank_IDs = [i for i in carrier_attr['Drug IDs'].replace(" ","").split(';')] + + for id in DrugBank_IDs: + + index = 0 + flag = 0 + + for res_attr in result: + + if id == res_attr.DrugBank_ID: + + flag = 1 + + if res_attr.Carrier_UniProt_ID == "": + + result[index] = result[index]._replace( + Carrier_UniProt_ID = carrier_attr['UniProt ID'],) + + else: + + result[index] = result[index]._replace( + Carrier_UniProt_ID = result[index].Carrier_UniProt_ID + ";" + carrier_attr['UniProt ID'],) + + break + + index += 1 + + if flag == 0: + + result.append( + ProteinIdentifiers( + DrugBank_ID = id, + Carrier_UniProt_ID = carrier_attr['UniProt ID'], + ) + ) + + + url = urls.urls['drugbank']['drug_transporter_identifiers'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False, + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") + zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") + transporter = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) + + if pharma_active: + + active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) + + for rec in active: + + transporter.append(rec) + + for transporter_attr in transporter: + + DrugBank_IDs = [i for i in transporter_attr['Drug IDs'].replace(" ","").split(';')] + + for id in DrugBank_IDs: + + index = 0 + flag = 0 + + for res_attr in result: + + if id == res_attr.DrugBank_ID: + + flag = 1 + + if res_attr.Transporter_UniProt_ID == "": + + result[index] = result[index]._replace( + Transporter_UniProt_ID = transporter_attr['UniProt ID'],) + + else: + + result[index] = result[index]._replace( + Transporter_UniProt_ID = result[index].Transporter_UniProt_ID + ";" + transporter_attr['UniProt ID'],) + + break + + index += 1 + + if flag == 0: + + result.append( + ProteinIdentifiers( + DrugBank_ID = id, + Transporter_UniProt_ID = transporter_attr['UniProt ID'], + ) + ) + + url = urls.urls['drugbank']['drug_target_identifiers'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False, + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") + zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") + target = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) + + if pharma_active: + + active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) + + for rec in active: + + target.append(rec) + + for target_attr in target: + + DrugBank_IDs = [i for i in target_attr['Drug IDs'].replace(" ","").split(';')] + + for id in DrugBank_IDs: + + index = 0 + flag = 0 + + for res_attr in result: + + if id == res_attr.DrugBank_ID: + + flag = 1 + + if res_attr.Target_UniProt_ID == "": + + result[index] = result[index]._replace( + Target_UniProt_ID = target_attr['UniProt ID'],) + + else: + + result[index] = result[index]._replace( + Target_UniProt_ID = result[index].Target_UniProt_ID + ";" + target_attr['UniProt ID'],) + + break + + index += 1 + + if flag == 0: + + result.append( + ProteinIdentifiers( + DrugBank_ID = id, + Target_UniProt_ID = target_attr['UniProt ID'], + ) + ) + + return result + +def drug_bank( + user: str, + passwd: str, + addprotid: bool = True, + pharma_active: bool = False, + ) -> List[tuple] : + """ + Retrieves structures, external links and protein identifiers from Drugbank. + + Args: + user (str): E-mail address for login to DrugBank. + passwd (str): Password for login to DrugBank. + addprotid (bool): Wheter to include protein identifiers from DrugBank. + pharma_active (bool): Wheter to include pharmacologically active identifiers. + + Returns: + namedtuple. + """ + + fields = ('DrugBank_ID','Name','CAS_Number','Drug_Groups','InChIKey','InChI','SMILES','Formula', + 'KEGG_Compound_ID','KEGG_Drug_ID','PubChem_Compound_ID','PubChem_Substance_ID','ChEBI_ID', + 'ChEMBL_ID','Drug_Type','PharmGKB_ID','HET_ID','Target_UniProt_ID','Transporter_UniProt_ID', + 'Enzym_UniProt_ID','Carrier_UniProt_ID') + + credentials = {'user': user, 'passwd': passwd} + + auth_str = base64.b64encode( + ('%s:%s' % (credentials['user'], credentials['passwd'])).encode() + ).decode() + + decoded = 'Basic %s' % auth_str + + req_hdrs = ['Authorization: %s' % decoded] + req_hdrs.extend([settings.get('user_agent')]) + + url = urls.urls['drugbank']['all_structures'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".zip") + zipfile = curl.FileOpener(c.fileobj.name + ".zip") + structure_links = list(csv.DictReader(zipfile.result["structure links.csv"], delimiter = ',')) + + url = urls.urls['drugbank']['all_drug'] + c = curl.Curl( + url, + large = True, + silent = False, + req_headers = req_hdrs, + cache = False + ) + + os.rename(c.fileobj.name, c.fileobj.name + ".zip") + zipfile = curl.FileOpener(c.fileobj.name + ".zip") + drug_links = list(csv.DictReader(zipfile.result["drug links.csv"], delimiter = ',')) + + if addprotid: + + Combine = collections.namedtuple('Combine', fields,defaults = ("",) * len(fields)) + + else: + Combine = collections.namedtuple('Combine', fields[:17],defaults = ("",) * len(fields[:17])) + + result = [] + + for struct_attr in structure_links: + + for drug_attr in drug_links: + + if struct_attr['DrugBank ID'] == drug_attr['DrugBank ID']: + + result.append( + Combine( + DrugBank_ID = struct_attr['DrugBank ID'], + Name = struct_attr['Name'], + CAS_Number = struct_attr['CAS Number'], + Drug_Groups = struct_attr['Drug Groups'], + InChIKey = struct_attr['InChIKey'], + InChI = struct_attr['InChI'], + SMILES = struct_attr['SMILES'], + Formula = struct_attr['Formula'], + KEGG_Compound_ID = struct_attr['KEGG Compound ID'], + KEGG_Drug_ID = struct_attr['KEGG Drug ID'], + PubChem_Compound_ID = struct_attr['PubChem Compound ID'], + PubChem_Substance_ID = struct_attr['PubChem Substance ID'], + ChEBI_ID = struct_attr['ChEBI ID'], + ChEMBL_ID = struct_attr['ChEMBL ID'], + Drug_Type = drug_attr['Drug Type'], + PharmGKB_ID = drug_attr['PharmGKB ID'], + HET_ID = drug_attr['HET ID'], + ) + ) + + if addprotid: + + identifiers_list = add_prot_id(user, passwd, pharma_active) + index = 0 + + for res_attr in result: + + for iden_attr in identifiers_list: + + if res_attr.DrugBank_ID == iden_attr.DrugBank_ID: + + result[index] = result[index]._replace( + Target_UniProt_ID = iden_attr.Target_UniProt_ID, + Transporter_UniProt_ID = iden_attr.Transporter_UniProt_ID, + Enzym_UniProt_ID = iden_attr.Enzym_UniProt_ID, + Carrier_UniProt_ID = iden_attr.Carrier_UniProt_ID, + ) + + break + + index += 1 + + + return result \ No newline at end of file diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py new file mode 100644 index 000000000..b314a63fa --- /dev/null +++ b/pypath/inputs/hpo.py @@ -0,0 +1,166 @@ +from typing import List, Dict + +import csv +import collections + +import pypath.utils.mapping as map +import pypath.share.curl as curl +import pypath.resources.urls as urls +import pypath.formats.obo as obo + +def hpo_gene_annotations() -> Dict[str, list]: + """ + Retrieves Gene-HPO relationships from HPO. + + Returns: + namedtuple. + """ + + url = urls.urls['hpo']['gene'] + c = curl.Curl(url, large = True, silent = False) + + gene = list(csv.DictReader(c.result, delimiter = ',')) + + fields = ('entrez_gene_id','entrez_gene_symbol','HPO_Term_ID') + + HPOGeneAnnotations = collections.namedtuple('HPOGeneAnnotations', fields,defaults = ("",) * len(fields)) + + annotations = collections.defaultdict(list) + + for rec in gene: + + values = rec.values() + values = list(values)[0].replace('\t',',').split(',') + id = map.map_name(values[1], 'genesymbol', 'uniprot') + id = list(id) + + if id: + + annotations[id[0]].append( + HPOGeneAnnotations( + entrez_gene_id = values[0], + entrez_gene_symbol = values[1], + HPO_Term_ID = values[2], + ) + ) + + return annotations + +def hpo_disease_annotations() -> List[tuple] : + """ + Retrieves Disease-HPO relationships from HPO. + + Returns: + namedtuple. + """ + + url = urls.urls['hpo']['disease'] + c = curl.Curl(url, large = True, silent = False) + + disease = list(csv.DictReader(c.result, delimiter = '\t')) + + fields = ('DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference', 'Evidence', 'Aspect') + + HPODiseaseAnnotations = collections.namedtuple('HPODiseaseAnnotations', fields,defaults = ("",) * len(fields)) + + result = [] + + for i in range(4,len(disease)): + + values = disease[i].values() + values = list(values) + + result.append( + HPODiseaseAnnotations( + DatabaseID = values[0], + DiseaseName = values[1][0], + Qualifier = values[1][1], + HPO_ID = values[1][2], + Reference = values[1][3], + Evidence = values[1][4], + Aspect = values[1][9], + ) + ) + + + return result + +def hpo_ontology() -> List[tuple] : + """ + Retrieves ontology from HPO. + + Returns: + namedtuple. + """ + + url = urls.urls['hpo']['ontology'] + reader = obo.Obo(url) + hpo_ontology = [i for i in reader] + + + fields = ('hpo_id','term_name','synonyms','xrefs','is_a') + + Ontology = collections.namedtuple('Ontology', fields,defaults = ("",) * len(fields)) + + + result = [] + + for rec in hpo_ontology: + + syn_lst = [] + xref_lst = [] + isa_lst = [] + + if rec[2][1]: + + name = rec[2][0] + " " + rec[2][1] + + else: + + name = rec[2][0] + + result.append( + Ontology( + hpo_id = rec[1][0], + term_name = name, + ) + ) + + if rec[5].get('synonym'): + + synonym = list(rec[5].get('synonym')) + + for i in synonym: + + syn = i[0] + " " + i[1] + syn_lst.append(syn) + + result[-1] = result[-1]._replace( + synonyms = syn_lst + ) + + if rec[5].get('xref'): + + xref = list(rec[5].get('xref')) + + for i in xref: + + xref_lst.append(i[0]) + + result[-1] = result[-1]._replace( + xrefs = xref_lst + ) + + if rec[5].get('is_a'): + + is_a = list(rec[5].get('is_a')) + + for i in is_a: + + isa_lst.append(i[0] + " : " + i[2]) + + result[-1] = result[-1]._replace( + is_a = isa_lst + ) + + return result From 933a78e2a140cea6327a359a0f978c5797f6e56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Fri, 12 Aug 2022 00:13:37 +0300 Subject: [PATCH 02/32] Update urls.py --- pypath/resources/urls.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pypath/resources/urls.py b/pypath/resources/urls.py index e1fecfdd4..71de13390 100644 --- a/pypath/resources/urls.py +++ b/pypath/resources/urls.py @@ -1537,6 +1537,29 @@ 'interactions': 'https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/drug.target.interaction.tsv.gz', 'SMILES_InChI' : 'https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/structures.smiles.tsv', }, + 'drugbank': { + 'label': 'DrugBank database', + 'all_structures': 'https://go.drugbank.com/releases/5-1-9/downloads/all-structure-links', + 'all_drug': 'https://go.drugbank.com/releases/5-1-9/downloads/all-drug-links', + 'drug_target_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/target-all-polypeptide-ids', + 'drug_enzym_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/enzyme-all-polypeptide-ids', + 'drug_carrier_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/carrier-all-polypeptide-ids', + 'drug_transporter_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/transporter-all-polypeptide-ids', + }, + 'chembl': { + 'label': 'ChEMBL database', + 'url': 'https://www.ebi.ac.uk', + 'target': '/chembl/api/data/target.json?limit=1000', + 'assay' : '/chembl/api/data/assay.json?limit=1000', + 'activity' : '/chembl/api/data/activity.json?limit=1000', + 'molecule' : '/chembl/api/data/molecule.json?limit=1000', + }, + 'hpo': { + 'label': 'HPO database', + 'ontology': 'https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo', + 'disease' : 'http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa', + 'gene' : 'http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt', + }, } From 0542e76cc27858c4228caca1221b37fa26d9bcc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Fri, 12 Aug 2022 00:13:38 +0300 Subject: [PATCH 03/32] Add files via upload --- pypath/data/licenses/hpo.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 pypath/data/licenses/hpo.json diff --git a/pypath/data/licenses/hpo.json b/pypath/data/licenses/hpo.json new file mode 100644 index 000000000..538c5ab81 --- /dev/null +++ b/pypath/data/licenses/hpo.json @@ -0,0 +1,7 @@ +{ + "name": "HPO", + "full_name": "HPO License", + "url": "https://hpo.jax.org/app/license", + "purpose": "academic", + "sharing": "alike" +} \ No newline at end of file From f111b607beddefb78491a8cc535f037fc600afa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Fri, 12 Aug 2022 00:17:25 +0300 Subject: [PATCH 04/32] Update annot.py --- pypath/core/annot.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pypath/core/annot.py b/pypath/core/annot.py index 2e7fb4f67..034476831 100644 --- a/pypath/core/annot.py +++ b/pypath/core/annot.py @@ -6906,3 +6906,27 @@ def get_db( ) return globals()['db'] + +class HPO(AnnotationBase): + + _eq_fields = () + + def __init__(self, **kwargs): + """ + HPO Gene Annotations from the HPO database. + """ + + kwargs.pop('ncbi_tax_id', None) + + AnnotationBase.__init__( + self, + name = 'HPO', + ncbi_tax_id = constants.NOT_ORGANISM_SPECIFIC, + input_method = 'hpo.hpo_gene_annotations', + **kwargs + ) + + def _process_method(self): + # already the appropriate format, no processing needed + self.annot = self.data + delattr(self, 'data') From 60db316403c1ca8a16ec41a96eb6e1f4063eb730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Fri, 12 Aug 2022 00:20:32 +0300 Subject: [PATCH 05/32] Update resources.json --- pypath/resources/data/resources.json | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pypath/resources/data/resources.json b/pypath/resources/data/resources.json index e13e4809d..71ff23dae 100644 --- a/pypath/resources/data/resources.json +++ b/pypath/resources/data/resources.json @@ -4616,5 +4616,39 @@ "https://academic.oup.com/nar/article/49/D1/D1160/5957163?login=false" ] } + }, + "DrugBank": { + "license": "CC BY-NC 4.0", + "urls": { + "webpages": ["https://go.drugbank.com/"], + "articles": [ + "https://pubmed.ncbi.nlm.nih.gov/29126136/", + "https://pubmed.ncbi.nlm.nih.gov/24203711/", + "https://pubmed.ncbi.nlm.nih.gov/21059682/", + "https://pubmed.ncbi.nlm.nih.gov/18048412/", + "https://pubmed.ncbi.nlm.nih.gov/16381955/" + ] + } + }, + "ChEMBL": { + "license": "CC BY-SA 3.0", + "urls": { + "webpages": ["https://www.ebi.ac.uk/chembl/"], + "articles": [ + "http://europepmc.org/article/PMC/5210557", + "http://europepmc.org/article/PMC/4489243", + "http://europepmc.org/article/MED/24413672" + ] + } + }, + "HPO": { + "full_name": "The Human Phenotype Ontology", + "license": "HPO", + "urls": { + "webpages": ["https://hpo.jax.org/app/"], + "articles": [ + "https://pubmed.ncbi.nlm.nih.gov/33264411/" + ] + } } } From a0f8dd7503e1b255ac423ed837c092a2213c4186 Mon Sep 17 00:00:00 2001 From: deeenes Date: Fri, 12 Aug 2022 18:41:40 +0200 Subject: [PATCH 06/32] PRs from Tennur: removed trailing spaces, added headers --- pypath/inputs/chembl.py | 88 +++++++++++++++++++++++------------- pypath/inputs/drugbank.py | 67 ++++++++++++++++++--------- pypath/inputs/drugcentral.py | 65 ++++++++++++++++++-------- pypath/inputs/hpo.py | 55 ++++++++++++++++------ 4 files changed, 187 insertions(+), 88 deletions(-) diff --git a/pypath/inputs/chembl.py b/pypath/inputs/chembl.py index aecdc4eda..d93818aa7 100644 --- a/pypath/inputs/chembl.py +++ b/pypath/inputs/chembl.py @@ -1,4 +1,27 @@ -from typing import List +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# This file is part of the `pypath` python module +# +# Copyright +# 2014-2022 +# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University +# +# Authors: Dénes Türei (turei.denes@gmail.com) +# Nicolàs Palacio +# Sebastian Lobentanzer +# Erva Ulusoy +# Olga Ivanova +# Ahmet Rifaioglu +# Tennur Kılıç +# +# Distributed under the GPLv3 License. +# See accompanying file LICENSE.txt or copy at +# http://www.gnu.org/licenses/gpl-3.0.html +# +# Website: http://pypath.omnipathdb.org/ +# import json import collections @@ -6,11 +29,12 @@ import pypath.share.curl as curl import pypath.resources.urls as urls -def chembl_targets() -> List[tuple] : + +def chembl_targets() -> list[tuple]: """ Retrieves targets data from ChEMBL. - - Returns: + + Returns: namedtuple. """ @@ -40,10 +64,10 @@ def chembl_targets() -> List[tuple] : else: break - + fileObject = open(c.fileobj.name) lst = json.loads(fileObject.read()) - + for trgt_attr in lst['targets']: if trgt_attr['target_components']: @@ -56,7 +80,7 @@ def chembl_targets() -> List[tuple] : ) else: - + trgtlst.append( Target( target_chembl_id = trgt_attr['target_chembl_id'], @@ -68,8 +92,8 @@ def chembl_targets() -> List[tuple] : def chembl_assays() -> List[tuple] : """ Retrieves assays data from ChEMBL. - - Returns: + + Returns: namedtuple. """ @@ -99,12 +123,12 @@ def chembl_assays() -> List[tuple] : else: break - + fileObject = open(c.fileobj.name) lst = json.loads(fileObject.read()) - + for assy_attr in lst['assays']: - + assylst.append( Assay( assay_chembl_id = assy_attr['assay_chembl_id'], @@ -120,8 +144,8 @@ def chembl_assays() -> List[tuple] : def chembl_molecules() -> List[tuple] : """ Retrieves molecules data from ChEMBL. - - Returns: + + Returns: namedtuple. """ @@ -152,10 +176,10 @@ def chembl_molecules() -> List[tuple] : else: break - + fileObject = open(c.fileobj.name) lst = json.loads(fileObject.read()) - + for mlcl_attr in lst['molecules']: xrefs = [] @@ -166,7 +190,7 @@ def chembl_molecules() -> List[tuple] : prodrug = mlcl_attr['prodrug'], ) ) - + if mlcl_attr['molecule_hierarchy'] != None: mlcllst[-1] = mlcllst[-1]._replace( molecule_chembl_id = mlcl_attr['molecule_hierarchy']['molecule_chembl_id'], @@ -179,26 +203,26 @@ def chembl_molecules() -> List[tuple] : full_mwt = mlcl_attr['molecule_properties']['full_mwt'], heavy_atoms = mlcl_attr['molecule_properties']['heavy_atoms'], molecular_species = mlcl_attr['molecule_properties']['molecular_species'], - ) - + ) + if mlcl_attr['molecule_structures'] != None: mlcllst[-1] = mlcllst[-1]._replace( conanicle_smiles = mlcl_attr['molecule_structures']['canonical_smiles'], standard_inchi_key = mlcl_attr['molecule_structures']['standard_inchi_key'], standard_inchi = mlcl_attr['molecule_structures']['standard_inchi'], ) - + if mlcl_attr['cross_references'] != None: for rec in mlcl_attr['cross_references']: - + xrefs.append({'xref_id' : rec['xref_id'], 'xref_src': rec['xref_src']}) mlcllst[-1] = mlcllst[-1]._replace( xrefs = xrefs ) - + return mlcllst def chembl_activities( @@ -212,7 +236,7 @@ def chembl_activities( pchembl_value_none (bool): Whether the pchembl value should be none or not. standard_relation (str): Which standard relation in needed. - Returns: + Returns: namedtuple. standard_flag and standard_units attributes are not included in the returned namedtuple. Only records returned are the ones where data_validity_comment is none. @@ -232,11 +256,11 @@ def chembl_activities( if flag == 0: if pchembl_value_none == True: - + url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=true' - + else: - + url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=false' url = url + '&standard_relation__exact='+standard_relation @@ -253,15 +277,15 @@ def chembl_activities( else: break - + fileObject = open(c.fileobj.name) lst = json.loads(fileObject.read()) - + for actvty_attr in lst['activities']: if actvty_attr['data_validity_comment'] == None: - + actvtylst.append( Activity( assay_chembl_id = actvty_attr['assay_chembl_id'], @@ -272,7 +296,7 @@ def chembl_activities( standard_value = actvty_attr['standard_value'], target_chembl_id = actvty_attr['target_chembl_id'], ) - ) + ) + - - return actvtylst \ No newline at end of file + return actvtylst diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 9f4ed3ae2..d55a2c146 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -1,3 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# This file is part of the `pypath` python module +# +# Copyright +# 2014-2022 +# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University +# +# Authors: Dénes Türei (turei.denes@gmail.com) +# Nicolàs Palacio +# Sebastian Lobentanzer +# Erva Ulusoy +# Olga Ivanova +# Ahmet Rifaioglu +# Tennur Kılıç +# +# Distributed under the GPLv3 License. +# See accompanying file LICENSE.txt or copy at +# http://www.gnu.org/licenses/gpl-3.0.html +# +# Website: http://pypath.omnipathdb.org/ +# + from typing import List import os @@ -14,8 +39,8 @@ _log = _logger._log def add_prot_id( - user: str, - passwd: str, + user: str, + passwd: str, pharma_active: bool = False, ) -> List[tuple] : """ @@ -52,7 +77,7 @@ def add_prot_id( silent = False, req_headers = req_hdrs, cache = False, - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") @@ -65,7 +90,7 @@ def add_prot_id( for rec in active: enzym.append(rec) - + result = [] result.append( @@ -121,7 +146,7 @@ def add_prot_id( silent = False, req_headers = req_hdrs, cache = False, - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") @@ -181,7 +206,7 @@ def add_prot_id( silent = False, req_headers = req_hdrs, cache = False, - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") @@ -240,7 +265,7 @@ def add_prot_id( silent = False, req_headers = req_hdrs, cache = False, - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") @@ -282,7 +307,7 @@ def add_prot_id( break index += 1 - + if flag == 0: result.append( @@ -295,9 +320,9 @@ def add_prot_id( return result def drug_bank( - user: str, - passwd: str, - addprotid: bool = True, + user: str, + passwd: str, + addprotid: bool = True, pharma_active: bool = False, ) -> List[tuple] : """ @@ -336,7 +361,7 @@ def drug_bank( silent = False, req_headers = req_hdrs, cache = False - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".zip") zipfile = curl.FileOpener(c.fileobj.name + ".zip") @@ -349,7 +374,7 @@ def drug_bank( silent = False, req_headers = req_hdrs, cache = False - ) + ) os.rename(c.fileobj.name, c.fileobj.name + ".zip") zipfile = curl.FileOpener(c.fileobj.name + ".zip") @@ -363,7 +388,7 @@ def drug_bank( Combine = collections.namedtuple('Combine', fields[:17],defaults = ("",) * len(fields[:17])) result = [] - + for struct_attr in structure_links: for drug_attr in drug_links: @@ -391,28 +416,28 @@ def drug_bank( HET_ID = drug_attr['HET ID'], ) ) - + if addprotid: - + identifiers_list = add_prot_id(user, passwd, pharma_active) index = 0 - + for res_attr in result: for iden_attr in identifiers_list: if res_attr.DrugBank_ID == iden_attr.DrugBank_ID: - + result[index] = result[index]._replace( Target_UniProt_ID = iden_attr.Target_UniProt_ID, Transporter_UniProt_ID = iden_attr.Transporter_UniProt_ID, Enzym_UniProt_ID = iden_attr.Enzym_UniProt_ID, - Carrier_UniProt_ID = iden_attr.Carrier_UniProt_ID, + Carrier_UniProt_ID = iden_attr.Carrier_UniProt_ID, ) - + break index += 1 - return result \ No newline at end of file + return result diff --git a/pypath/inputs/drugcentral.py b/pypath/inputs/drugcentral.py index cc1801e0f..edeff5436 100644 --- a/pypath/inputs/drugcentral.py +++ b/pypath/inputs/drugcentral.py @@ -1,3 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# This file is part of the `pypath` python module +# +# Copyright +# 2014-2022 +# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University +# +# Authors: Dénes Türei (turei.denes@gmail.com) +# Nicolàs Palacio +# Sebastian Lobentanzer +# Erva Ulusoy +# Olga Ivanova +# Ahmet Rifaioglu +# Tennur Kılıç +# +# Distributed under the GPLv3 License. +# See accompanying file LICENSE.txt or copy at +# http://www.gnu.org/licenses/gpl-3.0.html +# +# Website: http://pypath.omnipathdb.org/ +# + from typing import List import csv @@ -7,9 +32,9 @@ import pypath.resources.urls as urls def drug_central( - organism: str = "Homo sapiens", - SMILES: bool = False, - InChI: bool = False, + organism: str = "Homo sapiens", + SMILES: bool = False, + InChI: bool = False, CAS_RN: bool = False, ) -> List[tuple]: """ @@ -24,7 +49,7 @@ def drug_central( Returns: namedtuple. """ - + fields = ('DRUG_NAME','TARGET_NAME','TARGET_CLASS', 'TARGET_ACCESSION','GENE','ACT_VALUE','ACT_TYPE', 'ACTION_TYPE','TDL','ORGANISM','SMILES','InChI', @@ -35,7 +60,7 @@ def drug_central( interactions = list(csv.DictReader(c.result, delimiter = '\t')) temp_inter = [] - + for rec in interactions: if rec not in temp_inter: @@ -51,9 +76,9 @@ def drug_central( url = urls.urls['drugcentral']['SMILES_InChI'] c = curl.Curl(url, large = True, silent = False) structures = list(csv.DictReader(c.result, delimiter = '\t')) - + temp_struct = [] - + for rec in structures: if rec not in temp_struct: @@ -111,10 +136,10 @@ def drug_central( ACT_TYPE = inter_attr['ACT_TYPE'], ACTION_TYPE = inter_attr['ACTION_TYPE'], TDL = inter_attr['TDL'], - ORGANISM = inter_attr['ORGANISM'], + ORGANISM = inter_attr['ORGANISM'], ) ) - + for struct_attr in structures: if inter_attr['STRUCT_ID'] == struct_attr['ID']: @@ -125,7 +150,7 @@ def drug_central( SMILES = struct_attr['SMILES'], InChI = struct_attr['InChI'], InChIKey = struct_attr['InChIKey'], - CAS_RN = struct_attr['CAS_RN'], + CAS_RN = struct_attr['CAS_RN'], ) elif SMILES == True and InChI == True and CAS_RN == False: @@ -133,20 +158,20 @@ def drug_central( result[-1] = result[-1]._replace( SMILES = struct_attr['SMILES'], InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], + InChIKey = struct_attr['InChIKey'], ) elif SMILES == True and InChI == False and CAS_RN == True: result[-1] = result[-1]._replace( SMILES = struct_attr['SMILES'], - CAS_RN = struct_attr['CAS_RN'], + CAS_RN = struct_attr['CAS_RN'], ) elif SMILES == True and InChI == False and CAS_RN == False: result[-1] = result[-1]._replace( - SMILES = struct_attr['SMILES'], + SMILES = struct_attr['SMILES'], ) elif SMILES == False and InChI == True and CAS_RN == True: @@ -154,22 +179,22 @@ def drug_central( result[-1] = result[-1]._replace( InChI = struct_attr['InChI'], InChIKey = struct_attr['InChIKey'], - CAS_RN = struct_attr['CAS_RN'], + CAS_RN = struct_attr['CAS_RN'], ) elif SMILES == False and InChI == False and CAS_RN == True: result[-1] = result[-1]._replace( - CAS_RN = struct_attr['CAS_RN'], + CAS_RN = struct_attr['CAS_RN'], ) elif SMILES == False and InChI == True and CAS_RN == False: result[-1] = result[-1]._replace( InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], + InChIKey = struct_attr['InChIKey'], ) - + else: DrugTargetInteractions = collections.namedtuple('DrugTargetInteractions', fields[0:10]) @@ -177,7 +202,7 @@ def drug_central( for inter_attr in interactions: if organism == inter_attr['ORGANISM']: - + result.append( DrugTargetInteractions( DRUG_NAME = inter_attr['DRUG_NAME'], @@ -189,8 +214,8 @@ def drug_central( ACT_TYPE = inter_attr['ACT_TYPE'], ACTION_TYPE = inter_attr['ACTION_TYPE'], TDL = inter_attr['TDL'], - ORGANISM = inter_attr['ORGANISM'], + ORGANISM = inter_attr['ORGANISM'], ) ) - return result \ No newline at end of file + return result diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py index b314a63fa..4a7e8f413 100644 --- a/pypath/inputs/hpo.py +++ b/pypath/inputs/hpo.py @@ -1,3 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# This file is part of the `pypath` python module +# +# Copyright +# 2014-2022 +# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University +# +# Authors: Dénes Türei (turei.denes@gmail.com) +# Nicolàs Palacio +# Sebastian Lobentanzer +# Erva Ulusoy +# Olga Ivanova +# Ahmet Rifaioglu +# Tennur Kılıç +# +# Distributed under the GPLv3 License. +# See accompanying file LICENSE.txt or copy at +# http://www.gnu.org/licenses/gpl-3.0.html +# +# Website: http://pypath.omnipathdb.org/ +# + from typing import List, Dict import csv @@ -11,8 +36,8 @@ def hpo_gene_annotations() -> Dict[str, list]: """ Retrieves Gene-HPO relationships from HPO. - - Returns: + + Returns: namedtuple. """ @@ -33,9 +58,9 @@ def hpo_gene_annotations() -> Dict[str, list]: values = list(values)[0].replace('\t',',').split(',') id = map.map_name(values[1], 'genesymbol', 'uniprot') id = list(id) - + if id: - + annotations[id[0]].append( HPOGeneAnnotations( entrez_gene_id = values[0], @@ -49,8 +74,8 @@ def hpo_gene_annotations() -> Dict[str, list]: def hpo_disease_annotations() -> List[tuple] : """ Retrieves Disease-HPO relationships from HPO. - - Returns: + + Returns: namedtuple. """ @@ -64,7 +89,7 @@ def hpo_disease_annotations() -> List[tuple] : HPODiseaseAnnotations = collections.namedtuple('HPODiseaseAnnotations', fields,defaults = ("",) * len(fields)) result = [] - + for i in range(4,len(disease)): values = disease[i].values() @@ -88,15 +113,15 @@ def hpo_disease_annotations() -> List[tuple] : def hpo_ontology() -> List[tuple] : """ Retrieves ontology from HPO. - - Returns: + + Returns: namedtuple. """ url = urls.urls['hpo']['ontology'] reader = obo.Obo(url) hpo_ontology = [i for i in reader] - + fields = ('hpo_id','term_name','synonyms','xrefs','is_a') @@ -134,7 +159,7 @@ def hpo_ontology() -> List[tuple] : syn = i[0] + " " + i[1] syn_lst.append(syn) - + result[-1] = result[-1]._replace( synonyms = syn_lst ) @@ -146,7 +171,7 @@ def hpo_ontology() -> List[tuple] : for i in xref: xref_lst.append(i[0]) - + result[-1] = result[-1]._replace( xrefs = xref_lst ) @@ -156,11 +181,11 @@ def hpo_ontology() -> List[tuple] : is_a = list(rec[5].get('is_a')) for i in is_a: - + isa_lst.append(i[0] + " : " + i[2]) - + result[-1] = result[-1]._replace( is_a = isa_lst ) - + return result From 2eccd6d679fafb35ab0bd34d2c858257e174897d Mon Sep 17 00:00:00 2001 From: deeenes Date: Fri, 12 Aug 2022 19:51:24 +0200 Subject: [PATCH 07/32] refactored new module `inputs.chembl` --- pypath/inputs/chembl.py | 396 ++++++++++++++++++++++------------------ 1 file changed, 216 insertions(+), 180 deletions(-) diff --git a/pypath/inputs/chembl.py b/pypath/inputs/chembl.py index d93818aa7..0bfbae8f4 100644 --- a/pypath/inputs/chembl.py +++ b/pypath/inputs/chembl.py @@ -23,6 +23,8 @@ # Website: http://pypath.omnipathdb.org/ # +from typing import Literal + import json import collections @@ -35,268 +37,302 @@ def chembl_targets() -> list[tuple]: Retrieves targets data from ChEMBL. Returns: - namedtuple. + List of drug target records as named tuples. """ - fields_target = ('accession','target_chembl_id') - - Target = collections.namedtuple('Target', fields_target,defaults = ("None",) * len(fields_target)) + fields_target = ( + 'accession', + 'target_chembl_id', + ) - trgtlst = [] + ChemblTarget = collections.namedtuple( + 'ChemblTarget', + fields_target, + defaults = (None,) * len(fields_target), + ) - flag = 0 + tgt_lst = [] + page_dct = {} while True: - if flag == 0: - - url = urls.urls['chembl']['url'] + urls.urls['chembl']['target'] - c = curl.Curl(url, large=True, silent=False) - flag = 1 - - else: - - if lst['page_meta']['next']: - - url = urls.urls['chembl']['url'] + lst['page_meta']['next'] - c = curl.Curl(url, large=True, silent=False) + if not page_dct: - else: + url = ( + f"{urls.urls['chembl']['url']}" + f"{urls.urls['chembl']['target']}" + ) - break + elif page_dct['page_meta']['next']: - fileObject = open(c.fileobj.name) - lst = json.loads(fileObject.read()) + url = ( + f"{urls.urls['chembl']['url']}" + f"{page_dct['page_meta']['next']}" + ) - for trgt_attr in lst['targets']: + else: - if trgt_attr['target_components']: + break - trgtlst.append( - Target( - accession = trgt_attr['target_components'][0]['accession'], - target_chembl_id = trgt_attr['target_chembl_id'], - ) - ) + c = curl.Curl(url, large=True, silent=False) + fileobj = open(c.fileobj.name) + page_dct = json.loads(fileobj.read()) - else: + tgt_lst.extend( + ChemblTarget( + accession = ( + tgt['target_components'][0]['accession'] + if 'target_components' in tgt else + None + ), + target_chembl_id = tgt['target_chembl_id'], + ) + for tgt in page_dct['targets'] + ) - trgtlst.append( - Target( - target_chembl_id = trgt_attr['target_chembl_id'], - ) - ) + return tgt_lst - return trgtlst -def chembl_assays() -> List[tuple] : +def chembl_assays() -> list[tuple] : """ Retrieves assays data from ChEMBL. Returns: - namedtuple. + List of assay records as named tuples. """ - fields_assay = ('assay_chembl_id','assay_organism','assay_type','confidence_score','target_chembl_id') + fields_assay = ( + 'assay_chembl_id', + 'assay_organism', + 'assay_type', + 'confidence_score', + 'target_chembl_id', + ) - Assay = collections.namedtuple('Assay', fields_assay,defaults = ("None",) * len(fields_assay)) + ChemblAssay = collections.namedtuple( + 'ChemblAssay', + fields_assay, + defaults = (None,) * len(fields_assay), + ) - assylst = [] - - flag = 0 + assay_lst = [] + page_dct = {} while True: - if flag == 0: + if not page_dct: - url = urls.urls['chembl']['url'] + urls.urls['chembl']['assay'] - c = curl.Curl(url, large=True, silent=False) - flag = 1 + url = ( + f"{urls.urls['chembl']['url']}" + f"{urls.urls['chembl']['assay']}" + ) - else: + elif page_dct['page_meta']['next']: - if lst['page_meta']['next']: + url = ( + f"{urls.urls['chembl']['url']}" + f"{page_dct['page_meta']['next']}" + ) - url = urls.urls['chembl']['url'] + lst['page_meta']['next'] - c = curl.Curl(url, large=True, silent=False) - - else: + else: - break + break - fileObject = open(c.fileobj.name) - lst = json.loads(fileObject.read()) + c = curl.Curl(url, large=True, silent=False) + fileobj = open(c.fileobj.name) + page_dct = json.loads(fileobj.read()) - for assy_attr in lst['assays']: + assay_lst.extend( + ChemblAssay( + assay_chembl_id = assy_attr['assay_chembl_id'], + assay_organism = assy_attr['assay_organism'], + assay_type = assy_attr['assay_type'], + confidence_score = assy_attr['confidence_score'], + target_chembl_id = assy_attr['target_chembl_id'], + ) + for assy_attr in page_dct['assays'] + ) - assylst.append( - Assay( - assay_chembl_id = assy_attr['assay_chembl_id'], - assay_organism = assy_attr['assay_organism'], - assay_type = assy_attr['assay_type'], - confidence_score = assy_attr['confidence_score'], - target_chembl_id = assy_attr['target_chembl_id'], - ) - ) + return assay_lst - return assylst -def chembl_molecules() -> List[tuple] : +def chembl_molecules() -> list[tuple]: """ Retrieves molecules data from ChEMBL. Returns: - namedtuple. + Molecule records as named tuples. """ - fields_molecule = ('alogp','conanicle_smiles','chirality','full_mwt','heavy_atoms','standard_inchi_key','molecular_species', - 'molecul_type','molecule_chembl_id','parent_chembl_id','prodrug','standard_inchi', 'xrefs') + def _get(mol, key0, key1): - Molecule = collections.namedtuple('Molecule', fields_molecule,defaults = ("None",) * len(fields_molecule)) + return mol.get(f'molecule_{key0}', {}).get(key1, None) - mlcllst = [] - flag = 0 + fields_molecule = ( + 'alogp', + 'canonical_smiles', + 'chirality', + 'full_mwt', + 'heavy_atoms', + 'std_inchi_key', + 'species', + 'type', + 'chembl', + 'parent_chembl', + 'prodrug', + 'std_inchi', + 'xrefs', + ) + + ChemblMolecule = collections.namedtuple( + 'ChemblMolecule', + fields_molecule, + defaults = (None,) * len(fields_molecule), + ) + + mol_lst = [] + page_dct = {} while True: - if flag == 0: + if not page_dct: url = urls.urls['chembl']['url'] + urls.urls['chembl']['molecule'] c = curl.Curl(url, large=True, silent=False) - flag = 1 - - else: - - if lst['page_meta']['next']: - - url = urls.urls['chembl']['url'] + lst['page_meta']['next'] - c = curl.Curl(url, large=True, silent=False) - - else: - - break - fileObject = open(c.fileobj.name) - lst = json.loads(fileObject.read()) + elif page_dct['page_meta']['next']: - for mlcl_attr in lst['molecules']: + url = ( + f"{urls.urls['chembl']['url']}" + f"{lst['page_meta']['next']}" + ) - xrefs = [] - mlcllst.append( - Molecule( - chirality = mlcl_attr['chirality'], - molecul_type = mlcl_attr['molecule_type'], - prodrug = mlcl_attr['prodrug'], - ) - ) - - if mlcl_attr['molecule_hierarchy'] != None: - mlcllst[-1] = mlcllst[-1]._replace( - molecule_chembl_id = mlcl_attr['molecule_hierarchy']['molecule_chembl_id'], - parent_chembl_id = mlcl_attr['molecule_hierarchy']['parent_chembl_id'], - ) - - if mlcl_attr['molecule_properties'] != None: - mlcllst[-1] = mlcllst[-1]._replace( - alogp = mlcl_attr['molecule_properties']['alogp'], - full_mwt = mlcl_attr['molecule_properties']['full_mwt'], - heavy_atoms = mlcl_attr['molecule_properties']['heavy_atoms'], - molecular_species = mlcl_attr['molecule_properties']['molecular_species'], - ) - - if mlcl_attr['molecule_structures'] != None: - mlcllst[-1] = mlcllst[-1]._replace( - conanicle_smiles = mlcl_attr['molecule_structures']['canonical_smiles'], - standard_inchi_key = mlcl_attr['molecule_structures']['standard_inchi_key'], - standard_inchi = mlcl_attr['molecule_structures']['standard_inchi'], - ) - - if mlcl_attr['cross_references'] != None: - - for rec in mlcl_attr['cross_references']: - - xrefs.append({'xref_id' : rec['xref_id'], 'xref_src': rec['xref_src']}) + else: - mlcllst[-1] = mlcllst[-1]._replace( - xrefs = xrefs + break + + c = curl.Curl(url, large=True, silent=False) + fileobj = open(c.fileobj.name) + page_dct = json.loads(fileobj.read()) + + mol_lst.extend( + ChemblMolecule( + chirality = mol['chirality'], + type = mol['molecule_type'], + prodrug = mol['prodrug'], + + chembl = _get(mol, 'hierarchy', 'molecule_chembl_id'), + parent_chembl = _get(mol, 'hierarchy', 'parent_chembl_id'), + + alogp = _get(mol, 'properties', 'alogp'), + full_mwt = _get(mol, 'properties', 'full_mwt'), + heavy_atoms = _get(mol, 'properties', 'heavy_atoms'), + species = _get(mol, 'properties', 'molecular_species'), + + canonical_smiles = _get(mol, 'structures', 'canonical_smiles'), + std_inchi_key = _get(mol, 'structures', 'standard_inchi_key'), + std_inchi = _get(mol, 'structures', 'standard_inchi'), + + xrefs = ( + [ + { + 'xref_id': rec['xref_id'], + 'xref_src': rec['xref_src'], + } + for rec in mol['cross_references'] + ] + if mol['cross_references'] else + None ) + ) + for mol in page_dct['molecules'] + ) + return mol_lst - return mlcllst def chembl_activities( pchembl_value_none: bool = False, - standard_relation: bool = '=', - ) -> List[tuple] : + #TODO: are these below all the allowed values? + standard_relation: Literal['=', '>', '<', '>=', '<='], + ) -> list[tuple] : """ Retrieves activities data from ChEMBL. Args: - pchembl_value_none (bool): Whether the pchembl value should be none or not. - standard_relation (str): Which standard relation in needed. + pchembl_value_none: + # TODO: it is allowed to be None or must be None? + Whether the pchembl value should be none or not. + standard_relation: + Which standard relation in needed. Returns: - namedtuple. - standard_flag and standard_units attributes are not included in the returned namedtuple. - Only records returned are the ones where data_validity_comment is none. + List of activity records as named tuples. `standard_flag` and + `standard_units` attributes are not included in the returned records. + # TODO: then why the data_validity_comment is part of the records? + Only records without `data_validity_comment` are returned. """ - fields_activity = ('assay_chembl_id','data_validity_comment','molecule_chembl_id','pchembl_value', - 'standard_relation','standard_value','target_chembl_id') - - Activity = collections.namedtuple('Activity', fields_activity,defaults = ("None",) * len(fields_activity)) - - actvtylst = [] - - flag = 0 + fields_activity = ( + 'assay_chembl', + 'data_validity_comment', + 'chembl', + 'pchembl', + 'standard_relation', + 'standard_value', + 'target_chembl', + ) + + ChemblActivity = collections.namedtuple( + 'ChemblActivity', + fields_activity, + defaults = (None,) * len(fields_activity), + ) + + activity_lst = [] + page_dct = {} while True: - if flag == 0: + if not page_lst: - if pchembl_value_none == True: - url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=true' + url = ( + f"{urls.urls['chembl']['url']}" + f"{urls.urls['chembl']['activity']}" + f"&pchembl_value__isnull={str(pchembl_value_none).lower()}" + f"&standard_relation__exact={standard_relation}" + ) - else: + elif page_dct['page_meta']['next']: - url = urls.urls['chembl']['url'] + urls.urls['chembl']['activity']+'&pchembl_value__isnull=false' - - url = url + '&standard_relation__exact='+standard_relation - c = curl.Curl(url, large=True, silent=False) - flag = 1 + url = ( + f"{urls.urls['chembl']['url']}" + f"{lst['page_meta']['next']}" + ) else: - if lst['page_meta']['next']: - - url = urls.urls['chembl']['url'] + lst['page_meta']['next'] - c = curl.Curl(url, large=True, silent=False) - - else: - - break - - fileObject = open(c.fileobj.name) - lst = json.loads(fileObject.read()) - - - for actvty_attr in lst['activities']: + break - if actvty_attr['data_validity_comment'] == None: + c = curl.Curl(url, large=True, silent=False) + fileobj = open(c.fileobj.name) + page_dct = json.loads(fileobj.read()) - actvtylst.append( - Activity( - assay_chembl_id = actvty_attr['assay_chembl_id'], - data_validity_comment = actvty_attr['data_validity_comment'], - molecule_chembl_id = actvty_attr['molecule_chembl_id'], - pchembl_value = actvty_attr['pchembl_value'], - standard_relation = actvty_attr['standard_relation'], - standard_value = actvty_attr['standard_value'], - target_chembl_id = actvty_attr['target_chembl_id'], - ) - ) + activity_lst.extend( + ChemblActivity( + assay_chembl = act['assay_chembl_id'], + data_validity_comment = act['data_validity_comment'], + chembl = act['molecule_chembl_id'], + pchembl = act['pchembl_value'], + standard_relation = act['standard_relation'], + standard_value = act['standard_value'], + target_chembl = act['target_chembl_id'], + ) + for act in page_dct['activities'] + if act['data_validity_comment'] is None + ) - return actvtylst + return activity_lst From cb1b2d9411af82ba6ba2f53afb49aba5e4a43984 Mon Sep 17 00:00:00 2001 From: deeenes Date: Wed, 17 Aug 2022 16:01:47 +0200 Subject: [PATCH 08/32] drugbank & drugcentral: trivial refactoring --- pypath/inputs/drugbank.py | 50 ++++++++++++++++++++++-------------- pypath/inputs/drugcentral.py | 35 +++++++++++++------------ 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index d55a2c146..852db9804 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,8 +23,6 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import List - import os import csv import collections @@ -38,37 +36,51 @@ _logger = session.Logger(name = 'drugbank') _log = _logger._log -def add_prot_id( + +def drugbank_proteins( user: str, passwd: str, pharma_active: bool = False, - ) -> List[tuple] : + ) -> list[tuple] : """ Retrieves protein identifiers from Drugbank. Args: - user (str): E-mail address for login to DrugBank. - passwd (str): Password for login to DrugBank. - pharma_active (bool): Wheter to include pharmacologically active identifiers. + user: + E-mail address with registered DrugBank account. + passwd: + Password for the DrugBank account. + pharma_active: + Wheter to include only pharmacologically active identifiers. Returns: - namedtuple. + List of protein records as named tuples. """ credentials = {'user': user, 'passwd': passwd} auth_str = base64.b64encode( - ('%s:%s' % (credentials['user'], credentials['passwd'])).encode() - ).decode() - - decoded = 'Basic %s' % auth_str - - req_hdrs = ['Authorization: %s' % decoded] - req_hdrs.extend([settings.get('user_agent')]) + f"{credentials['user']}:{credentials['passwd']}".encode() + ) - fields = ('DrugBank_ID','Target_UniProt_ID','Transporter_UniProt_ID','Enzym_UniProt_ID','Carrier_UniProt_ID') + req_hdrs = [ + f'Authorization: Basic {auth.decode()}', + settings.get('user_agent'), + ] + + fields = ( + 'DrugBank_ID', + 'Target_UniProt_ID', + 'Transporter_UniProt_ID', + 'Enzym_UniProt_ID', + 'Carrier_UniProt_ID', + ) - ProteinIdentifiers = collections.namedtuple('ProteinIndetifiers', fields,defaults = ("",) * len(fields)) + DrugbankProtein = collections.namedtuple( + 'DrugbankProtein', + fields, + defaults = (None,) * len(fields), + ) url = urls.urls['drugbank']['drug_enzym_identifiers'] c = curl.Curl( @@ -319,12 +331,13 @@ def add_prot_id( return result + def drug_bank( user: str, passwd: str, addprotid: bool = True, pharma_active: bool = False, - ) -> List[tuple] : + ) -> list[tuple] : """ Retrieves structures, external links and protein identifiers from Drugbank. @@ -439,5 +452,4 @@ def drug_bank( index += 1 - return result diff --git a/pypath/inputs/drugcentral.py b/pypath/inputs/drugcentral.py index edeff5436..8e2c49414 100644 --- a/pypath/inputs/drugcentral.py +++ b/pypath/inputs/drugcentral.py @@ -23,20 +23,19 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import List - import csv import collections import pypath.share.curl as curl import pypath.resources.urls as urls +import pypath.share.common as common def drug_central( organism: str = "Homo sapiens", SMILES: bool = False, InChI: bool = False, CAS_RN: bool = False, - ) -> List[tuple]: + ) -> list[tuple]: """ Retrieves drug-target interactions datasets from Drug Central. @@ -50,24 +49,28 @@ def drug_central( namedtuple. """ - fields = ('DRUG_NAME','TARGET_NAME','TARGET_CLASS', - 'TARGET_ACCESSION','GENE','ACT_VALUE','ACT_TYPE', - 'ACTION_TYPE','TDL','ORGANISM','SMILES','InChI', - 'InChIKey','CAS_RN',) + fields = ( + 'DRUG_NAME', + 'TARGET_NAME', + 'TARGET_CLASS', + 'TARGET_ACCESSION', + 'GENE', + 'ACT_VALUE', + 'ACT_TYPE', + 'ACTION_TYPE', + 'TDL', + 'ORGANISM', + 'SMILES', + 'InChI', + 'InChIKey', + 'CAS_RN', + ) url = urls.urls['drugcentral']['interactions'] c = curl.Curl(url, large = True, silent = False) interactions = list(csv.DictReader(c.result, delimiter = '\t')) - temp_inter = [] - - for rec in interactions: - - if rec not in temp_inter: - - temp_inter.append(rec) - - interactions = temp_inter + interactions = common.unique_list(interactions) result = [] From 56f2ee1731576bc14a585a7a810f5ded3d0b2183 Mon Sep 17 00:00:00 2001 From: deeenes Date: Fri, 19 Aug 2022 21:18:44 +0200 Subject: [PATCH 09/32] urls: long lines --- pypath/resources/urls.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/pypath/resources/urls.py b/pypath/resources/urls.py index 71de13390..a23d01148 100644 --- a/pypath/resources/urls.py +++ b/pypath/resources/urls.py @@ -1528,23 +1528,32 @@ }, 'interpro': { 'label': 'Protein families, domains and functional sites', - 'entries': 'https://ftp.ebi.ac.uk/pub/databases/interpro/interpro.xml.gz', - 'annotations': 'https://www.ebi.ac.uk/interpro/api/entry/InterPro/protein/' - '%s/taxonomy/uniprot/%u?page_size=%u', + 'entries': 'https://ftp.ebi.ac.uk/pub/databases/' + 'interpro/interpro.xml.gz', + 'annotations': 'https://www.ebi.ac.uk/interpro/api/entry/InterPro/' + 'protein/%s/taxonomy/uniprot/%u?page_size=%u', }, 'drugcentral': { 'label': 'Drug-target interactions', - 'interactions': 'https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/drug.target.interaction.tsv.gz', - 'SMILES_InChI' : 'https://unmtid-shinyapps.net/download/DrugCentral/2021_09_01/structures.smiles.tsv', + 'interactions': 'https://unmtid-shinyapps.net/download/DrugCentral' + '/2021_09_01/drug.target.interaction.tsv.gz', + 'SMILES_InChI' : 'https://unmtid-shinyapps.net/download/DrugCentral' + '/2021_09_01/structures.smiles.tsv', }, 'drugbank': { 'label': 'DrugBank database', - 'all_structures': 'https://go.drugbank.com/releases/5-1-9/downloads/all-structure-links', - 'all_drug': 'https://go.drugbank.com/releases/5-1-9/downloads/all-drug-links', - 'drug_target_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/target-all-polypeptide-ids', - 'drug_enzym_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/enzyme-all-polypeptide-ids', - 'drug_carrier_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/carrier-all-polypeptide-ids', - 'drug_transporter_identifiers' : 'https://go.drugbank.com/releases/5-1-9/downloads/transporter-all-polypeptide-ids', + 'all_structures': 'https://go.drugbank.com/releases/5-1-9/' + 'downloads/all-structure-links', + 'all_drug': 'https://go.drugbank.com/releases/5-1-9/downloads/' + 'all-drug-links', + 'drug_target_identifiers' : 'https://go.drugbank.com/releases/' + '5-1-9/downloads/target-all-polypeptide-ids', + 'drug_enzym_identifiers' : 'https://go.drugbank.com/releases/5-1-9/' + 'downloads/enzyme-all-polypeptide-ids', + 'drug_carrier_identifiers' : 'https://go.drugbank.com/releases/5-1-9/' + 'downloads/carrier-all-polypeptide-ids', + 'drug_transporter_identifiers' : 'https://go.drugbank.com/releases/' + '5-1-9/downloads/transporter-all-polypeptide-ids', }, 'chembl': { 'label': 'ChEMBL database', @@ -1556,9 +1565,11 @@ }, 'hpo': { 'label': 'HPO database', - 'ontology': 'https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo', + 'ontology': 'https://raw.githubusercontent.com/obophenotype/' + 'human-phenotype-ontology/master/hp.obo', 'disease' : 'http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa', - 'gene' : 'http://purl.obolibrary.org/obo/hp/hpoa/genes_to_phenotype.txt', + 'gene' : 'http://purl.obolibrary.org/obo/hp/hpoa/' + 'genes_to_phenotype.txt', }, } From 90e37df6e0692da3fcf71ae2237eebcca1680ea8 Mon Sep 17 00:00:00 2001 From: deeenes Date: Fri, 19 Aug 2022 22:35:20 +0200 Subject: [PATCH 10/32] `inputs.drugbank` - refactoring 1. --- pypath/inputs/drugbank.py | 328 +++++++------------------------------- pypath/resources/urls.py | 2 +- 2 files changed, 63 insertions(+), 267 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 852db9804..217c368dc 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,7 +23,6 @@ # Website: http://pypath.omnipathdb.org/ # -import os import csv import collections import base64 @@ -37,7 +36,27 @@ _log = _logger._log -def drugbank_proteins( +def _drugbank_download(user: str, passwd: str, *args, **kwargs): + + defaults = { + 'large': True, + 'silent': False, + 'compr': 'zip', + } + + defaults.update(kwargs) + + auth_str = base64.b64encode(f"{user}:{passwd}".encode()) + + defaults['req_headers'] = [ + f'Authorization: Basic {auth.decode()}', + settings.get('user_agent'), + ] + + return curl.Curl(*args, **defaults) + + +def drugbank_raw_interactions( user: str, passwd: str, pharma_active: bool = False, @@ -51,29 +70,18 @@ def drugbank_proteins( passwd: Password for the DrugBank account. pharma_active: - Wheter to include only pharmacologically active identifiers. + Only pharmacologically active relations. Returns: - List of protein records as named tuples. + List of drug-protein relations. """ - credentials = {'user': user, 'passwd': passwd} - - auth_str = base64.b64encode( - f"{credentials['user']}:{credentials['passwd']}".encode() - ) - - req_hdrs = [ - f'Authorization: Basic {auth.decode()}', - settings.get('user_agent'), - ] + csv_name = 'pharmacologically_active.csv' if pharma_active else 'all.csv' fields = ( - 'DrugBank_ID', - 'Target_UniProt_ID', - 'Transporter_UniProt_ID', - 'Enzym_UniProt_ID', - 'Carrier_UniProt_ID', + 'drugbank_id', + 'uniprot_id', + 'relation', ) DrugbankProtein = collections.namedtuple( @@ -82,257 +90,37 @@ def drugbank_proteins( defaults = (None,) * len(fields), ) - url = urls.urls['drugbank']['drug_enzym_identifiers'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - cache = False, - ) - - os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") - zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") - enzym = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) - - if pharma_active: - - active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) - - for rec in active: - - enzym.append(rec) - result = [] - result.append( - ProteinIdentifiers( - DrugBank_ID = "", - ) - ) - - for enzym_attr in enzym: - - DrugBank_IDs = [i for i in enzym_attr['Drug IDs'].replace(" ","").split(';')] - - for id in DrugBank_IDs: - - index = 0 - flag = 0 - - for res_attr in result: - - if id == res_attr.DrugBank_ID: - - flag = 1 - - if res_attr.Enzym_UniProt_ID == "": - - result[index] = result[index]._replace( - Enzym_UniProt_ID = enzym_attr['UniProt ID'],) - - else: - - result[index] = result[index]._replace( - Enzym_UniProt_ID = result[index].Enzym_UniProt_ID + ";" + enzym_attr['UniProt ID'],) - - break - - index += 1 - - if flag == 0: - - result.append( - ProteinIdentifiers( - DrugBank_ID = id, - Enzym_UniProt_ID = enzym_attr['UniProt ID'], - ) - ) - - del result[0] - - url = urls.urls['drugbank']['drug_carrier_identifiers'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - cache = False, - ) - - os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") - zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") - carrier = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) - - if pharma_active: - - active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) - - for rec in active: - - carrier.append(rec) - - for carrier_attr in carrier: - - DrugBank_IDs = [i for i in carrier_attr['Drug IDs'].replace(" ","").split(';')] - - for id in DrugBank_IDs: - - index = 0 - flag = 0 - - for res_attr in result: - - if id == res_attr.DrugBank_ID: - - flag = 1 - - if res_attr.Carrier_UniProt_ID == "": - - result[index] = result[index]._replace( - Carrier_UniProt_ID = carrier_attr['UniProt ID'],) + for rel in ('carrier', 'enzyme', 'target', 'transporter'): - else: + url = urls.urls['drugbank'][f'drug_{rel}_identifiers'] - result[index] = result[index]._replace( - Carrier_UniProt_ID = result[index].Carrier_UniProt_ID + ";" + carrier_attr['UniProt ID'],) - - break - - index += 1 - - if flag == 0: - - result.append( - ProteinIdentifiers( - DrugBank_ID = id, - Carrier_UniProt_ID = carrier_attr['UniProt ID'], - ) - ) - - - url = urls.urls['drugbank']['drug_transporter_identifiers'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - cache = False, - ) - - os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") - zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") - transporter = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) - - if pharma_active: - - active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) - - for rec in active: - - transporter.append(rec) - - for transporter_attr in transporter: - - DrugBank_IDs = [i for i in transporter_attr['Drug IDs'].replace(" ","").split(';')] - - for id in DrugBank_IDs: - - index = 0 - flag = 0 - - for res_attr in result: - - if id == res_attr.DrugBank_ID: - - flag = 1 - - if res_attr.Transporter_UniProt_ID == "": - - result[index] = result[index]._replace( - Transporter_UniProt_ID = transporter_attr['UniProt ID'],) - - else: - - result[index] = result[index]._replace( - Transporter_UniProt_ID = result[index].Transporter_UniProt_ID + ";" + transporter_attr['UniProt ID'],) - - break - - index += 1 - - if flag == 0: - - result.append( - ProteinIdentifiers( - DrugBank_ID = id, - Transporter_UniProt_ID = transporter_attr['UniProt ID'], - ) - ) - - url = urls.urls['drugbank']['drug_target_identifiers'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - cache = False, - ) - - os.rename(c.fileobj.name, c.fileobj.name + ".csv.zip") - zipfile = curl.FileOpener(c.fileobj.name + ".csv.zip") - target = list(csv.DictReader(zipfile.result["all.csv"], delimiter = ',')) - - if pharma_active: - - active = list(csv.DictReader(zipfile.result["pharmacologically_active.csv"], delimiter = ',')) - - for rec in active: - - target.append(rec) - - for target_attr in target: - - DrugBank_IDs = [i for i in target_attr['Drug IDs'].replace(" ","").split(';')] - - for id in DrugBank_IDs: - - index = 0 - flag = 0 - - for res_attr in result: - - if id == res_attr.DrugBank_ID: - - flag = 1 - - if res_attr.Target_UniProt_ID == "": - - result[index] = result[index]._replace( - Target_UniProt_ID = target_attr['UniProt ID'],) - - else: - - result[index] = result[index]._replace( - Target_UniProt_ID = result[index].Target_UniProt_ID + ";" + target_attr['UniProt ID'],) + c = _drugbank_download( + user = user, + passwd = passwd, + files_needed = (csv_name,), + ) - break + _ = next(c.result[csv_name]) - index += 1 + for l in c.result[csv_name]: - if flag == 0: + drugs, uniprot = l.strip().split(',') - result.append( - ProteinIdentifiers( - DrugBank_ID = id, - Target_UniProt_ID = target_attr['UniProt ID'], - ) - ) + result.extend( + DrugbankProtein( + drugbank_id = drug, + uniprot_id = uniprot, + relation = rel, + ) + for drug in drugs + ) return result -def drug_bank( +def drugbank( user: str, passwd: str, addprotid: bool = True, @@ -373,12 +161,16 @@ def drug_bank( large = True, silent = False, req_headers = req_hdrs, - cache = False + compr = 'zip', + files_needed = ('structure links.csv',), ) - os.rename(c.fileobj.name, c.fileobj.name + ".zip") - zipfile = curl.FileOpener(c.fileobj.name + ".zip") - structure_links = list(csv.DictReader(zipfile.result["structure links.csv"], delimiter = ',')) + structure_links = list( + csv.DictReader( + c.result['structure links.csv'], + delimiter = ',', + ) + ) url = urls.urls['drugbank']['all_drug'] c = curl.Curl( @@ -386,12 +178,16 @@ def drug_bank( large = True, silent = False, req_headers = req_hdrs, - cache = False + compr = 'zip', + files_needed = ('drug links.csv',), ) - os.rename(c.fileobj.name, c.fileobj.name + ".zip") - zipfile = curl.FileOpener(c.fileobj.name + ".zip") - drug_links = list(csv.DictReader(zipfile.result["drug links.csv"], delimiter = ',')) + drug_links = list( + csv.DictReader( + c.result['drug links.csv'], + delimiter = ',', + ) + ) if addprotid: diff --git a/pypath/resources/urls.py b/pypath/resources/urls.py index a23d01148..4245fbc55 100644 --- a/pypath/resources/urls.py +++ b/pypath/resources/urls.py @@ -1548,7 +1548,7 @@ 'all-drug-links', 'drug_target_identifiers' : 'https://go.drugbank.com/releases/' '5-1-9/downloads/target-all-polypeptide-ids', - 'drug_enzym_identifiers' : 'https://go.drugbank.com/releases/5-1-9/' + 'drug_enzyme_identifiers' : 'https://go.drugbank.com/releases/5-1-9/' 'downloads/enzyme-all-polypeptide-ids', 'drug_carrier_identifiers' : 'https://go.drugbank.com/releases/5-1-9/' 'downloads/carrier-all-polypeptide-ids', From 6bab27d889df86c07ce4bd89034ed26195c1c33e Mon Sep 17 00:00:00 2001 From: deeenes Date: Fri, 19 Aug 2022 23:17:35 +0200 Subject: [PATCH 11/32] `inputs.drugbank` - refactoring 2. --- pypath/inputs/drugbank.py | 185 +++++++++++++++----------------------- 1 file changed, 73 insertions(+), 112 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 217c368dc..e7e9150b0 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -84,8 +84,8 @@ def drugbank_raw_interactions( 'relation', ) - DrugbankProtein = collections.namedtuple( - 'DrugbankProtein', + DrugbankRawInteraction = collections.namedtuple( + 'DrugbankRawInteraction', fields, defaults = (None,) * len(fields), ) @@ -97,6 +97,7 @@ def drugbank_raw_interactions( url = urls.urls['drugbank'][f'drug_{rel}_identifiers'] c = _drugbank_download( + url = url, user = user, passwd = passwd, files_needed = (csv_name,), @@ -109,7 +110,7 @@ def drugbank_raw_interactions( drugs, uniprot = l.strip().split(',') result.extend( - DrugbankProtein( + DrugbankRawInteraction( drugbank_id = drug, uniprot_id = uniprot, relation = rel, @@ -120,132 +121,92 @@ def drugbank_raw_interactions( return result -def drugbank( - user: str, - passwd: str, - addprotid: bool = True, - pharma_active: bool = False, - ) -> list[tuple] : +def drugbank_drugs(user: str, passwd: str) -> list[tuple] : """ - Retrieves structures, external links and protein identifiers from Drugbank. + Retrieves drug identifiers from Drugbank. + + Each drug is annotated by its various database cross-references. Args: - user (str): E-mail address for login to DrugBank. - passwd (str): Password for login to DrugBank. - addprotid (bool): Wheter to include protein identifiers from DrugBank. - pharma_active (bool): Wheter to include pharmacologically active identifiers. + user: + E-mail address with registered DrugBank account. + passwd: + Password for the DrugBank account. Returns: - namedtuple. + List of named tuples, each field corresponding to various identifiers. """ - fields = ('DrugBank_ID','Name','CAS_Number','Drug_Groups','InChIKey','InChI','SMILES','Formula', - 'KEGG_Compound_ID','KEGG_Drug_ID','PubChem_Compound_ID','PubChem_Substance_ID','ChEBI_ID', - 'ChEMBL_ID','Drug_Type','PharmGKB_ID','HET_ID','Target_UniProt_ID','Transporter_UniProt_ID', - 'Enzym_UniProt_ID','Carrier_UniProt_ID') - - credentials = {'user': user, 'passwd': passwd} + fields = ( + 'drugbank', + 'name', + 'type', + 'groups', + 'cas', + 'inchikey', + 'inchi', + 'smiles', + 'formula', + 'kegg_compound', + 'kegg_drug', + 'pubchem_cid', + 'pubchem_sid', + 'chebi', + 'chembl', + 'pharmgkb', + 'het', + ) - auth_str = base64.b64encode( - ('%s:%s' % (credentials['user'], credentials['passwd'])).encode() - ).decode() + raw = {} - decoded = 'Basic %s' % auth_str + for table in ('drug', 'structure'): - req_hdrs = ['Authorization: %s' % decoded] - req_hdrs.extend([settings.get('user_agent')]) + csv = f'{table} links.csv' - url = urls.urls['drugbank']['all_structures'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - compr = 'zip', - files_needed = ('structure links.csv',), - ) - - structure_links = list( - csv.DictReader( - c.result['structure links.csv'], - delimiter = ',', + c = _drugbank_download( + url = urls.urls['drugbank'][f'all_{table}s'], + user = user, + passwd = passwd, + files_needed = (csv,), ) - ) - url = urls.urls['drugbank']['all_drug'] - c = curl.Curl( - url, - large = True, - silent = False, - req_headers = req_hdrs, - compr = 'zip', - files_needed = ('drug links.csv',), - ) - - drug_links = list( - csv.DictReader( - c.result['drug links.csv'], - delimiter = ',', + raw[table] = dict( + (rec['DrugBank ID'], rec) + for rec in csv.DictReader(c.result[csv], delimiter = ',') ) - ) - - if addprotid: - Combine = collections.namedtuple('Combine', fields,defaults = ("",) * len(fields)) - - else: - Combine = collections.namedtuple('Combine', fields[:17],defaults = ("",) * len(fields[:17])) + DrugbankDrug = collections.namedtuple( + 'DrugbankDrug', + fields, + defaults = (None,) * len(fields), + ) result = [] - for struct_attr in structure_links: - - for drug_attr in drug_links: - - if struct_attr['DrugBank ID'] == drug_attr['DrugBank ID']: - - result.append( - Combine( - DrugBank_ID = struct_attr['DrugBank ID'], - Name = struct_attr['Name'], - CAS_Number = struct_attr['CAS Number'], - Drug_Groups = struct_attr['Drug Groups'], - InChIKey = struct_attr['InChIKey'], - InChI = struct_attr['InChI'], - SMILES = struct_attr['SMILES'], - Formula = struct_attr['Formula'], - KEGG_Compound_ID = struct_attr['KEGG Compound ID'], - KEGG_Drug_ID = struct_attr['KEGG Drug ID'], - PubChem_Compound_ID = struct_attr['PubChem Compound ID'], - PubChem_Substance_ID = struct_attr['PubChem Substance ID'], - ChEBI_ID = struct_attr['ChEBI ID'], - ChEMBL_ID = struct_attr['ChEMBL ID'], - Drug_Type = drug_attr['Drug Type'], - PharmGKB_ID = drug_attr['PharmGKB ID'], - HET_ID = drug_attr['HET ID'], - ) - ) - - if addprotid: - - identifiers_list = add_prot_id(user, passwd, pharma_active) - index = 0 - - for res_attr in result: - - for iden_attr in identifiers_list: - - if res_attr.DrugBank_ID == iden_attr.DrugBank_ID: - - result[index] = result[index]._replace( - Target_UniProt_ID = iden_attr.Target_UniProt_ID, - Transporter_UniProt_ID = iden_attr.Transporter_UniProt_ID, - Enzym_UniProt_ID = iden_attr.Enzym_UniProt_ID, - Carrier_UniProt_ID = iden_attr.Carrier_UniProt_ID, - ) - - break - - index += 1 + for dbid, struct in raw['structure'].items(): + + drug = raw['drug'].get(dbid, {}) + + result.append( + DrugbankDrug( + drugbank = dbid, + name = struct['Name'], + type = drug.get('Drug Type', None), + groups = struct['Drug Groups'], + cas = struct['CAS Number'], + inchikey = struct['InChIKey'], + inchi = struct['InChI'], + smiles = struct['SMILES'], + formula = struct['Formula'], + kegg_compound = struct['KEGG Compound ID'], + kegg_drug = struct['KEGG Drug ID'], + pubchem_cid = struct['PubChem Compound ID'], + pubchem_sid = struct['PubChem Substance ID'], + chebi = struct['ChEBI ID'], + chembl = struct['ChEMBL ID'], + pharmgkb = drug.get('PharmGKB ID', None) + het = drug.get('HET ID', None), + ) + ) return result From f0de019d0bde1357a4e8b2576015d7a8af4e6537 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 00:56:01 +0200 Subject: [PATCH 12/32] `drugbank_interactions` --- pypath/inputs/drugbank.py | 67 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index e7e9150b0..c24b5dd34 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,6 +23,7 @@ # Website: http://pypath.omnipathdb.org/ # +import re import csv import collections import base64 @@ -32,7 +33,7 @@ import pypath.share.session as session import pypath.share.settings as settings -_logger = session.Logger(name = 'drugbank') +_logger = session.Logger(name = 'drugbank_input') _log = _logger._log @@ -121,6 +122,70 @@ def drugbank_raw_interactions( return result +def drugbank_interactions( + user: str, + passwd: str, + pharma_active: bool = False, + ) -> list[tuple] : + """ + Drug-protein and protein-drug interactions from Drugbank. + + Args: + user: + E-mail address with registered DrugBank account. + passwd: + Password for the DrugBank account. + pharma_active: + Only pharmacologically active interactions. + + Returns: + List of drug-protein and protein-drug interactions. + """ + + raw = drugbank_raw_interactions( + user = user, + passwd = passwd, + harma_active = pharma_active, + ) + + drugs = dict( + (d.drugbank, d) + for d in drugbank_drugs(user = user, passwd = passwd) + ) + + DrugbankInteraction = collections.namedtuple( + 'DrugbankInteraction', + ( + 'source', + 'target', + 'source_entity_type', + 'target_entity_type', + 'interaction_type', + ) + ) + + result = [] + + for r in raw: + + drug = drugs.get(r.drugbank_id, None) + + # TODO: later engage the mapping module here + if drug and drug.pubchem_cid: + + src_tgt = reversed if r.relation == 'target' else lambda x: x + + result.append( + DrugbankInteraction( + *src_tgt(r.uniprot_id, drug.pubchem_cid), + *src_tgt('protein', 'drug'), + interaction_type = r.relation, + ) + ) + + return result + + def drugbank_drugs(user: str, passwd: str) -> list[tuple] : """ Retrieves drug identifiers from Drugbank. From 05b98e13a04d470297439998c3710896af7cc179 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 00:56:21 +0200 Subject: [PATCH 13/32] `drugbank_annotations` --- pypath/inputs/drugbank.py | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index c24b5dd34..6ba9df7fd 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -275,3 +275,47 @@ def drugbank_drugs(user: str, passwd: str) -> list[tuple] : ) return result + + +def drugbank_annotations(user: str, passwd: str): + """ + Drug annotations from Drugbank. + + The annotations are restricted to the drug molecule type and drug status. + + Args: + user: + E-mail address with registered DrugBank account. + passwd: + Password for the DrugBank account. + pharma_active: + Only pharmacologically active interactions. + + Returns: + List of drug annotations. + """ + + drugs = drugbank_drugs(user = user, passwd = passwd) + + DrugbankAnnotation = collections.namedtuple( + 'DrugbankAnnotation', + ( + 'type', + 'status', + ) + ) + + result = collections.defaultdict(set) + + for d in drugs: + + if d.pubchem_cid: + + result[d.pubchem_cid].add( + DrugbankAnnotation( + type = d.type, + status = re.sub(',\s*', ';', d.groups), + ) + ) + + return dict(result) From e58b2ecc67689fb4c5535d3a9242e79bd8f9a1c3 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 01:55:53 +0200 Subject: [PATCH 14/32] new module `input.credentials`: obtain credentials from arguments, settings or file --- pypath/inputs/credentials.py | 120 +++++++++++++++++++++++++++++++++++ pypath/inputs/drugbank.py | 23 ++++++- 2 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 pypath/inputs/credentials.py diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py new file mode 100644 index 000000000..16388343a --- /dev/null +++ b/pypath/inputs/credentials.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# This file is part of the `pypath` python module +# +# Copyright +# 2014-2022 +# EMBL, EMBL-EBI, Uniklinik RWTH Aachen, Heidelberg University +# +# Authors: Dénes Türei (turei.denes@gmail.com) +# Nicolàs Palacio +# Sebastian Lobentanzer +# Erva Ulusoy +# Olga Ivanova +# Ahmet Rifaioglu +# +# Distributed under the GPLv3 License. +# See accompanying file LICENSE.txt or copy at +# http://www.gnu.org/licenses/gpl-3.0.html +# +# Website: http://pypath.omnipathdb.org/ +# + +from typing import Optional + +import os + +import pypath.share.settings as settings +import pypath.share.settings as session + +_logger = session.Logger(name = 'credentials') +_log = _logger._log + + +def credentials( + *args: tuple[str, str], + resource: Optional[str] = None, + from_file: Optional[str] = None, + **kwargs: dict[str, str], + ) -> dict: + """ + Credentials required for restricted access resources. + + Args: + args: + Two strings: a user name and password. If only one provided, it + is assumed to be a user name; if more provided, apart from the + first two, the rest will be ignored. + resource: + Name of the resource. If the key `_credentials` + exists in the module settings, its value will be returned as + credentials. + from_file: + Path to a file or name of a file that is located in the module's + default secrets directory. + kwargs: + Custom key-value pairs, will be returned unchanged. This is the + way to explicitely provide user and password, and any further + fields. + + Returns: + A dictionary with the credentials. Raises RuntimeError if credentials + not provided by any of the available ways. + """ + + fields = ('user', 'passwd') + kwargs.update(dict(zip(fields, args))) + + if all(f in kwargs for f in fields): + + credentials = kwargs + + else: + + credentials = settings.get(f'{resource.lower()}_credentials') + + if not credentials: + + secrets_fname = from_file + + if not os.path.exists(secrets_fname): + + secrets_fname = os.path.join( + settings.get('secrets_dir'), + secrets_fname, + ) + + if os.path.exists(secrets_fname): + + _log( + f'Reading credentials for `{resource}` ' + f'from file `{secrets_fname}`.' + ) + + with open(secrets_fname, 'r') as fp: + + lines = fp.read().strip().split(os.linesep) + + keys, values = tuple(zip(*( + ([None] + l.split(':', maxsplit = 1))[-2:] + for l in lines + ))) + + keys = keys if all(keys) else fields + credentials = dict(zip(keys, values)) + credentials.update(kwargs) + + else: + + _log(f'`{resource}` credentials provided by `settings`.') + + if not credentials: + + msg = f'Failed to obtain credentials for resource `{resource}`' + _log(msg) + + raise RuntimeError(msg) + + return credentials diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 6ba9df7fd..d6ba5d9a6 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,6 +23,8 @@ # Website: http://pypath.omnipathdb.org/ # +from typing import Optional + import re import csv import collections @@ -37,7 +39,16 @@ _log = _logger._log -def _drugbank_download(user: str, passwd: str, *args, **kwargs): +def _drugbank_credentials( + user: Optional[str] = None, + passwd: Optional[str] = None, + ) -> tuple[str, str]: + """ + + """ + + +def _drugbank_download(user: str, passwd: str, *args, **kwargs) -> curl.Curl: defaults = { 'large': True, @@ -186,7 +197,7 @@ def drugbank_interactions( return result -def drugbank_drugs(user: str, passwd: str) -> list[tuple] : +def drugbank_drugs(user: str, passwd: str) -> list[tuple]: """ Retrieves drug identifiers from Drugbank. @@ -277,7 +288,7 @@ def drugbank_drugs(user: str, passwd: str) -> list[tuple] : return result -def drugbank_annotations(user: str, passwd: str): +def drugbank_annotations(user: str, passwd: str) -> dict[str, set[tuple]]: """ Drug annotations from Drugbank. @@ -319,3 +330,9 @@ def drugbank_annotations(user: str, passwd: str): ) return dict(result) + + +def drugbank_mapping(user: str, passwd: str, ) -> dict[str, set[str]]: + + + From de70e70f1d1d41dcdc2b85f6d69305534630a181 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 02:01:46 +0200 Subject: [PATCH 15/32] `inputs.cosmic` uses `inputs.credentials` --- pypath/inputs/cosmic.py | 64 ++++++++++++----------------------------- 1 file changed, 19 insertions(+), 45 deletions(-) diff --git a/pypath/inputs/cosmic.py b/pypath/inputs/cosmic.py index d654463e7..461579a97 100644 --- a/pypath/inputs/cosmic.py +++ b/pypath/inputs/cosmic.py @@ -33,6 +33,7 @@ import pypath.share.session as session_mod import pypath.share.settings as settings import pypath.utils.mapping as mapping +import pypath.inputs.credentials as credentials _logger = session_mod.Logger(name = 'cosmic_input') _log = _logger._log @@ -49,54 +50,27 @@ def cancer_gene_census_annotations( Returns dict of annotations. """ - if not user or not passwd: + try: - credentials = settings.get('cosmic_credentials') - - if not credentials: - - if not os.path.exists(credentials_fname): - - credentials_fname = os.path.join( - settings.get('secrets_dir'), - credentials_fname, - ) - - if os.path.exists(credentials_fname): - - _log( - 'Reading COSMIC credentials ' - 'from file `%s`.' % credentials_fname - ) - - with open(credentials_fname, 'r') as fp: - - credentials = dict( - zip( - ('user', 'passwd'), - fp.read().split('\n')[:2], - ) - ) - - else: - _log('COSMIC credentials provided by `settings`.') - - if not credentials or {'user', 'passwd'} - set(credentials.keys()): - - _log( - 'No credentials available for the COSMIC website. ' - 'Either set the `cosmic_credentials` key in the `settings` ' - 'module (e.g. `{\'user\': \'myuser\', ' - '\'passwd\': \'mypassword\'}`), or pass them directly to the ' - '`pypath.inputs.cosmic.cancer_gene_census_annotations` ' - 'method.' - ) + cosmic_cred = credentials.credentials( + user = user, + passwd = passwd, + resource = 'COSMIC', + from_file = credentials_fname, + ) - return {} + except RuntimeError: - else: + _log( + 'No credentials available for the COSMIC website. ' + 'Either set the `cosmic_credentials` key in the `settings` ' + 'module (e.g. `{\'user\': \'myuser\', ' + '\'passwd\': \'mypassword\'}`), or pass them directly to the ' + '`pypath.inputs.cosmic.cancer_gene_census_annotations` ' + 'method.' + ) - credentials = {'user': user, 'passwd': passwd} + return {} CancerGeneCensusAnnotation = collections.namedtuple( 'CancerGeneCensusAnnotation', @@ -128,7 +102,7 @@ def multi_field(content): url = urls.urls['cgc']['url_new'] auth_str = base64.b64encode( - ('%s:%s\n' % (credentials['user'], credentials['passwd'])).encode() + ('%s:%s\n' % (cosmic_cred['user'], cosmic_cred['passwd'])).encode() ) req_hdrs = ['Authorization: Basic %s' % auth_str.decode()] From 13b0503bd04e1245efe9e9ca028dd0560f55b5c9 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 02:04:44 +0200 Subject: [PATCH 16/32] `credentials` removes `None` values from arguments --- pypath/inputs/credentials.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py index 16388343a..c6451cfcf 100644 --- a/pypath/inputs/credentials.py +++ b/pypath/inputs/credentials.py @@ -66,6 +66,7 @@ def credentials( fields = ('user', 'passwd') kwargs.update(dict(zip(fields, args))) + kwargs = dict(it for kwargs.items() if it[1] is not None) if all(f in kwargs for f in fields): From f0b28572b53dbbe07c1cad808a6548ed76c349f6 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 02:43:19 +0200 Subject: [PATCH 17/32] `credentials`: settings key is used also as default file name --- pypath/inputs/credentials.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py index c6451cfcf..e8f8a90ff 100644 --- a/pypath/inputs/credentials.py +++ b/pypath/inputs/credentials.py @@ -74,11 +74,12 @@ def credentials( else: - credentials = settings.get(f'{resource.lower()}_credentials') + settings_key = f'{resource.lower()}_credentials' + credentials = settings.get(settings_key) if not credentials: - secrets_fname = from_file + secrets_fname = from_file or settings_key if not os.path.exists(secrets_fname): From 68b3f218b03c1b5624c13783365d323e8b11aeee Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 02:43:55 +0200 Subject: [PATCH 18/32] `inputs.drugbank` uses `credentials` + `drugbank_mapping` --- pypath/inputs/drugbank.py | 133 ++++++++++++++++++++++++++++++++++---- 1 file changed, 119 insertions(+), 14 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index d6ba5d9a6..fe5afeab4 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -34,6 +34,7 @@ import pypath.share.curl as curl import pypath.share.session as session import pypath.share.settings as settings +import pypath.inputs.credentials as credentials _logger = session.Logger(name = 'drugbank_input') _log = _logger._log @@ -42,13 +43,38 @@ def _drugbank_credentials( user: Optional[str] = None, passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, ) -> tuple[str, str]: - """ - """ + return credentials.credentials( + user = user, + passwd = passwd, + resource = 'DrugBank', + from_file = credentials_fname, + ) + + +def _drugbank_download( + *args, + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, + **kwargs + ) -> Optional[curl.Curl]: + + try: + + cred = _drugbank_credentials( + user = user, + passwd = passwd, + credentials_fname = credentials_fname, + ) + + except RuntimeError: + _log('No credentials available for the DrugBank website.') -def _drugbank_download(user: str, passwd: str, *args, **kwargs) -> curl.Curl: + return None defaults = { 'large': True, @@ -58,7 +84,7 @@ def _drugbank_download(user: str, passwd: str, *args, **kwargs) -> curl.Curl: defaults.update(kwargs) - auth_str = base64.b64encode(f"{user}:{passwd}".encode()) + auth_str = base64.b64encode(f"{cred['user']}:{cred['passwd']}".encode()) defaults['req_headers'] = [ f'Authorization: Basic {auth.decode()}', @@ -69,8 +95,9 @@ def _drugbank_download(user: str, passwd: str, *args, **kwargs) -> curl.Curl: def drugbank_raw_interactions( - user: str, - passwd: str, + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, pharma_active: bool = False, ) -> list[tuple] : """ @@ -112,9 +139,12 @@ def drugbank_raw_interactions( url = url, user = user, passwd = passwd, + credentials_fname = credentials_fname, files_needed = (csv_name,), ) + if not c: continue + _ = next(c.result[csv_name]) for l in c.result[csv_name]: @@ -134,8 +164,9 @@ def drugbank_raw_interactions( def drugbank_interactions( - user: str, - passwd: str, + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, pharma_active: bool = False, ) -> list[tuple] : """ @@ -157,6 +188,7 @@ def drugbank_interactions( user = user, passwd = passwd, harma_active = pharma_active, + credentials_fname = credentials_fname, ) drugs = dict( @@ -197,7 +229,11 @@ def drugbank_interactions( return result -def drugbank_drugs(user: str, passwd: str) -> list[tuple]: +def drugbank_drugs( + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, + ) -> list[tuple]: """ Retrieves drug identifiers from Drugbank. @@ -243,9 +279,12 @@ def drugbank_drugs(user: str, passwd: str) -> list[tuple]: url = urls.urls['drugbank'][f'all_{table}s'], user = user, passwd = passwd, + credentials_fname = credentials_fname, files_needed = (csv,), ) + if not c: continue + raw[table] = dict( (rec['DrugBank ID'], rec) for rec in csv.DictReader(c.result[csv], delimiter = ',') @@ -288,7 +327,11 @@ def drugbank_drugs(user: str, passwd: str) -> list[tuple]: return result -def drugbank_annotations(user: str, passwd: str) -> dict[str, set[tuple]]: +def drugbank_annotations( + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, + ) -> dict[str, set[tuple]]: """ Drug annotations from Drugbank. @@ -299,14 +342,16 @@ def drugbank_annotations(user: str, passwd: str) -> dict[str, set[tuple]]: E-mail address with registered DrugBank account. passwd: Password for the DrugBank account. - pharma_active: - Only pharmacologically active interactions. Returns: List of drug annotations. """ - drugs = drugbank_drugs(user = user, passwd = passwd) + drugs = drugbank_drugs( + user = user, + passwd = passwd + credentials_fname = credentials_fname, + ) DrugbankAnnotation = collections.namedtuple( 'DrugbankAnnotation', @@ -332,7 +377,67 @@ def drugbank_annotations(user: str, passwd: str) -> dict[str, set[tuple]]: return dict(result) -def drugbank_mapping(user: str, passwd: str, ) -> dict[str, set[str]]: +def drugbank_mapping( + id_type: str, + target_id_type: str, + user: Optional[str] = None, + passwd: Optional[str] = None, + credentials_fname: Optional[str] = None, + ) -> dict[str, set[str]]: + """ + Identifier translation table from DrugBank. + + Available ID types: drugbank, name, type, groups, cas, inchikey, + inchi, smiles, formula, kegg_compound, kegg_drug, pubchem_cid, + pubchem_sid, chebi, chembl, pharmgkb, het. + + Args: + id_type: + The identifier type to be used as keys. + target_id_type: + The identifier type that will be collected into the values. + user: + E-mail address with registered DrugBank account. + passwd: + Password for the DrugBank account. + credentials_fname: + File name or path to a file with DrugBank login credentials. + + Returns: + An identifier translation table. + """ + + synonyms = { + 'pubchem_compound': 'pubchem_cid', + 'pubchem_substance': 'pubchem_sid', + } + + def id_type_proc(_id_type): + _id_type = re.sub('[^cs]id$', '', _id_type.lower()).replace(' ', '_') + return synonyms.get(_id_type, _id_type) + + + drugs = drugbank_drugs( + user = user, + passwd = passwd + credentials_fname = credentials_fname, + ) + + result = collections.defaultdict(set) + + id_type = id_type_proc(id_type) + target_id_type = id_type_proc(id_type) + + for d in drugs: + + the_id = getattr(d, id_type) + target_id = getattr(d, target_id_type) + + if the_id and target_id: + + result[the_id].add(target_id) + + return dict(result) From e1429174553f7723f6910f2840d3b46449e2e960 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 19:09:50 +0200 Subject: [PATCH 19/32] `drugcentral_drugs`: retrieves drug data --- pypath/inputs/drugcentral.py | 99 +++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/pypath/inputs/drugcentral.py b/pypath/inputs/drugcentral.py index 8e2c49414..158d5829c 100644 --- a/pypath/inputs/drugcentral.py +++ b/pypath/inputs/drugcentral.py @@ -27,58 +27,97 @@ import collections import pypath.share.curl as curl +import pypath.share.session as session import pypath.resources.urls as urls -import pypath.share.common as common +import pypath.utils.taxonomy as taxonomy + +_logger = session.Logger(name = 'drugcentral_input') +_log = _logger._log + + + +def drugcentral_drugs() -> list[tuple]: + """ + Drug names and structures from Drug Central. + + Returns: + List of drugs, each represented by a named tuple. + """ + + DrugcentralDrug = collections.namedtuple( + 'DrugcentralDrug', + ( + 'drugcentral', + 'inn', + 'cas', + 'smiles', + 'inchikey', + 'inchi', + ) + ) + + url = urls.urls['drugcentral']['SMILES_InChI'] + c = curl.Curl(url, large = True, silent = False) + drugs = list(csv.DictReader(c.result, delimiter = '\t')) + + result = [ + DrugcentralDrug( + drugcentral = drug['ID'], + inn = drug['INN'], + cas = drug['CAS_RN'], + smiles = drug['SMILES'], + inchikey = drug['InChIKey'], + inchi = drug['InChI'], + ) + for drug in drugs + ] + + return result + def drug_central( - organism: str = "Homo sapiens", - SMILES: bool = False, - InChI: bool = False, - CAS_RN: bool = False, + organism: Union[str, int] = 'Homo sapiens', ) -> list[tuple]: """ - Retrieves drug-target interactions datasets from Drug Central. + Retrieves drug-target interactions from Drug Central. Args: - organism (str): Which organism to use for processing. - SMILES (bool): Whether to include SMILES structures from Drug Central. - InChI (bool): Whether to include InChI formats and InChI keys from Drug Central. - CAS_RN (bool): Whether to include CAS Registry Number from Drug Central. + organism: + Organism name or NCBI Taxonomy ID. Returns: - namedtuple. + List of drug-target relationships, represented as named tuples. """ fields = ( - 'DRUG_NAME', - 'TARGET_NAME', - 'TARGET_CLASS', - 'TARGET_ACCESSION', - 'GENE', - 'ACT_VALUE', - 'ACT_TYPE', - 'ACTION_TYPE', - 'TDL', - 'ORGANISM', - 'SMILES', - 'InChI', - 'InChIKey', - 'CAS_RN', + 'drug', + 'target', + 'target_class', + 'target_accession', + 'gene', + 'act_value', + 'act_type', + 'action_type', + 'tdl', + 'organism', ) url = urls.urls['drugcentral']['interactions'] c = curl.Curl(url, large = True, silent = False) interactions = list(csv.DictReader(c.result, delimiter = '\t')) - interactions = common.unique_list(interactions) + organism_latin = taxonomy.ensure_latin_name(organism) + + if not organism_latin: + + msg = f'Could not find latin name for organism: `{organism}`.' + _log(msg) + + raise ValueError(msg) result = [] - if SMILES == True or InChI == True or CAS_RN == True: - url = urls.urls['drugcentral']['SMILES_InChI'] - c = curl.Curl(url, large = True, silent = False) - structures = list(csv.DictReader(c.result, delimiter = '\t')) temp_struct = [] From 2ea48de1ea5cc34416e19477e656374d6d8717af Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 19:51:32 +0200 Subject: [PATCH 20/32] refactored `inputs.drugcentral` - drug data, interactions, id translation --- pypath/inputs/drugcentral.py | 224 ++++++++++++----------------------- 1 file changed, 74 insertions(+), 150 deletions(-) diff --git a/pypath/inputs/drugcentral.py b/pypath/inputs/drugcentral.py index 158d5829c..7e6a43fd3 100644 --- a/pypath/inputs/drugcentral.py +++ b/pypath/inputs/drugcentral.py @@ -23,11 +23,14 @@ # Website: http://pypath.omnipathdb.org/ # +from typing import Optional, Union + import csv import collections import pypath.share.curl as curl import pypath.share.session as session +import pypath.share.common as common import pypath.resources.urls as urls import pypath.utils.taxonomy as taxonomy @@ -35,7 +38,6 @@ _log = _logger._log - def drugcentral_drugs() -> list[tuple]: """ Drug names and structures from Drug Central. @@ -75,31 +77,40 @@ def drugcentral_drugs() -> list[tuple]: return result -def drug_central( - organism: Union[str, int] = 'Homo sapiens', +def drugcentral_interactions( + organism: Optional[Union[str, int]] = None, + comments: bool = False, ) -> list[tuple]: """ Retrieves drug-target interactions from Drug Central. Args: organism: - Organism name or NCBI Taxonomy ID. + Organism name or NCBI Taxonomy ID. If not provided, + all organisms will be retained. + comments: + Include comments in the result. Returns: List of drug-target relationships, represented as named tuples. """ - fields = ( - 'drug', - 'target', - 'target_class', - 'target_accession', - 'gene', - 'act_value', - 'act_type', - 'action_type', - 'tdl', - 'organism', + DrugcentralInteraction = collections.namedtuple( + 'DrugcentralInteraction', + ( + 'drug', + 'drug_name', + 'uniprot', + 'target_type', + 'canonical', + 'act_value', + 'act_type', + 'relation', + 'effect', + 'tdl', + 'organism', + 'comment', + ), ) url = urls.urls['drugcentral']['interactions'] @@ -108,156 +119,69 @@ def drug_central( organism_latin = taxonomy.ensure_latin_name(organism) - if not organism_latin: + if organism and not organism_latin: msg = f'Could not find latin name for organism: `{organism}`.' _log(msg) - raise ValueError(msg) - - result = [] - - - - temp_struct = [] - - for rec in structures: - - if rec not in temp_struct: - - temp_struct.append(rec) - - structures = temp_struct - - if SMILES == True and InChI == True and CAS_RN == True: - - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields[0:], defaults = (None,) * len(fields)) - - elif SMILES == True and InChI == True and CAS_RN == False: - - fields = fields[0:13] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - elif SMILES == True and InChI == False and CAS_RN == True: - - fields = fields[0:11] + fields[13:] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - elif SMILES == True and InChI == False and CAS_RN == False: - - fields = fields[0:11] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - elif SMILES == False and InChI == True and CAS_RN == True: - - fields = fields[0:10] + fields[11:] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - elif SMILES == False and InChI == False and CAS_RN == True: - - fields = fields[13:] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - elif SMILES == False and InChI == True and CAS_RN == False: - - fields = fields[0:10] + fields[11:13] - InteractionsandStructures = collections.namedtuple('InteractionsandStructures', fields, defaults = (None,) * len(fields)) - - for inter_attr in interactions: - - if organism == inter_attr['ORGANISM']: - - result.append( - InteractionsandStructures( - DRUG_NAME = inter_attr['DRUG_NAME'], - TARGET_NAME = inter_attr['TARGET_NAME'], - TARGET_CLASS = inter_attr['TARGET_CLASS'], - TARGET_ACCESSION = inter_attr['ACCESSION'], - GENE = inter_attr['GENE'], - ACT_VALUE = inter_attr['ACT_VALUE'], - ACT_TYPE = inter_attr['ACT_TYPE'], - ACTION_TYPE = inter_attr['ACTION_TYPE'], - TDL = inter_attr['TDL'], - ORGANISM = inter_attr['ORGANISM'], - ) - ) - - for struct_attr in structures: - - if inter_attr['STRUCT_ID'] == struct_attr['ID']: - - if SMILES == True and InChI == True and CAS_RN == True: - - result[-1] = result[-1]._replace( - SMILES = struct_attr['SMILES'], - InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], - CAS_RN = struct_attr['CAS_RN'], - ) - - elif SMILES == True and InChI == True and CAS_RN == False: - - result[-1] = result[-1]._replace( - SMILES = struct_attr['SMILES'], - InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], - ) - - elif SMILES == True and InChI == False and CAS_RN == True: - - result[-1] = result[-1]._replace( - SMILES = struct_attr['SMILES'], - CAS_RN = struct_attr['CAS_RN'], - ) + drugs = dict( + (d.drugcentral, d) + for d in drugcentral_drugs() + ) - elif SMILES == True and InChI == False and CAS_RN == False: + result = [ + DrugcentralInteraction( + drug = drugs.get(i['STRUCT_ID'], None), + drug_name = i['DRUG_NAME'], + uniprot = uniprot, + target_type = i['TARGET_CLASS'], + canonical = i['MOA'] == '1', + act_value = common.try_float(i['ACT_VALUE']) or None, + act_type = i['ACT_TYPE'], + relation = i['RELATION'] or None, # what is relation?? + effect = i['ACTION_TYPE'] or None, + tdl = i['TDL'], + organism = i['ORGANISM'], + comment = i['ACT_COMMENT'] if comments else None, + ) + for i in interactions + for uniprot in i['ACCESSION'].split('|') + if not organism_latin or i['ORGANISM'] == organism_latin + ] - result[-1] = result[-1]._replace( - SMILES = struct_attr['SMILES'], - ) + return result - elif SMILES == False and InChI == True and CAS_RN == True: - result[-1] = result[-1]._replace( - InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], - CAS_RN = struct_attr['CAS_RN'], - ) +def drugcentral_mapping( + id_type: str, + target_id_type: str, + ) -> dict[str, set[str]]: + """ + Identifier translation table from Drug Central. - elif SMILES == False and InChI == False and CAS_RN == True: + Available ID types: drugcentral, inn, cas, smiles, inchikey, inchi. - result[-1] = result[-1]._replace( - CAS_RN = struct_attr['CAS_RN'], - ) + Args: + id_type: + The identifier type to be used as keys. + target_id_type: + The identifier type that will be collected into the values. - elif SMILES == False and InChI == True and CAS_RN == False: + Returns: + An identifier translation table. + """ - result[-1] = result[-1]._replace( - InChI = struct_attr['InChI'], - InChIKey = struct_attr['InChIKey'], - ) + drugs = drugcentral_drugs() - else: + result = collections.defaultdict(set) - DrugTargetInteractions = collections.namedtuple('DrugTargetInteractions', fields[0:10]) + for d in drugs: - for inter_attr in interactions: + the_id = getattr(d, id_type) + target_id = getattr(d, target_id_type) - if organism == inter_attr['ORGANISM']: + if the_id and target_id: - result.append( - DrugTargetInteractions( - DRUG_NAME = inter_attr['DRUG_NAME'], - TARGET_NAME = inter_attr['TARGET_NAME'], - TARGET_CLASS = inter_attr['TARGET_CLASS'], - TARGET_ACCESSION = inter_attr['ACCESSION'], - GENE = inter_attr['GENE'], - ACT_VALUE = inter_attr['ACT_VALUE'], - ACT_TYPE = inter_attr['ACT_TYPE'], - ACTION_TYPE = inter_attr['ACTION_TYPE'], - TDL = inter_attr['TDL'], - ORGANISM = inter_attr['ORGANISM'], - ) - ) + result[the_id].add(target_id) - return result + return dict(result) From 71a6f1ab94bfcbb3cdfa5f8e4f72f2ae4695c9b0 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 20:10:30 +0200 Subject: [PATCH 21/32] refactored `hpo_annotations` --- pypath/inputs/go.py | 2 +- pypath/inputs/hpo.py | 43 ++++++++++++++++--------------------------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/pypath/inputs/go.py b/pypath/inputs/go.py index 2fb82497c..25fa657a4 100644 --- a/pypath/inputs/go.py +++ b/pypath/inputs/go.py @@ -68,7 +68,7 @@ def go_annotations_uniprot(organism = 9606, swissprot = 'yes'): def go_annotations_goa( organism = 'human', evidence_codes=False): - + """ Downloads GO annotation from UniProt GOA. """ diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py index 4a7e8f413..94b69ed70 100644 --- a/pypath/inputs/hpo.py +++ b/pypath/inputs/hpo.py @@ -23,55 +23,43 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import List, Dict - import csv import collections -import pypath.utils.mapping as map +import pypath.utils.mapping as mapping import pypath.share.curl as curl import pypath.resources.urls as urls import pypath.formats.obo as obo -def hpo_gene_annotations() -> Dict[str, list]: + +def hpo_annotations() -> dict[str, set[str]]: """ - Retrieves Gene-HPO relationships from HPO. + Human Phenotype Ontology annotations. Returns: - namedtuple. + Dict of proteins as keys and sets of HPO terms as values. """ url = urls.urls['hpo']['gene'] c = curl.Curl(url, large = True, silent = False) + _ = next(c.result) - gene = list(csv.DictReader(c.result, delimiter = ',')) - - fields = ('entrez_gene_id','entrez_gene_symbol','HPO_Term_ID') + result = collections.defaultdict(set) - HPOGeneAnnotations = collections.namedtuple('HPOGeneAnnotations', fields,defaults = ("",) * len(fields)) + for r in c.result: - annotations = collections.defaultdict(list) + r = r.strip().split('\t') - for rec in gene: + uniprots = mapping.map_name(r[0], 'entrez', 'uniprot') - values = rec.values() - values = list(values)[0].replace('\t',',').split(',') - id = map.map_name(values[1], 'genesymbol', 'uniprot') - id = list(id) + for uniprot in uniprots: - if id: + result[uniprot].add(r[2]) - annotations[id[0]].append( - HPOGeneAnnotations( - entrez_gene_id = values[0], - entrez_gene_symbol = values[1], - HPO_Term_ID = values[2], - ) - ) + return result - return annotations -def hpo_disease_annotations() -> List[tuple] : +def hpo_disease_annotations() -> list[tuple] : """ Retrieves Disease-HPO relationships from HPO. @@ -110,7 +98,8 @@ def hpo_disease_annotations() -> List[tuple] : return result -def hpo_ontology() -> List[tuple] : + +def hpo_ontology() -> list[tuple] : """ Retrieves ontology from HPO. From 6a7462b026e157268648aa8b2712f444233122d9 Mon Sep 17 00:00:00 2001 From: deeenes Date: Sat, 20 Aug 2022 21:10:33 +0200 Subject: [PATCH 22/32] refactored `inputs.hpo` --- pypath/inputs/hpo.py | 165 +++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 83 deletions(-) diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py index 94b69ed70..1e64feaea 100644 --- a/pypath/inputs/hpo.py +++ b/pypath/inputs/hpo.py @@ -23,7 +23,9 @@ # Website: http://pypath.omnipathdb.org/ # -import csv +from typing import Union + +import re import collections import pypath.utils.mapping as mapping @@ -59,122 +61,119 @@ def hpo_annotations() -> dict[str, set[str]]: return result -def hpo_disease_annotations() -> list[tuple] : +def hpo_terms() -> dict[str, str]: + """ + Human Phenotype Ontology accession to term mapping. """ - Retrieves Disease-HPO relationships from HPO. + + return hpo_ontology()['terms'] + + +def hpo_diseases() -> dict[str, set[tuple]]: + """ + HPO term-disease relationships from Human Phenotype Ontology. Returns: - namedtuple. + A set of disease records for each HPO term. """ url = urls.urls['hpo']['disease'] c = curl.Curl(url, large = True, silent = False) - disease = list(csv.DictReader(c.result, delimiter = '\t')) + HpoDisease = collections.namedtuple( + 'HpoDisease', + ( + 'omim', + 'name', + 'pmid', + 'evidence', + 'onset', + 'frequency', + 'sex', + 'modifier', + 'aspect', + ), + ) - fields = ('DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference', 'Evidence', 'Aspect') + result = collections.defaultdict(set) - HPODiseaseAnnotations = collections.namedtuple('HPODiseaseAnnotations', fields,defaults = ("",) * len(fields)) + for r in c.result: - result = [] + if r[0] == '#': continue - for i in range(4,len(disease)): + r = r.split('\t') - values = disease[i].values() - values = list(values) + pmid = re.sub('^PMID:', '', r[4]) if r[4][:4] == 'PMID' else None - result.append( - HPODiseaseAnnotations( - DatabaseID = values[0], - DiseaseName = values[1][0], - Qualifier = values[1][1], - HPO_ID = values[1][2], - Reference = values[1][3], - Evidence = values[1][4], - Aspect = values[1][9], - ) + result[r[3]].add( + HpoDisease( + omim = r[0], + name = r[1], + pmid = pmid, + evidence = r[5] or None, + onset = r[6] or None, + frequency = r[7] or None, + sex = r[8] or None, + modifier = r[9] or None, + aspect = r[10], ) + ) - - return result + return dict(result) -def hpo_ontology() -> list[tuple] : +def hpo_ontology() -> dict[str, dict[str, Union[str, set[str]]]]: """ - Retrieves ontology from HPO. + Ontology data from HPO. Returns: - namedtuple. + Five dictionaries with term names, term definitions, parents in the + ontology tree, term synonyms and cross references to other databases. + The dicts "terms" and "defs" are one-to-one, while "parents", + "synonyms" and "xrefs" are one-to-many mappings, the keys are always + HPO terms. """ url = urls.urls['hpo']['ontology'] reader = obo.Obo(url) - hpo_ontology = [i for i in reader] - - - fields = ('hpo_id','term_name','synonyms','xrefs','is_a') - Ontology = collections.namedtuple('Ontology', fields,defaults = ("",) * len(fields)) + result = { + 'terms': {}, + 'defs': {}, + 'parents': collections.defaultdict(set), + 'synonyms': collections.defaultdict(set), + 'xrefs': collections.defaultdict(set), + } + for r in reader: - result = [] + if r.stanza != 'Term': continue - for rec in hpo_ontology: + term = r.id.value - syn_lst = [] - xref_lst = [] - isa_lst = [] + name = (r.name.value, r.name.modifiers) + name = ' '.join(n for n in name if n) + result['terms'][term] = name - if rec[2][1]: + result['defs'][term] = r.definition.value if r.definition else None - name = rec[2][0] + " " + rec[2][1] + for key, obokey in ( + ('parents', 'is_a'), + ('synonyms', 'synonym'), + ('xrefs', 'xref'), + ): - else: - - name = rec[2][0] - - result.append( - Ontology( - hpo_id = rec[1][0], - term_name = name, + proc = ( + lambda x: tuple(x.split(':')) + if key == 'xrefs' else + lambda x: x ) - ) - - if rec[5].get('synonym'): - synonym = list(rec[5].get('synonym')) - - for i in synonym: - - syn = i[0] + " " + i[1] - syn_lst.append(syn) - - result[-1] = result[-1]._replace( - synonyms = syn_lst - ) - - if rec[5].get('xref'): - - xref = list(rec[5].get('xref')) - - for i in xref: - - xref_lst.append(i[0]) - - result[-1] = result[-1]._replace( - xrefs = xref_lst - ) - - if rec[5].get('is_a'): - - is_a = list(rec[5].get('is_a')) - - for i in is_a: - - isa_lst.append(i[0] + " : " + i[2]) - - result[-1] = result[-1]._replace( - is_a = isa_lst + result[key][term].update( + { + proc(x.value) + for x in r.attrs.get(obokey, ()) + } ) - return result + return {k, dict(v) for k, v in result.items()} From a5d98d2db55faf047af624200ed19ad7d6b3748c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Mon, 22 Aug 2022 09:52:03 +0300 Subject: [PATCH 23/32] Update urls.py --- pypath/resources/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypath/resources/urls.py b/pypath/resources/urls.py index 4245fbc55..e39e2d371 100644 --- a/pypath/resources/urls.py +++ b/pypath/resources/urls.py @@ -1544,7 +1544,7 @@ 'label': 'DrugBank database', 'all_structures': 'https://go.drugbank.com/releases/5-1-9/' 'downloads/all-structure-links', - 'all_drug': 'https://go.drugbank.com/releases/5-1-9/downloads/' + 'all_drugs': 'https://go.drugbank.com/releases/5-1-9/downloads/' 'all-drug-links', 'drug_target_identifiers' : 'https://go.drugbank.com/releases/' '5-1-9/downloads/target-all-polypeptide-ids', From 7b66002c190efc8b9532a3ebdef95d1ce08c8f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Mon, 22 Aug 2022 10:02:37 +0300 Subject: [PATCH 24/32] Update credentials.py --- pypath/inputs/credentials.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py index e8f8a90ff..c11151e2d 100644 --- a/pypath/inputs/credentials.py +++ b/pypath/inputs/credentials.py @@ -27,7 +27,7 @@ import os import pypath.share.settings as settings -import pypath.share.settings as session +import pypath.share.session as session _logger = session.Logger(name = 'credentials') _log = _logger._log From 75e6f9ad32a97a03559149738a19c64850bb321d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Mon, 22 Aug 2022 17:44:05 +0300 Subject: [PATCH 25/32] Update drugbank.py --- pypath/inputs/drugbank.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index fe5afeab4..3bea4cde5 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -87,7 +87,7 @@ def _drugbank_download( auth_str = base64.b64encode(f"{cred['user']}:{cred['passwd']}".encode()) defaults['req_headers'] = [ - f'Authorization: Basic {auth.decode()}', + f'Authorization: Basic {auth_str.decode()}', settings.get('user_agent'), ] @@ -149,11 +149,13 @@ def drugbank_raw_interactions( for l in c.result[csv_name]: - drugs, uniprot = l.strip().split(',') + drugs, uniprot = l.strip().split(',')[-1], l.strip().split(',')[5] + + drugs = drugs.strip().split(';') result.extend( DrugbankRawInteraction( - drugbank_id = drug, + drugbank_id = drug.strip(), uniprot_id = uniprot, relation = rel, ) @@ -187,7 +189,7 @@ def drugbank_interactions( raw = drugbank_raw_interactions( user = user, passwd = passwd, - harma_active = pharma_active, + pharma_active = pharma_active, credentials_fname = credentials_fname, ) @@ -349,7 +351,7 @@ def drugbank_annotations( drugs = drugbank_drugs( user = user, - passwd = passwd + passwd = passwd, credentials_fname = credentials_fname, ) @@ -422,7 +424,7 @@ def id_type_proc(_id_type): drugs = drugbank_drugs( user = user, - passwd = passwd + passwd = passwd, credentials_fname = credentials_fname, ) From aebcb69b733a7e6f6c27444dadd327b3db4d24e3 Mon Sep 17 00:00:00 2001 From: Erva Ulusoy <95041228+ervau@users.noreply.github.com> Date: Sat, 27 Aug 2022 01:19:55 +0300 Subject: [PATCH 26/32] refactored `inputs.drugbank` and `inputs.credentials` --- pypath/inputs/credentials.py | 8 ++++---- pypath/inputs/drugbank.py | 34 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py index c11151e2d..aff3c26f1 100644 --- a/pypath/inputs/credentials.py +++ b/pypath/inputs/credentials.py @@ -22,7 +22,7 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import Optional +from typing import Optional, Tuple, Dict import os @@ -34,10 +34,10 @@ def credentials( - *args: tuple[str, str], + *args: Tuple[str, str], resource: Optional[str] = None, from_file: Optional[str] = None, - **kwargs: dict[str, str], + **kwargs: Dict[str, str], ) -> dict: """ Credentials required for restricted access resources. @@ -66,7 +66,7 @@ def credentials( fields = ('user', 'passwd') kwargs.update(dict(zip(fields, args))) - kwargs = dict(it for kwargs.items() if it[1] is not None) + kwargs = dict(it for it in kwargs.items() if it[1] is not None) if all(f in kwargs for f in fields): diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 3bea4cde5..69df0036a 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,7 +23,7 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import Optional +from typing import Optional, Tuple, List, Set, Dict import re import csv @@ -44,7 +44,7 @@ def _drugbank_credentials( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> tuple[str, str]: + ) -> Tuple[str, str]: return credentials.credentials( user = user, @@ -99,7 +99,7 @@ def drugbank_raw_interactions( passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, - ) -> list[tuple] : + ) -> List[tuple] : """ Retrieves protein identifiers from Drugbank. @@ -170,7 +170,7 @@ def drugbank_interactions( passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, - ) -> list[tuple] : + ) -> List[tuple] : """ Drug-protein and protein-drug interactions from Drugbank. @@ -222,8 +222,8 @@ def drugbank_interactions( result.append( DrugbankInteraction( - *src_tgt(r.uniprot_id, drug.pubchem_cid), - *src_tgt('protein', 'drug'), + *src_tgt((r.uniprot_id, drug.pubchem_cid)), + *src_tgt(('protein', 'drug')), interaction_type = r.relation, ) ) @@ -235,7 +235,7 @@ def drugbank_drugs( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> list[tuple]: + ) -> List[tuple]: """ Retrieves drug identifiers from Drugbank. @@ -275,21 +275,21 @@ def drugbank_drugs( for table in ('drug', 'structure'): - csv = f'{table} links.csv' + csv_ = f'{table} links.csv' c = _drugbank_download( url = urls.urls['drugbank'][f'all_{table}s'], user = user, passwd = passwd, credentials_fname = credentials_fname, - files_needed = (csv,), + files_needed = (csv_,), ) if not c: continue raw[table] = dict( (rec['DrugBank ID'], rec) - for rec in csv.DictReader(c.result[csv], delimiter = ',') + for rec in csv.DictReader(c.result[csv_], delimiter = ',') ) DrugbankDrug = collections.namedtuple( @@ -321,7 +321,7 @@ def drugbank_drugs( pubchem_sid = struct['PubChem Substance ID'], chebi = struct['ChEBI ID'], chembl = struct['ChEMBL ID'], - pharmgkb = drug.get('PharmGKB ID', None) + pharmgkb = drug.get('PharmGKB ID', None), het = drug.get('HET ID', None), ) ) @@ -333,7 +333,7 @@ def drugbank_annotations( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> dict[str, set[tuple]]: + ) -> Dict[str, Set[tuple]]: """ Drug annotations from Drugbank. @@ -385,13 +385,13 @@ def drugbank_mapping( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> dict[str, set[str]]: + ) -> Dict[str, Set[str]]: """ Identifier translation table from DrugBank. Available ID types: drugbank, name, type, groups, cas, inchikey, - inchi, smiles, formula, kegg_compound, kegg_drug, pubchem_cid, - pubchem_sid, chebi, chembl, pharmgkb, het. + inchi, smiles, formula, kegg_compound, kegg_drug, pubchem_compound, + pubchem_substance, chebi, chembl, pharmgkb, het. Args: id_type: @@ -431,7 +431,7 @@ def id_type_proc(_id_type): result = collections.defaultdict(set) id_type = id_type_proc(id_type) - target_id_type = id_type_proc(id_type) + target_id_type = id_type_proc(target_id_type) for d in drugs: @@ -442,4 +442,4 @@ def id_type_proc(_id_type): result[the_id].add(target_id) - return dict(result) + return dict(result) \ No newline at end of file From 475a62cb047ad4ed06969e58a9fa09b98dbf8bac Mon Sep 17 00:00:00 2001 From: Erva Ulusoy <95041228+ervau@users.noreply.github.com> Date: Sat, 27 Aug 2022 09:46:07 +0300 Subject: [PATCH 27/32] refactored typings back to "drugbank: trivial refactoring" --- pypath/inputs/drugbank.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pypath/inputs/drugbank.py b/pypath/inputs/drugbank.py index 69df0036a..2f0c91570 100644 --- a/pypath/inputs/drugbank.py +++ b/pypath/inputs/drugbank.py @@ -23,7 +23,7 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import Optional, Tuple, List, Set, Dict +from typing import Optional import re import csv @@ -44,7 +44,7 @@ def _drugbank_credentials( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> Tuple[str, str]: + ) -> tuple[str, str]: return credentials.credentials( user = user, @@ -99,7 +99,7 @@ def drugbank_raw_interactions( passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, - ) -> List[tuple] : + ) -> list[tuple] : """ Retrieves protein identifiers from Drugbank. @@ -170,7 +170,7 @@ def drugbank_interactions( passwd: Optional[str] = None, credentials_fname: Optional[str] = None, pharma_active: bool = False, - ) -> List[tuple] : + ) -> list[tuple] : """ Drug-protein and protein-drug interactions from Drugbank. @@ -235,7 +235,7 @@ def drugbank_drugs( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> List[tuple]: + ) -> list[tuple]: """ Retrieves drug identifiers from Drugbank. @@ -333,7 +333,7 @@ def drugbank_annotations( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> Dict[str, Set[tuple]]: + ) -> dict[str, set[tuple]]: """ Drug annotations from Drugbank. @@ -385,7 +385,7 @@ def drugbank_mapping( user: Optional[str] = None, passwd: Optional[str] = None, credentials_fname: Optional[str] = None, - ) -> Dict[str, Set[str]]: + ) -> dict[str, set[str]]: """ Identifier translation table from DrugBank. @@ -442,4 +442,4 @@ def id_type_proc(_id_type): result[the_id].add(target_id) - return dict(result) \ No newline at end of file + return dict(result) From 6ca6d40f3b8481709e32f8adad015be6e78668a1 Mon Sep 17 00:00:00 2001 From: Erva Ulusoy <95041228+ervau@users.noreply.github.com> Date: Sat, 27 Aug 2022 09:48:50 +0300 Subject: [PATCH 28/32] refactored typing back to original in `inputs.credentials` --- pypath/inputs/credentials.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypath/inputs/credentials.py b/pypath/inputs/credentials.py index aff3c26f1..1d9ac4560 100644 --- a/pypath/inputs/credentials.py +++ b/pypath/inputs/credentials.py @@ -22,7 +22,7 @@ # Website: http://pypath.omnipathdb.org/ # -from typing import Optional, Tuple, Dict +from typing import Optional import os @@ -34,10 +34,10 @@ def credentials( - *args: Tuple[str, str], + *args: tuple[str, str], resource: Optional[str] = None, from_file: Optional[str] = None, - **kwargs: Dict[str, str], + **kwargs: dict[str, str], ) -> dict: """ Credentials required for restricted access resources. From 87c768948d35b4a96b432df18b7bac896c569761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Sat, 27 Aug 2022 18:12:05 +0300 Subject: [PATCH 29/32] Update chembl.py --- pypath/inputs/chembl.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pypath/inputs/chembl.py b/pypath/inputs/chembl.py index 0bfbae8f4..9eb79f15c 100644 --- a/pypath/inputs/chembl.py +++ b/pypath/inputs/chembl.py @@ -82,7 +82,7 @@ def chembl_targets() -> list[tuple]: ChemblTarget( accession = ( tgt['target_components'][0]['accession'] - if 'target_components' in tgt else + if tgt['target_components'] else None ), target_chembl_id = tgt['target_chembl_id'], @@ -165,8 +165,16 @@ def chembl_molecules() -> list[tuple]: """ def _get(mol, key0, key1): - - return mol.get(f'molecule_{key0}', {}).get(key1, None) + + molecule_properties = mol.get(f'molecule_{key0}', {}) + + if molecule_properties: + + return molecule_properties.get(key1, None) + + else: + + return None fields_molecule = ( @@ -205,7 +213,7 @@ def _get(mol, key0, key1): url = ( f"{urls.urls['chembl']['url']}" - f"{lst['page_meta']['next']}" + f"{page_dct['page_meta']['next']}" ) else: @@ -253,9 +261,9 @@ def _get(mol, key0, key1): def chembl_activities( - pchembl_value_none: bool = False, #TODO: are these below all the allowed values? standard_relation: Literal['=', '>', '<', '>=', '<='], + pchembl_value_none: bool = False, ) -> list[tuple] : """ Retrieves activities data from ChEMBL. @@ -295,7 +303,7 @@ def chembl_activities( while True: - if not page_lst: + if not page_dct: url = ( @@ -309,7 +317,7 @@ def chembl_activities( url = ( f"{urls.urls['chembl']['url']}" - f"{lst['page_meta']['next']}" + f"{page_dct['page_meta']['next']}" ) else: From fd24c5501a384774e0cbe8d7014c7dfb0496964b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Wed, 31 Aug 2022 13:04:37 +0300 Subject: [PATCH 30/32] Update hpo.py --- pypath/inputs/hpo.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py index 1e64feaea..e6c062eb0 100644 --- a/pypath/inputs/hpo.py +++ b/pypath/inputs/hpo.py @@ -34,17 +34,21 @@ import pypath.formats.obo as obo -def hpo_annotations() -> dict[str, set[str]]: +def hpo_annotations() -> dict[str, set[tuple]]: """ Human Phenotype Ontology annotations. Returns: - Dict of proteins as keys and sets of HPO terms as values. + Dict of proteins as keys and sets of HPO annotations as values. """ url = urls.urls['hpo']['gene'] c = curl.Curl(url, large = True, silent = False) _ = next(c.result) + + fields = ('entrez_gene_id','entrez_gene_symbol','hpo_id') + + HPOAnnotations = collections.namedtuple('HPOAnnotations', fields,defaults = ("",) * len(fields)) result = collections.defaultdict(set) @@ -56,7 +60,13 @@ def hpo_annotations() -> dict[str, set[str]]: for uniprot in uniprots: - result[uniprot].add(r[2]) + result[uniprot].add( + HPOAnnotations( + entrez_gene_id = r[0], + entrez_gene_symbol = r[1], + hpo_id = r[2], + ) + ) return result From 042fa92392738c245572e7d4fe760fdfe1e3195d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Wed, 31 Aug 2022 13:06:03 +0300 Subject: [PATCH 31/32] Update annot.py --- pypath/core/annot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypath/core/annot.py b/pypath/core/annot.py index 034476831..8bfa606cf 100644 --- a/pypath/core/annot.py +++ b/pypath/core/annot.py @@ -6922,7 +6922,7 @@ def __init__(self, **kwargs): self, name = 'HPO', ncbi_tax_id = constants.NOT_ORGANISM_SPECIFIC, - input_method = 'hpo.hpo_gene_annotations', + input_method = 'hpo.hpo_annotations', **kwargs ) From deb7d434c44a2325444d2e8752dd7923d895a3da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tennur=20K=C4=B1l=C4=B1=C3=A7?= <99681146+tnnrklc@users.noreply.github.com> Date: Sat, 10 Sep 2022 18:45:23 +0300 Subject: [PATCH 32/32] Update hpo.py --- pypath/inputs/hpo.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pypath/inputs/hpo.py b/pypath/inputs/hpo.py index e6c062eb0..f3d0406c3 100644 --- a/pypath/inputs/hpo.py +++ b/pypath/inputs/hpo.py @@ -96,6 +96,7 @@ def hpo_diseases() -> dict[str, set[tuple]]: 'omim', 'name', 'pmid', + 'qualifier', 'evidence', 'onset', 'frequency', @@ -120,6 +121,7 @@ def hpo_diseases() -> dict[str, set[tuple]]: omim = r[0], name = r[1], pmid = pmid, + qualifier = r[2] or None, evidence = r[5] or None, onset = r[6] or None, frequency = r[7] or None, @@ -179,11 +181,14 @@ def hpo_ontology() -> dict[str, dict[str, Union[str, set[str]]]]: lambda x: x ) - result[key][term].update( - { - proc(x.value) - for x in r.attrs.get(obokey, ()) - } - ) - - return {k, dict(v) for k, v in result.items()} + for x in r.attrs.get(obokey, ()): + y = proc(x.value) + result[key][term].update( + { + y(x.value) + if type(y) != tuple else + y + } + ) + + return {k: dict(v) for k, v in result.items()}