-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathorphanet_db.py
161 lines (135 loc) · 5 KB
/
orphanet_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import sqlite3
import xml.etree.ElementTree as ET
class orphanet_db:
conn = sqlite3.connect("orphanet.db", check_same_thread=False)
cursor = conn.cursor()
def create_tables(self) -> None:
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS frequency
(frequency_id INTEGER PRIMARY KEY, frequency_name TEXT)
"""
)
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS disorders
(disorder_id INTEGER PRIMARY KEY, orpha_code TEXT, disorder_name TEXT)
"""
)
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS hpo_terms
(hpo_id TEXT PRIMARY KEY, hpo_term TEXT)
"""
)
self.cursor.execute(
"""
CREATE TABLE IF NOT EXISTS disorder_associations (
assoc_id INTEGER PRIMARY KEY,
disorder_id INTEGER,
hpo_id TEXT,
frequency_id INTEGER,
FOREIGN KEY(disorder_id) REFERENCES disorders(disorder_id),
FOREIGN KEY(hpo_id) REFERENCES hpo_terms(hpo_id),
FOREIGN KEY(frequency_id) REFERENCES frequency(frequency_id)
)
"""
)
self.conn.commit()
def create_indices(self) -> None:
self.cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_disorder_associations_hpo_id
ON disorder_associations (hpo_id)
"""
)
self.conn.commit()
def insert_data(self) -> None:
# Insert data into the frequency table
frequency_names = [
"Excluded (0%)",
"Very rare (<4-1%)",
"Occasional (29-5%)",
"Frequent (79-30%)",
"Very frequent (99-80%)",
"Obligate (100%)",
]
for i, name in enumerate(frequency_names):
self.cursor.execute(
"INSERT OR IGNORE INTO frequency VALUES (?, ?)", (i + 1, name)
)
xml_data = open("en_product4.xml", "r").read()
root = ET.fromstring(xml_data)
for disorder in root.findall(".//Disorder"):
disorder_id = disorder.attrib.get("id")
orpha_code = disorder.find("OrphaCode").text
disorder_name = disorder.find("Name").text
# Insert data into the disorders table
self.cursor.execute(
"INSERT OR IGNORE INTO disorders VALUES (?, ?, ?)",
(disorder_id, orpha_code, disorder_name),
)
for association in disorder.findall(".//HPODisorderAssociation"):
assoc_id = association.attrib.get("id")
hpo_id = association.find("HPO/HPOId").text
hpo_term = association.find("HPO/HPOTerm").text
frequency_name = association.find("HPOFrequency/Name").text
frequency_id = frequency_names.index(frequency_name) + 1
# Insert data into the hpo_terms table
self.cursor.execute(
"INSERT OR IGNORE INTO hpo_terms VALUES (?, ?)", (hpo_id, hpo_term)
)
# Insert data into the disorder_associations table
self.cursor.execute(
"INSERT OR IGNORE INTO disorder_associations VALUES (?, ?, ?, ?)",
(assoc_id, disorder_id, hpo_id, frequency_id),
)
# Save (commit) the changes
self.conn.commit()
def create_orphanet_db(self) -> None:
try:
self.create_tables()
self.create_indices()
except sqlite3.Error as e:
print(" ".join(e.args))
return
self.insert_data()
def get_disorders(self, hpo_ids: frozenset[str]) -> list:
if (
not hpo_ids
or len(hpo_ids) == 0
or (len(hpo_ids) == 1 and list(hpo_ids)[0] == "")
):
return []
# Get all disorders with the given hpo_ids
insert = f"{','.join('?' * len(hpo_ids))}"
query = f"""
SELECT orpha_code, disorder_name, hpo_id, hpo_term, frequency_id
FROM (
SELECT *
FROM disorder_associations
WHERE hpo_id in ({insert})
AND frequency_id != 1
) AS filtered_associations
JOIN disorders USING (disorder_id)
JOIN hpo_terms USING (hpo_id)
JOIN frequency USING (frequency_id)
ORDER BY disorder_id, hpo_id
"""
self.cursor.execute(query, tuple(hpo_ids))
return self.cursor.fetchall()
if __name__ == "__main__":
o = orphanet_db()
o.create_orphanet_db()
disorders = o.get_disorders(
frozenset(
[
"HP:0000716", # Depression
"HP:0000952", # Jaundice
"HP:0001369", # Arthritis
"HP:0002240", # Hepatomegaly
"HP:0012115", # Hepatitis
]
)
)
pass