-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_input_data.py
439 lines (359 loc) · 16.2 KB
/
process_input_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
import os
import numpy as np
import json
import pandas as pd
def build_model_config(eta, tau, pathway_dict, compounds, data_observed_weights,
a_init,
landa, gamma, mu, observed_weight_vector,
serialize=False, json_filename=None):
"""
Builds a model configuration and save to json file if appropriate
Parameters
----------
eta: 2D array, mapping from pathway to compounds
tau: 2D array, mapping from compounds to masses
pathway_dictionary: dictionary of 'pathway' and 'compounds'
....
Returns
-------
model_config : dictionary
dictionary containing all data and initialization needed to run PUMA
"""
model_config = {
"landa" : landa,
"gamma" : gamma,
"mu": mu,
"a_init" : a_init.tolist(),
"eta": eta.tolist(),
"tau": tau.tolist(),
"pathway_dict": pathway_dict,
"compounds": list(compounds),
"data_observed_weights": data_observed_weights,
"observed_weight_vector": observed_weight_vector.tolist()
}
if serialize:
with open(json_filename, 'w') as outfile:
json.dump(model_config, outfile)
return model_config
def load_json():
"""
Loads the json file
Returns
-------
model_config:
data structure holding all info required to run PUMA
"""
json_filename = "modelConfig.json"
if 'PUMA_OUTPUT_DATA' in os.environ:
outdata_dir = os.environ['PUMA_OUTPUT_DATA']
json_file = os.path.join(outdata_dir, json_filename)
if not (os.path.isfile(json_file)):
print ("unknown json file")
exit()
else:
print("PUMA Error: cannot find output dir")
with open(json_file) as data_file:
model_config = json.load(data_file)
model_config["a_init"] = np.asarray(model_config["a_init"])
model_config["tau"] = np.asarray(model_config["tau"])
model_config["eta"] = np.asarray(model_config["eta"])
model_config["observed_weight_vector"] = np.asarray(model_config["observed_weight_vector"])
return model_config
def initialize_pathway_activity(eta):
"""
Initializes pathway activities; default and necessary: all pathways are initialized to true
Parameters
----------
eta: 2D array, mapping from pathway to compounds
Returns
-------
a_init : 1D array
initialization of activity for all pathways
"""
a_init = np.ones(eta.shape[0])
return a_init
def create_compound_to_masses_tau_matrix(met_dict, compounds):
"""
Create an f by k matrix mapping of compounds to masses
Parameters
----------
met_dict: ...
compounds:...
Returns
-------
tau : 2D array
Binary mapping of compounds to Masses. Compounds are sorted alphabetically. Masses are sorted in ascending
order
mass_values: list
Sorted list (ascending order) of all masses in the met_dict
"""
assert(len(compounds) == len(met_dict))
mass_values = set([x[0] for x in met_dict.values()])
mass_values = sorted(mass_values)
f = len(compounds)
k = len(mass_values)
tau = np.zeros((f, k))
for f in compounds:
# go over mass values
# assign tau to 1, for each compound that has a mass k
the_mass = met_dict[f][0]
k = mass_values.index(the_mass)
tau[compounds.index(f)][k] = 1
return tau, mass_values
def create_observed_weight_vector(data_observed_weights, mass_values, ppm =15):
"""
Create an 1 by k matrix indicating if a mass was observed.
Parameters
----------
data_observed_weights:
input data of all observed masses
mass_values:
tau:
Returns
-------
observed_weight_vector: 1D array, 1xk
binary vector if a particular mass is observed in the data_observed_weights
"""
# the observed weight vector should be of the same width as the tau matrix k entries.
# The observed weight vector is organized from low mass to high, with 1 entries for observed weight
k = len(mass_values)
observed_weight_vector = np.zeros((1, k))
keep_observed_masses = []
n_observed_mets_in_model = 0
for x in mass_values:
for w in data_observed_weights:
# an observed mass may map to one or more compound
# if metabolite mass falls within w+15ppm or w-15ppm, then map w to met
fifteenppm = w * ppm / 1000000.
found = x >= w - fifteenppm and x <= w + fifteenppm
if found:
keep_observed_masses.append(x)
observed_weight_vector[0][mass_values.index(x)] = 1
n_observed_mets_in_model+=1
return observed_weight_vector[0]
def create_pathways_metabolite_eta_matrix(pathway_dict, compounds):
"""
Create eta: a map from pathways to metabolites
Parameters
----------
pathway_dict:
pathway_dict: dictionary of {'pathway': ['p1', 'p2', .., 'compounds': ['C00001 C00002 C00003', 'C000x C000x']}
compounds:
sorted unique list of compounds from the pathways that have mass measurements
Returns
-------
eta:
"""
p = len(pathway_dict['pathway'])
f = len(compounds)
eta = np.zeros((p, f))
for i in range(p):
mets_in_pathway =pathway_dict['compounds'][i].split()
for m in mets_in_pathway:
eta[i][compounds.index(m)] = 1
return eta
def process_input_files(pathway_file, met_file, obs_file, serialize=False, json_filename = None):
"""
Process input files to create data structures to run PUMA
:param pathway_file:
# Format: each line has pathway name, and metabolites in the pathway
# first row is a header: pathway, compounds
# cge00010,C00036 C15973 C00111 C00103 C00022 C00118
# every compound to be used in PUMA should belong to 1 or more pathways based on reading this file.
:param met_file:
# Format: each line has a compound ID followed by mass as specified by network model
# C00015,404.0022
# or 'C00015', 404.0022 # does not work yet
# first row is a header: compound, mass
:param obs_file:
# merged_observations_file
# if multiple sample files, this is the merged observations; there is a likely compound identified by other tools,
# and a confidence score.
# row zero is a header
# Column 0 will be the mass; last column will be the most likely match found using other tools (but not used
# when processing the input files as we do here. here we only care about the precursor mass )
# Format: Precursor Mass Retention Time Observation Confidence Average Observation Score Compound
# 176.0565 12.68 0.75 0.75 C20844
:param serialize:
#if true create model_config
:param json_filename:
Returns
-------
eta: 2D numpry array
Binary mapping of pathways to compounds. Compounds are sorted alphabetically.
tau : 2D array
Binary mapping of compounds to masses. Compounds are sorted alphabetically. Masses are sorted in ascending
pathway_dict: dict with 'pathway' and 'compounds' keys
values are lists of pathways, and corresponding lists of compounds
compounds: list
sorted listing of compounds
data_observed_weights: list
sorted listing of all observed masses
observed_weight_vector: 1D array
Binary vector indicating if a particular mass is observed
"""
# process pathway_file
pathway_df = pd.read_csv (pathway_file)
pathway_dict = pathway_df.to_dict('list')
compounds_listing = list(pathway_dict['compounds'])
#find unique compound listing
compounds_list = sorted(set([x for item in compounds_listing for x in item.split()]))
# process met_file
met_df = pd.read_csv (met_file)
met_dict = met_df.set_index('compound').T.to_dict('list')
mets_with_masses = set([x for x in met_dict.keys()])
# Check consistency of pathway and met files:
compounds_listing_in_pathways = set(compounds_list)
# Are there compounds without masses? In compounds_listing_in_pathways but not in mets_with_masses
compounds_with_no_masses = compounds_listing_in_pathways - mets_with_masses
if (len(compounds_with_no_masses)>0 ):
# print ("PUMA Warning 1: Compound in pathway does not have a matching mass. Please specify a mass for each "
# "compound"
# "For now %d compounds will be removed:" % len (compounds_with_no_masses))
# print('compounds_with_no_masses:', list(compounds_with_no_masses))
pathway_dict, compounds_listing = \
return_sanitized_pathways_and_compounds_listing(pathway_dict, compounds_with_no_masses)
compounds_list = sorted(set([x for item in compounds_listing for x in item.split()]))
# are there still any issues that cannot be resolved?
if(len(set(compounds_list) - mets_with_masses)>0):
print("PUMA ERROR 6: still have a problem removing compounds that have no masses")
exit()
# Are there mass compounds listed in the masses file without a mention in the pathways file?
compounds_not_in_pathways = mets_with_masses-set(compounds_list)
if (len(compounds_not_in_pathways) > 0):
# print("PUMA Warning 2: There exists mass compounds in %s,\n but compounds not in pathways."
# "Please remove them from mass listing,"
# "or they will be ignored" % met_file)
# print("%d compounds are not in pathways; however the masses are given" %len(compounds_not_in_pathways))
#delete them from met dictionary
for k in compounds_not_in_pathways:
del met_dict[k]
# Create mapping from pathways to metabolites
eta = create_pathways_metabolite_eta_matrix(pathway_dict, compounds_list)
# create mapping from metabolites to masses
tau, unique_masses = create_compound_to_masses_tau_matrix(met_dict, compounds_list)
print("pathways_in_model:", pathway_dict['pathway'])
print("number_pathways_in_model:", len(pathway_dict['pathway']))
print("metabolites_in_model:", compounds_list)
print("number_metabolites_in_model:", len(compounds_list))
# run santify check over matrices
sanity_check(eta, tau)
# create observed_weight_vector
obs_df = pd.read_csv (obs_file)
data_observed_weights= obs_df['mz'].values.tolist()
data_observed_weights= sorted(data_observed_weights)
# Note: we no longer need tau to pass to this function
observed_weight_vector = create_observed_weight_vector(data_observed_weights, unique_masses)
n_observed_masses = len([observed_mass for observed_mass in observed_weight_vector if observed_mass])
return eta, tau, pathway_dict, compounds_list, data_observed_weights, observed_weight_vector
def return_sanitized_pathways_and_compounds_listing(pathway_dict, compounds_with_no_masses):
"""
remvoing pathways which don't have any metabolites with corresponding masses
Parameters
----------
pathway_dict:
pathway_dict: dictionary of {'pathway': ['p1', 'p2', .., 'compounds': ['C00001 C00002 C00003', 'C000x C000x']}
compounds:
sorted unique list of compounds from the pathways that have mass measurements
Returns
-------
santized pathways, compounds_listing
"""
compounds_listing = []
for item in pathway_dict['compounds']:
new_value = set([x for x in item.split()])
item = new_value - compounds_with_no_masses
compounds_listing.append(' '.join(str(e) for e in item))
pathway_dict['compounds'] = compounds_listing
sanitized_compounds_listing = []
to_remove_indices = []
for idx, each in enumerate(compounds_listing):
if each == "":
# print("pathway %d has no metabolites with corresponding mass, so it will be removed" % idx, '\n')
to_remove_indices.append(idx)
else:
sanitized_compounds_listing.append(each)
compounds_listing = sanitized_compounds_listing
pathway_dict['compounds'] = compounds_listing
santized_pathway_list = [pathway for index, pathway in enumerate(pathway_dict["pathway"])
if index not in to_remove_indices]
pathway_dict["pathway"] = santized_pathway_list
return pathway_dict, compounds_listing
def sanity_check(eta, tau):
# sanity check of data
pm_row_sum = np.sum(eta, axis=0)
pm_col_sum = np.sum(eta, axis=1)
if np.any(np.min(pm_col_sum) <= 0):
raise Exception('Column sum of the pathway-metabolite matrix has zero entries. ' +
'It means that some pathway doesn\'t generate metabolites.')
if np.any(np.min(pm_row_sum) <= 0):
raise Exception('Row sum of the pathway-metabolite matrix has zero entries. ' +
'It means that some metabolite is not generated by any pathways.')
# sanity check of data
mw_row_sum = np.sum(tau, axis=0)
mw_col_sum = np.sum(tau, axis=1)
if np.any(mw_col_sum != 1):
raise Exception('Column sum of the metabolite-mass matrix has entries not equal to 1. ' +
'It means that some metabolite has no mass or more than one mass values.')
if np.any(np.min(mw_row_sum) <= 0):
raise Exception('Row sum of the metabolite-mass matrix has zero entries. ' +
'It means that some mass is not from any metabolite.')
def build_config(serialize=False, landa=0.5, gamma=0.9, mu=0.5,
pathway_filename = "pathways.csv", met_filename = "masses.csv",
observations_filename = "observed_data.csv", json_filename = "modelConfig.json"):
"""
Builds configuration for running PUMA using input files
:param serialize:
if true, write out a json file with model configuration
:param landa:
probability of a pathway being active. Currently same value for all pathways
:param gamma:
:param mu:
:param pathway_filename:
:param met_filename:
:param observations_filename:
:param json_filename:
Rreturns
--------
model_config:
data structure holding all info required to run PUMA
"""
# setup path for each input file
if 'PUMA_INPUT_DATA' in os.environ:
indata_dir= os.environ['PUMA_INPUT_DATA']
pathway_file = os.path.join(indata_dir, pathway_filename)
met_file = os.path.join(indata_dir, met_filename)
obs_file = os.path.join(indata_dir, observations_filename)
# check that all input files exist:
if not (os.path.isfile(pathway_file)):
print ("PUMA Error 1: unknown pathways file")
if not (os.path.isfile(met_file)):
print ("PUMA Error 2: unknown mets file")
if not (os.path.isfile(obs_file)):
print ("PUMA Error 3: unknown observation file")
if not ((os.path.isfile(pathway_file)) and (os.path.isfile(met_file)) and os.path.isfile(obs_file)):
exit()
else:
print ("PUMA Error 4: cannot find input data dir in environment")
# setup path for output files
if serialize and 'PUMA_OUTPUT_DATA' in os.environ: # path exists
outdata_dir= os.environ['PUMA_OUTPUT_DATA']
json_file = os.path.join(outdata_dir, json_filename)
if not os.path.exists(outdata_dir): # check directory to write json file exists
print ("PUMA Error 5: unknown output directory")
# process the input files
eta, tau, pathway_dict, compounds, data_observed_weights, observed_weight_vector = process_input_files(pathway_file,
met_file,
obs_file)
# initialize pathway activity
a_init = initialize_pathway_activity(eta)
# build a model configuration
model_config = build_model_config(eta, tau, pathway_dict, compounds, data_observed_weights,
a_init,
landa, gamma, mu, observed_weight_vector,
serialize, json_filename=json_file)
return model_config
def main():
model_config = load_json()
if __name__ == "__main__":
main()