-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPeakAssignator_v2.py
341 lines (297 loc) · 14.2 KB
/
PeakAssignator_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Module metadata variables
__author__ = "Jose Rodriguez"
__credits__ = ["Jose Rodriguez", "Andrea Laguillo Gómez", "Jesus Vazquez"]
__license__ = "Creative Commons Attribution-NonCommercial-NoDerivs 4.0 Unported License https://creativecommons.org/licenses/by-nc-nd/4.0/"
__version__ = "0.3.0"
__maintainer__ = "Jose Rodriguez"
__email__ = "jmrodriguezc@cnic.es;andrea.laguillo@cnic.es"
__status__ = "Development"
# import modules
import os
import sys
import argparse
import configparser
import logging
import re
import pandas as pd
import concurrent.futures
from itertools import repeat
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
# TODO: allow user to set output column names in the INI
#feather_data = pd.read_feather(r"C:\Users\Andrea\Desktop\data.ftr")
#apex_list = _extract_ApexList(r"C:\Users\Andrea\Desktop\SHIFTS-4\testing\bin002_w7_SL4500Target_ApexList_mass.txt")
###################
# Local functions #
###################
def concatInfiles(peakpickingfile):
'''
Concat input files...
adding Experiment column (dirname of input file), and adding a FWHM columns by Experiment
'''
def _extract_FWHM(file):
with open(file) as f:
data = f.read()
m = re.findall(r'FWHM:\s*([^\n]*)', data)
if m and len(m)>0:
return m[0]
else:
sys.exit("ERROR! FWHM is not defined for {}".format(file))
# read input file
# use high precision with the floats
df = pd.read_csv(peakpickingfile, sep="\t", float_precision='high')
# add folder name into column
foldername = os.path.dirname(peakpickingfile)
df['Experiment'] = foldername
# add filename column
df['Filename'] = os.path.basename(peakpickingfile)
# add fwhm column
#fwhm_file = "{}/{}".format(foldername, fwhm_fname)
#fwhm = _extract_FWHM(fwhm_file)
#fwhm = _extract_FWHM(fwhm_fname)
#df['FWHM'] = float(fwhm)
# assign type to categorical columns
df['Experiment'] = df['Experiment'].astype('category')
df['Filename'] = df['Filename'].astype('category')
#df['Label'] = df['Label'].astype('category')
#df['IsotpicJump'] = df['IsotpicJump'].astype('category')
return df
def closest_peak(apex_list, delta_MH):
'''
Assign a delta_MH value to the closest apex in a list
'''
peak = min(apex_list, key = lambda x : abs(x - delta_MH))
return peak
def find_orphans(ppm_max, theo_mass, peak, delta_MH, peak_label, orphan_label):
'''
Identify orphans and peaks
'''
# window = float(nsigma) * fwhm / 2
distance = abs(peak - delta_MH)
distance_ppm = (distance / (theo_mass + peak)) * 1e6
#max_distance = abs(float(nsigma) * fwhm / 2)
if distance_ppm <= ppm_max:
ID = peak_label
else:
ID = orphan_label
# TODO: return ppm
return ID, distance_ppm
# def get_deltamod(col_CalDeltaMH, col_Peak, orphan_label, col_ClosestPeak):
# '''
# Identify orphans and peaks
# '''
# if col_Peak == orphan_label:
# deltamod = col_CalDeltaMH
# else:
# deltamod = col_ClosestPeak
# return deltamod
def bin_operations(df, apex_list, ppm_max, peak_label, orphan_label,
col_ClosestPeak, col_CalDeltaMH, col_Peak, col_DM, col_TheoMass, col_ppm):
'''
Main function that handles the operations by BIN
'''
# get the BIN value from the input tuple df=(bin,df)
(bin_value, df) = float(df[0]), df[1]
# assign to peaks
df[col_ClosestPeak] = df.apply(lambda x: closest_peak(apex_list, x[col_CalDeltaMH]), axis = 1)
# identify orphans
df[col_Peak] = df.apply(lambda x: find_orphans(ppm_max, x[col_TheoMass], x[col_ClosestPeak], x[col_CalDeltaMH], peak_label, orphan_label)[0], axis = 1)
df[col_ppm] = df.apply(lambda x: find_orphans(ppm_max, x[col_TheoMass], x[col_ClosestPeak], x[col_CalDeltaMH], peak_label, orphan_label)[1], axis = 1)
df[col_Peak] = df[col_Peak].astype('category')
# calculate FDR
# create deltamass column # TODO: Recom
df[col_DM] = df.apply(lambda x: x[col_CalDeltaMH] if (x[col_Peak]==orphan_label) else x[col_ClosestPeak], axis = 1)
#df[col_DM] = df.apply(lambda x: get_deltamod(x[col_CalDeltaMH], x[col_Peak], orphan_label, x[col_ClosestPeak]), axis = 1)
# def peak_FDR():
# for each peak sort by xcorr (comet) # should we separate recom peaks?
# for each peak separate targets and decoys
# make new fdr column rank_D/rank/T
# def local_FDR():
# for each bin sort by xcorr (comet)
# for each bin separate targets and decoys
# make new fdr_column rank_D/rank_T
#def recom_FDR():
# for each recom peak sort by xcorr (recom)
# TO CHECK, print by BIN
# outfile = os.path.join("D:/tmp/kk/", bin+"_kk.tsv")
# df.to_csv(outfile, sep="\t", index=False)
return df
def format_seq(seqdm, dm, decimal_places):
'''
Make column with sequence and deltamass.
'''
#df.apply(lambda x: x[seqdmcolumn].split('[')[0] + '[' + str(round(x[col_DM], decimal_places)) + ']' + x[seqdmcolumn].split(']')[1], axis = 1)
if '[' in str(seqdm):
formatseq = str(seqdm).split('[')[0] + '[' + str(round(float(dm), decimal_places)) + ']' + str(seqdm).split(']')[1]
elif '_' in str(seqdm):
formatseq = str(seqdm).split('_')[0] + '_' + str(round(float(dm), decimal_places))
else:
sys.exit("Unrecognized sequence format in '" + str(config._sections['PeakAssignator'][seqdm]) + "' column!")
return formatseq
#################
# Main function #
#################
def main(args):
'''
Main function
'''
# Variables
ppm_max = abs(float(config._sections['PeakAssignator']['ppm_max']))
col_TheoMass = config._sections['PeakAssignator']['theomh_column']
col_CalDeltaMH = config._sections['PeakAssignator']['caldeltamh_column']
col_ClosestPeak = config._sections['PeakAssignator']['closestpeak_column']
col_Peak = config._sections['PeakAssignator']['peak_column']
col_DM = config._sections['PeakAssignator']['deltamass_column']
col_ppm = config._sections['PeakAssignator']['ppm_column']
peak_label = config._sections['PeakAssignator']['peak_label']
orphan_label = config._sections['PeakAssignator']['orphan_label']
seqdmcolumn = config._sections['PeakAssignator']['mod_peptide_column']
assignseqcolumn = config._sections['PeakAssignator']['assign_seq_column']
decimal_places = int(config._sections['General']['decimal_places'])
# logging.info("get the list of files with the inputs")
# with open(args.infile) as f:
# infiles = f.readlines()
# # you may also want to remove whitespace characters like `\n` at the end of each line
# infiles = [x.strip() for x in infiles]
# logging.debug(infiles)
# read apex list
def _extract_ApexList(file):
with open(file) as f:
data = f.read().split('\n')
data = [x for x in data if x.strip()]
data = np.array(data, dtype=np.float64)
return data
#foldername = os.path.dirname(args.appfile)
#apex_file = "{}/{}".format(foldername, args.appfile)
#apex_list = _extract_ApexList(apex_file)
apex_list = _extract_ApexList(args.appfile)
#logging.info("concat input files...")
#logging.info("adding Experiment column (dirname of input file),")
#logging.info("and adding a FWHM column by Experiment")
# with concurrent.futures.ProcessPoolExecutor(max_workers=args.n_workers) as executor:
# df = executor.map(concatInfiles, infiles)
# df = pd.concat(df)
# df.reset_index(drop=True, inplace=True)
if args.infile.strip().lower().endswith('.feather'):
logging.info("Reading input feather file...")
try:
df = pd.read_feather(args.infile)
except Exception as e:
logging.error("ERROR: Reading the input feather file")
sys.exit(1)
else:
logging.info("Reading input feather file...")
try:
df = pd.read_csv(args.infile, sep="\t", float_precision='high', low_memory=False)
except Exception as e:
logging.error("ERROR: Reading the input tabular file")
sys.exit(1)
logging.info("Create a column with the bin")
df['bin'] = df[col_CalDeltaMH].astype(str).str.extract(r'^([^\.]*)')
logging.info("Parallel the operations by bin")
with concurrent.futures.ProcessPoolExecutor(max_workers=args.n_workers) as executor:
df = executor.map(bin_operations, list(df.groupby("bin")), repeat(apex_list),
repeat(ppm_max),
repeat(peak_label),
repeat( orphan_label),
repeat(col_ClosestPeak),
repeat(col_CalDeltaMH),
repeat(col_Peak),
repeat(col_DM),
repeat(col_TheoMass),
repeat(col_ppm))
df = pd.concat(df)
#logging.info("calculate gobal FDR")
#df = get_global_FDR(df, args.xcorr)
#logging.info("sort by DeltaMax cal")
#df.sort_values(by=[col_CalDeltaMH], inplace=True)
df.reset_index(drop=True, inplace=True)
# d_h = df.head()
# d_t = df.tail()
# d_h.to_csv("kk_head.tsv", sep="\t")
# d_t.to_csv("kk_tail.tsv", sep="\t")
# Make assignseqcolumn # TODO: make new seqcolumnXXXXX_xxxx
# flag = 0
# if recomseqdmcolumn == "" or recomseqdmcolumn not in df:
# df["RECOMfiltered_type"] = "COMET"
# flag = 1
# recomseqdmcolumn = "placeholder"
# df[recomseqdmcolumn] = "placeholder"
if assignseqcolumn not in df:
df.insert(df.columns.get_loc(seqdmcolumn)+2, assignseqcolumn, np.nan)
df[assignseqcolumn] = df.apply(lambda x: format_seq(x[seqdmcolumn], x[col_DM], decimal_places), axis = 1)
#df[assignseqcolumn] = df.apply(lambda x: x[seqdmcolumn].split('[')[0] + '[' + str(round(x[col_DM], decimal_places)) + ']' + x[seqdmcolumn].split(']')[1], axis = 1)
# if flag == 1:
# df = df.drop(columns=["RECOMfiltered_type",recomseqdmcolumn])
logging.info("Writing output files...")
# https://towardsdatascience.com/the-best-format-to-save-pandas-data-414dca023e0d
# begin:printHDF5
# Note: Explote the Memory!!!
# assign NumExpr for the tables module
# tables.parameters.MAX_NUMEXPR_THREADS = args.n_workers
# df.to_hdf('data.h5', key='df', mode='w')
# end:printHDF5
if args.infile.strip().lower().endswith('.feather'):
outfile = args.infile[:-8] + '_PeakAssignation.feather'
df.to_feather(outfile)
else:
outfile = args.infile[:-8] + '_PeakAssignation.tsv'
df.to_csv(outfile, index=False, sep='\t', encoding='utf-8')
logging.info("Peak assignation finished.")
if __name__ == '__main__':
# multiprocessing.freeze_support()
# parse arguments
parser = argparse.ArgumentParser(
description='Peak Assignator',
epilog='''
Example:
python PeakAssignator.py
''')
defaultconfig = os.path.join(os.path.dirname(__file__), "config/SHIFTS.ini")
parser.add_argument('-i', '--infile', required=True, help='Input file with the peak picking')
parser.add_argument('-a', '--appfile', required=True, help='File with the apex list of Mass')
#parser.add_argument('-f', '--fwhm_filename', default='MAD_and_FWHM_calculations.txt', help='File name with the FWHM value (default: %(default)s)')
parser.add_argument('-c', '--config', default=defaultconfig, help='Path to custom config.ini file')
#parser.add_argument('-mn', '--mindelta', help='Minimum Delta Mass (default: %(default)s)')
#parser.add_argument('-mx', '--maxdelta', help='Maximum Delta Mass (default: %(default)s)')
parser.add_argument('-p', '--ppm', help='Maximum ppm difference for peak assignation')
parser.add_argument('-w', '--n_workers', type=int, default=4, help='Number of threads/n_workers (default: %(default)s)')
parser.add_argument('-v', dest='verbose', action='store_true', help="Increase output verbosity")
args = parser.parse_args()
# parse config
config = configparser.ConfigParser(inline_comment_prefixes='#')
config.read(args.config)
#if args.mindelta is not None:
#config.set('PeakAssignator', 'mindelta', str(args.mindelta))
#config.set('Logging', 'create_ini', '1')
#if args.maxdelta is not None:
#config.set('PeakAssignator', 'maxdelta', str(args.maxdelta))
#config.set('Logging', 'create_ini', '1')
if args.ppm is not None:
config.set('PeakAssignator', 'ppm_max', str(args.ppm))
config.set('Logging', 'create_ini', '1')
# if something is changed, write a copy of ini
if config.getint('Logging', 'create_ini') == 1:
with open(os.path.dirname(args.infile) + '/SHIFTS.ini', 'w') as newconfig:
config.write(newconfig)
# logging debug level. By default, info level
log_file = outfile = args.infile[:-8] + 'PeakAssignation_log.txt'
log_file_debug = outfile = args.infile[:-8] + 'PeakAssignation_log_debug.txt'
if args.verbose:
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
handlers=[logging.FileHandler(log_file_debug),
logging.StreamHandler()])
else:
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
handlers=[logging.FileHandler(log_file),
logging.StreamHandler()])
# start main function
logging.info('start script: '+"{0}".format(" ".join([x for x in sys.argv])))
main(args)
logging.info('end script')