-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtelME.py
executable file
·800 lines (636 loc) · 27.6 KB
/
telME.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
#!/usr/bin/env python
__version__ = '0.1.0'
import os
import sys
import csv
import time
import string
import subprocess as subP
from itertools import product
from copy import deepcopy
from multiprocessing import Pool, cpu_count, Value, Manager
from math import log, cosh, exp
from optparse import OptionParser, OptionGroup
# dependencies
from numpy import std, mean
_time_init = time.time()
"""
generate telomere counts from bam files
by
A) aligning TTAGGGx6 or CCCTAAx6
B) counting TTAGGG (or CCCTAA the reverse-complement)
and correcting that for total read-depth and on-target rate
"""
comp = string.maketrans("ACGTNacgtn-", "TGCANtgcan-")
def revcomp(s):
return s[::-1].translate(comp)
class MATCHBED:
# given a bed file look for match
# with a target region
def __init__ ( self, bedFile, bam_header) :
self.bed = bedFile
self.bin = 10**5
self.db = dict()
self.bam_header = bam_header
self.is_target = self.bed_is_target
try:
# if bedfile is not a file
self.parseBed()
self.validate_contigs()
except IOError:
self.is_target = self.wg_is_target
def validate_contigs( self ):
header = set(self.bam_header).discard("*")
if len(set(self.db.keys()) - set(self.bam_header)) == 0:
reporting.reportInfo("contigs check passed")
return True
# if not; not good
print "ERROR: bam file headers and bed file contigs do not match"
print "CONTIGS in bedfile", set(self.db.keys())
print "CONTIGS in bamfile", set(self.bam_header)
print "unmatched in bamfile", set(self.db.keys()) - set(self.bam_header)
print "ERROR: Unable to continue, fix the file headers."
sys.exit()
def parseBed ( self ) :
with open(self.bed) as inf:
for line in inf:
line = line.strip().split("\t")
try:
C, S, E = line[:3]
S = int(S)
E = int(E)
mS = S / self.bin
mE = E / self.bin
try:
self.db[C].setdefault(mS, list()).append((S, E))
except KeyError:
self.db[C] = {mS : [(S, E)]}
if mS != mE :
self.db[C].setdefault(mE, list()).append((S, E))
except ValueError:
print "WARNING: improper bed line", line
def wg_is_target( self, C, S, E ):
return False
def bed_is_target ( self, C, S, E ):
if C in self.db:
mS = S / self.bin
for start, end in self.db[C].get(mS, list()):
if start > E:
return False
elif start <= S < end or start < E <= end or S <= start <= end <= E:
return True
mE = E / self.bin
if mS != mE :
for start, end in self.db[C].get(mE, list()):
if start > E:
return False
elif start <= S < end or start < E <= end or S <= start <= end <= E:
return True
return False
class REPORTING:
def __init__( self, silence ):
self.silence = silence
self.start_main = time.time()
try:
rows, columns = os.popen('stty size', 'r').read().split()
except ValueError:
columns = "80"
self.column = int(columns)
def gethms( self, S):
days, remain = divmod( S , (3600*24) )
hours, remain = divmod( remain , 3600 )
minutes, seconds = divmod( remain , 60 )
return ':'.join(['%d%s'%(p,q) for p,q in zip((days, hours, minutes, seconds), ('d', 'h', 'm', 's')) if p > 0 ])
def report_progress( self ):
R = reads_done.value
E = (time.time() - self.start_main)
self.synopsis('Processed %s reads in %s [%.0f reads/sec]'%(self.splitthousands(str(R)), self.gethms(E), (R/E)))
time.sleep(1)
return
def synopsis( self, text ):
if not self.silence:
return
sys.stderr.write('\r%s' % ''.join([text, ' '*(self.column-len(text))]))
sys.stderr.flush()
return
def splitthousands( self, s, sep=','):
if len(s) <= 3:
return s
return self.splitthousands(s[:-3], sep) + sep + s[-3:]
def reportInfo( self, s):
print >> sys.stdout, "INFO:", s
def reportError(self, s):
print >> sys.stdout, "ERROR:", s
class CALCTEL:
def __init__(self, opts):
self.MT = opts.mintel
self.CNT = opts.count / 6.
self.PHRED = opts.phred
self.QUAL = opts.qual
if self.MT.isdigit():
self.MT = int(self.MT)
self.mintel = self.mintel_abs
else:
self.MT = float(self.MT) / 6.
self.mintel = self.mintel_frac
if not self.QUAL:
self.get_read_qual = bool
else:
self.get_read_qual = self.getReadQual
if opts.dups:
self.isDup = self.isDups
self.corr_dups = self.corrDups
else:
self.isDup = self.noDups
self.corr_dups = self.corrNoDups
if opts.CC == '1':
def correctionC(F_OFFT):
# use an exponential decay correction
return 10**(-1*F_OFFT**2)
elif opts.CC == '2':
def correctionC(F_OFFT):
# use a exponential correction
return (1 - F_OFFT)**-1
elif opts.CC == '3':
def correctionC(F_OFFT):
# use an hyperbolic cosine correction
return cosh(F_OFFT*5)
elif opts.CC == '5':
def correctionC(F_OFFT):
# use a fifth order polynomial
return 1-(10*F_OFFT**3-15*F_OFFT**4+6*F_OFFT**5)
elif opts.CC == '6':
def correctionC(F_OFFT):
# use a third order polynomial
return 1-(3*F_OFFT**2-2*F_OFFT**3)
elif opts.CC == '7':
def correctionC(F_OFFT):
# exponential decay correction
return (1.0-F_OFFT)**3
elif opts.CC == '8':
def correctionC(F_OFFT):
# exponential decay correction
return exp(-F_OFFT*3)
else:
# this is the default and CC=4
def correctionC(F_OFFT):
return 10**(F_OFFT**2)**4
self.telcount = self.telcountA
self.correctionC = correctionC
if opts.type == 'adv':
# change any two bases at a time in the first three positions
self.Freps = set(["".join(p) for p in product(('T',), 'ACGT', 'ACGT',('GGG',))] + ["".join(p) for p in product('ACGT',('T',), 'ACGT', ('GGG',))] + ["".join(p) for p in product('ACGT', 'ACGT', ('TGGG',))])
self.Rreps = [revcomp(p) for p in self.Freps]
elif opts.type == 'complex':
# change any one base at a time in the first three positions
self.Freps = set(["".join(p) for p in product('ACGT', ('TAGGG',))] + ["".join(p) for p in product(('T',), 'ACGT', ('AGGG',))] + ["".join(p) for p in product(('TT',),'ACGT', ('GGG',))])
self.Rreps = [revcomp(p) for p in self.Freps]
elif opts.type == 'simple':
# only second position variablity
self.Freps = ['TCAGGG', 'TGAGGG', 'TTAGGG', 'TAAGGG']
self.Rreps = [revcomp(p) for p in self.Freps]
elif opts.type == 'basic':
# only canonical variant sequence
self.telcount = self.telcountB
self.Freps = 'TTAGGG'
self.Rreps = 'CCCTAA'
def isDups( self, s ):
return 'd' in s
def noDups( self, s ):
# nothing is a duplicate
return 0
def corrDups( self, Vt, Vl, Vm, D ):
# correct for percent duplication
P_DUPS = 1. - D
return Vt*P_DUPS, Vl*P_DUPS, Vm*P_DUPS
def corrNoDups( self, Vt, Vl, Vm, D ):
# correct for percent duplication
return Vt, Vl, Vm
def getReadQual( self, qual ):
# given a list of qual values
# determine whether its pass or fail (30*6)
return bool(sum([ord(p)-self.PHRED for p in qual])/(len(qual)*self.QUAL))
def lookupRG( self, read, RG='null' ):
for p in read[10:]:
if p.startswith('RG:Z:'):
return p.split('RG:Z:')[1]
return RG
def mintel_frac(self, read, repeat):
# calculate relative fraction repeat based and Read-Lenght
if read.count(repeat) >= len(read) * self.MT:
return True
return False
def mintel_abs(self, read, repeat):
# compare absolute repeat counts in read
if read.count(repeat) >= self.MT:
return True
return False
def count_tel(self, cnt, read):
if cnt >= len(read) * self.CNT:
return True
return False
def telcountA(self, Seq, Qual ):
# count anything but basic
# check if repeat
# minimum exact canonical-repeats
# then, use a read-length correction
for reps, R in zip((self.Freps, self.Rreps), ('TTAGGG', 'CCCTAA')):
if self.mintel(Seq, R):
telrepeats = dict((p, Seq.count(p)) for p in reps)
cnt = sum(telrepeats.values())
if self.count_tel(cnt, Seq):
return 1., cnt, [p for p,q in telrepeats.items() for i in xrange(q)]
return 0., 0., list()
def telcountB(self, Seq, Qual ):
# count only basic
# check if repeat
# use a read-length correction
for R in (self.Freps, self.Rreps):
cnt = Seq.count(R)
if self.count_tel(cnt, Seq):
return 1., cnt, [R] * cnt
return 0., 0., list()
def CORR(self, TELR, EXPF, ALLT, ONT, F_OFFT, CHRM, DUPS):
CC = self.correctionC(F_OFFT)
Vt = 10**6 * (TELR / (ALLT-ONT)) / CC
# each repeat is 6-bases long, so you could multiply by 6
Vl = 10**6 * (EXPF / (ALLT-ONT)) / CC
try:
Vm = 10**6 * (TELR / CHRM / ALLT)
except ZeroDivisionError:
Vm = 0
return self.corr_dups(Vt, Vl, Vm, DUPS)
# get working
parser = OptionParser(usage="%prog -f [-b -t]", version=__version__ )
groups = OptionGroup(parser, "User input options")
groups.add_option("-f", dest="file",
help="path/to/infile.bam or comma-separated list of files to be merged and counted (only single-threaded supported for merged) [REQUIRED]")
groups.add_option("-b", dest="bedfile",
help="/path/to/targets.bed or whole genome 'wg' [%default]", default='wg')
groups.add_option('-t', dest='threads', type='int',
help="use multiprocessing based on read groups and each chr use 1 for non-mp, 0 for debug mp everthing else is multi-threaded [%default]",
default=8)
groups.add_option("-o", dest="outdir",
help="path/to/outdir/ [%default]",
default=os.getcwd())
parser.add_option_group(groups)
groups = OptionGroup(parser, "Advanced calibration options 'NOT recommended to alter unless you are recalibrating'")
groups.add_option("--count", dest="count",
metavar=' ', type='float',
help="number of repeats to count as a fraction of read-length [%default]",
default=0.8)
groups.add_option("--mintel", dest="mintel",
metavar=' ', type='str',
help="minimum fraction (ie:0.3) or count (ie:2) of canonical TTAGGG repeats in read [%default]",
default='2')
groups.add_option('--CC', dest='CC',
metavar=' ',
help="correction coefficient formula to use (1, 2, 3, 4, 5, 6, 7, 8) [%default]",
default='6')
groups.add_option('--qual', dest='qual',
metavar=' ', type='int',
help="the avg base quality score cutoff for good reads [%default]",
default=20)
groups.add_option('--type', dest='type', choices=('basic', 'simple', 'complex', 'adv'),
metavar=' ', type='choice',
help='counting approach; basic=TTAGGG, simple=TNAGGG, complex="TTNGGG,TNAGGG,NTAGGG" derivates total of 10", adv="TNNGGG,NTNGGG,NNAGGG" derivates total of 37" [%default]',
default='complex')
groups.add_option('--phred', dest='phred',
metavar=' ', type='int',
help="the quality score offset [%default]",
default=33)
groups.add_option('--dups', dest='dups',
metavar=' ', action='store_true',
help="correct for PCR/optical duplicates [%default]",
default=True)
parser.add_option_group(groups)
groups = OptionGroup(parser, "Runtime options")
groups.add_option('--verbose', dest='verbose',
metavar=' ', action='store_true',
help="print counts [%default]",
default=False)
groups.add_option('--samtools', dest='samtools',
metavar=' ',
help="/path/to/samtools [%default]",
default='samtools')
groups.add_option('--prefix', dest='prefix',
metavar=' ',
help="prefix for output name [%default]",
default='')
parser.add_option_group(groups)
(option, arg)= parser.parse_args()
reporting = REPORTING(option.verbose)
reporting.reportInfo("python PID %s"%os.getpid())
need_SP_merge = False
if ',' in option.file:
option.threads = 1
need_SP_merge = True
elif not option.file.startswith('ftp') and not os.path.isfile(option.file):
reporting.reportError("need to provide an input.bam file")
reporting.reportError("%s is not a valid file"%option.file)
sys.exit(1)
if option.bedfile == None or option.bedfile.upper() == 'WG':
option.bedfile = 'wg'
reporting.reportInfo("running whole genome calculation")
elif not os.path.isfile(option.bedfile):
reporting.reportError("bed file %s not located" %option.bedfile)
sys.exit(1)
if not option.file.startswith('ftp'):
# check .bam.bai file
# rename .bai file
if not os.path.isfile(option.file+'.bai'):
if os.path.isfile(option.file.replace('.bam', '.bai')):
os.rename(option.file.replace('.bam', '.bai'),
option.file+'.bai')
else:
subP.call([option.samtools, 'index', option.file],
stdout=subP.PIPE, bufsize=0)
# collect contig names from bam file
Chr_list = sorted([p.strip().split('\t')[0] for p in subP.Popen([option.samtools, 'idxstats', option.file.split(',')[0]], stdout=subP.PIPE, bufsize=0).stdout])
reporting.reportInfo("Bamfile contig count is %s"%len(Chr_list))
# These are the types of information that we are collecting
keepCountsDB = {'allR' : 1.0, 'ontR' : 0.0, 'telR' : 0.0,
'chrM' : 0.0, 'telU' : 0.0, 'telC': 0.0,
'dups' : 0.0, 'freq' : dict()}
# 1 'allR' : cnt of all reads in the read group
# 2 'ontR' : cnt of on target reads in the read group
# 3 'telR' : cnt of telomeric reads in the read group
# 4 'chrM' : cnt of chrM reads in the read group
# 5 'telU' : cnt of telomeric repeats from the telR unmapped reads in the read group
# 6 'telC' : cnt of telomeric repeat from the telR reads in the read group
# 'dups' : cnt of duplicate reads in the read group
# 'freq' : cnt of individual telomeric hexamers in the read group
def counter_SP( merge ):
global tabixtools
if merge:
flist = option.file.split(',')
# read samtools view in tandem for all the files
reporting.reportInfo("reading from %s files in tandem" % len(flist))
stmt = list()
for each in flist:
stmt.extend([option.samtools, 'view', '-X', each, ';'])
bamF = subP.Popen(" ".join(stmt).rstrip(";"), shell=True, stdout=subP.PIPE, bufsize=0)
else:
reporting.synopsis("starting %s" %sname[:40])
bamF = subP.Popen([option.samtools, 'view', '-X', option.file],
stdout=subP.PIPE, bufsize=0)
RGdata = dict()
for cnt, read in enumerate(bamF.stdout, 1):
read = read.strip().split("\t")
Qual = read[10]
if calctel.get_read_qual(Qual):
Seq = read[9].upper()
Chr = read[2]
RG = calctel.lookupRG(read)
# add to total read-group reads
try:
RGdata[RG]['allR'] += 1.
except KeyError:
RGdata[RG] = deepcopy(keepCountsDB)
RGdata[RG]['dups'] += calctel.isDup(read[1])
if Chr == '*':
# unmapped
tc, freq, telrepeats = calctel.telcount(Seq, Qual)
RGdata[RG]['telR'] += tc
RGdata[RG]['telU'] += tc
RGdata[RG]['telC'] += freq
for trep in telrepeats:
RGdata[RG]['freq'][trep] = RGdata[RG]['freq'].get(trep, 0) + 1
else:
# mapped
Spos = int(read[3])
Epos = Spos + len(Seq)
if Chr in ('chrM', 'chrMT', 'MT', 'M'):
RGdata[RG]['chrM'] += 1.
elif tabixtools.is_target(Chr, Spos, Epos):
# add to total on-target-reads
RGdata[RG]['ontR'] += 1.
else:
tc, freq, telrepeats = calctel.telcount(Seq, Qual)
RGdata[RG]['telR'] += tc
RGdata[RG]['telC'] += freq
for trep in telrepeats:
RGdata[RG]['freq'][trep] = RGdata[RG]['freq'].get(trep, 0) + 1
reporting.synopsis('Processed %s reads in %s'%(reporting.splitthousands(str(cnt)), reporting.gethms(time.time()-_time_init)))
reads_done.value = cnt
bamF.terminate()
return RGdata
def counter_MP( Chr ):
global tabixtools
# collect read-count, on-target-reads, telomeric reads, chrM reads, telomeric reads in unmapped, tel_repeat_cnt
reporting.synopsis("starting %s with %s" %(sname[:25], Chr))
cnt = 0
RGdata = dict()
if Chr in ('chrM', 'chrMT', 'MT', 'M'):
# chrM reads
bamF = subP.Popen([option.samtools, 'view', '-X',
option.file, Chr],
stdout=subP.PIPE, bufsize=0)
for cnt, read in enumerate(bamF.stdout, 1):
read = read.strip().split("\t")
Qual = read[10]
if calctel.get_read_qual(Qual):
RG = calctel.lookupRG(read)
Spos = int(read[3])
Epos = Spos + len(Qual)
# add to total read-group reads
try:
RGdata[RG]['allR'] += 1.
except KeyError:
RGdata[RG] = deepcopy(keepCountsDB)
RGdata[RG]['dups'] += calctel.isDup(read[1])
if tabixtools.is_target(Chr, Spos, Epos):
# add to total on-target-reads
RGdata[RG]['ontR'] += 1.
RGdata[RG]['chrM'] += 1.
elif Chr == '*':
# unmapped
bamF = subP.Popen([option.samtools, 'view', '-X',
'-f4', option.file],
stdout=subP.PIPE, bufsize=0)
for cnt, read in enumerate(bamF.stdout, 1):
read = read.strip().split("\t")
Qual = read[10]
if calctel.get_read_qual(Qual):
Seq = read[9].upper()
RG = calctel.lookupRG(read)
# add to total read-group reads
try:
RGdata[RG]['allR'] += 1.
except KeyError:
RGdata[RG] = deepcopy(keepCountsDB)
RGdata[RG]['dups'] += calctel.isDup(read[1])
# check if telomeric repeat
tc, freq, telrepeats = calctel.telcount(Seq, Qual)
RGdata[RG]['telR'] += tc
RGdata[RG]['telU'] += tc
RGdata[RG]['telC'] += freq
for trep in telrepeats:
RGdata[RG]['freq'][trep] = RGdata[RG]['freq'].get(trep, 0) + 1
else:
# mapped
bamF = subP.Popen([option.samtools, 'view', '-X', option.file, Chr],
stdout=subP.PIPE, bufsize=0)
for cnt, read in enumerate(bamF.stdout, 1):
read = read.strip().split("\t")
Qual = read[10]
if calctel.get_read_qual(Qual):
Seq = read[9].upper()
Chr = read[2]
Spos = int(read[3])
Epos = Spos + len(Seq)
RG = calctel.lookupRG(read)
# add to total read-group reads
try:
RGdata[RG]['allR'] += 1.
except KeyError:
RGdata[RG] = deepcopy(keepCountsDB)
RGdata[RG]['dups'] += calctel.isDup(read[1])
if tabixtools.is_target(Chr, Spos, Epos):
# add to total on-target-reads
RGdata[RG]['ontR'] += 1.
else:
tc, freq, telrepeats = calctel.telcount(Seq, Qual)
RGdata[RG]['telR'] += tc
RGdata[RG]['telC'] += freq
for trep in telrepeats:
RGdata[RG]['freq'][trep] = RGdata[RG]['freq'].get(trep, 0) + 1
bamF.terminate()
# add to total read-group reads
reads_done.value += cnt
reporting.report_progress()
return RGdata
def merge_RGdata(s):
global header
# first check for data type, should be a dict
if type(s) is not dict:
# collect all the outputs from the multiprocesses
# they are tuples of dicts of dicts
TelRepeats = dict()
RGdata = dict()
for each_dict in s:
# there are multiple RG tags with dict of values
for k,v in each_dict.iteritems():
TR = v.pop('freq') # the telrepeat dict
try:
for i in v.keys():
RGdata[k][i] += v[i]
for l,m in TR.items():
TelRepeats[k][l] = TelRepeats[k].get(l, 0) + m
except KeyError:
RGdata[k] = v
TelRepeats[k] = TR
else:
RGdata = s
TelRepeats = dict((p, RGdata[p].pop('freq')) for p in RGdata)
# for each read-group in the bam file generate the fraction
outlist = list()
for_ont = list() # calc mean based on off-target tel read cnt
for_chrm = list() # calc mean based on chrM ratio
for_ontC = list() # calc mean based on off-target telomere repeat cnt
TelRepeats_all = dict()
dup_rates = list() # calc mean duplication rate
ont_rates = list() # calc mean on-target rate
for RG in RGdata:
ALLT = RGdata[RG]['allR']
ONT = RGdata[RG]['ontR']
TELR = RGdata[RG]['telR']
CHRM = RGdata[RG]['chrM']
TELUN = RGdata[RG]['telU']
EXPF = RGdata[RG]['telC']
DUPS = RGdata[RG]['dups'] / ALLT
# ASSUME: the better the capture efficiency
# the worse the telomeric fraction
# i.e. if your on-target is 0 you should assume that
# the telomeric count represents WGS so take it as is.
F_OFFT = ONT / ALLT
Vt, Vl, Vm = calctel.CORR(TELR, EXPF, ALLT, ONT, F_OFFT, CHRM, DUPS)
if option.verbose:
print RG, RGdata[RG]
print Vt, Vl, Vm
for_ont.append(Vt)
for_ontC.append(Vl)
for_chrm.append(Vm)
dup_rates.append(DUPS)
ont_rates.append(F_OFFT)
TelRepeats_RG = dict()
# combine reverse-complement sequences
for T in TelRepeats[RG].keys():
tcnt = TelRepeats[RG][T]
# look for rev-comp sequences
if T.startswith('CCC'):
T = revcomp(T)
TelRepeats_all[T] = TelRepeats_all.get(T, 0) + tcnt
TelRepeats_RG[T] = TelRepeats_RG.get(T, 0) + tcnt
TRcnt = float(sum(TelRepeats_RG.values()))
TR = sorted(TelRepeats_RG.items(), key=lambda x: x[1], reverse=True)
TRstr = "\t".join(["%s:%.4f" %(p, q/TRcnt) for p,q in TR])
outlist.append('\t'.join([RG, '%i'%ALLT, '%i'%ONT, '%i'%TELR, '%i'%CHRM, '%i'%TELUN, '%i'%EXPF, str(Vt), str(Vm), str(Vl), '%.2f'%DUPS, TRstr ]) + '\n')
TRcnt = float(sum(TelRepeats_all.values()))
TR = sorted(TelRepeats_all.items(), key=lambda x: x[1], reverse=True)
TRstr = "\t".join(["%s:%.4f" %(p, q/TRcnt) for p,q in TR if q/TRcnt > 0.001])
outlist.append('\n' + '\t'.join([sname, 'MEAN+-SD',
'RTPMO-w/offt',
str(mean(for_ont)),
str(std(for_ont)),
'RTPKM-w/chrM',
str(mean(for_chrm)),
str(std(for_chrm)),
'RTPMO-w/offtCNT',
str(mean(for_ontC)),
str(std(for_ontC)),
'ONTARGET-RATE',
str(mean(ont_rates)),
str(std(ont_rates)),
'DUP-RATE',
str(mean(dup_rates)),
str(std(dup_rates)),
TRstr]) + '\n')
return outlist
option.outdir = os.path.realpath(option.outdir)
if not os.path.isdir(option.outdir):
os.makedirs(option.outdir)
# start processing
calctel = CALCTEL(option)
tabixtools = MATCHBED(option.bedfile, Chr_list)
sname = os.path.basename(option.file).split(',')[0].split('.bam')[0]
reads_done = Value('i', 0)
if option.threads == 1:
reporting.reportInfo("Running single-threaded")
D = counter_SP(need_SP_merge)
else:
print "INFO: Running multi-threaded",
# start with unmapped, since it may take the most
# then, add the rest of the contigs
if option.threads == 0:
print "debug-mode"
# debug mp mode
D = [counter_MP(p) for p in Chr_list]
else:
print "with %s" %option.threads
jobP = Pool(option.threads)
jobR = jobP.map_async(counter_MP, Chr_list)
jobP.close()
jobP.join()
D = jobR.get()
output_data = merge_RGdata(D)
outname = "" if not len(option.prefix) else "%s_"%option.prefix
outname += '_'.join([sname, "telcount_by", str(option.count), option.type, str(option.qual), option.CC, str(option.mintel)])
outname += "" if not option.dups else "_dups"
outname += '.tsv'
with open(os.path.join(option.outdir, outname), "w") as outf:
outf.write('\t'.join(['SAMPLE', sname]) + '\n')
outf.write('\t'.join(['TARGET', option.bedfile]) + '\n')
cmd = " ".join(["%s=%s"%p for p in sorted(option.__dict__.items())])
outf.write('\t'.join(['CMD', sys.argv[0] + " %s"%cmd]) + '\n')
outf.write('\t'.join(['RG', 'ALL', 'ONTARGET', 'TELOMERE',
'CHRM', 'TEL-UNMAPPED', 'CNT_TEL_LEN',
'RTPMO-w/offt',
'RTPKM-w/chrM',
'RTPMO-w/offtCNT',
'FRACTION_DUPS',
'TELOMERE_SEQ_FOUND']) + '\n')
outf.writelines(output_data)
CNT = reads_done.value
TIME = time.time()-_time_init
reporting.reportInfo("Process completed: %s reads in %s [%s r/s]" %(reporting.splitthousands(str(CNT)), reporting.gethms(TIME), reporting.splitthousands(str(int(float(CNT)/TIME)))))