This repository has been archived by the owner on Nov 17, 2021. It is now read-only.
forked from StanfordBioinformatics/Scoring
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalc_pbc.py
executable file
·72 lines (67 loc) · 2.16 KB
/
calc_pbc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
# Calculates the "PCR bottlenecking coefficient" (PBC), a measure of
# library complexity.
#
# Let m_distinct be the number of genomic locations to which at least one
# read maps; let m_1 be the number of locations to which exactly one read
# maps. PBC = m_1 / m_distinct.
#
# Note that this differs somewhat from the "nonredundant fraction" (NRF)
# metric described in Landt et al. (2012), which is defined as "the ratio
# between the number of positions in the genome that uniquely mappable
# reads map to and the total number of uniquely mappable reads".
#
# This code assumes the mappings file has already been filtered for only
# uniquely mappable reads.
#
# The input is a file containing mappings in either SAM or ELAND format.
# ELAND format is detected by the input filename ending in '_eland.txt'. If
# this suffix is not present, SAM format is assumed.
import sys
def main(read_file, output_file=None, name=None):
rf = open(read_file, 'r')
chrs = {}
# Number of genomic locations to which exactly one read maps
m_1 = 0
# Number of genome locations to which at least one read maps
m_distinct = 0
if not name:
name = read_file.split('/')[-1]
for line in rf:
if line.startswith('@'):
continue
fields = line.rstrip('\n').split('\t')
if read_file.endswith('_eland.txt'):
chr = fields[6]
start = int(fields[7])
else:
chr = fields[2]
start = int(fields[3])
if chr not in chrs:
chrs[chr] = {start:1}
m_1 += 1
m_distinct += 1
elif start not in chrs[chr]:
chrs[chr][start] = 1
m_1 += 1
m_distinct += 1
else:
chrs[chr][start] += 1
if chrs[chr][start] == 2:
m_1 -= 1
if not output_file:
print name, m_1, m_distinct, float(m_1) / float(m_distinct)
else:
of = open(output_file, 'a')
of.write('\t'.join([name, str(m_1), str(m_distinct), str(float(m_1) / float(m_distinct)),]) + '\n')
of.close()
if __name__ == '__main__':
if len(sys.argv) == 2:
main(sys.argv[1])
if len(sys.argv) == 3:
main(sys.argv[1], sys.argv[2])
if len(sys.argv) == 4:
main(sys.argv[1], sys.argv[2], sys.argv[3])
else:
print "Usage: calc_pbc.py <SAM or eland file> [output_stats_file] [name]"
raise SystemExit(1)