-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmerge_read_pairs.py
133 lines (105 loc) · 3.5 KB
/
merge_read_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Different strategies of Read [Pairs]
#
# 1. Read and mate mapped to same chromosome
# 2. Read and mate mapped to different chromosome
# 3. Read is mapped but mate is not mapped ( it could be either R1 or R2 )
# 4. Read is singleton ( It could be R1 or R2)
# Usage: python merge_read_pairs.py <input bam> < output sam>
import pysam,sys
read_length=80
insert_cutoff=2*read_length #Double the number of read length
# Merge the paired end reads. if there is overlap, ( logic to find overlap is if the insert length is less than the sum of read lengths of two reads), stitch them.
def mergeReadPairs(read,mate,insertlength):
insertlength=abs(insertlength) # To get the positive number for insert_length
if (insertlength>=insert_cutoff):
return read+mate
if (insertlength<insert_cutoff):
overlap=insert_cutoff-insertlength
pad_seq="N"*overlap
return read+mate[overlap:]+pad_seq
def handleSingletons(read):
pad_seq="N"*read_length
return read+pad_seq
def write_to_bam():
pass
filename = sys.argv[1]
outfile_name=sys.argv[2]
reads = pysam.Samfile(filename)
header=reads.header
outfile = pysam.AlignmentFile(outfile_name, "wh", header=header)
a = pysam.AlignedSegment()
def write_bam(query_name,query_seq,query_flag,query_ref_name,query_ref_start,query_mapping_qual,query_cigar):
a.query_name=query_name
a.query_sequence=query_seq
a.flag=query_flag
a.reference_id=query_ref_name
a.reference_start=query_ref_start
a.mapping_quality=query_mapping_qual
a.cigar=[(0,160)]
a.next_reference_id=0
a.next_reference_start=0
a.template_length=0
a.query_qualities="N"*160
outfile.write(a)
for read in reads:
if read.is_paired and read.is_read1:
#This will work if read 1 is mapped but read2 is unmapped.
if read.mate_is_unmapped:
write_bam(read.qname,handleSingletons(read.seq),read.flag,read.tid,read.reference_start,read.mapping_quality,read.cigartuples)
pos = reads.tell()
try:
mate = reads.mate(read)
except ValueError:
continue
finally:
reads.seek(pos)
#Strategy 1 when both the read and mate mapped to same CHR. Sometimes, insert length may be less than the read length, discard them
if (read.tid == mate.tid) and abs(read.tlen)>len(read.seq):
if read.is_reverse:
write_bam(read.qname,
mergeReadPairs(mate.seq,read.seq,read.tlen),
read.flag,read.tid,
mate.reference_start,
read.mapping_quality,
read.cigartuples)
elif mate.is_reverse:
write_bam(read.qname,
mergeReadPairs(read.seq,mate.seq,read.tlen),
read.flag,read.tid,
read.reference_start,
read.mapping_quality,
read.cigartuples)
#Strategy 2 when the mate mapped to different chromosome
elif read.tid != mate.tid:
write_bam(read.qname,
handleSingletons(read.seq),
read.flag,
read.tid,
read.reference_start,
read.mapping_quality,
read.cigartuples)
write_bam(mate.qname,
handleSingletons(mate.seq),
mate.flag,
mate.tid,
mate.reference_start,
mate.mapping_quality,
mate.cigartuples)
elif not read.is_paired:
write_bam(read.qname,
handleSingletons(read.seq),
read.flag,
read.tid,
read.reference_start,
read.mapping_quality,
read.cigartuples)
#This will work if read 2 is mapped but read 1 is unmapped.
if read.is_paired and read.is_read2 and read.mate_is_unmapped:
write_bam(read.qname,
handleSingletons(read.seq),
read.flag,
read.tid,
read.reference_start,
read.mapping_quality,
read.cigartuples)
outfile.close()