-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInputToFastq_bed2.py
186 lines (150 loc) · 5.27 KB
/
InputToFastq_bed2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import sys, argparse
import pyBigWig
from xplib.Annotation import Bed
from xplib import DBI
import os, string
from multiprocessing import *
rev_table=string.maketrans('ACGTacgtN', 'TGCATGCAN')
def ParseArg():
p=argparse.ArgumentParser( description = "Generate input file in fastq format for the Epi-evolutionary model." )
p.add_argument("q_region",type=str, nargs='+', help="Files containing coordinates of query regions in bed format. Note that the order should be the same as species.")
p.add_argument("-s", "--species", nargs=2, type=str, default=["hg38","rheMac8"], help="A list of genome assembly. Note that the list should have the same order as the peak files")
p.add_argument("-f","--fasta_path", type=str, default="/home/jialu/evm/Sequence", help="Path to the genome sequence files (.fa files).")
p.add_argument("--histone", nargs="+",type=str, default=["H3K4me3"], help="Name of histone modifications. Note that the list should have the same order as peak files.")
p.add_argument("--bg", type=str, help="File name. The file contains paths to ChIP-Seq peak calling files (.bed files).")
# p.add_argument("-t","--bg_cutoff", type=int, default=4, help="Cutoff for coverage")
p.add_argument("--s_path",type=str, default="samtools", help="Path to samtools")
p.add_argument("-p","--p_num",type=int, default=5,help="Number of processes to be used.")
p.add_argument("-o","--output",type=str,help="Output file name. ")
if len(sys.argv)==1:
print >>sys.stderr, p.print_help()
sys.exit(0)
return p.parse_args()
def overlap(bed1,bed2):
'''
Compute the overlap between two regions.
If the two regions overlap, return the length of the overlap, otherwise return False.
'''
if (bed1.stop>bed2.start) and (bed1.start<bed2.stop):
return min(bed1.stop,bed2.stop)-max(bed1.start,bed2.start)+1
else:
return False
def ReadHistones(fhist_name):
'''
Read ChIP-Seq peak files.
'''
sp1=[]
sp2=[]
with open(fhist_name,"r") as fhist:
fhist.readline()
while True:
line=fhist.readline().strip()
if "@" in line:
break
line=line.split("\t")
sp1.append(DBI.init(line[0],"bed"))
while True:
line=fhist.readline().strip()
if line=="":
break
line=line.split("\t")
sp2.append(DBI.init(line[0],"bed"))
return sp1,sp2
def revcomp(seq):
return seq.translate(rev_table)[::-1]
def fetchSeq(qr,fasta,s_path):
''' Fetch sequence from the genome sequence file.'''
region = '%s:%d-%d'%(qr.chr,qr.start,qr.stop-1)
seq="".join(os.popen(s_path+" faidx "+fasta+" "+region).read().split('\n')[1:])
if qr.strand=="-":
seq = revcomp(seq)
return seq.upper()
def fetchHistModSeq(qr, dbi):
''' Convert histone modification measurement to sequences of 1's and 0's based on the peak calling results.'''
#[a,b)
pointer=qr.start
hseq=""
interval_list=sorted(list(dbi.query(qr)))
for line in interval_list:
if not overlap(line,qr):
continue
bstart=line.start
bend=line.stop
if bstart<pointer:
hseq+="1"*(min(bend,qr.stop)-pointer)
elif bstart>pointer:
hseq+="0"*(bstart-pointer)+"1"*(min(bend,qr.stop)-bstart)
else:
hseq+="1"*(min(bend,qr.stop)-bstart)
pointer=min(bend,qr.stop)
if pointer!=qr.stop:
hseq+="0"*(qr.stop-pointer)
if qr.strand=="-":
hseq=hseq[::-1]
return hseq
def Generate_output_str(bed_pair):
'''
Generate region pairs. Note that the program will discard regions with N's in their sequences.
'''
bed1=bed_pair[0]
bed2=bed_pair[1]
output_list=[]
for s,sp,qrs in zip(args.species, [sp1,sp2], [bed1,bed2]):
output_list.append("@"+s+"_"+"_".join(args.histone)+"_"+bed1.id)
seq=fetchSeq(qrs,args.fasta_path.rstrip('/')+'/'+s+'.fa',args.s_path)
if "N" in seq:
output_list=[]
return
output_list.append(seq)
for fsp_name in sp:
output_list.append("+")
hseq=fetchHistModSeq(qrs, fsp_name)
if len(hseq)!=len(seq):
print bed1.id
print len(seq)
print len(hseq)
print >>sys.stderr,"Wrong hseq length!"
exit(0)
output_list.append(str(hseq))
if len(output_list)!=0:
return "\n".join(output_list)
def Main():
global args
args=ParseArg()
print >>sys.stderr, "Indexing..."
global sp1, sp2
sp1, sp2=ReadHistones(args.bg)
print >>sys.stderr, "Done indexing"
if len(sp1)!=len(sp2) or len(sp1)!=len(args.histone):
print >>sys.stderr, "Check the number of histone modifications!"
exit(0)
fout=open(args.output,"w")
input_list=[]
with open(args.q_region[0],"r") as fbed1, open(args.q_region[1],"r") as fbed2:
while True:
line1=fbed1.readline().strip().split("\t")
line2=fbed2.readline().strip().split("\t")
if line1[0]=="":
break
if line2[0]=="":
break
bed1=Bed(line1)
bed2=Bed(line2)
if bed1.chr!="chrX" and bed1.chr!="chrY":
try:
int(bed1.chr.lstrip("chr"))
except:
continue
if bed2.chr!="chrX" and bed2.chr!="chrY":
try:
int(bed2.chr.lstrip("chr"))
except:
continue
input_list.append((bed1, bed2))
p=Pool(args.p_num)
out_queue=p.map(Generate_output_str,input_list)
with open(args.output,"w") as fout:
for item in out_queue:
if item:
print >>fout, item
Main()