-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalcifer_downstream_sequence_modules.py
295 lines (279 loc) · 15.7 KB
/
calcifer_downstream_sequence_modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python
import os
from pyfaidx import Faidx
# module for functions regarding retrieving the circRNA sequence #
# get dict with the annotation for all exons, cds and 3'utrs
def mirna_annotation(gtf_file):
cds_annotation = {}
three_utr_annotation = {}
exon_annotation = {}
exon_endings = {}
# transform generic identifier to chromosome names
with open(gtf_file, "r") as anno:
for line in anno:
if line[0] != "#":
line_content = line.split()
if line_content[0] == "NC_000023.11":
line_content[0] = "chrX"
elif line_content[0] == "NC_000024.10":
line_content[0] = "chrY"
elif line_content[0] == "NC_000022.11":
line_content[0] = "chr22"
elif line_content[0] == "NC_000021.9":
line_content[0] = "chr21"
elif line_content[0] == "NC_000020.11":
line_content[0] = "chr20"
elif line_content[0] == "NC_000019.10":
line_content[0] = "chr19"
elif line_content[0] == "NC_000018.10":
line_content[0] = "chr18"
elif line_content[0] == "NC_000017.11":
line_content[0] = "chr17"
elif line_content[0] == "NC_000016.10":
line_content[0] = "chr16"
elif line_content[0] == "NC_000015.10":
line_content[0] = "chr15"
elif line_content[0] == "NC_000014.9":
line_content[0] = "chr14"
elif line_content[0] == "NC_000013.11":
line_content[0] = "chr13"
elif line_content[0] == "NC_000012.12":
line_content[0] = "chr12"
elif line_content[0] == "NC_000011.10":
line_content[0] = "chr11"
elif line_content[0] == "NC_000010.11":
line_content[0] = "chr10"
elif line_content[0] == "NC_000009.12":
line_content[0] = "chr9"
elif line_content[0] == "NC_000008.11":
line_content[0] = "chr8"
elif line_content[0] == "NC_000007.14":
line_content[0] = "chr7"
elif line_content[0] == "NC_000006.12":
line_content[0] = "chr6"
elif line_content[0] == "NC_000005.10":
line_content[0] = "chr5"
elif line_content[0] == "NC_000004.12":
line_content[0] = "chr4"
elif line_content[0] == "NC_000003.12":
line_content[0] = "chr3"
elif line_content[0] == "NC_000002.12":
line_content[0] = "chr2"
elif line_content[0] == "NC_000001.11":
line_content[0] = "chr1"
elif "chr" not in line_content[0]:
line_content[0] = "chr" + line_content[0]
information_type = line_content[2]
if int(line_content[3]) > int(line_content[4]):
feature_start = line_content[4]
feature_end = line_content[3]
else:
feature_start = line_content[3]
feature_end = line_content[4]
key_pos = line_content[0] + ":" + str(feature_start) + "-" + str(feature_end)
# get all annotated cds
if information_type == "CDS":
cds_annotation[key_pos] = '"' + key_pos + '"'
# get all annotated 3'utr
elif information_type == "three_prime_utr":
three_utr_annotation[key_pos] = '"' + key_pos + '"'
# get position of all exons
# also get exon numbers for circRNA naming
elif information_type == "exon":
if line_content[0] + ":" + str(feature_start) in exon_annotation:
if line_content[0] + ":" + str(feature_end) not in exon_annotation[line_content[0] + ":" +
str(feature_start)][0]:
exon_annotation[line_content[0] + ":" + str(feature_start)][0].append(line_content[0] + ":"
+ str(feature_end))
else:
exon_annotation[line_content[0] + ":" + str(feature_start)] = [[line_content[0] + ":" +
str(feature_end)],
line_content[17]]
exon_endings[line_content[0] + ":" + str(feature_end)] = [str(feature_start), line_content[17]]
return cds_annotation, three_utr_annotation, exon_annotation, exon_endings
# get all exons which are included in all found circRNAs #
# create fasta-files with the linear sequence, psuedo_circular sequence #
# (+ 25 bp from end to start and start to end) and multi cycle sequence (4 * linear seq) #
# also get sequence around bsj (250 bp + 25 bp into circRNA) for RBP analysis #
def circ_exon_seq(working_dir, gene_fasta, exon_anno, exon_endings):
genes = Faidx(gene_fasta)
circ_file = working_dir + "all_circs/two_unique_filtered.txt"
circ_rnas = {}
# keys are start or end position, every start/end can have multiple circRNAs (isoforms) #
# read in circRNAs with identifier as key #
with open(circ_file, "r") as circs_in:
with open(working_dir + "all_circs/circ_bsj_seq.fasta", "w") as bsj_seq:
for line in circs_in:
line_content = line.split()
all_pos = line_content[0].split(":")
chromosome = all_pos[0]
strand = line_content[0].split(":")[2]
positions = all_pos[1].split("-")
positions[0] = str(int(positions[0]) + 1)
circ_rnas[line_content[0]] = [positions[0], positions[1], line_content[1:]]
# get sequence around bsj +/- 250bp
header = "\n>" + line_content[0]
up_bsj = chromosome + ":" + str(int(positions[0]) - 250) + "-" + str(int(positions[0]) + 24)
down_bsj = chromosome + ":" + str(int(positions[1]) - 24) + "-" + str(int(positions[1]) + 250)
if strand == "+":
fasta_seq_up = str(genes.fetch(chromosome, (int(positions[0]) - 250), (int(positions[0]) + 24)))
full_fasta_seq_up = ""
full_fasta_seq_up += ''.join(fasta_seq_up)
fasta_seq_down = str(genes.fetch(chromosome, (int(positions[1]) - 24), (int(positions[1]) + 250)))
full_fasta_seq_down = ""
full_fasta_seq_down += ''.join(fasta_seq_down)
# -i option does not work maybe implement something else!
elif strand == "-":
fasta_seq_up = str(
genes.fetch(chromosome, (int(positions[0]) - 250), (int(positions[0]) + 24)).reverse.complement)
full_fasta_seq_up = ""
full_fasta_seq_up += ''.join(fasta_seq_up)
fasta_seq_down = str(
genes.fetch(chromosome, (int(positions[1]) - 24), (int(positions[1]) + 250)).reverse.complement)
full_fasta_seq_down = ""
full_fasta_seq_down += ''.join(fasta_seq_down)
bsj_seq.write(header + ":side1\n")
bsj_seq.write(full_fasta_seq_up)
bsj_seq.write(header + ":side2\n")
bsj_seq.write(full_fasta_seq_down)
circ_rna_seq_pos = {}
# set nonsense value for iteration failures
end_point = -1
start_exon = -1
pos = -1
for key in circ_rnas.keys():
circ_rna_starts = []
circ_rna_exons = {}
start = int(circ_rnas[key][0])
end = int(circ_rnas[key][1])
chrom = key.split(":")[0]
for pos in range(start, end):
# check for partial sequence of exon at circRNA start
if len(circ_rna_exons) == 0:
ident_pos_end = chrom + ":" + str(pos)
if ident_pos_end in exon_endings:
real_exon_start = exon_endings[ident_pos_end][0]
exon_number = exon_endings[ident_pos_end][1]
if int(real_exon_start) < pos:
start_exon = start
end_point = pos
circ_rna_starts.append(int(start_exon))
# add exon number here (and also if in exon start)
circ_rna_exons[start_exon] = [int(end_point), chrom + ":" + str(start_exon) + "-" +
str(end_point) + ":" + exon_number]
end_point = -1
ident_pos = chrom + ":" + str(pos)
# check for full exons in circRNA
if ident_pos in exon_anno:
start_exon = pos
end_point = 0
exon_ending_list = exon_anno[ident_pos][0]
exon_number = exon_anno[ident_pos][1]
# only take largest version of a given exon (i.e. largest end point inside a circ)
for i in exon_ending_list:
end_pos = int(i.split(":")[1])
if end >= end_pos > end_point:
end_point = end_pos
if end_point > pos:
circ_rna_starts.append(int(pos))
circ_rna_exons[pos] = [int(end_point), chrom + ":" + str(pos) + "-" + str(end_point) + ":"
+ exon_number]
# add partial exon sequence at the end of circRNA
if end_point == 0:
circ_rna_starts.append(int(start_exon))
circ_rna_exons[start_exon] = [int(pos), chrom + ":" + str(start_exon) + "-" + str(pos) + ":" + exon_number]
if len(circ_rna_starts) == 0:
circ_rna_starts.append(int(start))
circ_rna_exons[start] = [int(end), chrom + ":" + str(start) + "-" + str(end) + ":NA"]
circ_rna_seq_pos[key] = []
sorted_exon_starts = sorted(circ_rna_starts)
prev_exon_end = 0
circ_rna_end = int(key.split(":")[1].split("-")[1])
for exon_start in sorted_exon_starts:
putative_end = circ_rna_exons[exon_start][0]
if exon_start > prev_exon_end:
prev_exon_end = circ_rna_exons[exon_start][0]
circ_rna_seq_pos[key].append(circ_rna_exons[exon_start][1])
# there is the rare case that an exon in front of the circRNA end overlaps with the real last exon
# e.g. the backspliced exon. In this case we only consider the exon which was also involved in the
# back-splicing
elif putative_end == circ_rna_end and prev_exon_end != circ_rna_end:
circ_rna_seq_pos[key] = circ_rna_seq_pos[key][:-1]
prev_exon_end = circ_rna_exons[exon_start][0]
circ_rna_seq_pos[key].append(circ_rna_exons[exon_start][1])
# get linear circRNA sequence and exon numbers between start and end of each circRNA
with open(working_dir + "all_circs/circ_naming.txt", "w") as naming:
with open(working_dir + "all_circs/linear_seq.fasta", "w") as circ_seq_out:
for key in circ_rna_seq_pos.keys():
if len(circ_rna_seq_pos[key]) > 0:
exon_number_list = []
fasta_header = ">" + key
full_fasta_seq = ""
strand = key.split(":")[2]
if strand == "+":
for exon_pos in circ_rna_seq_pos[key]:
chromosome = exon_pos.split(":")[0]
position = exon_pos.split(":")[1]
start_pos = int(position.split("-")[0])
end_pos = int(position.split("-")[1])
if len(exon_pos.split(":")[2]) == 0:
exon_number = "NA"
elif "NA" not in exon_pos.split(":")[2]:
exon_number = int(exon_pos.split(":")[2][1:-2])
else:
exon_number = "NA"
# double check sequence content in comparing the exon number with all already added exons
# the sequence is only added if exon number is greater then all others and not already
# included in the sequence
exon_number_list.append(exon_number)
fasta_seq = str(genes.fetch(chromosome, start_pos, end_pos))
full_fasta_seq += ''.join(fasta_seq)
elif strand == "-":
for exon_pos in circ_rna_seq_pos[key]:
chromosome = exon_pos.split(":")[0]
position = exon_pos.split(":")[1]
start_pos = int(position.split("-")[0])
end_pos = int(position.split("-")[1])
if len(exon_pos.split(":")[2]) == 0:
exon_number = "NA"
elif "NA" not in exon_pos.split(":")[2]:
exon_number = int(exon_pos.split(":")[2][1:-2])
else:
exon_number = "NA"
# double check sequence content in comparing the exon number with all already added exons
# the sequence is only added if exon number is smaller then all others and not already
# included in the sequence
exon_number_list.append(exon_number)
fasta_seq = str(genes.fetch(chromosome, start_pos, end_pos).reverse.complement)
full_fasta_seq = ''.join((fasta_seq, full_fasta_seq))
circ_seq_out.write("\n" + fasta_header + "\n")
circ_seq_out.write(full_fasta_seq.upper())
final_exon_number_string = "(" + ','.join(str(e) for e in exon_number_list) + ")"
naming.write("\n" + key + "\t" + final_exon_number_string)
os.system("sed -i \'1d\' " + working_dir + "all_circs/linear_seq.fasta")
os.system("sed -i \'1d\' " + working_dir + "all_circs/circ_naming.txt")
with open(working_dir + "all_circs/linear_seq.fasta", "r") as circ:
with open(working_dir + "all_circs/pseudo_circular_seq.fasta", "w") as seq_out:
line_count = 0
for line1 in circ:
header = line1.replace("\n", "")
line2 = next(circ)
circ_line = line2.replace("\n", "")
seq = circ_line[-25:] + circ_line + circ_line[:25]
if line_count != 0:
seq_out.write("\n" + header + "\n" + seq)
else:
seq_out.write(header + "\n" + seq)
line_count += 1
with open(working_dir + "all_circs/linear_seq.fasta", "r") as circ:
with open(working_dir + "all_circs/multi_cycle_seq.fasta", "w") as seq_out:
line_count = 0
for line1 in circ:
header = line1.replace("\n", "")
line2 = next(circ)
seq = 4 * line2.replace("\n", "")
if line_count != 0:
seq_out.write("\n" + header + "\n" + seq)
else:
seq_out.write(header + "\n" + seq)
line_count += 1