-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathMetrics.py
executable file
·265 lines (233 loc) · 11.4 KB
/
Metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/python3
# Copyright 2011-2014 Francisco Pina Martins <f.pinamartins@gmail.com>
# This file is part of 4Pipe4.
# 4Pipe4 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 4Pipe4 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with 4Pipe4. If not, see <http://www.gnu.org/licenses/>.
import re
from pipeutils import FASTA_parser
def Read_metrics(fasta_file):
'''Calculate metrics from fasta files.'''
fasta_dict = FASTA_parser(fasta_file)
values = list(map(len, fasta_dict.values()))
max_len = max(values)
avg_len = "%.2f" % (sum(values)/len(values))
values.sort()
medianpos = len(values) / 2
if len(values) % 2 == 0:
median = (values[int(medianpos)] + values[int(medianpos - 1)]) / 2
else:
median = values[int(medianpos - 0.5)]
return(avg_len, max_len, median)
def Read_qual_metrics(qual_file):
'''Gather original fasta.qual info.'''
qual = open(qual_file, 'r')
quals = []
for lines in qual:
if lines.startswith(">") == False:
lines = list(map(int, lines.strip().split()))
quals += lines
qual.close()
qual_avg = "%.2f" % (sum(quals)/len(quals))
return(qual_avg)
def Dataset_gather(seqclean_report_file, origin_fasta_file, clean_fasta_file,
origin_qual_file, clean_qual_file):
'''Gather the relevant parts from seqclean report.'''
clean_report = open(seqclean_report_file, 'r')
marker = 0
text = []
for lines in clean_report:
if lines.startswith("*****"):
marker = 1
if lines.startswith(" by 'UniVec'"):
text.append(lines)
break
if marker == 1:
text.append(lines)
origin_metrics = Read_metrics(origin_fasta_file)
clean_metrics = Read_metrics(clean_fasta_file)
origin_qual_avg = Read_qual_metrics(origin_qual_file)
clean_qual_avg = Read_qual_metrics(clean_qual_file)
return(text, origin_metrics, clean_metrics, origin_qual_avg,
clean_qual_avg)
def Contig_gather(mira_report):
'''Gather contig data from the mira report:'''
report = open(mira_report, 'r')
total_contigs = []
total_consensus = []
N50 = []
N90 = []
N95 = []
max_coverage = []
avg_qual = []
for lines in report:
if lines.startswith("Num. reads assembled:"):
reads_assembled = re.search('\d*$', lines).group()
if lines.startswith(" Avg. total coverage:"):
total_cov = re.search('\d.*$', lines).group()
if lines.startswith(" Number of contigs:"):
total_contigs.append(re.search('\d.*$', lines).group())
if lines.startswith(" Total consensus:"):
total_consensus.append(re.search('\d*$', lines).group())
if lines.startswith(" Largest contig:"):
largest_contig = re.search('\d.*$', lines).group()
if lines.startswith(" N50 contig size:"):
N50.append(re.search('\d*$', lines).group())
if lines.startswith(" N90 contig size:"):
N90.append(re.search('\d*$', lines).group())
if lines.startswith(" N95 contig size:"):
N95.append(re.search('\d*$', lines).group())
if lines.startswith(" Max coverage (total):"):
max_coverage.append(re.search('\d*$', lines).group())
if lines.startswith(" Average consensus quality:"):
avg_qual.append(re.search('\d*$', lines).group())
report.close()
return(reads_assembled, total_cov, total_contigs, total_consensus,
largest_contig, N50, N90, N95, max_coverage, avg_qual)
def SNP_gather(snp_file, orf_file):
'''Gather SNP data from the fasta files:'''
snps = FASTA_parser(snp_file)
ammount = 0
for titles in snps.keys():
ammount += titles.count("#")
num_contigs = len(snps)
avg_snps_per_contig = "%.2f" % (ammount/num_contigs)
contig_sizes = list(map(len, snps.values()))
max_contig_size = max(contig_sizes)
min_contig_size = min(contig_sizes)
avg_contig_size = "%.2f" % (sum(contig_sizes)/len(contig_sizes))
# This part counts how many SNPs are inside ORFs
orfs = FASTA_parser(orf_file)
contig_names = []
for contigs in orfs.keys():
if "REVERSE" in contigs:
start = int(re.search('\[\d*', contigs).group()[1:])
end = re.findall('\d+(?=:)', contigs)
end = list(map(int, end))
for items in end:
suffix = str(start-items+2)
contig = re.sub(' .*$', '', contigs) + ';' + suffix
contig_names.append(contig)
else:
start = int(re.search('\[\d*', contigs).group()[1:])
end = re.findall('\d+(?=:)', contigs)
end = list(map(int, end))
for items in end:
suffix = str(start+items)
contig = re.sub(' .*$', '', contigs) + ';' + suffix
contig_names.append(contig)
snps_in_orfs = len(set(contig_names))
return(ammount, num_contigs, avg_snps_per_contig, max_contig_size,
min_contig_size, avg_contig_size, snps_in_orfs)
def Metrics_writer(dataset_info, contig_info, snp_info, metrics_file):
'''Writes the metrics into a file.'''
metrics_file = open(metrics_file, 'w')
metrics_file.write('<HTML>\n <HEAD>\n <META HTTP-EQUIV="CONTENT-\
TYPE" CONTENT="text/html; charset=utf-8">\n <TITLE>4Pipe4 Metrics \
Report</TITLE> \n <STYLE>\n <!-- \n BODY,DIV,\
TABLE,THEAD,TBODY,TFOOT,TR,TH,TD,P { font-family:"Arial"; font-size:small }\
\n -->\n </STYLE>\n </HEAD>\n<BODY>\n')
metrics_file.write("<H1>4Pipe4 metrics report:</H1>\n")
# Write these metrics for 454 only.
if dataset_info != "solxa":
metrics_file.write("<H2>Dataset metrics:</H2>\n")
metrics_file.write("<p>Average read length (before cleaning): "
+ str(dataset_info[1][0]) + "</p>\n")
metrics_file.write("<p>Maximum read length (before cleaning): "
+ str(dataset_info[1][1]) + "</p>\n")
metrics_file.write("<p>Median of read length (before cleaning): "
+ str(dataset_info[1][2]) + "</p>\n")
metrics_file.write("<p>Average base quality (before cleaning): "
+ str(dataset_info[3]) + "</p>\n")
metrics_file.write("<H3>SeqClean report:</H3>")
for lines in dataset_info[0]:
metrics_file.write("<p>" + lines + "</p>")
metrics_file.write("<p>Average read length (after cleaning): "
+ str(dataset_info[2][0]) + "</p>\n")
metrics_file.write("<p>Maximum read length (after cleaning): "
+ str(dataset_info[2][1]) + "</p>\n")
metrics_file.write("<p>Median of read length (after cleaning): "
+ str(dataset_info[2][2]) + "</p>\n")
metrics_file.write("<p>Average base quality (after cleaning): "
+ str(dataset_info[4]) + "</p>\n")
metrics_file.write("<H2>Contig metrics:</H2>\n")
metrics_file.write("<p>Number of reads assembled: "
+ str(contig_info[0]) + "</p>\n")
metrics_file.write("<p>Total coverage: " + str(contig_info[1]) + "</p>\n")
metrics_file.write("<p>Length of largest contig: "
+ str(contig_info[4]) + "</p>\n")
metrics_file.write("<H3>Contigs with a length of at least 1000 bp:</H3>\n")
metrics_file.write("<p>Total number of contigs formed: "
+ str(contig_info[2][0]) + "</p>\n")
metrics_file.write("<p>Total sum of consesnus bases: "
+ str(contig_info[3][0]) + "</p>\n")
metrics_file.write("<p>N50: " + str(contig_info[5][0]) + "</p>\n")
metrics_file.write("<p>N90: " + str(contig_info[6][0]) + "</p>\n")
metrics_file.write("<p>N95: " + str(contig_info[7][0]) + "</p>\n")
metrics_file.write("<p>Depth of maximum coverage zone: "
+ str(contig_info[8][0]) + "</p>\n")
metrics_file.write("<p>Average quality of bases: "
+ str(contig_info[9][0]) + "</p>\n")
metrics_file.write("<H3>All contigs:</H3>\n")
metrics_file.write("<p>Total number of contigs formed: "
+ str(contig_info[2][1]) + "</p>\n")
metrics_file.write("<p>Total sum of consesnus bases: "
+ str(contig_info[3][1]) + "</p>\n")
metrics_file.write("<p>N50: " + str(contig_info[5][1]) + "</p>\n")
metrics_file.write("<p>N90: " + str(contig_info[6][1]) + "</p>\n")
metrics_file.write("<p>N95: " + str(contig_info[7][1]) + "</p>\n")
metrics_file.write("<p>Depth of maximum coverage zone: "
+ str(contig_info[8][1]) + "</p>\n")
metrics_file.write("<p>Average quality of bases: "
+ str(contig_info[9][1]) + "</p>\n")
metrics_file.write("<H2>SNP metrics:</H2>\n")
metrics_file.write("<p>Number of SNPs found: " + str(snp_info[0])
+ "</p>\n")
metrics_file.write("<p>Number of contigs with SNPs: " + str(snp_info[1])
+ "</p>\n")
metrics_file.write("<p>Average SNPs per contig with SNPs: "
+ str(snp_info[2]) + "</p>\n")
metrics_file.write("<p>Largest contig length with at least one SNP: "
+ str(snp_info[3]) + "</p>\n")
metrics_file.write("<p>Shortest contig length with at least one SNP: "
+ str(snp_info[4]) + "</p>\n")
metrics_file.write("<p>Average length of contigs containing SNPs: "
+ str(snp_info[5]) + "</p>\n")
metrics_file.write("<p>Number of SNPs in ORFs: " + str(snp_info[6])
+ "</p>\n")
metrics_file.write("</BODY>\n</HTML>\n")
metrics_file.close()
def Run_module(seqclean_log_file, original_fasta_file, clean_fasta_file,
original_fasta_qual_file, clean_fasta_qual_file,
info_assembly_file, snps_fasta_file, bestorf_fasta_file,
metrics_file):
"""
Run the module
"""
dataset_info = Dataset_gather(seqclean_log_file, original_fasta_file,
clean_fasta_file, original_fasta_qual_file,
clean_fasta_qual_file)
contig_info = Contig_gather(info_assembly_file)
snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
def Run_as_solexa(info_assembly_file, snps_fasta_file, bestorf_fasta_file,
metrics_file):
"""
Run the module with solexa data.
"""
contig_info = Contig_gather(info_assembly_file)
snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
dataset_info = "solexa"
Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
if __name__ == "__main__":
# Usage: python3 Metrics.py (view Run_module() for a list of arguments)
from sys import argv
Run_module(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7],
argv[8], argv[9], argv[10])