@@ -117,23 +117,34 @@ def summary(self):
117
117
return r
118
118
119
119
120
- def _optimal_hit_for_each_query_nr (blast_output_path , max_evalue ):
120
+ def _optimal_hit_for_each_query_nr (blast_output_path , lineage_map , accession2taxid_dict , max_evalue ):
121
121
contigs_to_best_alignments = defaultdict (list )
122
122
accession_counts = defaultdict (lambda : 0 )
123
123
124
124
with open (blast_output_path ) as blastn_6_f :
125
125
# For each contig, get the alignments that have the best total score (may be multiple if there are ties).
126
+ # Prioritize the specificity of the hit.
127
+ specificity_to_best_alignments = defaultdict (dict )
126
128
for alignment in BlastnOutput6Reader (blastn_6_f ):
127
129
if alignment ["evalue" ] > max_evalue :
128
130
continue
129
131
query = alignment ["qseqid" ]
130
- best_alignments = contigs_to_best_alignments [query ]
131
132
132
- if len (best_alignments ) == 0 or best_alignments [0 ]["bitscore" ] < alignment ["bitscore" ]:
133
- contigs_to_best_alignments [query ] = [alignment ]
133
+ lineage_taxids = _get_lineage (alignment ["sseqid" ], lineage_map , accession2taxid_dict )
134
+ specificity = next ((level for level , taxid_at_level in enumerate (lineage_taxids ) if int (taxid_at_level ) > 0 ), float ("inf" ))
135
+
136
+ best_alignments = specificity_to_best_alignments [query ]
137
+
138
+ if (specificity not in best_alignments ) or best_alignments [specificity ][0 ]["bitscore" ] < alignment ["bitscore" ]:
139
+ specificity_to_best_alignments [query ][specificity ] = [alignment ]
134
140
# Add all ties to best_hits.
135
- elif len (best_alignments ) > 0 and best_alignments [0 ]["bitscore" ] == alignment ["bitscore" ]:
136
- contigs_to_best_alignments [query ].append (alignment )
141
+ elif len (best_alignments [specificity ]) > 0 and best_alignments [specificity ][0 ]["bitscore" ] == alignment ["bitscore" ]:
142
+ specificity_to_best_alignments [query ][specificity ].append (alignment )
143
+
144
+ # Choose the best alignments with the most specific taxid information.
145
+ for contig_id , specificity_alignment_dict in specificity_to_best_alignments .items ():
146
+ specific_best_alignments = next (specificity_alignment_dict [specificity ] for specificity in sorted (specificity_alignment_dict .keys ()))
147
+ contigs_to_best_alignments [contig_id ] = specific_best_alignments
137
148
138
149
# Create a map of accession to best alignment count.
139
150
for _contig_id , alignments in contigs_to_best_alignments .items ():
@@ -199,15 +210,15 @@ def _optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_di
199
210
# We prioritize the specificity of the hit; hits with species taxids are taken before hits without
200
211
# Specificity is just the index of the tuple returned by _get_lineage(); 0 for species, 1 for genus, etc.
201
212
lineage_taxids = _get_lineage (hit .sseqid , lineage_map , accession2taxid_dict )
202
- specificity = next (level for level , taxid_at_level in enumerate (lineage_taxids ) if int (taxid_at_level ) > 0 )
213
+ specificity = next (( level for level , taxid_at_level in enumerate (lineage_taxids ) if int (taxid_at_level ) > 0 ), float ( "inf" ) )
203
214
204
215
if (specificity not in best_hits ) or best_hits [specificity ][0 ].total_score < hit .total_score :
205
216
best_hits [specificity ] = [hit ]
206
217
# Add all ties to best_hits[specificity].
207
218
elif len (best_hits [specificity ]) > 0 and best_hits [specificity ][0 ].total_score == hit .total_score :
208
219
best_hits [specificity ].append (hit )
209
220
210
- specific_best_hits = next (hits for specificity , hits in best_hits .items () if len ( hits ) > 0 )
221
+ specific_best_hits = next (best_hits [ specificity ] for specificity in sorted ( best_hits .keys ()) )
211
222
contigs_to_blast_candidates [specific_best_hits [0 ].qseqid ] = specific_best_hits
212
223
213
224
# Create a map of accession to blast candidate count.
@@ -232,13 +243,17 @@ def _optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_di
232
243
233
244
def get_top_m8_nr (
234
245
blast_output ,
246
+ lineage_map_path ,
247
+ accession2taxid_dict_path ,
235
248
blast_top_blastn_6_path ,
236
249
max_evalue = MAX_EVALUE_THRESHOLD ,
237
250
):
238
251
''' Get top m8 file entry for each contig from blast_output and output to blast_top_m8 '''
239
- with open (blast_top_blastn_6_path , "w" ) as blast_top_blastn_6_f :
252
+ with open (blast_top_blastn_6_path , "w" ) as blast_top_blastn_6_f , \
253
+ open_file_db_by_extension (lineage_map_path , "lll" ) as lineage_map , \
254
+ open_file_db_by_extension (accession2taxid_dict_path , "L" ) as accession2taxid_dict : # noqa
240
255
BlastnOutput6Writer (blast_top_blastn_6_f ).writerows (
241
- _optimal_hit_for_each_query_nr (blast_output , max_evalue )
256
+ _optimal_hit_for_each_query_nr (blast_output , lineage_map , accession2taxid_dict , max_evalue )
242
257
)
243
258
244
259
0 commit comments