1
1
from collections import defaultdict
2
2
3
+ import idseq_dag .util .lineage as lineage
4
+
5
+ from idseq_dag .util .dict import open_file_db_by_extension
3
6
from idseq_dag .util .m8 import NT_MIN_ALIGNMENT_LEN
4
7
from idseq_dag .util .parsing import BlastnOutput6Reader , BlastnOutput6Writer
5
8
from idseq_dag .util .parsing import BlastnOutput6NTReader
@@ -56,6 +59,14 @@ def _intersects(needle, haystack):
56
59
''' Return True iff needle intersects haystack. Ignore overlap < NT_MIN_OVERLAP_FRACTION. '''
57
60
return any (_hsp_overlap (needle , hay ) for hay in haystack )
58
61
62
+ def _get_lineage (accession_id , lineage_map , accession2taxid_dict , lineage_cache = {}):
63
+ if accession_id in lineage_cache :
64
+ return lineage_cache [accession_id ]
65
+ accession_taxid = accession2taxid_dict .get (
66
+ accession_id .split ("." )[0 ], "NA" )
67
+ result = lineage_map .get (accession_taxid , lineage .NULL_LINEAGE )
68
+ lineage_cache [accession_id ] = result
69
+ return result
59
70
60
71
class BlastCandidate :
61
72
'''Documented in function get_top_m8_nt() below.'''
@@ -64,6 +75,7 @@ def __init__(self, hsps):
64
75
self .hsps = hsps
65
76
self .optimal_cover = None
66
77
self .total_score = None
78
+
67
79
if hsps :
68
80
# All HSPs here must be for the same query and subject sequence.
69
81
h_0 = hsps [0 ]
@@ -170,26 +182,33 @@ def _filter_and_group_hits_by_query(blast_output_path, min_alignment_length, min
170
182
171
183
172
184
# An iterator that, for contig, yields to optimal hit for the contig in the nt blast_output.
173
- def _optimal_hit_for_each_query_nt (blast_output , min_alignment_length , min_pident , max_evalue , summary = True ):
185
+ def _optimal_hit_for_each_query_nt (blast_output , lineage_map , accession2taxid_dict ,
186
+ min_alignment_length , min_pident , max_evalue , summary = True ):
174
187
contigs_to_blast_candidates = {}
175
188
accession_counts = defaultdict (lambda : 0 )
176
189
177
190
# For each contig, get the collection of blast candidates that have the best total score (may be multiple if there are ties).
178
191
for query_hits in _filter_and_group_hits_by_query (blast_output , min_alignment_length , min_pident , max_evalue ):
179
- best_hits = []
192
+ best_hits = {}
180
193
for _subject , hit_fragments in query_hits .items ():
181
194
# For NT, we take a special approach where we try to find a subset of disjoint HSPs
182
195
# with maximum sum of bitscores.
183
196
hit = BlastCandidate (hit_fragments )
184
197
hit .optimize ()
185
198
186
- if len (best_hits ) == 0 or best_hits [0 ].total_score < hit .total_score :
187
- best_hits = [hit ]
188
- # Add all ties to best_hits.
189
- elif len (best_hits ) > 0 and best_hits [0 ].total_score == hit .total_score :
190
- best_hits .append (hit )
199
+ # We prioritize the specificity of the hit; hits with species taxids are taken before hits without
200
+ # Specificity is just the index of the tuple returned by _get_lineage(); 0 for species, 1 for genus, etc.
201
+ lineage_taxids = _get_lineage (hit .sseqid , lineage_map , accession2taxid_dict )
202
+ specificity = next (level for level , taxid_at_level in enumerate (lineage_taxids ) if int (taxid_at_level ) > 0 )
191
203
192
- contigs_to_blast_candidates [best_hits [0 ].qseqid ] = best_hits
204
+ if (specificity not in best_hits ) or best_hits [specificity ][0 ].total_score < hit .total_score :
205
+ best_hits [specificity ] = [hit ]
206
+ # Add all ties to best_hits[specificity].
207
+ elif len (best_hits [specificity ]) > 0 and best_hits [specificity ][0 ].total_score == hit .total_score :
208
+ best_hits [specificity ].append (hit )
209
+
210
+ specific_best_hits = next (hits for specificity , hits in best_hits .items () if len (hits ) > 0 )
211
+ contigs_to_blast_candidates [specific_best_hits [0 ].qseqid ] = specific_best_hits
193
212
194
213
# Create a map of accession to blast candidate count.
195
214
for _contig_id , blast_candidates in contigs_to_blast_candidates .items ():
@@ -225,6 +244,8 @@ def get_top_m8_nr(
225
244
226
245
def get_top_m8_nt (
227
246
blast_output ,
247
+ lineage_map_path ,
248
+ accession2taxid_dict_path ,
228
249
blast_top_blastn_6_path ,
229
250
min_alignment_length = NT_MIN_ALIGNMENT_LEN ,
230
251
min_pident = _NT_MIN_PIDENT ,
@@ -251,7 +272,9 @@ def get_top_m8_nt(
251
272
252
273
# Output the optimal hit for each query.
253
274
254
- with open (blast_top_blastn_6_path , "w" ) as blast_top_blastn_6_f :
275
+ with open (blast_top_blastn_6_path , "w" ) as blast_top_blastn_6_f , \
276
+ open_file_db_by_extension (lineage_map_path , "lll" ) as lineage_map , \
277
+ open_file_db_by_extension (accession2taxid_dict_path , "L" ) as accession2taxid_dict : # noqa
255
278
BlastnOutput6NTRerankedWriter (blast_top_blastn_6_f ).writerows (
256
- _optimal_hit_for_each_query_nt (blast_output , min_alignment_length , min_pident , max_evalue , False )
279
+ _optimal_hit_for_each_query_nt (blast_output , lineage_map , accession2taxid_dict , min_alignment_length , min_pident , max_evalue , False )
257
280
)
0 commit comments