Skip to content

Commit 362fdeb

Browse files
committed
Use most specific hit NT top hits
1 parent 7e6d8a4 commit 362fdeb

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

lib/idseq-dag/idseq_dag/util/top_hits.py

+33-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from collections import defaultdict
22

3+
import idseq_dag.util.lineage as lineage
4+
5+
from idseq_dag.util.dict import open_file_db_by_extension
36
from idseq_dag.util.m8 import NT_MIN_ALIGNMENT_LEN
47
from idseq_dag.util.parsing import BlastnOutput6Reader, BlastnOutput6Writer
58
from idseq_dag.util.parsing import BlastnOutput6NTReader
@@ -56,6 +59,14 @@ def _intersects(needle, haystack):
5659
''' Return True iff needle intersects haystack. Ignore overlap < NT_MIN_OVERLAP_FRACTION. '''
5760
return any(_hsp_overlap(needle, hay) for hay in haystack)
5861

62+
def _get_lineage(accession_id, lineage_map, accession2taxid_dict, lineage_cache={}):
63+
if accession_id in lineage_cache:
64+
return lineage_cache[accession_id]
65+
accession_taxid = accession2taxid_dict.get(
66+
accession_id.split(".")[0], "NA")
67+
result = lineage_map.get(accession_taxid, lineage.NULL_LINEAGE)
68+
lineage_cache[accession_id] = result
69+
return result
5970

6071
class BlastCandidate:
6172
'''Documented in function get_top_m8_nt() below.'''
@@ -64,6 +75,7 @@ def __init__(self, hsps):
6475
self.hsps = hsps
6576
self.optimal_cover = None
6677
self.total_score = None
78+
6779
if hsps:
6880
# All HSPs here must be for the same query and subject sequence.
6981
h_0 = hsps[0]
@@ -170,26 +182,33 @@ def _filter_and_group_hits_by_query(blast_output_path, min_alignment_length, min
170182

171183

172184
# An iterator that, for contig, yields to optimal hit for the contig in the nt blast_output.
173-
def _optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue, summary=True):
185+
def _optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_dict,
186+
min_alignment_length, min_pident, max_evalue, summary=True):
174187
contigs_to_blast_candidates = {}
175188
accession_counts = defaultdict(lambda: 0)
176189

177190
# For each contig, get the collection of blast candidates that have the best total score (may be multiple if there are ties).
178191
for query_hits in _filter_and_group_hits_by_query(blast_output, min_alignment_length, min_pident, max_evalue):
179-
best_hits = []
192+
best_hits = {}
180193
for _subject, hit_fragments in query_hits.items():
181194
# For NT, we take a special approach where we try to find a subset of disjoint HSPs
182195
# with maximum sum of bitscores.
183196
hit = BlastCandidate(hit_fragments)
184197
hit.optimize()
185198

186-
if len(best_hits) == 0 or best_hits[0].total_score < hit.total_score:
187-
best_hits = [hit]
188-
# Add all ties to best_hits.
189-
elif len(best_hits) > 0 and best_hits[0].total_score == hit.total_score:
190-
best_hits.append(hit)
199+
# We prioritize the specificity of the hit; hits with species taxids are taken before hits without
200+
# Specificity is just the index of the tuple returned by _get_lineage(); 0 for species, 1 for genus, etc.
201+
lineage_taxids = _get_lineage(hit.sseqid, lineage_map, accession2taxid_dict)
202+
specificity = next(level for level, taxid_at_level in enumerate(lineage_taxids) if int(taxid_at_level) > 0)
191203

192-
contigs_to_blast_candidates[best_hits[0].qseqid] = best_hits
204+
if (specificity not in best_hits) or best_hits[specificity][0].total_score < hit.total_score:
205+
best_hits[specificity] = [hit]
206+
# Add all ties to best_hits[specificity].
207+
elif len(best_hits[specificity]) > 0 and best_hits[specificity][0].total_score == hit.total_score:
208+
best_hits[specificity].append(hit)
209+
210+
specific_best_hits = next(hits for specificity, hits in best_hits.items() if len(hits) > 0)
211+
contigs_to_blast_candidates[specific_best_hits[0].qseqid] = specific_best_hits
193212

194213
# Create a map of accession to blast candidate count.
195214
for _contig_id, blast_candidates in contigs_to_blast_candidates.items():
@@ -225,6 +244,8 @@ def get_top_m8_nr(
225244

226245
def get_top_m8_nt(
227246
blast_output,
247+
lineage_map_path,
248+
accession2taxid_dict_path,
228249
blast_top_blastn_6_path,
229250
min_alignment_length=NT_MIN_ALIGNMENT_LEN,
230251
min_pident=_NT_MIN_PIDENT,
@@ -251,7 +272,9 @@ def get_top_m8_nt(
251272

252273
# Output the optimal hit for each query.
253274

254-
with open(blast_top_blastn_6_path, "w") as blast_top_blastn_6_f:
275+
with open(blast_top_blastn_6_path, "w") as blast_top_blastn_6_f, \
276+
open_file_db_by_extension(lineage_map_path, "lll") as lineage_map, \
277+
open_file_db_by_extension(accession2taxid_dict_path, "L") as accession2taxid_dict: # noqa
255278
BlastnOutput6NTRerankedWriter(blast_top_blastn_6_f).writerows(
256-
_optimal_hit_for_each_query_nt(blast_output, min_alignment_length, min_pident, max_evalue, False)
279+
_optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_dict, min_alignment_length, min_pident, max_evalue, False)
257280
)

workflows/long-read-mngs/run.wdl

+8-1
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,8 @@ task RunNRAlignment {
644644
task FindTopHitsNT {
645645
input {
646646
File nt_m8
647+
File lineage_db
648+
File accession2taxid_db
647649
String docker_image_id
648650
}
649651
@@ -652,7 +654,12 @@ task FindTopHitsNT {
652654
python3 <<CODE
653655
from idseq_dag.steps.blast_contigs import get_top_m8_nt
654656
655-
get_top_m8_nt("~{nt_m8}", "gsnap.blast.top.m8")
657+
get_top_m8_nt(
658+
"~{nt_m8}",
659+
"~{lineage_db}",
660+
"~{accession2taxid_db}",
661+
"gsnap.blast.top.m8",
662+
)
656663
CODE
657664
658665
python3 - << 'EOF'

0 commit comments

Comments
 (0)