Skip to content

Commit 593c7c7

Browse files
committed
Use most specific hit NR top hits
1 parent 362fdeb commit 593c7c7

File tree

2 files changed

+37
-11
lines changed

2 files changed

+37
-11
lines changed

lib/idseq-dag/idseq_dag/util/top_hits.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -117,23 +117,34 @@ def summary(self):
117117
return r
118118

119119

120-
def _optimal_hit_for_each_query_nr(blast_output_path, max_evalue):
120+
def _optimal_hit_for_each_query_nr(blast_output_path, lineage_map, accession2taxid_dict, max_evalue):
121121
contigs_to_best_alignments = defaultdict(list)
122122
accession_counts = defaultdict(lambda: 0)
123123

124124
with open(blast_output_path) as blastn_6_f:
125125
# For each contig, get the alignments that have the best total score (may be multiple if there are ties).
126+
# Prioritize the specificity of the hit.
127+
specificity_to_best_alignments = defaultdict(dict)
126128
for alignment in BlastnOutput6Reader(blastn_6_f):
127129
if alignment["evalue"] > max_evalue:
128130
continue
129131
query = alignment["qseqid"]
130-
best_alignments = contigs_to_best_alignments[query]
131132

132-
if len(best_alignments) == 0 or best_alignments[0]["bitscore"] < alignment["bitscore"]:
133-
contigs_to_best_alignments[query] = [alignment]
133+
lineage_taxids = _get_lineage(alignment["sseqid"], lineage_map, accession2taxid_dict)
134+
specificity = next((level for level, taxid_at_level in enumerate(lineage_taxids) if int(taxid_at_level) > 0), float("inf"))
135+
136+
best_alignments = specificity_to_best_alignments[query]
137+
138+
if (specificity not in best_alignments) or best_alignments[specificity][0]["bitscore"] < alignment["bitscore"]:
139+
specificity_to_best_alignments[query][specificity] = [alignment]
134140
# Add all ties to best_hits.
135-
elif len(best_alignments) > 0 and best_alignments[0]["bitscore"] == alignment["bitscore"]:
136-
contigs_to_best_alignments[query].append(alignment)
141+
elif len(best_alignments[specificity]) > 0 and best_alignments[specificity][0]["bitscore"] == alignment["bitscore"]:
142+
specificity_to_best_alignments[query][specificity].append(alignment)
143+
144+
# Choose the best alignments with the most specific taxid information.
145+
for contig_id, specificity_alignment_dict in specificity_to_best_alignments.items():
146+
specific_best_alignments = next(specificity_alignment_dict[specificity] for specificity in sorted(specificity_alignment_dict.keys()))
147+
contigs_to_best_alignments[contig_id] = specific_best_alignments
137148

138149
# Create a map of accession to best alignment count.
139150
for _contig_id, alignments in contigs_to_best_alignments.items():
@@ -199,15 +210,15 @@ def _optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_di
199210
# We prioritize the specificity of the hit; hits with species taxids are taken before hits without
200211
# Specificity is just the index of the tuple returned by _get_lineage(); 0 for species, 1 for genus, etc.
201212
lineage_taxids = _get_lineage(hit.sseqid, lineage_map, accession2taxid_dict)
202-
specificity = next(level for level, taxid_at_level in enumerate(lineage_taxids) if int(taxid_at_level) > 0)
213+
specificity = next((level for level, taxid_at_level in enumerate(lineage_taxids) if int(taxid_at_level) > 0), float("inf"))
203214

204215
if (specificity not in best_hits) or best_hits[specificity][0].total_score < hit.total_score:
205216
best_hits[specificity] = [hit]
206217
# Add all ties to best_hits[specificity].
207218
elif len(best_hits[specificity]) > 0 and best_hits[specificity][0].total_score == hit.total_score:
208219
best_hits[specificity].append(hit)
209220

210-
specific_best_hits = next(hits for specificity, hits in best_hits.items() if len(hits) > 0)
221+
specific_best_hits = next(best_hits[specificity] for specificity in sorted(best_hits.keys()))
211222
contigs_to_blast_candidates[specific_best_hits[0].qseqid] = specific_best_hits
212223

213224
# Create a map of accession to blast candidate count.
@@ -232,13 +243,17 @@ def _optimal_hit_for_each_query_nt(blast_output, lineage_map, accession2taxid_di
232243

233244
def get_top_m8_nr(
234245
blast_output,
246+
lineage_map_path,
247+
accession2taxid_dict_path,
235248
blast_top_blastn_6_path,
236249
max_evalue=MAX_EVALUE_THRESHOLD,
237250
):
238251
''' Get top m8 file entry for each contig from blast_output and output to blast_top_m8 '''
239-
with open(blast_top_blastn_6_path, "w") as blast_top_blastn_6_f:
252+
with open(blast_top_blastn_6_path, "w") as blast_top_blastn_6_f, \
253+
open_file_db_by_extension(lineage_map_path, "lll") as lineage_map, \
254+
open_file_db_by_extension(accession2taxid_dict_path, "L") as accession2taxid_dict: # noqa
240255
BlastnOutput6Writer(blast_top_blastn_6_f).writerows(
241-
_optimal_hit_for_each_query_nr(blast_output, max_evalue)
256+
_optimal_hit_for_each_query_nr(blast_output, lineage_map, accession2taxid_dict, max_evalue)
242257
)
243258

244259

workflows/long-read-mngs/run.wdl

+12-1
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,8 @@ task FindTopHitsNT {
687687
task FindTopHitsNR {
688688
input {
689689
File nr_m8
690+
File lineage_db
691+
File accession2taxid_db
690692
String docker_image_id
691693
}
692694
@@ -695,7 +697,12 @@ task FindTopHitsNR {
695697
python3 <<CODE
696698
from idseq_dag.steps.blast_contigs import get_top_m8_nr
697699
698-
get_top_m8_nr("~{nr_m8}", "rapsearch2.blast.top.m8")
700+
get_top_m8_nr(
701+
"~{nr_m8}",
702+
"~{lineage_db}",
703+
"~{accession2taxid_db}",
704+
"rapsearch2.blast.top.m8"
705+
)
699706
CODE
700707
701708
python3 - << 'EOF'
@@ -1406,12 +1413,16 @@ workflow czid_long_read_mngs {
14061413
call FindTopHitsNT {
14071414
input:
14081415
nt_m8 = RunNTAlignment.nt_m8,
1416+
lineage_db = lineage_db,
1417+
accession2taxid_db = accession2taxid_db,
14091418
docker_image_id = docker_image_id,
14101419
}
14111420
14121421
call FindTopHitsNR {
14131422
input:
14141423
nr_m8 = RunNRAlignment.nr_m8,
1424+
lineage_db = lineage_db,
1425+
accession2taxid_db = accession2taxid_db,
14151426
docker_image_id = docker_image_id,
14161427
}
14171428

0 commit comments

Comments
 (0)