Skip to content

Commit eb75a2b

Browse files
committed
Add new required inputs to PipelineStepBlastContigs
1 parent 593c7c7 commit eb75a2b

File tree

3 files changed

+21
-13
lines changed

3 files changed

+21
-13
lines changed

lib/idseq-dag/idseq_dag/steps/blast_contigs.py

+12-10
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,15 @@ def run(self):
9494
command.write_text_to_file('[]', contig_summary_json)
9595
return # return in the middle of the function
9696

97+
lineage_db = s3.fetch_reference(
98+
self.additional_files["lineage_db"],
99+
self.ref_dir_local,
100+
allow_s3mi=False) # Too small to waste s3mi
101+
102+
accession2taxid_dict = s3.fetch_reference(self.additional_files["accession2taxid"], self.ref_dir_local)
103+
97104
(read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
98-
PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8)
105+
PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8, lineage_db, accession2taxid_dict)
99106
read2contig = {}
100107
generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path=duplicate_cluster_sizes_path)
101108

@@ -106,11 +113,6 @@ def run(self):
106113
refined_hit_summary, refined_m8)
107114

108115
# Generating taxon counts based on updated results
109-
lineage_db = s3.fetch_reference(
110-
self.additional_files["lineage_db"],
111-
self.ref_dir_local,
112-
allow_s3mi=False) # Too small to waste s3mi
113-
114116
deuterostome_db = None
115117
if self.additional_files.get("deuterostome_db"):
116118
deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"],
@@ -264,7 +266,7 @@ def update_read_dict(read2contig, blast_top_blastn_6_path, read_dict, accession_
264266
return (consolidated_dict, read2blastm8, contig2lineage, added_reads)
265267

266268
@staticmethod
267-
def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8):
269+
def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8, lineage_db, accession2taxid_dict):
268270
blast_type = 'nucl'
269271
blast_command = 'blastn'
270272
command.execute(
@@ -308,10 +310,10 @@ def run_blast_nt(blast_index_path, blast_m8, assembled_contig, reference_fasta,
308310
)
309311
)
310312
# further processing of getting the top m8 entry for each contig.
311-
get_top_m8_nt(blast_m8, blast_top_m8)
313+
get_top_m8_nt(blast_m8, lineage_db, accession2taxid_dict, blast_top_m8)
312314

313315
@staticmethod
314-
def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8):
316+
def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta, blast_top_m8, lineage_db, accession2taxid_dict):
315317
blast_type = 'prot'
316318
blast_command = 'blastx'
317319
command.execute(
@@ -349,4 +351,4 @@ def run_blast_nr(blast_index_path, blast_m8, assembled_contig, reference_fasta,
349351
)
350352
)
351353
# further processing of getting the top m8 entry for each contig.
352-
get_top_m8_nr(blast_m8, blast_top_m8)
354+
get_top_m8_nr(blast_m8, lineage_db, accession2taxid_dict, blast_top_m8)

workflows/short-read-mngs/postprocess.wdl

+7-2
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ task BlastContigs_refined_gsnap_out {
157157
File assembly_nt_refseq_fasta
158158
File duplicate_cluster_sizes_tsv
159159
File lineage_db
160+
File accession2taxid
160161
File taxon_blacklist
161162
File deuterostome_db
162163
Boolean use_deuterostome_filter
@@ -171,7 +172,7 @@ task BlastContigs_refined_gsnap_out {
171172
--input-files '[["~{gsnap_out_gsnap_m8}", "~{gsnap_out_gsnap_deduped_m8}", "~{gsnap_out_gsnap_hitsummary_tab}", "~{gsnap_out_gsnap_counts_with_dcr_json}"], ["~{assembly_contigs_fasta}", "~{assembly_scaffolds_fasta}", "~{assembly_read_contig_sam}", "~{assembly_contig_stats_json}"], ["~{assembly_nt_refseq_fasta}"], ["~{duplicate_cluster_sizes_tsv}"]]' \
172173
--output-files '["assembly/gsnap.blast.m8", "assembly/gsnap.reassigned.m8", "assembly/gsnap.hitsummary2.tab", "assembly/refined_gsnap_counts_with_dcr.json", "assembly/gsnap_contig_summary.json", "assembly/gsnap.blast.top.m8"]' \
173174
--output-dir-s3 '~{s3_wd_uri}' \
174-
--additional-files '{"lineage_db": "~{lineage_db}", "taxon_blacklist": "~{taxon_blacklist}", "deuterostome_db": "~{if use_deuterostome_filter then '~{deuterostome_db}' else ''}"}' \
175+
--additional-files '{"lineage_db": "~{lineage_db}", "accession2taxid": "~{accession2taxid}", "taxon_blacklist": "~{taxon_blacklist}", "deuterostome_db": "~{if use_deuterostome_filter then '~{deuterostome_db}' else ''}"}' \
175176
--additional-attributes '{"db_type": "nt", "use_taxon_whitelist": ~{use_taxon_whitelist}}'
176177
>>>
177178
output {
@@ -204,6 +205,7 @@ task BlastContigs_refined_rapsearch2_out {
204205
File assembly_nr_refseq_fasta
205206
File duplicate_cluster_sizes_tsv
206207
File lineage_db
208+
File accession2taxid
207209
File taxon_blacklist
208210
Boolean use_taxon_whitelist
209211
}
@@ -216,7 +218,7 @@ task BlastContigs_refined_rapsearch2_out {
216218
--input-files '[["~{rapsearch2_out_rapsearch2_m8}", "~{rapsearch2_out_rapsearch2_deduped_m8}", "~{rapsearch2_out_rapsearch2_hitsummary_tab}", "~{rapsearch2_out_rapsearch2_counts_with_dcr_json}"], ["~{assembly_contigs_fasta}", "~{assembly_scaffolds_fasta}", "~{assembly_read_contig_sam}", "~{assembly_contig_stats_json}"], ["~{assembly_nr_refseq_fasta}"], ["~{duplicate_cluster_sizes_tsv}"]]' \
217219
--output-files '["assembly/rapsearch2.blast.m8", "assembly/rapsearch2.reassigned.m8", "assembly/rapsearch2.hitsummary2.tab", "assembly/refined_rapsearch2_counts_with_dcr.json", "assembly/rapsearch2_contig_summary.json", "assembly/rapsearch2.blast.top.m8"]' \
218220
--output-dir-s3 '~{s3_wd_uri}' \
219-
--additional-files '{"lineage_db": "~{lineage_db}", "taxon_blacklist": "~{taxon_blacklist}"}' \
221+
--additional-files '{"lineage_db": "~{lineage_db}", "accession2taxid": "~{accession2taxid}", "taxon_blacklist": "~{taxon_blacklist}"}' \
220222
--additional-attributes '{"db_type": "nr", "use_taxon_whitelist": ~{use_taxon_whitelist}}'
221223
>>>
222224
output {
@@ -489,6 +491,7 @@ workflow czid_postprocess {
489491
String nr_db = "s3://czid-public-references/ncbi-sources/2021-01-22/nr"
490492
File nr_loc_db = "s3://czid-public-references/alignment_data/2021-01-22/nr_loc.db"
491493
File lineage_db = "s3://czid-public-references/taxonomy/2021-01-22/taxid-lineages.db"
494+
File accession2taxid_db = "s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/accession2taxid.marisa"
492495
File taxon_blacklist = "s3://czid-public-references/taxonomy/2021-01-22/taxon_blacklist.txt"
493496
File deuterostome_db = "s3://czid-public-references/taxonomy/2021-01-22/deuterostome_taxids.txt"
494497
Boolean use_deuterostome_filter = true
@@ -556,6 +559,7 @@ workflow czid_postprocess {
556559
assembly_nt_refseq_fasta = DownloadAccessions_gsnap_accessions_out.assembly_nt_refseq_fasta,
557560
duplicate_cluster_sizes_tsv = duplicate_cluster_sizes_tsv,
558561
lineage_db = lineage_db,
562+
accession2taxid = accession2taxid_db,
559563
taxon_blacklist = taxon_blacklist,
560564
deuterostome_db = deuterostome_db,
561565
use_deuterostome_filter = use_deuterostome_filter,
@@ -577,6 +581,7 @@ workflow czid_postprocess {
577581
assembly_nr_refseq_fasta = DownloadAccessions_rapsearch2_accessions_out.assembly_nr_refseq_fasta,
578582
duplicate_cluster_sizes_tsv = duplicate_cluster_sizes_tsv,
579583
lineage_db = lineage_db,
584+
accession2taxid = accession2taxid_db,
580585
taxon_blacklist = taxon_blacklist,
581586
use_taxon_whitelist = use_taxon_whitelist
582587
}

workflows/short-read-mngs/test/local_test_viral.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ postprocess.nt_db: s3://czid-public-references/test/viral-alignment-indexes/vira
2626
postprocess.nt_loc_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt_loc.marisa
2727
postprocess.nr_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nr
2828
postprocess.nr_loc_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nr_loc.marisa
29+
postprocess.accession2taxid_db: s3://czid-public-references/mini-database/alignment_indexes/2020-08-20-viral/viral_accessions2taxid.marisa
2930
experimental.nt_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt
3031
experimental.nt_loc_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt_loc.marisa
31-
experimental.nt_info_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt_info.marisa
32+
experimental.nt_info_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt_info.marisa

0 commit comments

Comments
 (0)