Skip to content

Commit

Permalink
Merge pull request #53 from aofarrel/mega-qc
Browse files Browse the repository at this point in the history
Output ludicrous amounts of QC metrics, soften QC limits, use integers for percentages instead of floats, update decontam
  • Loading branch information
aofarrel authored Dec 29, 2024
2 parents 3e0dd94 + 58e58fa commit 8bd57e1
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 125 deletions.
14 changes: 8 additions & 6 deletions checker.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,18 @@ workflow checker {
paired_fastq_sets = paired_fastq_sets
}

Array[File] TEST_bai = flatten(select_all([myco_raw_default.bais, TRUTH_bai, FALLBACK_bai]))
Array[File] TEST_diff = flatten(select_all([myco_raw_default.diffs, TRUTH_diff, FALLBACK_diff]))
File TEST_qc = myco_raw_default.qc_csv
Array[File] TEST_report = select_all(select_first([myco_raw_default.diff_reports, TRUTH_report, FALLBACK_report])) # awkward due to being Array[File?]
Array[File] TEST_vcf = flatten(select_all([myco_raw_default.vcfs, TRUTH_vcf, FALLBACK_vcf]))
Array[File] TEST_bai = flatten(select_all([myco_raw_default.tbd_bais, TRUTH_bai, FALLBACK_bai]))
Array[File] TEST_diff = flatten(select_all([myco_raw_default.tbd_diffs, TRUTH_diff, FALLBACK_diff]))
#File TEST_qc = myco_raw_default.qc_csv
Array[File] TEST_report = select_all(select_first([myco_raw_default.tbd_diff_reports, TRUTH_report, FALLBACK_report])) # awkward due to being Array[File?]
Array[File] TEST_vcf = flatten(select_all([myco_raw_default.tbd_vcfs, TRUTH_vcf, FALLBACK_vcf]))

call verify_array.arraycheck_classic as check_myco_raw_default {
input:
test = flatten([TEST_bai, TEST_diff, [TEST_qc], TEST_report, TEST_vcf]),
#test = flatten([TEST_bai, TEST_diff, [TEST_qc], TEST_report, TEST_vcf]),
test = flatten([TEST_bai, TEST_diff, TEST_report, TEST_vcf]),
truth = flatten(select_all([TRUTH_bai, TRUTH_diff, TRUTH_qc, TRUTH_report, TRUTH_vcf])),
#truth = flatten(select_all([TRUTH_bai, TRUTH_diff, TRUTH_qc, TRUTH_report, TRUTH_vcf])),
disk_size_override = checker_disk_size_override
}

Expand Down
152 changes: 68 additions & 84 deletions myco_raw.wdl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version 1.0

import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.15.0/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.16.3/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.15.0/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.24/tasks/processing_tasks.wdl" as sranwrp_processing
import "https://raw.githubusercontent.com/aofarrel/tree_nine/0.0.16/tree_nine.wdl" as build_treesWF
Expand All @@ -18,16 +18,14 @@ workflow myco {
Boolean guardrail_mode = true
Boolean low_resource_mode = false

Boolean clean_after_decontam = false
Int clean_average_q_score = 29
Boolean clean_before_decontam = true
Boolean covstatsQC_skip_entirely = false
Boolean decontam_use_CDC_varpipe_ref = false # changed in myco 6.3.0!!
Boolean covstatsQC_skip_entirely = true # changed in myco 6.4.0
Boolean decontam_use_CDC_varpipe_ref = false # changed in myco 6.3.0
File? mask_bedfile
Int QC_max_pct_low_coverage_sites = 20
Int QC_max_pct_unmapped = 2
Int QC_min_mean_coverage = 10
Int QC_min_q30 = 90
Int QC_max_pct_unmapped = 10 # changed in myco 6.4.0
Int QC_min_mean_coverage = 10 # CDC minimum: 50x
Int QC_min_q30 = 80 # CDC minimum: 85%
Boolean QC_soft_pct_mapped = false
Int QC_this_is_low_coverage = 10
Int quick_tasks_disk_size = 10
Expand Down Expand Up @@ -66,15 +64,13 @@ workflow myco {
scatter(paired_fastqs in paired_fastq_sets) {
call clckwrk_combonation.clean_and_decontam_and_check as decontam_each_sample {
input:
docker_image = if decontam_use_CDC_varpipe_ref then "ashedpotatoes/clockwork-plus:v0.12.5.2-CDC" else "ashedpotatoes/clockwork-plus:v0.12.5.1-CRyPTIC",
docker_image = if decontam_use_CDC_varpipe_ref then "ashedpotatoes/clockwork-plus:v0.12.5.3-CDC" else "ashedpotatoes/clockwork-plus:v0.12.5.3-CRyPTIC",
unsorted_sam = true,
force_rename_out = output_sample_name,
reads_files = paired_fastqs,
fastp_clean_avg_qual = clean_average_q_score,
fastp_clean_before_decontam = clean_before_decontam,
fastp_clean_after_decontam = clean_after_decontam,
QC_min_q30 = QC_min_q30 / 100.0,
preliminary_min_q30 = if guardrail_mode then 0.2 else 0.0000001,
preliminary_min_q30 = if guardrail_mode then 20 else 1,
subsample_cutoff = if guardrail_mode then 30000 else -1,
timeout_map_reads = if guardrail_mode then 300 else 0,
timeout_decontam = if guardrail_mode then 600 else 0,
Expand Down Expand Up @@ -299,14 +295,14 @@ workflow myco {

# did the decontamination step actually run? (note that defined() is not a robust check, but since this is the first task
# in the workflow this should be okay for now)
if(defined(decontam_each_sample.errorcode)) {
if(defined(decontam_each_sample.error_code)) {

# Sidenote: If you have a task taking in decontam_each_sample.errorcode, even after this defined check, it will fail
# Sidenote: If you have a task taking in decontam_each_sample.error_code, even after this defined check, it will fail
# command instantiation with "Cannot interpolate Array[String?] into a command string with attribute set
# [PlaceholderAttributeSet(None,None,None,Some( ))]".
# get the first (0th) value, eg only value since there's just one sample, and coerce it into type String
String coerced_decontam_errorcode = select_first([decontam_each_sample.errorcode[0], "WORKFLOW_ERROR_1_REPORT_TO_DEV"])
String coerced_decontam_errorcode = select_first([decontam_each_sample.error_code[0], "WORKFLOW_ERROR_1_REPORT_TO_DEV"])

# did the decontamination step return an error?
if(!(coerced_decontam_errorcode == pass)) {
Expand Down Expand Up @@ -381,60 +377,65 @@ workflow myco {
# [select_all(warning_decontam)]])) failed: No coercion defined from wom value(s) '
# [["EARLYQC_88.112_PCT_ABOVE_Q30_(MIN_0.9)", "EARLYQC_99.61_PCT_MAPPED_(MIN_99.995)"]]' of type 'Array[Array[String]]' to 'Array[String]'.
#Array[String] warnings = flatten(flatten([[select_all(tbprofilerFQ.warning_codes)], [select_all(warning_decontam)]]))
Float this_unmapped = decontam_each_sample.reads_unmapped[0]
Float this_kept = decontam_each_sample.reads_clck_kept[0]
Float porp_unmapped = this_unmapped/this_kept
Float pct_unmapped_decontam = if !clean_after_decontam then (porp_unmapped * 100) else -1.0

Map[String, String] metrics_to_values = {
"status": select_first([finalcode, "NA"]),
"n_reads_contam": decontam_each_sample.reads_is_contam[0], # decontamination
"n_reads_decon_reference": decontam_each_sample.reads_reference[0], # decontamination
"n_reads_decon_unmapped": decontam_each_sample.reads_unmapped[0], # decontamination
"n_reads_decon_kept": decontam_each_sample.reads_clck_kept[0], # decontamination
"pct_loss_decon": decontam_each_sample.pct_loss_decon[0], # decontamination
"pct_loss_cleaning": decontam_each_sample.pct_loss_cleaning[0], # decontamination
"pct_mapped_tbprof": select_first([tbprofilerFQ.pct_reads_mapped[0], "NA"]), # thiagen!TBProfiler
"pct_unmapped_covstats": select_first([percentUnmapped, "NA"]), # covstats
"pct_unmapped_decon": pct_unmapped_decontam, # decontamination
"pct_above_q30": decontam_each_sample.dcntmd_pct_above_q30[0], # fastp
"median_depth": select_first([tbprofilerFQ.median_depth[0], "NA"]), # thiagen!TBProfiler
"genome_pct_coverage": select_first([tbprofilerFQ.pct_genome_covered[0], "NA"]), # thiagen!TBProfiler
"mean_coverage": select_first([meanCoverage, "NA"]) # covstats
}
String sample_name_inputs_basename = sub(sub(sub(basename(paired_fastq_sets[0][0]), ".fastq", ""), ".gz", ""), ".fq", "")
String sample_name_maybe_varcalled = if length(final_bams) > 0 then sub(basename(final_bams[0]), "_to_H37Rv.bam", "") else sample_name_inputs_basename
String sample_name_maybe_manually_set = if defined(output_sample_name) then select_first([output_sample_name, "fallback"]) else sample_name_maybe_varcalled

call sranwrp_processing.map_to_tsv_or_csv as qc_summary {
input:
the_map = metrics_to_values,
column_names = if length(paired_fastq_sets) == 1 then [sample_name_maybe_manually_set] else ["sample"],
outfile = if length(paired_fastq_sets) == 1 then sample_name_maybe_manually_set+"_qc" else "combined_qc_report.txt"
}
output {
# status of sample -- only valid iff this ran on only one sample
String tbd_status_code = select_first([finalcode, pass])
String tbd_status = select_first([finalcode, pass])
String tbd_pipeline_run = date_pipeline_ran

# decon/fastp metadata pulled out directly
Float tbd_qc_q20_in = decontam_each_sample.q20_in[0]
Float tbd_qc_q30_in = decontam_each_sample.q30_in[0]
Int tbd_qc_reads_in = decontam_each_sample.reads_in[0]
Int tbd_qc_mean_r1_len_in = decontam_each_sample.mean_r1_len_in[0]
Int tbd_qc_mean_r2_len_in = decontam_each_sample.mean_r2_len_in[0]
Float tbd_qc_q20_postclean = decontam_each_sample.q20_postclean[0]
Float tbd_qc_q30_postclean = decontam_each_sample.q30_postclean[0]
Int tbd_qc_reads_postclean_per_fastp = decontam_each_sample.reads_postclean_per_fastp[0]
Int tbd_qc_mean_r1_len_postclean = decontam_each_sample.mean_r1_len_postclean[0]
Int tbd_qc_mean_r2_len_postclean = decontam_each_sample.mean_r2_len_postclean[0]
Float tbd_qc_pct_loss_cleaning_per_fastp = decontam_each_sample.pct_loss_cleaning_per_fastp[0]
Int tbd_qc_reads_postclean_per_decon = decontam_each_sample.reads_postclean_per_decon[0]
Int tbd_qc_reads_postdecon_per_decon = decontam_each_sample.reads_postdecon_per_decon[0]
Int tbd_qc_reads_TB = decontam_each_sample.reads_TB[0]
Int tbd_qc_reads_NTM = decontam_each_sample.reads_NTM[0]
Int tbd_qc_reads_human = decontam_each_sample.reads_human[0]
Int tbd_qc_reads_contam = decontam_each_sample.reads_contam[0]
Float tbd_qc_pct_reads_TB_predecon = decontam_each_sample.pct_reads_TB_predecon[0]
Float tbd_qc_pct_reads_NTM = decontam_each_sample.pct_reads_NTM[0]
Float tbd_qc_pct_reads_human = decontam_each_sample.pct_reads_human[0]
Float tbd_qc_pct_reads_TB_postdecon = decontam_each_sample.pct_reads_TB_postdecon[0]
Float tbd_qc_pct_loss_decon_per_decon = decontam_each_sample.pct_loss_decon_per_decon[0]
Float tbd_qc_pct_loss_total = decontam_each_sample.pct_loss_total[0]
Float tbd_qc_pct_loss_decon_per_fastp = decontam_each_sample.pct_loss_decon_per_fastp[0]
Float tbd_qc_q20_postdecon = decontam_each_sample.q20_postdecon[0]
Float tbd_qc_q30_postdecon = decontam_each_sample.q30_postdecon[0]
Int tbd_qc_reads_postdecon_per_fastp = decontam_each_sample.reads_postdecon_per_fastp[0]
Int tbd_qc_mean_r1_len_postdecon = decontam_each_sample.mean_r1_len_postdecon[0]
Int tbd_qc_mean_r2_len_postdecon = decontam_each_sample.mean_r2_len_postdecon[0]
Float tbd_qc_duplication_rate = decontam_each_sample.duplication_rate[0]
Int tbd_qc_reads_adapter_trimmed = decontam_each_sample.reads_adapter_trimmed[0]

# theiagen!TBProfiler metadata pulled out directly
Float? tbd_qc_median_depth_per_tbprof = tbprofilerFQ.median_depth[0]
Float? tbd_qc_avg_depth_per_tbprof = tbprofilerFQ.avg_depth[0]
Float? tbd_qc_pct_mapped_per_tbprof = tbprofilerFQ.pct_reads_mapped[0]
Float? tbd_qc_pct_genome_covered = tbprofilerFQ.pct_genome_covered[0]
Int? tbd_n_dr_variants = tbprofilerFQ.n_dr_variants[0]
Int? tbd_n_other_variants = tbprofilerFQ.n_other_variants[0]
String? tbd_resistance = tbprofilerFQ.resistance[0]
String? tbd_strain_per_tbprof = tbprofilerFQ.strain[0]

# debug
Float tbd_pct_unmapped_decontamm = pct_unmapped_decontam
Float tbd_n_reads_decon_kept = this_kept
Float tbd_n_reads_decon_unmapped = this_unmapped
Float? tbd_pct_mapped_tbprof = tbprofilerFQ.pct_reads_mapped[0]
Float? tbd_pct_unmapped_covstats = percentUnmapped
Float? tbd_pct_loss_decon = decontam_each_sample.pct_loss_decon[0]
Float? tbd_pct_loss_cleaning = decontam_each_sample.pct_loss_cleaning[0]
# covstats
Float? tbd_qc_pct_unmapped_covstats = percentUnmapped

# raw files
# intermediate files
Array[File] tbd_bais = final_bais
Array[File] tbd_bams = final_bams
Array[File] tbd_diffs = real_diffs
Array[File] tbd_masks = real_masks # bedgraph
Array[File] tbd_vcfs = minos_vcfs

# metadata
# metadata files
Array[File?] tbd_decontam_reports = decontam_each_sample.counts_out_tsv
Array[File?] tbd_covstats_reports = covstats.covstatsOutfile
Array[File?] tbd_diff_reports = real_reports
Expand All @@ -444,22 +445,6 @@ workflow myco {
Array[File?] tbd_tbprof_fq_looker = tbprofilerFQ.tbprofiler_looker_csv
Array[File?] tbd_tbprof_fq_laboratorian = tbprofilerFQ.tbprofiler_laboratorian_report_csv
Array[File?] tbd_tbprof_fq_lims = tbprofilerFQ.tbprofiler_lims_report_csv

# these outputs only exist if there are multiple samples
File? tbprof_bam_all_depths = collate_bam_depth.outfile
File? tbprof_bam_all_strains = collate_bam_strains.outfile
File? tbprof_bam_all_resistances = collate_bam_resistance.outfile
File? tbprof_fq_all_depths = collate_fq_depth.outfile
File? tbprof_fq_all_strains = collate_fq_strains.outfile
File? tbprof_fq_all_resistances = collate_fq_resistance.outfile

# these outputs only exist if we ran on a single sample
String? tbd_tbprof_bam_this_depth = single_sample_tbprof_bam_depth
String? tbd_tbprof_bam_this_strain = single_sample_tbprof_bam_strain
String? tbd_tbprof_bam_this_resistance = single_sample_tbprof_bam_resistance
String? tbd_tbprof_fq_this_depth = single_sample_tbprof_fq_depth
String? tbd_tbprof_fq_this_strain = single_sample_tbprof_fq_strain
String? tbd_tbprof_fq_this_resistance = single_sample_tbprof_fq_resistance

# tree nine
File? tree_nwk = trees.tree_nwk
Expand All @@ -470,16 +455,15 @@ workflow myco {
Array[File]? distance_matrix = trees.max_distance_matrix

# useful debugging/run information (only valid iff this ran on only one sample)
File tbd_qc_csv = qc_summary.tsv_or_csv
#Array[String] pass_or_warnings = if (length(warnings) > 0) then warnings else ["PASS"]
String? tbd_debug_decontam_ERR = decontam_ERR
String? tbd_debug_earlyQC_ERR = earlyQC_ERR
String? tbd_debug_varcall_ERR = varcall_ERR
String? tbd_debug_covstats_ERR = covstats_ERR
String? tbd_debug_vcfdiff_ERR = vcfdiff_ERR
Array[String]? tbd_debug_vcfdiff_errorcode_if_covstats = vcfdiff_errorcode_if_covstats
Array[String]? tbd_debug_vcfdiff_errorcode_if_no_covstats = vcfdiff_errorcode_if_no_covstats
Array[String]? tbd_debug_vcfdiff_errorcode_array = vcfdiff_errorcode_array
#String? tbd_debug_decontam_ERR = decontam_ERR
#String? tbd_debug_earlyQC_ERR = earlyQC_ERR
#String? tbd_debug_varcall_ERR = varcall_ERR
#String? tbd_debug_covstats_ERR = covstats_ERR
#String? tbd_debug_vcfdiff_ERR = vcfdiff_ERR
#Array[String]? tbd_debug_vcfdiff_errorcode_if_covstats = vcfdiff_errorcode_if_covstats
#Array[String]? tbd_debug_vcfdiff_errorcode_if_no_covstats = vcfdiff_errorcode_if_no_covstats
#Array[String]? tbd_debug_vcfdiff_errorcode_array = vcfdiff_errorcode_array
String tbd_clockwork_docker = decontam_each_sample.docker_used[0]
}
}
Expand Down
2 changes: 1 addition & 1 deletion myco_simple.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
version 1.0
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/update-dockers-0.12.5.2/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.15.0/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/fastp-wdl/0.0.4/fastp_tasks.wdl" as fastp
import "https://raw.githubusercontent.com/aofarrel/vcf_to_diff_wdl/0.0.3/vcf_to_diff.wdl" as diff

Expand Down
4 changes: 2 additions & 2 deletions myco_sra.wdl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version 1.0

import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/update-dockers-0.12.5.2/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/update-dockers-0.12.5.2/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.15.0/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.15.0/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.24/tasks/pull_fastqs.wdl" as sranwrp_pull
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.24/tasks/processing_tasks.wdl" as sranwrp_processing
import "https://raw.githubusercontent.com/aofarrel/tree_nine/0.0.16/tree_nine.wdl" as build_treesWF
Expand Down
32 changes: 0 additions & 32 deletions wrapper_example.wdl

This file was deleted.

0 comments on commit 8bd57e1

Please sign in to comment.