Skip to content

Commit

Permalink
Genotype filtering training labels and cutoff optimization (#584)
Browse files Browse the repository at this point in the history
* Add model GQ filter model training
* Remove plink and king from docker
* Peg R Matrix package to 1.6-5
  • Loading branch information
mwalker174 authored Apr 29, 2024
1 parent d9b5243 commit c0fa444
Show file tree
Hide file tree
Showing 23 changed files with 2,488 additions and 87 deletions.
23 changes: 5 additions & 18 deletions dockerfiles/sv-pipeline-virtual-env/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN apt-get -qqy update --fix-missing && \
# install conda packages
# NOTE: need to use scipy=1.7.3 instead of scipy=1.8.0 because it makes hail angry
ARG CONDA_PKGS="cython=0.29.28 numpy=1.22.3 pandas=1.4.2 scipy=1.7.3 scikit-learn=1.0.2 intervaltree=3.1.0 \
matplotlib=3.5.1 natsort=8.1.0 google-cloud-dataproc=4.0.2"
matplotlib=3.5.1 natsort=8.1.0 google-cloud-dataproc=4.0.2 seaborn=0.12.2"
RUN mamba install -qy --freeze-installed -n $CONDA_ENV_NAME -c conda-forge -c bioconda $CONDA_PKGS

# copy in HTSLIB install so that pysam uses same version as is available in pipeline
Expand Down Expand Up @@ -48,9 +48,9 @@ RUN export SETUPTOOLS_VERSION=$(python -c 'import setuptools; print(setuptools._
pip install setuptools==$SETUPTOOLS_VERSION

# pybedtools needs to be installed via pip because it doesn't like the updated python
# hail's latest version is only available via pip or local build
# hail's latest version is only available via pip or local build. Run cache purge in case the base is out of date.
ARG PIP_PKGS="pybedtools==0.9.0 hail==0.2.93"
RUN pip3 --no-cache-dir install $PIP_PKGS
RUN pip3 cache purge && pip3 --no-cache-dir install $PIP_PKGS

# clean unneeded stuff
RUN conda clean -ay --force-pkgs-dirs
Expand Down Expand Up @@ -79,7 +79,7 @@ RUN export NEW_PACKAGES=$(diff_of_lists.sh "$RUN_DEPS" $APT_REQUIRED_PACKAGES) &

# install R packages
ARG R_PACKAGES="assertthat beeswarm BH BSDA caret cli crayon DAAG data.table devtools digest dplyr e1071 fansi fpc \
generics gert glue HardyWeinberg hash latticeExtra magrittr Matrix metap mnormt nlme nloptr nnet \
generics gert glue HardyWeinberg hash latticeExtra magrittr metap mnormt nlme nloptr nnet \
numDeriv perm pillar pkgconfig plogr plyr purrr pwr R6 RColorBrewer Rcpp reshape reshape2 rlang ROCR \
rpart stringi stringr survival tibble tidyr tidyselect utf8 vioplot withr zoo"
ARG BIOCONDUCTOR_PKGS="SNPRelate multtest"
Expand All @@ -89,6 +89,7 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED
apt-get -qqy install --no-install-recommends $BUILD_DEPS $(fix_spaces.sh $APT_REQUIRED_PACKAGES) && \
install_bioconductor_packages.R $BIOCONDUCTOR_PKGS && \
install_deprecated_R_package.sh "https://cran.r-project.org/src/contrib/Archive/MASS/MASS_7.3-58.tar.gz" && \
install_deprecated_R_package.sh "https://cran.r-project.org/src/contrib/Archive/Matrix/Matrix_1.6-5.tar.gz" && \
install_R_packages.R $R_PACKAGES && \
apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \
apt-get -qqy autoremove --purge && \
Expand All @@ -100,18 +101,4 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED
/usr/share/man/?? \
/usr/share/man/??_*

# Install plink2 & KING (for relatedness inference)
ARG KING_URL="https://www.kingrelatedness.com/executables/Linux-king215.tar.gz"
RUN mkdir -p /opt/bin && \
cd /opt/bin && \
wget -q $KING_URL && \
tar -xzf Linux-king215.tar.gz && \
rm -f Linux-king215.tar.gz

ARG PLINK2_URL="https://github.com/chrchang/plink-ng/releases/download/2019/plink2_linux_x86_64_20190107.zip"
RUN cd /opt/bin && \
wget -q $PLINK2_URL && \
unzip plink2_linux_x86_64_20190107.zip && \
rm -f plink2_linux_x86_64_20190107.zip

ENV PATH=/opt/bin:$PATH
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"FilterGenotypes.vcf": {{ test_batch.concordance_vcf | tojson }},
"FilterGenotypes.output_prefix": {{ test_batch.name | tojson }},
"FilterGenotypes.ploidy_table": {{ test_batch.ploidy_table | tojson }},
"FilterGenotypes.truth_json": {{ test_batch.recalibrate_gq_truth_json | tojson }},

"FilterGenotypes.primary_contigs_fai": {{ reference_resources.primary_contigs_fai | tojson }},
"FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }},
"FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }},
"FilterGenotypes.recalibrate_gq_args": [
"--keep-homvar false",
"--keep-homref true",
"--keep-multiallelic true",
"--skip-genotype-filtering true",
"--min-samples-to-estimate-allele-frequency -1"
],

"FilterGenotypes.ped_file": {{ test_batch.ped_file | tojson }},
"FilterGenotypes.site_level_comparison_datasets": [
{{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }},
{{ reference_resources.gnomad_v2_collins_site_level_benchmarking_dataset | tojson }},
{{ reference_resources.hgsv_byrska_bishop_site_level_benchmarking_dataset | tojson }},
{{ reference_resources.thousand_genomes_site_level_benchmarking_dataset | tojson }}
],
"FilterGenotypes.sample_level_comparison_datasets": [
{{ reference_resources.hgsv_byrska_bishop_sample_level_benchmarking_dataset | tojson }}
],
"FilterGenotypes.sample_renaming_tsv": {{ reference_resources.hgsv_byrska_bishop_sample_renaming_tsv | tojson }},
"FilterGenotypes.runtime_override_per_sample_benchmark_plot": {
"mem_gb": 30,
"disk_gb": 50
},

"FilterGenotypes.linux_docker": {{ dockers.linux_docker | tojson }},
"FilterGenotypes.gatk_docker": {{ dockers.gq_recalibrator_docker | tojson }},
"FilterGenotypes.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
"FilterGenotypes.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"MakeGqRecalibratorTrainingSetFromPacBio.vcfs": [{{ test_batch.concordance_vcf | tojson }}],
"MakeGqRecalibratorTrainingSetFromPacBio.training_sample_ids": {{ test_batch.pacbio_samples_list | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.output_prefix": {{ test_batch.name | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.ploidy_table": {{ test_batch.ploidy_table | tojson }},

"MakeGqRecalibratorTrainingSetFromPacBio.pacbio_sample_ids": {{ test_batch.pacbio_samples | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.vapor_files": {{ test_batch.vapor_files | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.pbsv_vcfs": {{ test_batch.pacbio_pbsv_vcfs | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.pav_vcfs": {{ test_batch.pacbio_pav_vcfs | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.sniffles_vcfs": {{ test_batch.pacbio_sniffles_vcfs | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.reference_dict": {{ reference_resources.reference_dict | tojson }},

"MakeGqRecalibratorTrainingSetFromPacBio.sv_utils_docker" : {{ dockers.sv_utils_docker | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.gatk_docker" : {{ dockers.gatk_docker | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.sv_base_mini_docker" : {{ dockers.sv_base_mini_docker | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.sv_pipeline_docker" : {{ dockers.sv_pipeline_docker | tojson }},
"MakeGqRecalibratorTrainingSetFromPacBio.linux_docker" : {{ dockers.linux_docker | tojson }}
}
4 changes: 2 additions & 2 deletions inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
"pangenie_docker": "us.gcr.io/broad-dsde-methods/vjalili/pangenie:vj-127571f",
"sv-base-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-virtual-env:5994670",
"cnmops-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/cnmops-virtual-env:5994670",
"sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-virtual-env:2024-01-24-v0.28.4-beta-9debd6d7",
"sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-virtual-env:mw-train-genotype-filtering-a9479501",
"samtools-cloud-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/samtools-cloud-virtual-env:5994670",
"sv-utils-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils-env:2023-02-01-v0.26.8-beta-9b25c72d",
"sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-01-24-v0.28.4-beta-9debd6d7",
"sv_utils_docker": "us.gcr.io/broad-dsde-methods/markw/sv-utils:mw-train-genotype-filtering-a9479501",
"gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
"str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6"
}
6 changes: 3 additions & 3 deletions inputs/values/hgdp.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
"concordance_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.vcf.gz",
"concordance_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.vcf.gz.tbi",

"pacbio_sample_concordance_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.concordance.subset.vcf.gz",
"pacbio_sample_concordance_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.concordance.subset.vcf.gz.tbi",
"recalibrate_gq_truth_json": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.gq_training_labels.json",
"pacbio_sample_concordance_vcf": "gs://gatk-sv-hgdp/mw-train-genotype-filtering/hgdp.pacbio_samples.vcf.gz",
"pacbio_sample_concordance_vcf_index": "gs://gatk-sv-hgdp/mw-train-genotype-filtering/hgdp.pacbio_samples.vcf.gz.tbi",
"recalibrate_gq_truth_json": "gs://gatk-sv-hgdp/mw-train-genotype-filtering/hgdp.gq_training_labels.json",

"aou_recalibrated_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.aou_gq_recalibrated.vcf.gz",
"aou_recalibrated_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.aou_gq_recalibrated.vcf.gz.tbi",
Expand Down
Loading

0 comments on commit c0fa444

Please sign in to comment.