diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml index 65ad7bbb..d74b0dfa 100644 --- a/.github/workflows/github-actions.yml +++ b/.github/workflows/github-actions.yml @@ -1,6 +1,6 @@ name: DeepRVAT run-name: DeepRVAT ๐Ÿงฌ๐Ÿงช๐Ÿ’ป๐Ÿง‘โ€๐Ÿ”ฌ -on: [ push ] +on: [push] jobs: DeepRVAT-Pipeline-Smoke-Tests: @@ -77,7 +77,6 @@ jobs: --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs shell: micromamba-shell {0} - DeepRVAT-Preprocessing-Pipeline-Smoke-Tests: runs-on: ubuntu-latest steps: @@ -143,6 +142,10 @@ jobs: cache-environment: true cache-downloads: true + - name: Install biotoolbox + run: cpanm Bio::ToolBox@1.691 --force + shell: micromamba-shell {0} + - name: Install DeepRVAT run: pip install -e ${{ github.workspace }} shell: micromamba-shell {0} @@ -162,6 +165,20 @@ jobs: -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \ && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz + - name: Cache GTF file + id: cache-gtf + uses: actions/cache@v4 + with: + path: example/preprocess/workdir/reference + key: ${{ runner.os }}-reference-gtf + + - name: Download gtf data + if: steps.cache-gtf.outputs.cache-hit != 'true' + run: | + cd ${{ github.workspace }}/example/preprocess && \ + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz \ + -O workdir/reference/gencode.v44.annotation.gtf.gz + - name: Run preprocessing pipeline run: | python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \ @@ -169,12 +186,10 @@ jobs: --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs shell: micromamba-shell {0} - DeepRVAT-Preprocessing-Pipeline-Tests-With-QC: runs-on: ubuntu-latest needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests steps: - - name: Check out repository code uses: actions/checkout@v4 - uses: mamba-org/setup-micromamba@v1.8.0 @@ -184,6 +199,10 @@ jobs: cache-environment: true cache-downloads: true + - name: Install biotoolbox + run: cpanm Bio::ToolBox@1.691 --force + shell: micromamba-shell {0} + - name: Install DeepRVAT run: pip install -e ${{ github.workspace }} shell: micromamba-shell {0} @@ -203,6 +222,20 @@ jobs: -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \ && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz + - name: Cache GTF file + id: cache-gtf + uses: actions/cache@v4 + with: + path: example/preprocess/workdir/reference + key: ${{ runner.os }}-reference-gtf + + - name: Download gtf data + if: steps.cache-gtf.outputs.cache-hit != 'true' + run: | + cd ${{ github.workspace }}/example/preprocess && \ + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz \ + -O workdir/reference/gencode.v44.annotation.gtf.gz + - name: Run preprocessing pipeline run: | python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \ diff --git a/deeprvat_preprocessing_env.yml b/deeprvat_preprocessing_env.yml index 771b3a29..09b80c06 100644 --- a/deeprvat_preprocessing_env.yml +++ b/deeprvat_preprocessing_env.yml @@ -14,3 +14,7 @@ dependencies: - snakemake=7.17.1 - bcftools=1.17 - samtools=1.17 + - bedtools=2.31.1 + - perl-app-cpanminus=1.7047 + - bedops=2.4.41 + - gcc=13.2.0 \ No newline at end of file diff --git a/docs/_static/preprocess_rulegraph_no_qc.svg b/docs/_static/preprocess_rulegraph_no_qc.svg index 6ddf980d..d41f4483 100644 --- a/docs/_static/preprocess_rulegraph_no_qc.svg +++ b/docs/_static/preprocess_rulegraph_no_qc.svg @@ -1,28 +1,28 @@ - - - + + snakemake_dag - + 0 - + all 1 - + combine_genotypes - + 1->0 @@ -30,7 +30,7 @@ 2 - + preprocess_no_qc @@ -42,17 +42,17 @@ 3 - + add_variant_ids - + 3->0 - + 3->2 @@ -60,7 +60,7 @@ 4 - + concatenate_variants @@ -69,62 +69,62 @@ - - -9 - + + +12 + create_parquet_variant_ids - - -4->9 + + +4->12 5 - -variants + +variants 5->4 - - + + 6 - -normalize + +normalize 6->5 - - + + - - -10 - + + +13 + sparsify - - -6->10 - - + + +6->13 + + 7 - + extract_samples - + 7->2 @@ -132,38 +132,80 @@ 7->6 - - + + 8 - -index_fasta + +index_fasta 8->6 - - + + - - -9->0 + + +9 + +expand_regions + + + +8->9 + + + + + +9->6 + + + + + +10 + +create_bed + + + +10->9 + + + + + +11 + +fiter_gtf + + + +11->10 + + + + + +12->0 - + -9->2 +12->2 - - -10->2 - - + + +13->2 + + diff --git a/docs/_static/preprocess_rulegraph_with_qc.svg b/docs/_static/preprocess_rulegraph_with_qc.svg index 167d7839..7de3c73f 100644 --- a/docs/_static/preprocess_rulegraph_with_qc.svg +++ b/docs/_static/preprocess_rulegraph_with_qc.svg @@ -1,271 +1,313 @@ - - - + + snakemake_dag - + 0 - -all + +all 1 - -combine_genotypes + +combine_genotypes - + 1->0 - - + + 2 - -preprocess_with_qc + +preprocess_with_qc 2->1 - - + + 3 - -add_variant_ids + +add_variant_ids - + 3->0 - - + + - + 3->2 - - + + 4 - -concatenate_variants + +concatenate_variants 4->3 - - + + - - -9 - -create_parquet_variant_ids + + +12 + +create_parquet_variant_ids - - -4->9 - - + + +4->12 + + 5 - + variants 5->4 - - + + 6 - -normalize + +normalize 6->5 - - - - - -10 - -sparsify - - - -6->10 - - - - - -11 - -qc_varmiss - - - -6->11 - - - - - -12 - -qc_hwe - - - -6->12 - - + + 13 - -qc_read_depth + +sparsify - + 6->13 - - + + 14 - -qc_allelic_imbalance + +qc_varmiss - + 6->14 - - + + + + + +15 + +qc_hwe + + + +6->15 + + 16 - -qc_indmiss + +qc_read_depth - + 6->16 - - + + + + + +17 + +qc_allelic_imbalance + + + +6->17 + + + + + +19 + +qc_indmiss + + + +6->19 + + 7 - -extract_samples + +extract_samples - + 7->2 - - + + - + 7->6 - - + + 8 - -index_fasta + +index_fasta - + 8->6 - - + + - - -9->0 - - + + +9 + +expand_regions - - -9->2 - - + + +8->9 + + - - -10->2 - - + + +9->6 + + - - -11->2 - - + + +10 + +create_bed + + + +10->9 + + + + + +11 + +fiter_gtf + + + +11->10 + + + + + +12->0 + + - + 12->2 - - + + - + 13->2 - - + + - + 14->2 - - - - - -15 - -process_individual_missingness + + - + 15->2 - - + + - - -16->15 - - + + +16->2 + + + + + +17->2 + + + + + +18 + +process_individual_missingness + + + +18->2 + + + + + +19->18 + + diff --git a/docs/preprocessing.md b/docs/preprocessing.md index b9564cf1..ba8f4497 100644 --- a/docs/preprocessing.md +++ b/docs/preprocessing.md @@ -72,16 +72,23 @@ sparse_dir_name : sparse # Expected to be found in working_dir/reference_dir reference_fasta_file : GRCh38.primary_assembly.genome.fa +gtf_file : gencode.v44.annotation.gtf.gz + +# Mac memory used by convert2bed +convert2bed_max_mem: 64G + +# Increase the BED entry by the same number base pairs in each direction +region_expand: 3000 # You can specify a different zcat cmd for example gzcat here, default zcat zcat_cmd: - ``` - + +``` The config above would use the following directory structure: ```shell parent_directory -`-- workdir +-- workdir |-- norm | |-- bcf | |-- sparse @@ -151,13 +158,19 @@ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38 gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz ``` -4. Run with the example config +4. Download the gtf file + +```shell +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz -P workdir/reference/GRCh38.primary_assembly.genome.fa.gz +``` + +5. Run with the example config ```shell snakemake -j 1 --snakefile ../../pipelines/preprocess_with_qc.snakefile --configfile ../../pipelines/config/deeprvat_preprocess_config.yaml ``` -5. Enjoy the preprocessed data ๐ŸŽ‰ +6. Enjoy the preprocessed data ๐ŸŽ‰ ```shell ls -l workdir/preprocesed @@ -195,13 +208,20 @@ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38 gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz ``` -4. Run with the example config +5. Download the gtf file + +```shell +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz -P workdir/reference/GRCh38.primary_assembly.genome.fa.gz +``` + + +6. Run with the example config ```shell snakemake -j 1 --snakefile ../../pipelines/preprocess_no_qc.snakefile --configfile ../../pipelines/config/deeprvat_preprocess_config.yaml ``` -5. Enjoy the preprocessed data ๐ŸŽ‰ +7. Enjoy the preprocessed data ๐ŸŽ‰ ```shell ls -l workdir/preprocesed diff --git a/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz index df2edae1..5294f3c6 100644 Binary files a/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz and b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz differ diff --git a/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz.tbi b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz.tbi new file mode 100644 index 00000000..63f5991c Binary files /dev/null and b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz.tbi differ diff --git a/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz index 6228dc90..3a70b5b1 100644 Binary files a/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz and b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz differ diff --git a/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz.tbi b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz.tbi new file mode 100644 index 00000000..85c1cb3e Binary files /dev/null and b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz.tbi differ diff --git a/example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz b/example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz new file mode 100644 index 00000000..f1796e6a Binary files /dev/null and b/example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz differ diff --git a/pipelines/config/deeprvat_preprocess_config.yaml b/pipelines/config/deeprvat_preprocess_config.yaml index 335dd6c9..881c53a4 100644 --- a/pipelines/config/deeprvat_preprocess_config.yaml +++ b/pipelines/config/deeprvat_preprocess_config.yaml @@ -24,6 +24,13 @@ sparse_dir_name : sparse # Expected to be found in working_dir/reference_dir reference_fasta_file : GRCh38.primary_assembly.genome.fa +gtf_file : gencode.v44.annotation.gtf.gz + +# Max memory used by convert2bed +convert2bed_max_mem: 64G + +# Increase the BED entry by the same number base pairs in each direction +region_expand: 3000 # You can specify a different zcat cmd for example gzcat here, default zcat zcat_cmd: \ No newline at end of file diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile index 20eb0c89..865c3ebe 100644 --- a/pipelines/preprocessing/preprocess.snakefile +++ b/pipelines/preprocessing/preprocess.snakefile @@ -35,6 +35,13 @@ qc_allelic_imbalance_dir = qc_dir / "allelic_imbalance" qc_duplicate_vars_dir = qc_dir / "duplicate_vars" qc_filtered_samples_dir = qc_dir / "filtered_samples" +gtf_workdir = working_dir / "gtf" + +gtf_file = reference_dir / config["gtf_file"] +gtf_filtered_file = gtf_workdir / f"{gtf_file.stem}_filtered_genes.gtf" +bed_file = gtf_workdir / f"{gtf_file.stem}_filtered_genes.bed" +expanded_bed = gtf_workdir / f"{gtf_file.stem}_filtered_expanded_regions.bed" + vcf_stems, vcf_files, vcf_look_up = deeprvat_preprocess.parse_file_path_list(config["vcf_files_list"]) chromosomes = config["included_chromosomes"] @@ -56,12 +63,48 @@ rule normalize: samplefile=norm_dir / "samples_chr.csv", fasta=fasta_file, fastaindex=fasta_index_file, + expanded_bed=expanded_bed, params: vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem], output: bcf_file=bcf_dir / "{vcf_stem}.bcf", shell: - f"""{load_bcftools} bcftools view --samples-file {{input.samplefile}} --output-type u {{params.vcf_file}} | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}""" + f"""{load_bcftools} bcftools view -R "{{input.expanded_bed}}" "{{params.vcf_file}}" --output-type u \ + | bcftools view --samples-file {{input.samplefile}} --output-type u \ + | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u \ + | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}""" + + +rule fiter_gtf: + input: + gtf_file, + output: + gtf_filtered_file, + shell: + 'get_features.pl --in "{input}" --out "{output}" --include "gene_type=protein_coding" --feature "gene" --gtf' + + +rule create_bed: + input: + gtf_filtered_file, + output: + bed_file + params: + maxmem=config["convert2bed_max_mem"] + shell: + 'convert2bed --max-mem={params.maxmem} --input=gtf --output=bed < "{input}" > "{output}"' + + +rule expand_regions: + input: + bed=bed_file, + faidx=fasta_index_file, + params: + region_expand=config["region_expand"], + output: + expanded_bed + shell: + 'bedtools slop -i "{input.bed}" -g "{input.faidx}" -b {params.region_expand} > "{output}"' rule index_fasta: