PMBio · endast · Dec 12, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 14, 2023
diff --git a/.github/workflows/github-actions.yml b/.github/workflows/github-actions.yml
@@ -1,6 +1,6 @@
 name: DeepRVAT
 run-name: DeepRVAT 🧬🧪💻🧑‍🔬
-on: [ push ]
+on: [push]
 
 jobs:
   DeepRVAT-Pipeline-Smoke-Tests:
@@ -77,7 +77,6 @@ jobs:
           --snakefile ${{ github.workspace }}/pipelines/seed_gene_discovery.snakefile --show-failed-logs
         shell: micromamba-shell {0}
 
-
   DeepRVAT-Preprocessing-Pipeline-Smoke-Tests:
     runs-on: ubuntu-latest
     steps:
@@ -143,6 +142,10 @@ jobs:
           cache-environment: true
           cache-downloads: true
 
+      - name: Install biotoolbox
+        run: cpanm Bio::ToolBox@1.691 --force
+        shell: micromamba-shell {0}
+
       - name: Install DeepRVAT
         run: pip install -e ${{ github.workspace }}
         shell: micromamba-shell {0}
@@ -162,19 +165,31 @@ jobs:
           -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \
           && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz
 
+      - name: Cache GTF file
+        id: cache-gtf
+        uses: actions/cache@v4
+        with:
+          path: example/preprocess/workdir/reference
+          key: ${{ runner.os }}-reference-gtf
+
+      - name: Download gtf data
+        if: steps.cache-gtf.outputs.cache-hit != 'true'
+        run: |
+          cd ${{ github.workspace }}/example/preprocess && \
+          wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz \
+          -O workdir/reference/gencode.v44.annotation.gtf.gz
+
       - name: Run preprocessing pipeline
         run: |
           python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \
           --snakefile ${{ github.workspace }}/pipelines/preprocess_no_qc.snakefile \
           --configfile ${{ github.workspace }}/pipelines/config/deeprvat_preprocess_config.yaml --show-failed-logs
         shell: micromamba-shell {0}
 
-
   DeepRVAT-Preprocessing-Pipeline-Tests-With-QC:
     runs-on: ubuntu-latest
     needs: DeepRVAT-Preprocessing-Pipeline-Smoke-Tests
     steps:
-
       - name: Check out repository code
         uses: actions/checkout@v4
       - uses: mamba-org/setup-micromamba@v1.8.0
@@ -184,6 +199,10 @@ jobs:
           cache-environment: true
           cache-downloads: true
 
+      - name: Install biotoolbox
+        run: cpanm Bio::ToolBox@1.691 --force
+        shell: micromamba-shell {0}
+
       - name: Install DeepRVAT
         run: pip install -e ${{ github.workspace }}
         shell: micromamba-shell {0}
@@ -203,6 +222,20 @@ jobs:
           -O workdir/reference/GRCh38.primary_assembly.genome.fa.gz \
           && gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz
 
+      - name: Cache GTF file
+        id: cache-gtf
+        uses: actions/cache@v4
+        with:
+          path: example/preprocess/workdir/reference
+          key: ${{ runner.os }}-reference-gtf
+
+      - name: Download gtf data
+        if: steps.cache-gtf.outputs.cache-hit != 'true'
+        run: |
+          cd ${{ github.workspace }}/example/preprocess && \
+          wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz \
+          -O workdir/reference/gencode.v44.annotation.gtf.gz
+
       - name: Run preprocessing pipeline
         run: |
           python -m snakemake -j 2 --directory ${{ github.workspace }}/example/preprocess \

diff --git a/deeprvat_preprocessing_env.yml b/deeprvat_preprocessing_env.yml
@@ -14,3 +14,7 @@ dependencies:
   - snakemake=7.17.1
   - bcftools=1.17
   - samtools=1.17
+  - bedtools=2.31.1
+  - perl-app-cpanminus=1.7047
+  - bedops=2.4.41
+  - gcc=13.2.0  
diff --git a/docs/_static/preprocess_rulegraph_no_qc.svg b/docs/_static/preprocess_rulegraph_no_qc.svg
diff --git a/docs/_static/preprocess_rulegraph_with_qc.svg b/docs/_static/preprocess_rulegraph_with_qc.svg
diff --git a/docs/preprocessing.md b/docs/preprocessing.md
@@ -72,16 +72,23 @@ sparse_dir_name : sparse
 
 # Expected to be found in working_dir/reference_dir
 reference_fasta_file : GRCh38.primary_assembly.genome.fa
+gtf_file : gencode.v44.annotation.gtf.gz
+
+# Mac memory used by convert2bed
+convert2bed_max_mem: 64G
+
+# Increase the BED entry by the same number base pairs in each direction
+region_expand: 3000
 
 # You can specify a different zcat cmd for example gzcat here, default zcat
 zcat_cmd:
-   ```
-
+
+```
 The config above would use the following directory structure:
 
 ```shell
 parent_directory
-`-- workdir
+-- workdir
     |-- norm
     |   |-- bcf
     |   |-- sparse
@@ -151,13 +158,19 @@ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38
 gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz
 ```
 
-4. Run with the example config
+4. Download the gtf file
+
+```shell
+wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz -P workdir/reference/GRCh38.primary_assembly.genome.fa.gz
+```
+
+5. Run with the example config
 
 ```shell
 snakemake -j 1 --snakefile ../../pipelines/preprocess_with_qc.snakefile --configfile ../../pipelines/config/deeprvat_preprocess_config.yaml
 ```
 
-5. Enjoy the preprocessed data 🎉
+6. Enjoy the preprocessed data 🎉
 
 ```shell
 ls -l workdir/preprocesed
@@ -195,13 +208,20 @@ wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/GRCh38
 gzip -d workdir/reference/GRCh38.primary_assembly.genome.fa.gz
 ```
 
-4. Run with the example config
+5. Download the gtf file
+
+```shell
+wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_44/gencode.v44.annotation.gtf.gz -P workdir/reference/GRCh38.primary_assembly.genome.fa.gz
+```
+
+
+6. Run with the example config
 
 ```shell
 snakemake -j 1 --snakefile ../../pipelines/preprocess_no_qc.snakefile --configfile ../../pipelines/config/deeprvat_preprocess_config.yaml
 ```
 
-5. Enjoy the preprocessed data 🎉
+7. Enjoy the preprocessed data 🎉
 
 ```shell
 ls -l workdir/preprocesed

diff --git a/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz
diff --git a/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz.tbi b/example/preprocess/data/vcf/test_vcf_data_c21_b1.vcf.gz.tbi
diff --git a/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz
diff --git a/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz.tbi b/example/preprocess/data/vcf/test_vcf_data_c22_b1.vcf.gz.tbi
diff --git a/example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz b/example/preprocess/workdir/reference/gencode.v44.annotation.gtf.gz
diff --git a/pipelines/config/deeprvat_preprocess_config.yaml b/pipelines/config/deeprvat_preprocess_config.yaml
@@ -24,6 +24,13 @@ sparse_dir_name : sparse
 
 # Expected to be found in working_dir/reference_dir
 reference_fasta_file : GRCh38.primary_assembly.genome.fa
+gtf_file : gencode.v44.annotation.gtf.gz
+
+# Max memory used by convert2bed
+convert2bed_max_mem: 64G
+
+# Increase the BED entry by the same number base pairs in each direction
+region_expand: 3000
 
 # You can specify a different zcat cmd for example gzcat here, default zcat
 zcat_cmd:
diff --git a/pipelines/preprocessing/preprocess.snakefile b/pipelines/preprocessing/preprocess.snakefile
@@ -35,6 +35,13 @@ qc_allelic_imbalance_dir = qc_dir / "allelic_imbalance"
 qc_duplicate_vars_dir = qc_dir / "duplicate_vars"
 qc_filtered_samples_dir = qc_dir / "filtered_samples"
 
+gtf_workdir = working_dir / "gtf"
+
+gtf_file = reference_dir / config["gtf_file"]
+gtf_filtered_file = gtf_workdir / f"{gtf_file.stem}_filtered_genes.gtf"
+bed_file = gtf_workdir / f"{gtf_file.stem}_filtered_genes.bed"
+expanded_bed = gtf_workdir / f"{gtf_file.stem}_filtered_expanded_regions.bed"
+
 vcf_stems, vcf_files, vcf_look_up = deeprvat_preprocess.parse_file_path_list(config["vcf_files_list"])
 chromosomes = config["included_chromosomes"]
 
@@ -56,12 +63,48 @@ rule normalize:
         samplefile=norm_dir / "samples_chr.csv",
         fasta=fasta_file,
         fastaindex=fasta_index_file,
+        expanded_bed=expanded_bed,
     params:
         vcf_file=lambda wildcards: vcf_look_up[wildcards.vcf_stem],
     output:
         bcf_file=bcf_dir / "{vcf_stem}.bcf",
     shell:
-        f"""{load_bcftools} bcftools view --samples-file {{input.samplefile}} --output-type u {{params.vcf_file}} | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}"""
+        f"""{load_bcftools} bcftools view -R "{{input.expanded_bed}}" "{{params.vcf_file}}" --output-type u \
+        | bcftools view --samples-file {{input.samplefile}} --output-type u  \
+        | bcftools view --include 'COUNT(GT="alt") > 0' --output-type u \
+        | bcftools norm -m-both -f {{input.fasta}} --output-type b --output {{output.bcf_file}}"""
+
+
+rule fiter_gtf:
+    input:
+        gtf_file,
+    output:
+        gtf_filtered_file,
+    shell:
+        'get_features.pl --in "{input}" --out "{output}" --include "gene_type=protein_coding" --feature "gene" --gtf'
+
+
+rule create_bed:
+    input:
+        gtf_filtered_file,
+    output:
+        bed_file
+    params:
+        maxmem=config["convert2bed_max_mem"]
+    shell:
+        'convert2bed --max-mem={params.maxmem} --input=gtf --output=bed  < "{input}" > "{output}"'
+
+
+rule expand_regions:
+    input:
+        bed=bed_file,
+        faidx=fasta_index_file,
+    params:
+        region_expand=config["region_expand"],
+    output:
+        expanded_bed
+    shell:
+        'bedtools slop -i "{input.bed}" -g "{input.faidx}" -b {params.region_expand}  > "{output}"'
 
 
 rule index_fasta: