Merge pull request #15 from sigven/dev

Coding only
sigven · Sep 26, 2022 · 9f48483 · 9f48483
2 parents 67a4935 + 60681c7
commit 9f48483
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ The germline variant annotator (*gvanno*) is a software package intended for ana
 *gvanno* accepts query files encoded in the VCF format, and can analyze both SNVs and short InDels. The workflow relies heavily upon [Ensembl’s Variant Effect Predictor (VEP)](http://www.ensembl.org/info/docs/tools/vep/index.html), and [vcfanno](https://github.com/brentp/vcfanno). It produces an annotated VCF file and a file of tab-separated values (.tsv), the latter listing all annotations pr. variant record. Note that if your input VCF contains data (genotypes) from multiple samples (i.e. a multisample VCF), the output TSV file will contain one line/record __per sample variant__.
 
 ### News
+* September 26th 2022 - **1.5.1 release**
+  * Added option `--vep_coding_only` - only report variants that fall into coding regions of transcripts (VEP option `--coding_only`)
 * September 24th 2022 - **1.5.0 release**
   * Data updates: ClinVar, GENCODE GWAS catalog, CancerMine, Open Targets Platform
   * Software updates: VEP 107
@@ -25,7 +27,7 @@ The germline variant annotator (*gvanno*) is a software package intended for ana
 * August 25th 2021 - **1.4.3 release**
 	* Data updates: ClinVar, GWAS catalog, CancerMine, UniProt, Open Targets Platform
 
-### Annotation resources (v1.5.0)
+### Annotation resources (v1.5.1)
 
 * [VEP](http://www.ensembl.org/info/docs/tools/vep/index.html) - Variant Effect Predictor v107 (GENCODE v41/v19 as the gene reference dataset)
 * [dBNSFP](https://sites.google.com/site/jpopgen/dbNSFP) - Database of non-synonymous functional predictions (v4.2, March 2021)
@@ -71,17 +73,17 @@ An installation of Python (version >=_3.6_) is required to run *gvanno*. Check t
 
 #### STEP 2: Download *gvanno* and data bundle
 
-1. [Download the latest version](https://github.com/sigven/gvanno/releases/tag/v1.5.0) (gvanno run script, v1.5.0)
+1. [Download the latest version](https://github.com/sigven/gvanno/releases/tag/v1.5.1) (gvanno run script, v1.5.1)
 2. Download (preferably using `wget`) and unpack the latest assembly-specific data bundle in the gvanno directory
    * [grch37 data bundle](http://insilico.hpc.uio.no/pcgr/gvanno/gvanno.databundle.grch37.20220921.tgz) (approx 20Gb)
    * [grch38 data bundle](http://insilico.hpc.uio.no/pcgr/gvanno/gvanno.databundle.grch38.20220921.tgz) (approx 28Gb)
    * Example commands:
 	* `wget http://insilico.hpc.uio.no/pcgr/gvanno/gvanno.databundle.grch37.20220921.tgz`
 	* `gzip -dc gvanno.databundle.grch37.YYYYMMDD.tgz | tar xvf -`
 
-    A _data/_ folder within the _gvanno-1.5.0_ software folder should now have been produced
-3. Pull the [gvanno Docker image (1.5.0)](https://hub.docker.com/r/sigven/gvanno/) from DockerHub (approx 2.2Gb):
-   * `docker pull sigven/gvanno:1.5.0` (gvanno annotation engine)
+    A _data/_ folder within the _gvanno-1.5.1_ software folder should now have been produced
+3. Pull the [gvanno Docker image (1.5.1)](https://hub.docker.com/r/sigven/gvanno/) from DockerHub (approx 2.2Gb):
+   * `docker pull sigven/gvanno:1.5.1` (gvanno annotation engine)
 
 #### STEP 3: Input preprocessing
 
@@ -110,7 +112,7 @@ Run the workflow with **gvanno.py**, which takes the following arguments and opt
 	--query_vcf QUERY_VCF
 				    VCF input file with germline query variants (SNVs/InDels).
 	--gvanno_dir GVANNO_DIR
-				    Directory that contains the gvanno data bundle, e.g. ~/gvanno-1.5.0
+				    Directory that contains the gvanno data bundle, e.g. ~/gvanno-1.5.1
 	--output_dir OUTPUT_DIR
 				    Output directory
 	--genome_assembly {grch37,grch38}
@@ -134,6 +136,8 @@ Run the workflow with **gvanno.py**, which takes the following arguments and opt
 				    Variant Effect Predictor (VEP) processing, default: canonical,appris,biotype,ccds,rank,tsl,length,mane
 	--vep_skip_intergenic
 				    Skip intergenic variants in Variant Effect Predictor (VEP) processing, default: False
+	--vep_coding_only
+	          Only report variants falling into coding regions of transcripts (VEP), default: False
 
 	Other optional arguments:
 	--force_overwrite     By default, the script will fail with an error if any output file already exists.
@@ -147,10 +151,10 @@ Run the workflow with **gvanno.py**, which takes the following arguments and opt
 
 The _examples_ folder contains an example VCF file. Analysis of the example VCF can be performed by the following command:
 
-	python ~/gvanno-1.5.0/gvanno.py
-	--query_vcf ~/gvanno-1.5.0/examples/example.grch37.vcf.gz
-	--gvanno_dir ~/gvanno-1.5.0
-	--output_dir ~/gvanno-1.5.0
+	python ~/gvanno-1.5.1/gvanno.py
+	--query_vcf ~/gvanno-1.5.1/examples/example.grch37.vcf.gz
+	--gvanno_dir ~/gvanno-1.5.1
+	--output_dir ~/gvanno-1.5.1
 	--sample_id example
 	--genome_assembly grch37
 	--container docker

diff --git a/gvanno.py b/gvanno.py
@@ -11,7 +11,7 @@
 import platform
 from argparse import RawTextHelpFormatter
 
-GVANNO_VERSION = '1.5.0'
+GVANNO_VERSION = '1.5.1'
 DB_VERSION = 'GVANNO_DB_VERSION = 20220921'
 VEP_VERSION = '107'
 GENCODE_VERSION = 'v41'
@@ -37,16 +37,17 @@ def __main__():
    optional.add_argument('--version', action='version', version='%(prog)s ' + str(GVANNO_VERSION))
    optional.add_argument('--no_vcf_validate', action = "store_true",help="Skip validation of input VCF with Ensembl's vcf-validator, default: %(default)s")
    optional.add_argument('--docker_uid', dest = 'docker_user_id', help = 'Docker user ID. default is the host system user ID. If you are experiencing permission errors, try setting this up to root (`--docker-uid root`)')
-   optional_vep.add_argument('--vep_regulatory', action='store_true', help = 'Enable Variant Effect Predictor (VEP) to look for overlap with regulatory regions (option --regulatory in VEP).')
-   optional_vep.add_argument('--vep_gencode_all', action='store_true', help = 'Consider all GENCODE transcripts with Variant Effect Predictor (VEP) (option --gencode_basic in VEP is used by default in gvanno).')
+   optional_vep.add_argument('--vep_regulatory', action='store_true', help = 'Enable VEP to look for overlap with regulatory regions (option --regulatory in VEP).')
+   optional_vep.add_argument('--vep_gencode_all', action='store_true', help = 'Consider all GENCODE transcripts with VEP (option --gencode_basic in VEP is used by default in gvanno).')
    optional_vep.add_argument('--vep_lof_prediction', action = "store_true", help = "Predict loss-of-function variants with Loftee plugin " + \
-      "in Variant Effect Predictor (VEP), default: %(default)s")
-   optional_vep.add_argument('--vep_n_forks', default = 4, help="Number of forks for Variant Effect Predictor (VEP) processing, default: %(default)s")
+      "in VEP, default: %(default)s")
+   optional_vep.add_argument('--vep_n_forks', default = 4, help="Number of forks for VEP processing, default: %(default)s")
    optional_vep.add_argument('--vep_buffer_size', default = 500, help="Variant buffer size (variants read into memory simultaneously) " + \
-      "for Variant Effect Predictor (VEP) processing\n- set lower to reduce memory usage, higher to increase speed, default: %(default)s")
+      "for VEP processing\n- set lower to reduce memory usage, higher to increase speed, default: %(default)s")
    optional_vep.add_argument('--vep_pick_order', default = "canonical,appris,biotype,ccds,rank,tsl,length,mane", help="Comma-separated string " + \
-      "of ordered transcript properties for primary variant pick in\nVariant Effect Predictor (VEP) processing, default: %(default)s")
-   optional_vep.add_argument('--vep_skip_intergenic', action = "store_true", help="Skip intergenic variants in Variant Effect Predictor (VEP) processing, default: %(default)s")
+      "of ordered transcript properties for primary variant pick in\nVEP processing, default: %(default)s")
+   optional_vep.add_argument('--vep_skip_intergenic', action = "store_true", help="Skip intergenic variants (VEP), default: %(default)s")
+   optional_vep.add_argument('--vep_coding_only', action = "store_true", help="Only return consequences that fall in the coding regions of transcripts (VEP), default: %(default)s")
    optional.add_argument('--vcfanno_n_processes', default = 4, help="Number of processes for vcfanno " + \
       "processing (see https://github.com/brentp/vcfanno#-p), default: %(default)s")
    required.add_argument('--query_vcf', help='VCF input file with germline query variants (SNVs/InDels).', required = True)
@@ -347,6 +348,8 @@ def run_gvanno(arg_dict, host_directories):
          gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)"
       if arg_dict['vep_skip_intergenic'] == 1:
          vep_options = vep_options + " --no_intergenic"
+      if arg_dict['vep_coding_only'] == 1:
+         vep_options = vep_options + " --coding_only"
       if arg_dict['vep_regulatory'] == 1:
          vep_options = vep_options + " --regulatory"
       if arg_dict['vep_lof_prediction'] == 1:
@@ -362,13 +365,14 @@ def run_gvanno(arg_dict, host_directories):
       ## GVANNO|VEP - run consequence annotation with Variant Effect Predictor
       logger = getlogger('gvanno-vep')   
       print()
-      logger.info("STEP 1: Basic variant annotation with Variant Effect Predictor (" + str(VEP_VERSION) + ", GENCODE " + str(GENCODE_VERSION) + ", " + str(arg_dict['genome_assembly']) + ")")
-      logger.info("VEP configuration - one primary consequence block pr. alternative allele (--flack_pick_allele)")
+      logger.info("STEP 1: Basic variant annotation with Variant Effect Predictor (v" + str(VEP_VERSION) + ", GENCODE " + str(GENCODE_VERSION) + ", " + str(arg_dict['genome_assembly']) + ")")
+      logger.info("VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)")
       logger.info("VEP configuration - transcript pick order: " + str(arg_dict['vep_pick_order']))
       logger.info("VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options")
       logger.info("VEP configuration - GENCODE set: " + str(gencode_set_in_use))
       logger.info("VEP configuration - buffer size: " + str(arg_dict['vep_buffer_size']))
       logger.info("VEP configuration - skip intergenic: " + str(arg_dict['vep_skip_intergenic']))
+      logger.info("VEP configuration - coding only: " + str(arg_dict['vep_coding_only']))
       logger.info("VEP configuration - look for overlap with regulatory regions: " + str(arg_dict['vep_regulatory']))
       logger.info("VEP configuration - number of forks: " + str(arg_dict['vep_n_forks']))
       logger.info("VEP configuration - loss-of-function prediction: " + str(arg_dict['vep_lof_prediction']))

diff --git a/src/gvanno.tgz b/src/gvanno.tgz
diff --git a/src/gvanno/gvanno_summarise.py b/src/gvanno/gvanno_summarise.py
@@ -58,6 +58,7 @@ def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction = 0, r
    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
+   num_records_filtered = 0
 
    vcf_info_element_types = {}
    for e in vcf.header_iter():
@@ -77,10 +78,11 @@ def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction = 0, r
             current_chrom = str(rec.CHROM)
             num_chromosome_records_processed = 0
       if rec.INFO.get('CSQ') is None:
-         alt_allele = ','.join(rec.ALT)
-         pos = rec.start + 1
-         variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
-         logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped')
+         num_records_filtered = num_records_filtered + 1
+         #alt_allele = ','.join(rec.ALT)
+         #pos = rec.start + 1
+         #variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(rec.REF) + '>' + alt_allele
+         #logger.warning('Variant record ' + str(variant_id) + ' does not have CSQ tag from Variant Effect Predictor (--vep_skip_intergenic or --vep_coding_only turned ON?)  - variant will be skipped')
          continue
       num_chromosome_records_processed += 1
       gvanno_xref = annoutils.make_transcript_xref_map(rec, gvanno_xref_map, xref_tag = "GVANNO_XREF")
@@ -116,6 +118,7 @@ def extend_vcf_annotations(query_vcf, gvanno_db_directory, lof_prediction = 0, r
    w.close()
    logger.info('Completed summary of functional annotations for ' + str(num_chromosome_records_processed) + ' variants on chromosome ' + str(current_chrom))
    vcf.close()
+   logger.info("Number of variant calls filtered by VEP (No CSQ tag, '--vep_coding_only' / '--vep_skip_intergenic'): " + str(num_records_filtered))
 
    if os.path.exists(out_vcf):
       if os.path.getsize(out_vcf) > 0: