config_ml-jg.yaml

# Configuration file for SCVUCv4

# Author: Teresita M. Porter
# Date: March 2, 2020

############################################################################
# General pipeline settings
############################################################################

# Indicate number of cores available to run pipeline, snakefile, and configuration file at the command line:
# snakemake --cores 24 --snakefile snakefile --configfile config.yaml

############################################################################
# Identify raw read files
############################################################################

# This directory contains raw reads (all of them or from just one run)
# Only use compressed fastq files with this pipeline, ex. fastq.gz
raw: "data"

# Indicate 'sample' and 'read' wildcards in raw filenames:
# These files should be in a "data" folder
# Sample filename structure,
# 	SITE-CONDITION-REPLICATE_S1_L001_R1_001.fastq.gz
# 	{sample}_L001_R{read}_001.fastq.gz
raw_sample_read_wildcards: "data/{sample}_L001_R{read}_001.fastq.gz"

# SEQPREP sample wildcard and parameters
# These files should be in a "data" folder
# Sample,
#	{sample}_L001_R1_001.fastq.gz
raw_sample_forward_wildcard: "data/{sample}_L001_R1_001.fastq.gz"
raw_sample_reverse_wildcard: "data/{sample}_L001_R2_001.fastq.gz"

############################################################################
# Raw read pairing
############################################################################

SEQPREP:
# Phred score quality cutoff
    q: 20
# Minimum overlap length between forward and reverse reads
    o: 25

############################################################################
# Primer trimming
############################################################################

# CUTADAPT parameters for the COI-ml-jg amplicon
# FWD primer is 'mlCOIintF' from Leray et al., 2013
# REV primer is 'jgHCO2198' from Geller et al., 2013 (reverse complement)
CUTADAPT_FWD:
    g: "GGWACWGGWTGAACWGTWTAYCCYCC"
    m: 150
    q: "20,20"
    mn: 3

CUTADAPT_REV:
    a: "TGRTTYTTYGGNCAYCCNGARGTNTA"
    m: 150
    q: "20,20"
    mn: 3

############################################################################
# Dereplication
############################################################################

# Indicate a directory name here that is short and simple with no spaces or weird punctuation
# A good directory name would be the amplicon, ex. "BR5"

dir: "ml-jg"

############################################################################
# Denoising
############################################################################

# Indicate minimum number of reads per cluster to retain
# Here, remove all singletons and doubletons, retain clusters with 3+ reads

VSEARCH_DENOISE:
    minsize: 3

############################################################################
# ESV x sample table
############################################################################

# VSEARCH params
VSEARCH_TABLE:
# Indicate number of cores to use
# Do not exceed the number of cores allotted to run the whole pipeline ('--cores' above)
    t: 24


############################################################################
# Taxonomic assignment
############################################################################

# Uses the RDP classifier
# Do not use old RDP classifier v2.2 from conda, install the newer v2.12 from SourceForge https://sourceforge.net/projects/rdp-classifier/
# COI Classifier v4 based on sequences mined from GenBank and BOLD but compatible with the RDP classifier is available from GitHub https://github.com/Hajibabaei-Lab/SCVUC_COI_metabarcode_pipeline

RDP:
    jar: "/path/to/rdp_classifier_2.12/dist/classifier.jar"
    t: "/path/to/CO1Classifier/v4/NCBI_BOLD_merged/mydata/mydata_trained/rRNAClassifier.properties"

############################################################################
# Filter out pseudogenes
############################################################################

# Target a single taxonomic group, ex. Arthropoda
taxon: 'Arthropoda'

# Indicate path to hmm profile
hmm: 'bold.hmm'

# Translate ESVs into all open reading frames
# ORFfinder params
ORFFINDER:

# genetic code
# 5 = invertebrate mitochondrial, see NCBI for additional genetic codes
    g: 5

# ORF start codon to use
# 0 = ATG only
# 1 = ATG and alternative initiation codon (default)
# 2 = any sense codon
    s: 2

# minimum length (default 75, min 30)
    ml: 30

# ignore nested ORFs (true|false)
    n: 'true'

# strand (both|plus|minus)
    strand: 'plus'

############################################################################
# Reformat CSV
############################################################################

# Add amplicon name to Zotu to keep these ids unique when data from many amplicons are combined
# The pattern will prefix the Zotu with the amplicon name
# Ex. sed -e 's/^/amplicon_/g' infile > outfile
# Below, enter the substitution pattern for sed to use (the part in single quotes above)

SED: 's/^/ml-jg_/g'