-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig_ml-jg.yaml
148 lines (112 loc) · 4.97 KB
/
config_ml-jg.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Configuration file for SCVUCv4
# Author: Teresita M. Porter
# Date: March 2, 2020
############################################################################
# General pipeline settings
############################################################################
# Indicate number of cores available to run pipeline, snakefile, and configuration file at the command line:
# snakemake --cores 24 --snakefile snakefile --configfile config.yaml
############################################################################
# Identify raw read files
############################################################################
# This directory contains raw reads (all of them or from just one run)
# Only use compressed fastq files with this pipeline, ex. fastq.gz
raw: "data"
# Indicate 'sample' and 'read' wildcards in raw filenames:
# These files should be in a "data" folder
# Sample filename structure,
# SITE-CONDITION-REPLICATE_S1_L001_R1_001.fastq.gz
# {sample}_L001_R{read}_001.fastq.gz
raw_sample_read_wildcards: "data/{sample}_L001_R{read}_001.fastq.gz"
# SEQPREP sample wildcard and parameters
# These files should be in a "data" folder
# Sample,
# {sample}_L001_R1_001.fastq.gz
raw_sample_forward_wildcard: "data/{sample}_L001_R1_001.fastq.gz"
raw_sample_reverse_wildcard: "data/{sample}_L001_R2_001.fastq.gz"
############################################################################
# Raw read pairing
############################################################################
SEQPREP:
# Phred score quality cutoff
q: 20
# Minimum overlap length between forward and reverse reads
o: 25
############################################################################
# Primer trimming
############################################################################
# CUTADAPT parameters for the COI-ml-jg amplicon
# FWD primer is 'mlCOIintF' from Leray et al., 2013
# REV primer is 'jgHCO2198' from Geller et al., 2013 (reverse complement)
CUTADAPT_FWD:
g: "GGWACWGGWTGAACWGTWTAYCCYCC"
m: 150
q: "20,20"
mn: 3
CUTADAPT_REV:
a: "TGRTTYTTYGGNCAYCCNGARGTNTA"
m: 150
q: "20,20"
mn: 3
############################################################################
# Dereplication
############################################################################
# Indicate a directory name here that is short and simple with no spaces or weird punctuation
# A good directory name would be the amplicon, ex. "BR5"
dir: "ml-jg"
############################################################################
# Denoising
############################################################################
# Indicate minimum number of reads per cluster to retain
# Here, remove all singletons and doubletons, retain clusters with 3+ reads
VSEARCH_DENOISE:
minsize: 3
############################################################################
# ESV x sample table
############################################################################
# VSEARCH params
VSEARCH_TABLE:
# Indicate number of cores to use
# Do not exceed the number of cores allotted to run the whole pipeline ('--cores' above)
t: 24
############################################################################
# Taxonomic assignment
############################################################################
# Uses the RDP classifier
# Do not use old RDP classifier v2.2 from conda, install the newer v2.12 from SourceForge https://sourceforge.net/projects/rdp-classifier/
# COI Classifier v4 based on sequences mined from GenBank and BOLD but compatible with the RDP classifier is available from GitHub https://github.com/Hajibabaei-Lab/SCVUC_COI_metabarcode_pipeline
RDP:
jar: "/path/to/rdp_classifier_2.12/dist/classifier.jar"
t: "/path/to/CO1Classifier/v4/NCBI_BOLD_merged/mydata/mydata_trained/rRNAClassifier.properties"
############################################################################
# Filter out pseudogenes
############################################################################
# Target a single taxonomic group, ex. Arthropoda
taxon: 'Arthropoda'
# Indicate path to hmm profile
hmm: 'bold.hmm'
# Translate ESVs into all open reading frames
# ORFfinder params
ORFFINDER:
# genetic code
# 5 = invertebrate mitochondrial, see NCBI for additional genetic codes
g: 5
# ORF start codon to use
# 0 = ATG only
# 1 = ATG and alternative initiation codon (default)
# 2 = any sense codon
s: 2
# minimum length (default 75, min 30)
ml: 30
# ignore nested ORFs (true|false)
n: 'true'
# strand (both|plus|minus)
strand: 'plus'
############################################################################
# Reformat CSV
############################################################################
# Add amplicon name to Zotu to keep these ids unique when data from many amplicons are combined
# The pattern will prefix the Zotu with the amplicon name
# Ex. sed -e 's/^/amplicon_/g' infile > outfile
# Below, enter the substitution pattern for sed to use (the part in single quotes above)
SED: 's/^/ml-jg_/g'