-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_batch_qiime
executable file
·408 lines (338 loc) · 13.3 KB
/
_batch_qiime
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/bin/sh
export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism/seq_prisms
export BATCH_QIIME_PRISM_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism
batch_qiime_version=$1
function read_answer_with_default() {
read answer
if [ -z "$answer" ]; then
answer=$@
fi
}
function get_run_opts() {
DRY_RUN=no
DEBUG=no
HPC_TYPE=slurm
FILES=""
OUT_ROOT=""
BLAST_DATABASE=""
TAX_DATA=""
SIMILARITY=""
SEQLENGTH_MIN=""
FILE_TYPE=""
PROCESSING_ROOT=/dataset/gseq_processing/scratch/batch_qiime
PROCESSING_ROOT=/dataset/gseq_processing/scratch/batch_qiime
echo "*** batch_qiime analysis version $batch_qiime_version ***
* note that you can paste into your terminal window by clicking your right mouse button
* you can select text in the terminal window using your left mouse button
* at any stage you can press CTRL-C to exit the dialogs
* if you would prefer to run a single batch command, use batch_qiime_prism.sh (-h for help) - e.g.
this allows you to run the analysis on any collection of input files
"
####### get and check DATASET
while [ 1 ] ; do
echo "
please give the name of the dataset you would like to process (e.g. PJ_MGS00100 ), or a folder containing your sequence data
"
read DATASET
if [ ! -z "$DATASET" ]; then
if [ -d $DATASET ]; then
break
fi
if [ ! -d /dataset/${DATASET}/scratch ]; then
echo "sorry can't find (e.g.) /dataset/${DATASET}/scratch (and $DATASET is not an accessible folder)"
else
break
fi
fi
done
if [ -d ${DATASET} ]; then
echo "will look for data in ${DATASET}"
else
echo "will look for data in /dataset/${DATASET}/scratch"
fi
####### get and check whether to run locally or on the cluster
echo "
should this run be queued on the compute cluster ? (y/n, default=y. If n, will be run locally)
"
read_answer_with_default y
if [ "$answer" != "n" ]; then
HPC_TYPE=slurm
else
HPC_TYPE=local
fi
# set up folder
if [ -d $DATASET ]; then
# try removing the first component of the path and then appending that to processing_root
OUTPUT_ROOT=`echo $DATASET | cut -d'/' -f3-`
OUTPUT_ROOT=$PROCESSING_ROOT/$OUTPUT_ROOT
else
OUTPUT_ROOT=$PROCESSING_ROOT/$DATASET
fi
while [ 1 ]; do
echo "please specify output base folder (or just press Enter/Return to use default , $OUTPUT_ROOT )"
read_answer_with_default $OUTPUT_ROOT
NEW_ROOT=$answer
if [ -d $NEW_ROOT ]; then
echo "warning - $NEW_ROOT already exists, use anyway ? (y/n, default=y)"
read_answer_with_default y
if [[ ( $answer == "y" ) || ( -z $answer ) ]]; then
OUTPUT_ROOT=$NEW_ROOT
break
fi
else
mkdir -p $NEW_ROOT
if [ -d $NEW_ROOT ]; then
OUTPUT_ROOT=$NEW_ROOT
break
fi
fi
done
echo "will use output root folder $OUTPUT_ROOT
"
####### get and check the analysis type
while [ 1 ] ; do
ANALYSIS=qiime
echo "
please give which analysis you want (all, clean, qiime, kmer_analysis) (or just press enter to run qiime)
(notes :
* entering qiime includes length filtering, clustering, taxonomy assignment and summarising)
* the clean step is currently run seperately , and is not done if you select all or qiime
)
"
read_answer_with_default qiime
ANALYSIS=$answer
if [[ ( "$ANALYSIS" != "all" ) && ( "$ANALYSIS" != "clean" ) && ( "$ANALYSIS" != "qiime" ) && ( "$ANALYSIS" != "kmer_analysis" ) ]]; then
echo "analysis must be one of all, clean, qiime, kmer_analysis"
else
break
fi
done
if [[ ( "$ANALYSIS" == "all" ) || ( "$ANALYSIS" == "qiime" ) ]]; then
ls $OUTPUT_ROOT/qiime_analysis/* > /dev/null 2>&1
if [ $? == 0 ] ; then
echo "found existing results under $OUTPUT_ROOT/qiime_analysis/* - are you sure you want to continue (e.g. complete an interrupted run) ? (y/n)"
read_answer_with_default n
if [[ ( $answer != "y" ) && ( $answer != "Y" ) ]]; then
echo "ok quitting - please either use a different output folder or clean up $OUTPUT_ROOT/qiime_analysis"
exit 1
fi
fi
fi
if [[ ( "$ANALYSIS" == "all" ) || ( "$ANALYSIS" == "kmer_analysis" ) ]]; then
ls $OUTPUT_ROOT/kmer_analysis/* > /dev/null 2>&1
if [ $? == 0 ] ; then
echo "found existing results under $OUTPUT_ROOT/kmer_analysis - are you sure you want to continue (e.g. complete an interrupted run) ? (y/n)"
read_answer_with_default n
if [[ ( $answer != "y" ) && ( $answer != "Y" ) ]]; then
echo "ok quitting - please use a different output folder"
exit 1
fi
fi
fi
echo "will use analysis=$ANALYSIS
"
####### get and check the blast data to use
if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
while [ 1 ] ; do
echo "
please give the full path to the blast database (or just press enter to use default, /dataset/datacache/archive/metagenomics/build/combined_092016_silva.fasta)
"
read_answer_with_default /dataset/datacache/archive/metagenomics/build/combined_092016_silva.fasta
BLAST_DATABASE=$answer
if [ -f ${BLAST_DATABASE}.nin ]; then
break
else
echo "could not find blast index file ${BLAST_DATABASE}.nin"
fi
done
echo "will use blast database $BLAST_DATABASE"
fi
####### get and check the tax data to use
if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
while [ 1 ] ; do
echo "
please give the full path to the taxonomy data (or just press enter to use default, /dataset/datacache/archive/metagenomics/build/combined_092016_silva.taxonomy)
"
read_answer_with_default /dataset/datacache/archive/metagenomics/build/combined_092016_silva.taxonomy
TAX_DATA=$answer
if [ -f $TAX_DATA ]; then
break
else
echo "could not find tax data $TAX_DATA"
fi
done
echo "will use tax data $TAX_DATA"
fi
####### get and check the similarity to use
if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
while [ 1 ] ; do
echo "
please specify the similarity to use for clustering (or just press enter to use default, 0.99)
"
read_answer_with_default 0.99
SIMILARITY=$answer
python -c "print float('$SIMILARITY')" >/dev/null 2>&1
if [ $? != 0 ]; then
echo "looks like similarity requested ( $SIMILARITY ) is not a number"
exit 1
else
break
fi
done
echo "will use similarity $SIMILARITY"
fi
####### get and check the minimum length to use
if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
while [ 1 ] ; do
SEQLENGTH_MIN=200
echo "
please specify the minimum seq length (or just press enter to use default, 200)
"
read_answer_with_default 200
SEQLENGTH_MIN=$answer
python -c "print float('$SEQLENGTH_MIN')" >/dev/null 2>&1
if [ $? != 0 ]; then
echo "looks like minimum seqlength requested ( $SEQLENGTH_MIN ) is not a number"
exit 1
else
break
fi
done
echo "will use minimum seqlength $SEQLENGTH_MIN"
fi
} # get_run_opts
function run_qiime_analysis() {
mkdir -p $OUTPUT_ROOT/qiime_analysis
if [ ! -d $OUTPUT_ROOT/qiime_analysis ]; then
echo "unable to create output folder $OUTPUT_ROOT/qiime_analysis - quitting"
exit 1
fi
if [ -d $DATASET ]; then
FILE_PATH=$DATASET
else
FILE_PATH=/dataset/$DATASET/scratch
fi
while [ 1 ]; do
if [ -f $OUTPUT_ROOT/input_file_list.txt ]; then
break
fi
echo "
finding files to process..."
find $FILE_PATH/ -name "*_R1_*fastq*.gz" -print > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
return_code=$?
if [ $return_code == 0 ]; then
num_files=`cat $OUTPUT_ROOT/input_file_list.txt | wc -l`
if [ $num_files == 0 ]; then
return_code=1
fi
fi
if [ $return_code != 0 ]; then
# try just listing all .gz files , attempting to filter just R1
ls $FILE_PATH/*.gz | egrep "(_R1_|_1\.f).*.gz$" > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
if [ $? != 0 ]; then
echo "unable to find any files like *.gz under $FILE_PATH/"
while [ 1 ]; do
echo "please enter path to to files (or CTRL-C and edit $OUTPUT_ROOT/input_file_list.txt so it contains the names of the files to process)"
read $FILE_PATH
echo "
finding files to process..."
find $FILE_PATH/ -name "*_R1_*fastq*.gz" -print > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
return_code=$?
if [ $return_code == 0 ]; then
num_files=`cat $OUTPUT_ROOT/input_file_list.txt | wc -l`
if [ $num_files == 0 ]; then
return_code=1
fi
fi
if [ $return_code == 0 ]; then
break
else
ls $FILE_PATH/*.gz | egrep "(_R1_|_1\.f).*.gz$" > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
if [ $? == 0 ]; then
break
fi
fi
done
fi
fi
done
# confirm file list
echo "will process the following files: . . . (press Enter/Return for a list - then press space bar to page through listing)"
read answer
more $OUTPUT_ROOT/input_file_list.txt
echo "OK ? (default = y)"
read_answer_with_default y
if [ $answer == "y" ]; then
break
else
echo "
ok quitting - you can edit the file $OUTPUT_ROOT/input_file_list.txt in order to customise the
files to process, or remove it so that the data folder is re-scanned for files to process, and try again"
exit 1
fi
# check we can parse sample names from filenames - if not prompt for a custom regular expression
echo "
checking we can parse sample ID from filenames. . . "
regexp='^[^_]+_(\S+)_'
while [ 1 ]; do
for file in `cat $OUTPUT_ROOT/input_file_list.txt`; do
$BATCH_QIIME_PRISM_BIN/add_sample_name.py -n -r "$regexp" $file
return_code=$?
if [ $return_code != 0 ]; then
echo "Could not parse sample name from $file using a \"$regexp\" - please enter a regular expression to use ( e.g. something like ^([^_]+)_ or ^([^_]+_L1)_ might work)"
read_answer_with_default $regexp
regexp=$answer
break
fi
done
if [ $return_code == 0 ]; then
echo "do those sample names look OK ? (y/n)"
read_answer_with_default y
if [ $answer != "y" ]; then
echo "please enter a suitable regular expression. Examples : ^[^_]+_(\S+)_ ^([^_]+)_ ^([^_]+_L1)_ "
read_answer_with_default $regexp
regexp=$answer
else
break
fi
fi
done
echo "
export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism/seq_prisms
export BATCH_QIIME_PRISM_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism
time $BATCH_QIIME_PRISM_BIN/batch_qiime_prism.sh -x \"$regexp\" -b $BLAST_DATABASE -t $TAX_DATA -m $SEQLENGTH_MIN -s $SIMILARITY -O $OUTPUT_ROOT/qiime_analysis \`cat $OUTPUT_ROOT/input_file_list.txt\`" > $OUTPUT_ROOT/run.src
echo "about to process data files listed in $OUTPUT_ROOT/input_file_list.txt, writing results to $OUTPUT_ROOT/qiime_analysis"
echo "using . . ."
cat $OUTPUT_ROOT/run.src
echo "press enter to continue (or CTRL-C to cancel)"
read junk
set -x
time $BATCH_QIIME_PRISM_BIN/batch_qiime_prism.sh -x "$regexp" -b $BLAST_DATABASE -t $TAX_DATA -m $SEQLENGTH_MIN -s $SIMILARITY -O $OUTPUT_ROOT/qiime_analysis `cat $OUTPUT_ROOT/input_file_list.txt`
set +x
}
function run_kmer_analysis() {
# set up the kmer anlaysis folder and large-ram config
mkdir -p $OUTPUT_ROOT/kmer_analysis
mkdir -p $OUTPUT_ROOT/etc
cp $BATCH_QIIME_PRISM_BIN/etc/64GBRAM_slurm_array_job $OUTPUT_ROOT/etc
rm -f $OUTPUT_ROOT/etc/tardis.toml # in case left over from previous kmer_prism run , which may modify this
echo "jobtemplatefile = \"$OUTPUT_ROOT/etc/64GBRAM_slurm_array_job\"" > $OUTPUT_ROOT/etc/tardis.toml
# run kmer analysis
if [ -f $OUTPUT_ROOT/kmer_analysis/kmer_entropy.k6.jpg ]; then
echo "skipping kmer analysis as landmark $OUTPUT_ROOT/kmer_analysis/kmer_entropy.k6.jpg exists"
else
echo "about to run a kmer analysis for files listed in $OUTPUT_ROOT/input_file_list.txt, writing results to $OUTPUT_ROOT/kmer_analysis"
echo "press enter to continue (or CTRL-C to cancel)"
read junk
set -x
$SEQ_PRISMS_BIN/kmer_prism.sh -s .05 -a fasta -O $OUTPUT_ROOT/kmer_analysis `cat $OUTPUT_ROOT/input_file_list.txt` 1>$OUTPUT_ROOT/kmer_analysis.stdout 2>$OUTPUT_ROOT/kmer_analysis.stderr
set +x
fi
}
get_run_opts
if [[ ( $ANALYSIS == "qiime" ) || ( $ANALYSIS == "all" ) ]]; then
run_qiime_analysis
fi
if [[ ( $ANALYSIS == "kmer_analysis" ) || ( $ANALYSIS == "all" ) ]]; then
run_kmer_analysis
fi