-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathalphaSat-HMMER.wdl
274 lines (220 loc) · 6.41 KB
/
alphaSat-HMMER.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
version 1.0
workflow alphaSat_HMMER_workflow {
input {
File input_fasta
File hmm_profile
File hmm_profile_SF
String assembly_id
}
call split_fasta {
input:
input_fasta = input_fasta
}
scatter (contig_fasta in split_fasta.split_fasta_out) {
call alphaSat_HMMER {
input:
input_fastas = [contig_fasta],
hmm_profile = hmm_profile,
hmm_profile_SF = hmm_profile_SF
}
}
call combine_beds as combine_hor_beds {
input:
beds = alphaSat_HMMER.as_hor_bed,
assembly_id = assembly_id,
tag = "alphasat_HOR"
}
call combine_beds as combine_hor_sf_beds {
input:
beds = alphaSat_HMMER.as_hor_sf_bed,
assembly_id = assembly_id,
tag = "alphasat_HOR_SF"
}
call combine_beds as combine_sf_beds {
input:
beds = alphaSat_HMMER.as_sf_bed,
assembly_id = assembly_id,
tag = "alphasat_SF"
}
call combine_beds as combine_strand_beds {
input:
beds = alphaSat_HMMER.as_strand_bed,
assembly_id = assembly_id,
tag = "alphasat_strand"
}
call summarize_alpha {
input:
as_hor_bed = combine_hor_beds.output_bed,
as_sf_bed = combine_sf_beds.output_bed,
assembly_id = assembly_id
}
output {
File as_hor_bed = combine_hor_beds.output_bed
File as_hor_sf_bed = combine_hor_sf_beds.output_bed
File as_sf_bed = combine_sf_beds.output_bed
File as_strand_bed = combine_strand_beds.output_bed
File as_summary_bed = summarize_alpha.as_summary_bed
}
meta {
author: "Julian Lucas"
email: "juklucas@ucsc.edu"
description: "Calls a modified version of [HumAS-HMMER](https://github.com/enigene/HumAS-HMMER). See [HumAS-HMMER_for_AnVIL](https://github.com/fedorrik/HumAS-HMMER_for_AnVIL) for modifications"
}
}
task split_fasta {
input {
File input_fasta
Int memSizeGB = 4
Int threadCount = 1
Int addldisk = 10
}
parameter_meta {
input_fasta: "Genomic assemblies. Files must be in fa or fa.gz format."
}
# Estimate disk size required
Int input_fasta_size = ceil(size(input_fasta, "GB"))
Int final_disk_dize = input_fasta_size * 4 + addldisk
command <<<
set -eux -o pipefail
## make output directory
mkdir split
## split fasta into contigs/scaffolds/chromosomes
faSplit byname ~{input_fasta} split/
>>>
output {
Array[File] split_fasta_out = glob("split/*.fa")
}
runtime {
memory: memSizeGB + " GB"
cpu: threadCount
disks: "local-disk " + final_disk_dize + " SSD"
docker: "quay.io/biocontainers/ucsc-fasplit@sha256:2ddc814ba3e8075a31e13ec1fc737c34c501bc0586546e2b0670ca71e25348c4"
preemptible: 1
}
}
task alphaSat_HMMER {
input {
Array[File] input_fastas
File hmm_profile
File hmm_profile_SF
Int memSizeGB = 4
Int threadCount = 8
Int addldisk = 10
Int preempts = 2
}
parameter_meta {
input_fastas: "genomic assemblies or long contigs. Files must be in fa or fa.gz format."
hmm_profile: "main hmm profile"
hmm_profile_SF: "hmm profile for creation of AS-SF bed file"
}
# Estimate disk size required
Int input_fasta_size = ceil(size(input_fastas, "GB"))
Int input_hmm_profile_size = ceil(size(hmm_profile, "GB"))
Int input_hmm_profilesf_size = ceil(size(hmm_profile_SF, "GB"))
Int final_disk_dize = input_fasta_size * 6 + input_hmm_profile_size + input_hmm_profilesf_size + addldisk
command <<<
set -eux -o pipefail
mkdir input_fasta_dir
cd input_fasta_dir
## Troubleshoot slow runtime
date
## script expects all input sequences to be in one directory
## files must be named *.fa
INPUT_FILES=(~{sep=" " input_fastas})
for INPUT_FILE in ${INPUT_FILES[@]};
do
## If gzipped, extract
if [[ $INPUT_FILE =~ \.gz$ ]]; then
gunzip -f $INPUT_FILE
INPUT_FILE="${INPUT_FILE%.gz}"
fi
## Copy file to input_fasta_dir. Change suffix to .fa if neccesary.
if [[ $INPUT_FILE =~ \.fa$ ]]; then
cp $INPUT_FILE .
elif [[ $INPUT_FILE =~ \.fasta$ ]]; then
BASENAME=$(basename "${INPUT_FILE}" .fasta)
cp $INPUT_FILE ./${BASENAME}.fa
else
echo "Files must be named with fa suffix"
exit 1
fi
done
## Return to execution directory
cd ..
## localize hmmertblout2bed script (needed for hmmer run calls)
ln -s /opt/HumAS-HMMER_for_AnVIL/hmmertblout2bed.awk .
## localize overlap filter script (needed for hmmer run calls)
ln -s /opt/HumAS-HMMER_for_AnVIL/overlap_filter.py .
## Run HumAS-HMMER, output: AS-HOR+SF, AS-HOR, AS-strand
hmmer-run.sh input_fasta_dir ~{hmm_profile} ~{threadCount}
## Run HumAS-HMMER, output: AS-SF
hmmer-run_SF.sh input_fasta_dir ~{hmm_profile_SF} ~{threadCount}
>>>
output {
File as_hor_sf_bed = glob("AS-HOR+SF-vs-*.bed")[0]
File as_strand_bed = glob("AS-strand-vs-*.bed")[0]
File as_hor_bed = glob("AS-HOR-vs-*.bed")[0]
File as_sf_bed = glob("AS-SF-vs-*.bed")[0]
}
runtime {
memory: memSizeGB + " GB"
cpu: threadCount
disks: "local-disk " + final_disk_dize + " SSD"
docker: "juklucas/alphasat_hmmer@sha256:7210a50bc6a99a8beea374f689753e2e6d16b02dc60b400b40694a9ca6ce2489"
preemptible: preempts
}
}
task combine_beds {
input {
Array[File] beds
String tag = "combined"
String assembly_id
Int memSizeGB = 8
Int threadCount = 1
Int diskSizeGB = 64
}
String out_bed_fn = "~{assembly_id}_~{tag}.bed"
command <<<
set -eux -o pipefail
## Combine scattered results into one file
cat ~{sep=" " beds} | sort -k 1,1 -k2,2n > ~{out_bed_fn}
>>>
output {
File output_bed = out_bed_fn
}
runtime {
memory: memSizeGB + " GB"
cpu: threadCount
disks: "local-disk " + diskSizeGB + " SSD"
docker: "juklucas/alphasat_summarize@sha256:bab2062491c68c0f4c793193c2d3db4d3a301ad041d5d2d863d7978e6fe6d687"
preemptible: 1
}
}
task summarize_alpha {
input {
File as_hor_bed
File as_sf_bed
String assembly_id
Int memSizeGB = 4
Int threadCount = 1
Int diskSizeGB = 32
}
String as_summary_bed_out = "~{assembly_id}_asat.bed"
command <<<
## Summarize the monomer-level annotation into regional annotations (HOR, dHOR, etc.)
/opt/scripts/create_asat_bed.sh \
~{as_hor_bed} \
~{as_sf_bed} \
~{as_summary_bed_out}
>>>
output {
File as_summary_bed = as_summary_bed_out
}
runtime {
memory: memSizeGB + " GB"
cpu: threadCount
disks: "local-disk " + diskSizeGB + " SSD"
docker: "juklucas/alphasat_summarize@sha256:bab2062491c68c0f4c793193c2d3db4d3a301ad041d5d2d863d7978e6fe6d687"
preemptible: 1
}
}