-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.nf
240 lines (196 loc) · 7.95 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env nextflow
// LDSC necessary files (European population)
plink = params.LDSC_files + "1000G_EUR_Phase3_plink/1000G.EUR.QC."
frq = params.LDSC_files + "1000G_Phase3_frq/1000G.EUR.QC."
weights = params.LDSC_files + "1000G_Phase3_weights_hm3_no_MHC/weights.hm3_noMHC."
hmsnps = params.LDSC_files + "hm_snp.txt"
// By default, the pipeline will run on all bed files in the folder where the nextflow pipeline is located
// This can be changed using the --bed modifier (e.g. --bed /Users/home/myBedFolder/)
params.bed = "./"
bedfiles = Channel.fromPath(params.bed + "*.bed")
// Output directory. By default LDSC files will be outputed in a folder named "output"
// This can be changed using the --out modifier (e.g. --out LDSC_results)
params.out = "output"
// Phenotype file, comma separated with name of phenotype in first column and path in second column, no header!
// (e.g. scz,/path/to/LDSC/sumstats/scz.sumstats.gz)
// Location of sumstats file can be modifed using the --pheno modifier (e.g. --pheno myphenos.csv)
params.pheno = "pheno.csv"
pheno_file = file(params.pheno)
// Indicator for tissue association analysis (false by default)
params.tissue = false
// By default, the pipeline will use the Finucane et al. 2015 annotation
// The model from Gazal et al. 2018 can also be used.
params.model = "Finucane"
if (params.model == "Finucane"){
model_path = params.LDSC_files + "1000G_EUR_Phase3_baseline/baseline."
analysis_type = "Finucane"
} else if (params.model == "Gazal"){
model_path = params.LDSC_files + "baselineLD_v1.1/baselineLD."
analysis_type = "Gazal"
} else {
error("Please input a valid baseline model: Finucane or Gazal")
}
// If the tissue modifier is entered, the name of the model will be updated
if (params.tissue){
analysis_type = params.model + "_tissue"
}
// Info displayed at runtime
log.info """\
===================================
Usage:
nextflow run LDSC_nf
Options:
model: --model Finucane (default) or Gazal
tissue: --tissue (to run tissue association analysis) (default off)
bed: --bed /mybeds/location/ (default, current directory) (bed files should be tab delimited with no header!)
output directory: --out output (default, output folder current directory)
phenotype file: --pheno pheno.csv (default name) (two columns: id in the first column (no underscore), sumstats path in the second) (No header)
LDSC path: --LDSC_files /path/to/LDSC
===================================
L D S C - P I P E L I N E
===================================
analysis_type : ${analysis_type}
tissue mode : ${params.tissue}
bed path : ${params.bed}
out : ${params.out}
phenotypes : ${params.pheno}
LDSC path : ${params.LDSC_files}
LDSC plink : ${plink}
LDSC frq : ${frq}
LDSC weights : ${weights}
hm_snps : ${hmsnps}
===================================
To download the latest version of the pipeline run:
nextflow pull jbryois/LDSC_nf
===================================
"""
/*
*
* CODE BELOW SHOULD NOT NEED TO BE MODIFIED TO RUN THE PIPELINE!
*
*/
// Creates a channel for the different phenotypes to be tested
// The phenotype name should not contain an underscore
Channel
.fromPath(pheno_file)
.splitCsv(header: false)
.set { ch_pheno }
// Splits Input bed files per chromosome (1-22). Outputs the inputname and bed files for each chromosome
process Split_input_beds_per_chr {
input:
file bed from bedfiles
output:
set inputname, file('chr*') into InputBedsPerChr
script:
inputname = bed.toString() - ~/.bed$/
"""
awk '\$1 ~ /^chr[1-2]?[0-9]{1}\$/ {print \$0 >> \$1;close(\$1)}' < $bed
for i in {23..29}
do
if [ -f chr\$i ];
then
rm chr\$i
fi
done
"""
}
// Outputs sample name, bedfiles for each chromosome, annotation files and plink files (adapted from: https://github.com/nextflow-io/nextflow/issues/796)
InputBedsPerChr
.flatMap { item ->
inputname = item[0];
files = item[1];
files.collect { onefile -> \
return [ inputname, \
onefile, \
file(model_path + "${onefile.simpleName.replaceFirst(/chr/,'')}" + ".annot.gz"), \
file(plink + "${onefile.simpleName.replaceFirst(/chr/,'')}" + ".bim"), \
file(plink + "${onefile.simpleName.replaceFirst(/chr/,'')}" + ".bed"), \
file(plink + "${onefile.simpleName.replaceFirst(/chr/,'')}" + ".fam"), \
file(hmsnps)]}
}
.set { ch_chr }
/* Gets extended bed annotation for the input bed chromosome file (+-500bp), gets bed file for the annotation file (same chr), gets overlapping SNPs
* between the input bed chromosome and the annotation, add the overlapping SNPs to the annotation file, compute LD scores for the new annotation
*/
process Get_LD_scores {
input:
set val(inputname), file(input_bed), file(baseline_annot), file(plink_bim), file(plink_bed), file(plink_fam), file('hm_snp.txt') from ch_chr
output:
set inputname, file('input*') into LDscores
script:
outname = baseline_annot.toString().replaceAll(/baseline(LD)?/,'input')
outname = outname.replaceAll(/.annot.gz/,'')
tissue = params.tissue
"""
zcat < $baseline_annot | awk -v OFS='\t' 'NR>1 {print "chr"\$1,\$2,\$2,\$3}' | gzip > $baseline_annot".bed"
intersectBed -c -a $baseline_annot".bed" -b $input_bed > $input_bed".1000genomes.intersect"
awk '{if(\$5!=0) print \$4}' $input_bed".1000genomes.intersect" > $input_bed".1000genomes.intersect.snp"
if [ $tissue = "false" ]
then
awk -v OFS='\t' '{print \$1,\$2-500,\$3+500,\$1":"\$2"-"\$3}' $input_bed > $input_bed".500bp.ext"
awk -v OFS='\t' '{ if (\$2<0) print \$1,0,\$3,\$4; else print \$0}' $input_bed".500bp.ext" > tmp
mv tmp $input_bed".500bp.ext"
intersectBed -c -a $baseline_annot".bed" -b $input_bed".500bp.ext" > $input_bed".1000genomes.500bp.ext.intersect"
awk '{if(\$5!=0) print \$4}' $input_bed".1000genomes.500bp.ext.intersect" > $input_bed".1000genomes.500bp.ext.intersect.snp"
fast_match2_minimal.gz.pl $input_bed".1000genomes.intersect.snp" $inputname $baseline_annot > tmp
fast_match2.gz.pl $input_bed".1000genomes.500bp.ext.intersect.snp" $inputname"_500bp_ext" tmp | gzip > $outname".annot.gz"
rm tmp
rm *500bp.ext
else
fast_match2_minimal.gz.pl $input_bed".1000genomes.intersect.snp" $inputname $baseline_annot | gzip > $outname".annot.gz"
fi
ldsc.py --l2 --bfile $plink_bim.baseName --ld-wind-cm 1 --print-snps hm_snp.txt --annot $outname".annot.gz" --out $outname
rm *.intersect
rm *.intersect.snp
rm $baseline_annot".bed"
"""
}
/* Prepare a channel combining each input bed, all LD scores files and the phenotypes to be tested
*/
LDscores
.groupTuple()
.map { mytuple ->
def key = mytuple[0]
def myfiles = mytuple[1].flatten()
return tuple(key.toString(), myfiles)
}
.combine(ch_pheno)
.set { LDscores_join }
/* Gather all LD files for each inputBed and and phenotypes and run partitioned LD score regression for each input file and each phenotype
*/
process Get_pLDSC_Results {
publishDir "${params.out}/$analysis_type/Results_raw/", mode: 'copy', overwrite: true
input:
set inputname, path(inputLDscores),pheno,path(sumstats) from LDscores_join
output:
file('*results') into (Results_ch_h2,Results_ch_tissue)
"""
ldsc.py --h2 $sumstats --ref-ld-chr $model_path,input. --w-ld-chr $weights --overlap-annot --frqfile-chr $frq --print-coefficients --out ${pheno}_${inputname}
"""
}
// For enrichment analysis - Gathers the results files and compute final file
process Clean_results_h2_enrichment {
publishDir "${params.out}/$analysis_type/", mode: 'copy', overwrite: true
input:
path(results) from Results_ch_h2.collect()
output:
file('*.txt') into CleanResultsH2
when:
!params.tissue
"""
CleanH2Results.R
"""
}
// For tissue specific analysis - Gathers the results files, compute pvalue and produce final results file
process Clean_results_tissue_association {
publishDir "${params.out}/$analysis_type/", mode: 'copy', overwrite: true
input:
path(results) from Results_ch_tissue.collect()
output:
file('*.txt') into CleanResultsZ
when:
params.tissue
"""
CleanTissueResults.R
"""
}