forked from tkrahn/extract23
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract23.sh
executable file
·178 lines (140 loc) · 6.6 KB
/
extract23.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/bin/bash
# extract23 Version 1.3
OPTIND=1 # Reset in case getopts has been used previously in the shell.
# Fetch parameters from commandline
while getopts ":vt:o:r:b:T:" opt; do
case "$opt" in
v) verbose=1
;;
b) BAMFILE_SORTED=$OPTARG
;;
r) REF=$OPTARG
;;
t) REF_23ANDME=$OPTARG
;;
o) OUTFILE=$OPTARG
;;
T) THREADS=$OPTARG
;;
*)
echo "======================================== EXTRACT23 ======================================="
echo "Usage: "
echo "extract23.sh -b <sorted_hg19_bamfile.bam> -r <hg19_ref.fasta> [-t <23andMe_V3_hg19_ref.tab.gz>] [-o output.txt] [-T threads ] [-v]"
echo " Parameters:"
echo " -b The whole genome hg19 referenced, sorted and indexed BAM file. REQUIRED!"
echo " -r The hg19 reference file (downloaded to your computer and indexed with samtools index <ref.fasta>. REQUIRED!"
echo " -t The bgziped and tabix -s1 -b2 -e2 indexed translation database. Defaults to 23andMe_V3_hg19_ref.tab.gz"
echo " -o Output file name without the .zip suffix. Defaults to 23andMe_V3_hg19.txt"
echo " -T Number of output compression threads to use"
echo " -v Verbose output."
exit 0
;;
esac
done
shift $((OPTIND-1))
if [ -z "${BAMFILE_SORTED}" ]; then
echo "BAM file required (option -b)"
exit 0
fi
if [ -z "${REF}" ]; then
echo "FASTA reference file required (option -r)"
exit 0
fi
if [ -z "${REF_23ANDME}" ]; then
echo "Using Pre-defined translation database 23andMe_V3_hg19_ref.tab.gz (option -t)"
REF_23ANDME="23andMe_V3_hg19_ref.tab.gz"
fi
if [ -z "${OUTFILE}" ]; then
echo "Using Pre-defined output file name 23andMe_V3_hg19.txt (option -o)"
OUTFILE="23andMe_V3_hg19.txt"
fi
if [ -z "${THREADS}" ]; then
echo "Using default number of threads"
THREADS="0"
fi
if [ -z "${verbose}" ]; then
verbose=0
fi
# Generate 23andMe mockup file
if [ ${verbose} -gt 0 ]; then
echo "Starting mpileup... Please be patient!"
fi
# The -l parameter requires mpileup to exactly pick the constellations at the SNP positions
# without having to screen through the whole genome base by base.
# samtools mpileup -R -B -q30 -Q30 -v -l ${REF_23ANDME} -f ${REF} ${BAMFILE_SORTED} > 23andMe_raw.vcf.gz
samtools view -H ${BAMFILE_SORTED} | /bin/sed -e 's/SN:\([0-9XY]\)/SN:chr\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - ${BAMFILE_SORTED} > bam_tmp.bam
bcftools mpileup --ignore-RG --threads $THREADS -B -q30 -Q30 -T ${REF_23ANDME} -f ${REF} bam_tmp.bam -O z -x -o 23andMe_raw.vcf.gz
tabix -p vcf 23andMe_raw.vcf.gz
if [ ${verbose} -gt 0 ]; then
echo "Mpileup completed. Starting SNP calling..."
fi
# Now we call the SNPs from the raw mpileup data with the -m (mixed base) genotype caller
bcftools call --threads $THREADS -O z -V indels -m -P 0 23andMe_raw.vcf.gz > 23andMe_called.vcf.gz
tabix -p vcf 23andMe_called.vcf.gz
if [ ${verbose} -gt 0 ]; then
echo "SNP calling completed. Starting annotation..."
fi
# Here we annotate the SNP names (rs numbers) to each SNP position
bcftools annotate --threads $THREADS -O z -a ${REF_23ANDME} -c CHROM,POS,ID 23andMe_called.vcf.gz > 23andMe_annotated.vcf.gz
tabix -p vcf 23andMe_annotated.vcf.gz
if [ ${verbose} -gt 0 ]; then
echo "Annotation completed. Starting extraction from VCF ..."
fi
# Pick the data from the vcf and convert it into a tab delimited table.
# The loop brackets [] are required for the translated genotypes, even though we only have one sample.
# We have to reformat several things with the sed editor in a pipe to make the format compatible.
bcftools query -f '%ID\t%CHROM\t%POS[\t%TGT]\n' 23andMe_annotated.vcf.gz | \
sed 's/chr//' | \
sed 's/\tM\t/\tMT\t/g' | \
sed 's/\///' | \
sed 's/\.\.$/--/' | \
sed 's/TA$/AT/' | \
sed 's/TC$/CT/' | \
sed 's/TG$/GT/' | \
sed 's/GA$/AG/' | \
sed 's/GC$/CG/' | \
sed 's/CA$/AC/' > 23andMe_V3_hg19.tab
if [ ${verbose} -gt 0 ]; then
echo "Extraction from VCF completed. Sorting by chromosome and position ..."
fi
# Finally sort the table by chromosome and position
sort -t $'\t' -k2,3 -V 23andMe_V3_hg19.tab > 23andMe_V3_hg19_sorted.tab
# 23andMe header
# I have implemented exactly the original 23andMe header here to avoid compatibility issues.
# However I have no clue about copyright issues, so use at your own risk!
# Probably most tools will ignore the # comment lines, so I recommend to change the header and only use this original
# 23andMe header when compatibility issues occur.
echo '# This data file generated by 23andMe at: Thu Dec 29 11:59:59 2016' > ${OUTFILE}
echo '#' >> ${OUTFILE}
echo '# This file contains raw genotype data, including data that is not used in 23andMe reports.' >> ${OUTFILE}
echo '# This data has undergone a general quality review however only a subset of markers have been ' >> ${OUTFILE}
echo '# individually validated for accuracy. As such, this data is suitable only for research, ' >> ${OUTFILE}
echo '# educational, and informational use and not for medical or other use.' >> ${OUTFILE}
echo '# ' >> ${OUTFILE}
echo '# Below is a text version of your data. Fields are TAB-separated' >> ${OUTFILE}
echo '# Each line corresponds to a single SNP. For each SNP, we provide its identifier ' >> ${OUTFILE}
echo '# (an rsid or an internal id), its location on the reference human genome, and the ' >> ${OUTFILE}
echo '# genotype call oriented with respect to the plus strand on the human reference sequence.' >> ${OUTFILE}
echo '# We are using reference human assembly build 37 (also known as Annotation Release 104).' >> ${OUTFILE}
echo '# Note that it is possible that data downloaded at different times may be different due to ongoing ' >> ${OUTFILE}
echo '# improvements in our ability to call genotypes. More information about these changes can be found at:' >> ${OUTFILE}
echo '# https://www.23andme.com/you/download/revisions/' >> ${OUTFILE}
echo '# ' >> ${OUTFILE}
echo '# More information on reference human assembly build 37 (aka Annotation Release 104):' >> ${OUTFILE}
echo '# http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606' >> ${OUTFILE}
echo '#' >> ${OUTFILE}
echo '# rsid chromosome position genotype' >> ${OUTFILE}
# Append the genotype table to the header
cat 23andMe_V3_hg19_sorted.tab >> ${OUTFILE}
if [ ${verbose} -gt 0 ]; then
echo "${OUTFILE} was created. Compressing ..."
fi
# Zip the file
zip ${OUTFILE}.zip ${OUTFILE}
echo "extract23: Output file ${OUTFILE}.zip was created."
# Delete all no longer needed files
rm -f 23andMe_raw.vcf.*
rm -f 23andMe_called.vcf.*
rm -f 23andMe_annotated.vcf.*
rm -f 23andMe_V3_hg19*.tab
rm -f ${OUTFILE}