-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path03_sampleData2genotypes.pl
executable file
·271 lines (241 loc) · 9.19 KB
/
03_sampleData2genotypes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/perl
############################################################################################
# Copyright (C) Nicolas Thierry-Mieg, 2019-2025
#
# This file is part of grexome-TIMC-Secondary, written by Nicolas Thierry-Mieg
# (CNRS, France) Nicolas.Thierry-Mieg@univ-grenoble-alpes.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with this program.
# If not, see <https://www.gnu.org/licenses/>.
############################################################################################
# 24/03/2018
# NTM
# Read on stdin a (G)VCF file with one data column per sample.
# FORMAT MUST start with either GT:AF and have DP somewhere (for SNVs
# and short indels), or it must start with GT:GQ:FR:BPR and possibly have
# BP (for CNVs).
# Replace all the sample data columns by genotype columns
# HV, HET, OTHER, HR.
# Each column has eg:
# genotype1~sample1,sample2|genotype2~sample3|genotype3~sample4,sample5
# - HR has only one genotype: 0/0.
# - all genotypes in HET column will be REF/VAR, eg 0/1, 0/2...
# - all genotypes in HV column will by homovar, eg 1/1, 2/2 etc...
# - all other genotypes (ie VAR1/VAR2) will appear in OTHER column.
# For HV and HET genos at SNVs/short-indels, "sample" actually
# becomes: $sample[$dp:$af].
# Similarly for CNV calls "sample" becomes $sample[$GQ:$FR:$BP] (without :$BP if
# it doesn't exist).
#
# <NON_REF> is removed from ALTs if it was present, apart from that
# we don't touch the first 8 columns: we assume all ALTs are called
# in at least one genotype, and the REF+ALTs are already normalized.
# This is the case if the input GVCF went through filterBadCalls.pl .
#
# NOTE that FORMAT becomes "GENOS", so produced file is no longer
# a VCF stricto sensu (spec requires first FORMAT key is GT).
use strict;
use warnings;
use File::Basename qw(basename);
use POSIX qw(strftime);
# we use $0 in every stderr message but we really only want
# the program name, not the path
$0 = basename($0);
my $now = strftime("%F %T", localtime);
warn "I $now: $0 - starting to run\n";
# parse header, just copy it except for FORMAT descriptions,
# which we replace with our FORMAT descriptions, and
# the CHROM line, where we replace the sample data by genotype
# columns, but store the list of sample names in @samples
my @samples;
while(my $line = <STDIN>) {
if ($line =~ /^##/) {
# ignore previous FORMAT descriptions, print other header lines
($line =~ /^##FORMAT=/) || print $line;
}
elsif ($line =~ /^#CHROM/) {
chomp($line);
@samples = split(/\t/, $line);
(@samples >= 10) || die "E $0: no sample ids in CHROM line?\n$line\n";
# first 9 fields are just copied
my $lineToPrint = shift(@samples);
foreach my $i (2..9) {
$lineToPrint .= "\t".shift(@samples);
}
# now @samples has the sample ids, as we wanted.
# add the new column names
$lineToPrint .= "\tHV\tHET\tOTHER\tHR\n";
# Now print the command line, the new FORMAT descriptions, and
# then the new CHROM line
# add info with full command line run
my $com = "$0 ".join(" ",@ARGV);
chomp($com);
$com .= " < ".`readlink -f /proc/$$/fd/0` ;
chomp($com);
$com .= " > ".`readlink -f /proc/$$/fd/1` ;
chomp($com);
$com .= " 2> ".`readlink -f /proc/$$/fd/2` ;
chomp($com);
print "##sampleData2genotypes=<commandLine=\"$com\">\n";
print "##FORMAT=<ID=GENOS,Number=.,Type=String,Description=\"pipe-separated list of genotypes called at this position, with all corresponding sample identifiers for each genotype (with [DP:AF] for HET and HV SNVs/short-indels, and [GQ:FR:BP] for CNVs (no BP if unavailable)\">\n";
print $lineToPrint;
last;
}
else {
die "E $0: parsing header, found bad line:\n$line";
}
}
# now parse data lines
while(my $line = <STDIN>) {
chomp($line);
my @data = split(/\t/, $line);
(@data >= 10) || die "E $0: no sample data in line?\n$line\n";
# first 8 fields are copied, except we remove <NON_REF> from ALTs if it was present
$data[4] =~ s/,<NON_REF>//;
my $lineToPrint = shift(@data);
foreach my $i (1..7) {
$lineToPrint .= "\t".shift(@data);
}
# from FORMAT we need GT:AF+DP (assumption: filterBadCalls.pl already ran
# and created/updated AF and DP with correct values), or GT:GQ+FR (and BP
# if it exists) for CNVs.
my $format = shift(@data);
# GT and AF/GQ should always be first, check it and detect whether this is
# a SNV/short-indel (isCnv==0), or a CNV without or with BP (isCnv==1 or 2)
my $isCnv = 0;
# for SNVs we will need the index of DP in FORMAT
my $dpCol = 0;
if ($format eq "GT:GQ:FR:BPR:BP") {
$isCnv = 2;
}
elsif ($format eq "GT:GQ:FR:BPR") {
$isCnv = 1;
}
elsif ($format =~ /^GT:AF:/) {
my @format = split(/:/, $format);
foreach my $i (2..$#format) {
($format[$i] eq "DP") && ($dpCol = $i) && last;
}
($dpCol==0) && die "E $0: SNV line but DP not found in format:\n$line\n";
}
else {
die "E $0: FORMAT doesn't begin with 'GT:AF:' or 'GT:GQ:FR:BPR' in:\n$line\n";
}
# print new FORMAT
$lineToPrint .= "\tGENOS";
# sanity check
(@data == @samples) ||
die "E $0: we don't have same number of samples and data columns in:\n$line\n";
# construct comma-separated list of samples for each called genotype
my %geno2samples = ();
foreach my $i (0..$#data) {
# ignore NOCALLs
($data[$i] eq './.') && next;
my $geno;
# $callInfo: stays "" for HRs, else [$dp:$af] for SNVs, [$gq:$fr:$bp] or [$gq:$fr] for CNVs
my $callInfo = "";
if ($isCnv) {
# CNV
if ($data[$i] eq '0/0') {
$geno = '0/0';
}
else {
($data[$i] =~ m~^(\d+/\d+):([^:]+:[^:]+):~) ||
die "E $0: cannot grab GT:GQ:FR for sample $i in $data[$i] in line:\n$line\n";
($geno,$callInfo) = ($1,$2);
if ($isCnv == 2) {
# grab :BP == last field
($data[$i] =~ m~(:[^:]+)$~) ||
die "cannot grab BP in $data[$i]\n";
my $bp = $1;
# we can have several comma-separated BPs, but commas are used to separate
# samples -> substitute with _
$bp =~ s/,/_/g;
$callInfo .= $bp;
}
$callInfo = "[$callInfo]";
}
}
else {
# SNV/short-indel
if ($data[$i] =~ m~^0/0:~) {
$geno = '0/0';
}
else {
($data[$i] =~ m~^(\d+/\d+):([^:]+):~) ||
die "E $0: cannot grab genotype and AF for sample $i in $data[$i] in line:\n$line\n";
($geno,$callInfo) = ($1,$2);
my @thisData = split(/:/, $data[$i]);
my $dp = 0;
($dpCol) && ($thisData[$dpCol]) && ($thisData[$dpCol] ne '.') && ($dp = $thisData[$dpCol]);
($dp) || die "E $0: AF is $callInfo but couldn't find DP in $data[$i]\n$line\n";
# we want [$DP:$AF]
$callInfo = "[$dp:$callInfo]";
}
}
if (defined $geno2samples{$geno}) { $geno2samples{$geno} .= ","; }
else { $geno2samples{$geno} = ""; }
$geno2samples{$geno} .= $samples[$i].$callInfo;
}
# now print data for each genotype, removing data from %geno2samples as we go
# format is: genotype1~sample1,sample2|genotype2~sample3|genotype3~sample4,sample5
# HV
$lineToPrint .= "\t";
foreach my $geno (sort GENOSORT keys %geno2samples) {
($geno eq '0/0') && next; # skip HR
($geno =~ m~^(\d+)/\1$~) || next;
$lineToPrint .= "$geno~".$geno2samples{$geno}."|";
delete($geno2samples{$geno});
}
# remove last | if needed (not needed if there are no HVs)
$lineToPrint =~ s/\|$// ;
# HET == 0/*
$lineToPrint .= "\t";
foreach my $geno (sort GENOSORT keys %geno2samples) {
($geno eq '0/0') && next; # skip HR
($geno =~ m~^0/\d+$~) || next;
$lineToPrint .= "$geno~".$geno2samples{$geno}."|";
delete($geno2samples{$geno});
}
# remove last | if there was at least one HET geno
$lineToPrint =~ s/\|$// ;
# OTHER
$lineToPrint .= "\t";
foreach my $geno (sort GENOSORT keys %geno2samples) {
($geno eq '0/0') && next; # skip HR
($geno =~ m~^\d+/\d+$~) || die "E $0: cannot parse geno $geno in line:\n$line\n";
$lineToPrint .= "$geno~".$geno2samples{$geno}."|";
delete($geno2samples{$geno});
}
# remove last | if needed
$lineToPrint =~ s/\|$// ;
# HR: if no sample is HR, column is empty
$lineToPrint .= "\t";
if (defined $geno2samples{'0/0'}) {
$lineToPrint .= "0/0~".$geno2samples{'0/0'};
delete($geno2samples{'0/0'});
}
# sanity:
(%geno2samples) &&
die "E $0: after eating every geno, geno2samples not empty: %geno2samples, line:\n$line\n";
print $lineToPrint."\n";
}
$now = strftime("%F %T", localtime);
warn "I $now: $0 - ALL DONE, completed successfully!\n";
#######################################
# sort function for genotypes x/y (x and y are ints)
sub GENOSORT {
($a =~ m~^(\d+)/(\d+)$~) || die "E $0 in GENOSORT: cannot decompose a $a\n";
my ($a1,$a2) = ($1,$2);
($b =~ m~^(\d+)/(\d+)$~) || die "E $0 in GENOSORT: cannot decompose b $b\n";
my ($b1,$b2) = ($1,$2);
return(($a1 <=> $b1) || ($a2 <=> $b2));
}