-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathali-pfam-lowercase-rf-region.pl
91 lines (77 loc) · 3.2 KB
/
ali-pfam-lowercase-rf-region.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env perl
# EPN, Mon Jan 6 11:44:52 2020
# ali-pfam-lowercase-rf-region.pl
#
# Given a pfam formatted alignment with RF annotation, and a region of
# RF positions, make all nucleotides within those regions lowercase.
#
use warnings;
use strict;
use Getopt::Long;
my $usage;
$usage = "ali-pfam-lowercase-rf-region.pl\n\n";
$usage .= "Usage:\n\n";
$usage .= "ali-pfam-lowercase-rf-region.pl <pfam formatted alignment file> <RF start position> <RF end position>\n";
#$usage .= "\tOPTIONS:\n";
#$usage .= "\t\t-s: additionally convert gap RF columns to '.' in individual SS and SS_cons lines [default: do not]\n\n";
#$usage .= "\t\t-o: *only* convert gap RF columns to '.' in individual SS and SS_cons lines, do not lowercase RF gap columns in each sequence\n\n";
#my $opt_s = 0; # set to '1' if -s used
#my $opt_o = 0; # set to '1' if -o used
#&GetOptions( "s" => \$opt_s,
# "o" => \$opt_o);
if(scalar(@ARGV) != 3) { die $usage; }
#if($opt_s && $opt_o) { die "ERROR only one of -s and -o can be used"; }
my ($aln_file, $rf_start, $rf_end) = (@ARGV);
if($rf_start <= 0) { die "ERROR <RF start position> must be >= 1"; }
if($rf_start > $rf_end) { die "ERROR <RF start position> must be <= <RF end position>"; }
my %seen_H = (); # key is sequence name, used to check to make sure we are in Pfam format
open(IN, $aln_file) || die "ERROR unable to open $aln_file";
# first pass to determine rfgap columns
my $found_rf = 0;
my $i;
my @do_lowercase_A = (); # [0..alen], value is 1 if we should convert to lowercase because within rf_start..rf_end
my $line;
my $rfpos = 0;
while($line = <IN>) {
if($line =~ /^\#=GC\s+RF\s+(\S+)$/) {
if($found_rf) {
die "ERROR RF line found twice, did you convert to pfam format (e.g. esl-reformat pfam ...)?";
}
my $rf = $1;
my @rf_A = split("", $rf);
for($i = 0; $i < scalar(@rf_A); $i++) {
my $rf_is_gap = ($rf_A[$i] =~ m/[\.\-\_\~]/) ? 1 : 0; # RF gap characters are: '-', '.', '_' and '~'
if(! $rf_is_gap) { $rfpos++; }
$do_lowercase_A[$i] = (($rfpos >= $rf_start) && ($rfpos <= $rf_end)) ? 1 : 0;
}
$found_rf = 1;
}
}
my $rflen = $rfpos;
if(! $found_rf) { die "ERROR did not find RF annotation"; }
close(IN);
if($rflen < $rf_start) { die "ERROR <RF start position> input as $rf_start but RF length is only $rflen"; }
if($rflen < $rf_end) { die "ERROR <RF end position> input as $rf_end but RF length is only $rflen"; }
# second pass to lowercase RF gap columns in sequences and optionally convert RF gap columns in individual SS and SS_cons lines to '.'
open(IN, $aln_file) || die "ERROR unable to open $aln_file on second pass";
while($line = <IN>) {
chomp $line;
if($line =~ /^(\S+)(\s+)(\S+)$/) {
my ($seqname, $space, $seq) = ($1, $2, $3);
if((exists $seen_H{$seqname}) && ($seen_H{$seqname} == 1)) {
die "ERROR saw sequence $seqname twice, did you convert to pfam format (e.g. esl-reformat pfam ...)?";
}
$seen_H{$seqname} = 1;
my @seq_A = split("", $seq);
for($i = 0; $i < scalar(@seq_A); $i++) {
if($do_lowercase_A[$i]) {
$seq_A[$i] =~ tr/A-Z/a-z/;
}
}
printf("%s%s%s\n", $seqname, $space, join("", @seq_A));
}
else {
print $line . "\n";
}
}
close(IN);