-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathesl-alistat-cinfo-rfam-ss-seqcons.pl
84 lines (74 loc) · 2.85 KB
/
esl-alistat-cinfo-rfam-ss-seqcons.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env perl
# EPN, Tue Aug 29 19:01:47 2023
# esl-alistat-cinfo-rfam-ss-cons.pl
# Add the position conservation calculated in Bio-Easel's
# MSA.c:_c_pos_conservation (v0.16) to an esl-alistat
# --cinfo file. This is the conservation data that is
# colored on the Rfam (14.9) Secondary structure ->
# 'seqcons' page (rfam-family-pipeline/Rfam/Scripts/view/rfam_family_view.pl
# and rfam-family-pipeline/Rfam/Lib/Bio/Rfam/View/Plugin/SecondaryStructure.pm
#
use warnings;
use strict;
use Getopt::Long;
my $usage;
$usage = "perl esl-alistat-cinfo-rfam-ss-seqcons.pl [OPTIONS] <DNA or RNA alignment esl-alistat --cinfo output file>\n";
$usage .= "\tOPTIONS:\n";
$usage .= "\t--ignoregaps: report frequency of most common nt, ignoring gaps (default Rfam behavior: consider gaps)\n\n";
my $do_ignoregaps = 0;
&GetOptions( "ignoregaps" => \$do_ignoregaps);
if(scalar(@ARGV) != 1) { die $usage; }
my ($cinfo_file) = (@ARGV);
open(CINFO, $cinfo_file) || die "ERROR unable to open $cinfo_file";
## Per column residue counts:
## Alignment file: /Users/nawrockie/src/infernal-1.1.4/tutorial/Cobalamin.sto
## Alignment idx: 1
## Alignment name: Cobalamin
## Number of sequences: 431
## Ambiguities were averaged (e.g. 1 'N' = 0.25 'A', 0.25 'C', 0.25 'G' and 0.25 'U')
## Sequence weights from alignment were ignored (if they existed).
##
## alnpos A C G U
## ------- ------- ------- ------- -------
# 1 101.0 101.0 89.0 139.0
# 2 117.0 110.0 76.0 128.0
# The Rfam method counts gaps, so a gappy column cannot have high sequence conservation
# Here, gaps are not reported, but we can get number of sequences from line 4 so we
# don't need to know how many gaps.
my $nseq = undef;
while(my $line = <CINFO>) {
chomp $line;
if($line =~ /^\# Number of sequences\:\s+(\d+)/) {
$nseq = $1;
}
if(($line !~ m/^\#/) && ($line !~ m/^\/\/$/)) {
# data line
chomp $line;
my $orig_line = $line;
$line =~ s/^\s+//;
my @el_A = split(/\s+/, $line);
if(scalar(@el_A) != 5) { die "ERROR expected 5 tokens on data line but got a different number: $line\n"; }
my ($apos, $na, $nc, $ng, $nu) = (@el_A);
if(! defined $nseq) {
die "ERROR did not read number of sequences line before first data line";
}
#my $ngaps = ($nseq - ($na + $nc + $ng + $nu));
my $max = $na;
if($nc > $max) { $max = $nc; }
if($ng > $max) { $max = $ng; }
if($nu > $max) { $max = $nu; }
my $denom = ($do_ignoregaps) ? ($na+$nc+$ng+$nu) : $nseq;
my $seqcons = $max / $denom;
printf("%s %.3f\n", $orig_line, $seqcons);
}
elsif($line =~ m/alnpos/) {
print("# alnpos A C G U maxfreq\n");
}
elsif($line =~ m/^\# ---/) {
print("# ------- ------- ------- ------- ------- -------\n");
}
else {
print $line . "\n";
}
}
close(CINFO)