-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathalign_codon2aa.pl
executable file
·146 lines (106 loc) · 5.89 KB
/
align_codon2aa.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#! /usr/bin/perl
# Takes in two arguments: (1) aligned amino acid sequence; and (2) nucleotide sequence.
# Prints the codon-aligned nucleotide sequence to screen.
#########################################################################################
# EXAMPLE CALL:
#########################################################################################
# align_codon2aa.pl <aligned amino acid sequence> <nucleotide sequence>
# align_codon2aa.pl MSAARKRTL-L ATGTCGGCGGCTCGTAAGCGCACCTTGTTG
#########################################################################################
# Copyright (C) 2018 Chase W. Nelson
# Date created: July 27, 2018
# AUTHOR: Chase W. Nelson
# CONTACT1: cnelson@amnh.org
# AFFILIATION: Sackler Institute for Comparative Genomics, American Museum of Natural History, New York, NY 10024, USA
# ACKNOWLEDGMENTS: written by C.W.N. with support from a Gerstner Scholars Fellowship from
# the Gerstner Family Foundation at the American Museum of Natural History, New York.
use strict;
use warnings;
# This example should NOT work
#my $this_aln_aa_seq = 'MSAARKRTLLKVIILGDSGVGKTSLINQYVNNKFSNQHKATIGADFLTKEVVEDRLVTMQIWDTAGQERLQGLGVAFYRGADCCVLVYDVYVMKSFDNLDYWRDEFVIQAAPSDQEYFPFVVLGNKVDVDGGNSRVVSEKKAKAWCAAKGGIPYFESSAKEDFNVDAAFQCIAKNALKNETEEEIYLPDTIDVNASRPQKTSGCEC';
#my $this_nt_seq = 'ATGTCGGCGGCTCGTAAGCGCACCTTGTTGAAGGTCATCATCCTCGGCGATAGCGGGGTTGGAAAGACATCGCTGATAAACCAATATGTTAATAACAAGTTCAGCAATCAACACAAGGCCACTATTGGAGCTGATTTCCTAACTAAAGAAGTATAAGTTGAAGACAGACTTGTTACAATGCAGATTTGGGATACGGCTGGACAAGAACGGCTTCAGGGTCTCGGTGTTGCCTTCTACAGAGGTGCAGATTGTTGTGTTCTTGTTTACGATGTTTATGTCATGAAGTCATTCGATAACCTTGACTACTGGAGGGATGAGTTTGTGATCCAGGCAGCCCCATCCGATCAGGAGTACTTCCCCTTTGTAGTACTTGGAAATAAAGTGGACGTGGATGGTGGCAATAGTCGAGTGGTCTCCGAGAAGAAGGCCAAAGCATGGTGTGCAGCGAAAGGAGGCATCCCCTACTTTGAGTCATCAGCCAAGGAAGACTTCAACGTGGATGCTGCATTCCAGTGTATTGCCAAGAACGCATTGAAGAACGAGACGGAGGAGGAAATCTACCTGCCTGATACGATCGACGTGAACGCCAGCAGGCCACAGAAAACTTCCGGATGCGAGTGTTAA';
# This example SHOULD work
#my $this_aln_aa_seq = 'MSAARKRTL-L';
#my $this_nt_seq = 'ATGTCGGCGGCTCGTAAGCGCACCTTGTTG'
my $this_aln_aa_seq = $ARGV[0];
my $this_nt_seq = $ARGV[1];
print "\n\n##########################################################################################\n";
print "### Performing codon alignment...\n";
unless($this_nt_seq) {
die "\n## Must submit the following arguments (in order):\n".
"##\t(1) aligned amino acid sequence\n##\t(2) nucleotide sequence\n## TERMINATED.\n\n";
}
my $this_nt_seq_aligned = &align_codon2aa($this_aln_aa_seq, $this_nt_seq);
print "\n### CODON ALIGNMENT: $this_nt_seq_aligned\n";
exit;
#########################################################################################
#########################################################################################
#################################### ####################################
#################################### SUBROUTINES ####################################
#################################### ####################################
#########################################################################################
#########################################################################################
##########################################################################################
sub align_codon2aa {
my ($curr_aa_seq, $curr_nt_seq) = @_;
# RECORD positions of GAPS in the amino acid, and also check for correct lengths
# Store gap indices for each seq in an array of arrays
my @aa_gap_indices;
my $curr_aa_seq_length = length($curr_aa_seq);
my $ungapped_curr_aa_seq_length = $curr_aa_seq_length;
my $nt_seq_length = length($curr_nt_seq);
for(my $pos_index = 0; $pos_index < $curr_aa_seq_length; $pos_index++) {
my $curr_aa = substr($curr_aa_seq, $pos_index, 1);
if($curr_aa =~ /-/) { # It's a gap
$ungapped_curr_aa_seq_length -= 1;
push(@aa_gap_indices, $pos_index);
}
}
if(($ungapped_curr_aa_seq_length * 3) != $nt_seq_length) {
# If nucleotide sequence contains a STOP codon not represented in the amino acid
# sequence or vice versa, modify the nucleotide sequence
my $last_codon = substr($curr_nt_seq, -3);
my $last_aa = substr($curr_aa_seq, -1);
#print "\nlast_codon = $last_codon\n";
if(($last_codon eq 'TAA' || $last_codon eq 'TAG' || $last_codon eq 'TGA') && ($last_aa ne '*')) {
# Remove the STOP codon to nt seq
#warn "\n### STOP CODON REMOVED.\n";
print "\n### REMOVING STOP codon:\nBEFOR: $curr_nt_seq\n";
substr($curr_nt_seq, -3, 3, '');
print "AFTER: $curr_nt_seq\n";
} elsif(($last_codon ne 'TAA' && $last_codon ne 'TAG' && $last_codon ne 'TGA') && ($last_aa eq '*')) {
# Add the STOP codon to nt seq
#warn "\n### STOP CODON ADDED.\n";
print "\n### ADDING STOP codon:\nBEFOR: $curr_nt_seq\n";
$curr_nt_seq .= 'TAA';
print "AFTER: $curr_nt_seq\n";
}
$nt_seq_length = length($curr_nt_seq);
#print "\nungapped_curr_aa_seq_length X 3 = " . $ungapped_curr_aa_seq_length * 3 . "\n" .
# "nt_seq_length = $nt_seq_length\n";
# If that didn't fix it
if(($ungapped_curr_aa_seq_length * 3) != $nt_seq_length) {
return '<ERRONEOUS INPUT; SEQUENCE LENGTHS NOT CONSISTENT>'; # DONE
}
}
# INSERT the appropriate gaps into the nucleotide sequences
if(scalar(@aa_gap_indices) >= 1) {
foreach(@aa_gap_indices) {
#print "$_ and ";
substr($curr_nt_seq, ($_ * 3), 0, '---')
}
#print "that's it.\n";
#print "New nt sequence length of <THIS> is " . length($curr_nt_seq) . "\n";
}
# WRITE the new nucleotide sequence with amino acid alignment imposed
my $curr_length = length($curr_nt_seq);
if ($curr_length < 3) {
die "\n\n### WARNING: Must be at least 3 nucleotides.\n\n";
}
if(($curr_length % 3) > 0) {
warn "\n\n### WARNING: Length is not a multiple of 3 (complete codons).\n\n";
}
#print "len is ".length($curr_nt_seq)."\n$curr_nt_seq";
#print "\nAligned Nucleotide Sequence:\n$curr_seq\n\n";
return "$curr_nt_seq";
}