-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathalign_sequences.py
executable file
·104 lines (88 loc) · 3.33 KB
/
align_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
"""
usage:
align_sequences [options] seq1.fa seq2.fa
where the options are:
-h,--help : print usage and quit
-m,--match: score of a match in the alignment [2]
-x,--mismatch: penalty for a mismatch in the alignment [1]
-g,--gapopen: penalty for opening a new gap [4]
-e,--gapextend: penalty for extending a gap [1]
"""
from sys import argv, stderr
from getopt import getopt, GetoptError
# a simple function to read the name and sequence from a file
# The file is expected to have just one contig/sequence. This function
# checks the assumption and complains if it is not the case.
def read_single_contig_fasta(filename):
names = []
sequences = []
with open(filename, 'r') as f:
line = f.readline()
assert line.startswith(">")
names.append(line.strip().split("\t"))
sequence = ""
for line in f:
if line.startswith(">"):
sequences.append(sequence)
names.append(line.strip().split("\t"))
sequence = ""
else:
for x in line.strip():
if x not in ["A", "C", "G", "T"]:
print("Unknown nucleotide {}".format(x), file=stderr)
exit(3)
sequence += line.strip()
sequences.append(sequence)
assert len(names) == 1
assert len(sequences) == 1
return names[0], sequences[0]
def smith_waterman(seq1, seq2, match, mismatch, gapopen, gapextend):
max_score = 0
alnseq1 = ""
alnseq2 = ""
return max_score, alnseq1, alnseq2
def main(filename1, filename2, match, mismatch, gapopen, gapextend):
# read the name and sequence from the file
name1, seq1 = read_single_contig_fasta(filename1)
name2, seq2 = read_single_contig_fasta(filename2)
# this function takes as input two nucleotide sequences along with
# scores for an alignment match, mismatch, opening a new gap, and
# extending an existing gap. This should return the maximum alignment
# score as well as the alignment. For examples see the testdriver script
max_score, alnseq1, alnseq2 = smith_waterman(seq1, seq2,
match, mismatch, gapopen, gapextend)
print("Maximum alignment score: {}".format(max_score))
print("Sequence1 : {}".format(alnseq1))
print("Sequence2 : {}".format(alnseq2))
if __name__ == "__main__":
try:
opts, args = getopt(argv[1:],
"hm:x:g:e:",
["help", "match=", "mismatch=", "gapopen=", "gapextend="])
except GetoptError as err:
print(err)
print(__doc__, file=stderr)
exit(1)
match = 2
mismatch = 1
gapopen = 4
gapextend = 1
for o, a in opts:
if o in ("-h", "--help"):
print(__doc__, file=stderr)
exit()
elif o in ("-m", "--match"):
match = float(a)
elif o in ("-x", "--mismatch"):
mismatch = float(a)
elif o in ("-g", "--gapopen"):
gapopen = float(a)
elif o in ("-e", "--gapextend"):
gapextend = float(a)
else:
assert False, "unhandled option"
if len(args) != 2:
print(__doc__, file=stderr)
exit(2)
main(args[0], args[1], match, mismatch, gapopen, gapextend)