-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathA3MIO.py
170 lines (120 loc) · 5.36 KB
/
A3MIO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
a3m and a2m input for BioPython
This registers new iterators that can be used for reading sequences and MSAs with Bio.SeqIO and Bio.AlignIO.
To use, simply import this module in your project and then use e.g. Bio.AlignIO.parse using any of these new formats:
a3m Regular A3M format
a2m Regular A2M format
a3m-nogaps A3M format, ignoring all insert states
a2m-nogaps A2M format, ignoring all insert states
See the demo at the bottom of the file for details
"""
import Bio.SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser, FastaIterator
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import collections
import re
try:
import itertools
from itertools import izip as zip
except ImportError:
pass
def pad(x, l, pad_char="-"):
"""Pad x with pad_char characters until it has length l"""
return x + pad_char * (l - len(x))
def insert_gaps(sequences):
"""Return a iterator over sequences that corresponds to the input sequences with gaps added"""
re_match = re.compile(r'([A-Z0-9~-])')
# split input sequences into columns of single-character match (upper case and -) states and
# variable-length inserts (lower case)
inserts = [
re_match.split(seq)
for seq in sequences
]
# for each column, determine maximum length
insert_max_lengths = [
max(
len(inserts[i][j])
for i in range(len(inserts))
)
for j in range(len(inserts[0]))
]
return (
"".join(
pad(insert, insert_len)
for insert, insert_len in zip(seq, insert_max_lengths)
)
for seq in inserts
)
def SimpleA2MA3MParser(handle, format='a3m', remove_inserts=False):
"""Simple A3M/A2M parser that returns tuples of title and sequence"""
re_insert = re.compile(r'[a-z.]')
# piggyback on the fasta parser for splitting file into title and sequence
parsed = collections.OrderedDict(SimpleFastaParser(handle))
titles = parsed.keys()
sequences = parsed.values()
if format == 'a3m' and not remove_inserts:
sequences = insert_gaps(sequences)
elif format == 'a2m' and not remove_inserts:
sequences = (
seq.replace(".", "-")
for seq in sequences
)
elif remove_inserts:
sequences = (
re_insert.sub('', seq)
for seq in sequences
)
else:
raise ValueError("Unknown format: {0}".format(format))
return zip(titles, sequences)
def A2MA3MIterator(format='a3m', remove_inserts=False):
"""Create a SeqIO-style iterator from parameters
Arguments:
format -- What input format to parse ('a3m' or 'a2m') (default: 'a3m')
remove_inserts -- Whether inserts with respect to the query sequence should be removed (default: False)
"""
def inner_iterator(handle, alphabet=single_letter_alphabet, title2ids=None):
"""Generator function to iterate over a3m records (as SeqRecord objects).
Arguments:
handle -- input file
alphabet -- optional alphabet
title2ids -- A function that, when given the title of the FASTA file (without the beginning >),
will return the id, name and description (in that order) for the record as a tuple
of strings. If this is not given, then the entire title line will be used as the
description, and the first word as the id and name.
"""
if title2ids is None:
title2ids = lambda t: (t.split(None, 1)[0], t.split(None, 1)[0], t)
for title, seq in SimpleA2MA3MParser(handle, format, remove_inserts):
id, name, description = title2ids(title)
yield SeqRecord(Seq(seq, alphabet), id=id, name=name, description=description)
return inner_iterator
def monkey_patch(name, iterator):
"""Monkey patch the Bio.SeqIO iterator registry if no such iterator exists yet"""
if name not in Bio.SeqIO._FormatToIterator:
Bio.SeqIO._FormatToIterator[name] = iterator
monkey_patch('a3m', A2MA3MIterator('a3m', False))
monkey_patch('a3m-nogaps', A2MA3MIterator('a3m', True))
monkey_patch('a2m', A2MA3MIterator('a2m', False))
monkey_patch('a2m-nogaps', A2MA3MIterator('a2m', True))
if __name__ == '__main__':
def assert_equal(iter_a, iter_b, msg):
for rec_a, rec_b in zip(iter_a, iter_b):
# assert that string representations of SeqRecords match - this is
# quite hacky and should not be done :)
assert repr(rec_a) == repr(rec_b), msg + ": " + repr((rec_a, rec_b))
# Compare our parser against output of reformat.pl from the HH-suite
a2m = list(Bio.SeqIO.parse("data/test.a2m", "a2m"))
a3m = list(Bio.SeqIO.parse("data/test.a3m", "a3m"))
fas = list(Bio.SeqIO.parse("data/test.fasta", "fasta"))
assert_equal(a2m, a3m, "a2m vs a3m")
assert_equal(a3m, fas, "a3m vs fas")
assert_equal(a2m, fas, "a2m vs fas")
a2m_ng = list(Bio.SeqIO.parse("data/test.a2m", "a3m-nogaps"))
a3m_ng = list(Bio.SeqIO.parse("data/test.a3m", "a3m-nogaps"))
fas_ng = list(Bio.SeqIO.parse("data/test.fasta_nogaps", "fasta"))
assert_equal(a2m_ng, a3m_ng, "a2m_ng vs a3m_ng")
assert_equal(a3m_ng, fas_ng, "a3m_ng vs fas_ng")
assert_equal(a2m_ng, fas_ng, "a2m_ng vs fas_ng")