-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta.impala
194 lines (170 loc) · 8.22 KB
/
fasta.impala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
type char = u8;
type str = [char];
extern "C" {
fn strlen(&str) -> u64;
fn memcpy(&char, &char, u64) -> &str;
fn write(&str,u64) -> ();
fn println(&str) -> ();
}
struct nucleotide_info {
letter: char,
probability: float,
}
fn range(a: int, b: int, body: fn(int, fn())) -> () {
if a < b {
body(a);
range(a+1, b, body, return)
}
}
static MAXIMUM_LINE_WIDTH = 60;
// Repeatedly print string_To_Repeat until it has printed
// number_Of_Characters_To_Create. The output is also wrapped to
// MAXIMUM_LINE_WIDTH columns.
fn repeat_And_Wrap_String(string_To_Repeat: &str, number_Of_Characters_To_Create: int) -> () {
let string_To_Repeat_Length = strlen(string_To_Repeat) as int;
// Create an extended_String_To_Repeat which is a copy of string_To_Repeat
// but extended with another copy of the first MAXIMUM_LINE_WIDTH characters
// of string_To_Repeat appended to the end. Later on this allows us to
// generate a line of output just by doing simple memory copies using an
// appropriate offset into extended_String_To_Repeat.
let mut extended_String_To_Repeat = ~[string_To_Repeat_Length+MAXIMUM_LINE_WIDTH: char];
for column in range(0, string_To_Repeat_Length + MAXIMUM_LINE_WIDTH) {
extended_String_To_Repeat(column) = string_To_Repeat(column % string_To_Repeat_Length);
}
let mut offset = 0;
let mut line: [char * 61];
line(MAXIMUM_LINE_WIDTH) = '\n';
let mut current_Number_Of_Characters_To_Create = number_Of_Characters_To_Create;
while current_Number_Of_Characters_To_Create > 0 {
// Figure out the length of the line we need to write. If it's less than
// MAXIMUM_LINE_WIDTH then we also need to add a line feed in the right
// spot too.
let mut line_Length = MAXIMUM_LINE_WIDTH;
if current_Number_Of_Characters_To_Create < MAXIMUM_LINE_WIDTH {
line_Length = current_Number_Of_Characters_To_Create;
line(line_Length) = '\n';
}
memcpy(&line(0), &extended_String_To_Repeat(offset), line_Length as u64);
// Update the offset, reducing it by string_To_Repeat_Length if
// necessary.
offset += line_Length;
if offset>string_To_Repeat_Length {
offset -= string_To_Repeat_Length;
}
// Output the line to stdout and update the
// current_Number_Of_Characters_To_Create.
write(&line, (line_Length+1) as u64);
current_Number_Of_Characters_To_Create -= line_Length;
}
}
// Generate a floating point pseudorandom number from 0.0 to max using a linear
// congruential generator.
static IM = 139968u;
static IA = 3877u;
static IC = 29573u;
fn get_LCG_Pseudorandom_Number(max: float ) -> float {
static mut seed = 42u;
seed = (seed*IA + IC) % IM;
(max/(IM as float))*(seed as float)
}
// Print a pseudorandom DNA sequence that is number_Of_Characters_To_Create
// characters long and made up of the nucleotides specified in
// nucleotides_Information and occurring at the frequencies specified in
// nucleotides_Information. The output is also wrapped to MAXIMUM_LINE_WIDTH
// columns.
fn generate_And_Wrap_Pseudorandom_DNA_Sequence(
nucleotides_Information: &[nucleotide_info],
number_Of_Nucleotides: int,
number_Of_Characters_To_Create: int) -> ()
{
// Cumulate the probabilities. Note that the probability is being multiplied
// by IM because later on we'll also be calling the random number generator
// with a value that is multiplied by IM. Since the random number generator
// does a division by IM this allows the compiler to cancel out the
// multiplication and division by IM with each other without requiring any
// changes to the random number generator code whose code was explicitly
// defined in the rules.
let mut cumulative_Probabilities = ~[number_Of_Nucleotides: float];
let mut cumulative_Probability = 0.0f;
for i in range(0, number_Of_Nucleotides) {
cumulative_Probability += nucleotides_Information(i).probability;
let p = cumulative_Probability*(IM as float);
cumulative_Probabilities(i) = p;
}
let mut line: [char * 61];
line(MAXIMUM_LINE_WIDTH) = '\n';
let mut current_Number_Of_Characters_To_Create = number_Of_Characters_To_Create;
while current_Number_Of_Characters_To_Create > 0 {
// Figure out the length of the line we need to write. If it's less than
// MAXIMUM_LINE_WIDTH then we also need to add a line feed in the right
// spot too.
let mut line_Length = MAXIMUM_LINE_WIDTH;
if current_Number_Of_Characters_To_Create < MAXIMUM_LINE_WIDTH {
line_Length = current_Number_Of_Characters_To_Create;
line(line_Length) = '\n';
}
// Fill up the line with characters from nucleotides_Information[] that
// are selected by looking up a pseudorandom number.
for column in range(0, line_Length) {
let r = get_LCG_Pseudorandom_Number(IM as float);
// Count the number of nucleotides with a probability less than what
// was selected by the random number generator and then use that
// count as an index for the nucleotide to select. It's arguable
// whether this qualifies as a linear search but I guess you can say
// that you're doing a linear search for all the nucleotides with a
// probability less than what was selected by the random number
// generator and then just counting how many matches were found.
// With a small number of nucleotides this can be faster than doing
// a more normal linear search (although in some cases it may
// generate different results) and a couple of the other programs
// already do this as well so we will too.
let mut count = 0;
for i in range(0, number_Of_Nucleotides) {
if cumulative_Probabilities(i) <= r {
++count;
}
}
line(column) = nucleotides_Information(count).letter;
}
// Output the line to stdout and update the
// current_Number_Of_Characters_To_Create.
write(&line, (line_Length+1) as u64);
current_Number_Of_Characters_To_Create -= line_Length;
}
}
fn main(n: int) -> () {
println(">ONE Homo sapiens alu");
let mut homo_Sapiens_Alu =
"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC"
"AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG"
"TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG"
"AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA";
repeat_And_Wrap_String(&homo_Sapiens_Alu, 2*n);
println(">TWO IUB ambiguity codes");
let mut iub_Nucleotides_Information = [
nucleotide_info{letter: 'a', probability: 0.27f},
nucleotide_info{letter: 'c', probability: 0.12f},
nucleotide_info{letter: 'g', probability: 0.12f},
nucleotide_info{letter: 't', probability: 0.27f},
nucleotide_info{letter: 'B', probability: 0.02f},
nucleotide_info{letter: 'D', probability: 0.02f},
nucleotide_info{letter: 'H', probability: 0.02f},
nucleotide_info{letter: 'K', probability: 0.02f},
nucleotide_info{letter: 'M', probability: 0.02f},
nucleotide_info{letter: 'N', probability: 0.02f},
nucleotide_info{letter: 'R', probability: 0.02f},
nucleotide_info{letter: 'S', probability: 0.02f},
nucleotide_info{letter: 'V', probability: 0.02f},
nucleotide_info{letter: 'W', probability: 0.02f},
nucleotide_info{letter: 'Y', probability: 0.02f}
];
generate_And_Wrap_Pseudorandom_DNA_Sequence(&iub_Nucleotides_Information, 15, 3*n);
println(">THREE Homo sapiens frequency");
let mut homo_Sapien_Nucleotides_Information = [
nucleotide_info{letter: 'a', probability: 0.3029549426680f},
nucleotide_info{letter: 'c', probability: 0.1979883004921f},
nucleotide_info{letter: 'g', probability: 0.1975473066391f},
nucleotide_info{letter: 't', probability: 0.3015094502008f}
];
generate_And_Wrap_Pseudorandom_DNA_Sequence(&homo_Sapien_Nucleotides_Information, 4, 5*n);
}