fasta.impala

type char = u8;
type str = [char];

extern "C" {
    fn strlen(&str) -> u64;
    fn memcpy(&char, &char, u64) -> &str;
    fn write(&str,u64) -> ();
    fn println(&str) -> ();
}

struct nucleotide_info {
    letter: char,
    probability: float,
}

fn range(a: int, b: int, body: fn(int, fn())) -> () {
    if a < b {
        body(a);
        range(a+1, b, body, return)
    }
}

static MAXIMUM_LINE_WIDTH = 60;

// Repeatedly print string_To_Repeat until it has printed
// number_Of_Characters_To_Create. The output is also wrapped to
// MAXIMUM_LINE_WIDTH columns.
fn repeat_And_Wrap_String(string_To_Repeat: &str, number_Of_Characters_To_Create: int) -> () {
    let string_To_Repeat_Length = strlen(string_To_Repeat) as int;

    // Create an extended_String_To_Repeat which is a copy of string_To_Repeat
    // but extended with another copy of the first MAXIMUM_LINE_WIDTH characters
    // of string_To_Repeat appended to the end. Later on this allows us to
    // generate a line of output just by doing simple memory copies using an
    // appropriate offset into extended_String_To_Repeat.
    let mut extended_String_To_Repeat = ~[string_To_Repeat_Length+MAXIMUM_LINE_WIDTH: char];
    for column in range(0, string_To_Repeat_Length + MAXIMUM_LINE_WIDTH) {
        extended_String_To_Repeat(column) = string_To_Repeat(column % string_To_Repeat_Length);
    }

    let mut offset = 0;

    let mut line: [char * 61];
    line(MAXIMUM_LINE_WIDTH) = '\n';

    let mut current_Number_Of_Characters_To_Create = number_Of_Characters_To_Create;
    while current_Number_Of_Characters_To_Create > 0 {
        // Figure out the length of the line we need to write. If it's less than
        // MAXIMUM_LINE_WIDTH then we also need to add a line feed in the right
        // spot too.
        let mut line_Length = MAXIMUM_LINE_WIDTH;
        if current_Number_Of_Characters_To_Create < MAXIMUM_LINE_WIDTH {
            line_Length = current_Number_Of_Characters_To_Create;
            line(line_Length) = '\n';
        }

        memcpy(&line(0), &extended_String_To_Repeat(offset), line_Length as u64);

        // Update the offset, reducing it by string_To_Repeat_Length if
        // necessary.
        offset += line_Length;
        if offset>string_To_Repeat_Length {
            offset -= string_To_Repeat_Length;
        }

        // Output the line to stdout and update the
        // current_Number_Of_Characters_To_Create.
        write(&line, (line_Length+1) as u64);
        current_Number_Of_Characters_To_Create -= line_Length;
   }
}

// Generate a floating point pseudorandom number from 0.0 to max using a linear
// congruential generator.
static IM   = 139968u;
static IA   = 3877u;
static IC   = 29573u;

fn get_LCG_Pseudorandom_Number(max: float ) -> float {
   static mut seed = 42u;
   seed = (seed*IA + IC) % IM;
   (max/(IM as float))*(seed as float)
}

// Print a pseudorandom DNA sequence that is number_Of_Characters_To_Create
// characters long and made up of the nucleotides specified in
// nucleotides_Information and occurring at the frequencies specified in
// nucleotides_Information. The output is also wrapped to MAXIMUM_LINE_WIDTH
// columns.
fn generate_And_Wrap_Pseudorandom_DNA_Sequence(
    nucleotides_Information: &[nucleotide_info],
    number_Of_Nucleotides: int,
    number_Of_Characters_To_Create: int) -> () 
{
    // Cumulate the probabilities. Note that the probability is being multiplied
    // by IM because later on we'll also be calling the random number generator
    // with a value that is multiplied by IM. Since the random number generator
    // does a division by IM this allows the compiler to cancel out the
    // multiplication and division by IM with each other without requiring any
    // changes to the random number generator code whose code was explicitly
    // defined in the rules.
    let mut cumulative_Probabilities = ~[number_Of_Nucleotides: float];
    let mut cumulative_Probability = 0.0f;
    for i in range(0, number_Of_Nucleotides) {
        cumulative_Probability += nucleotides_Information(i).probability;
        let p = cumulative_Probability*(IM as float);
        cumulative_Probabilities(i) = p;
    }

    let mut line: [char * 61];
    line(MAXIMUM_LINE_WIDTH) = '\n';

    let mut current_Number_Of_Characters_To_Create = number_Of_Characters_To_Create;
    while current_Number_Of_Characters_To_Create > 0 {
        // Figure out the length of the line we need to write. If it's less than
        // MAXIMUM_LINE_WIDTH then we also need to add a line feed in the right
        // spot too.
        let mut line_Length = MAXIMUM_LINE_WIDTH;
        if current_Number_Of_Characters_To_Create < MAXIMUM_LINE_WIDTH {
            line_Length = current_Number_Of_Characters_To_Create;
            line(line_Length) = '\n';
        }

        // Fill up the line with characters from nucleotides_Information[] that
        // are selected by looking up a pseudorandom number.
        for column in range(0, line_Length) {
            let r = get_LCG_Pseudorandom_Number(IM as float);

            // Count the number of nucleotides with a probability less than what
            // was selected by the random number generator and then use that
            // count as an index for the nucleotide to select. It's arguable
            // whether this qualifies as a linear search but I guess you can say
            // that you're doing a linear search for all the nucleotides with a
            // probability less than what was selected by the random number
            // generator and then just counting how many matches were found.
            // With a small number of nucleotides this can be faster than doing
            // a more normal linear search (although in some cases it may
            // generate different results) and a couple of the other programs
            // already do this as well so we will too.
            let mut count = 0;
            for i in range(0, number_Of_Nucleotides) {
                if cumulative_Probabilities(i) <= r {
                    ++count;
                }
            }

            line(column) = nucleotides_Information(count).letter;
        }

        // Output the line to stdout and update the
        // current_Number_Of_Characters_To_Create.
        write(&line, (line_Length+1) as u64);
        current_Number_Of_Characters_To_Create -= line_Length;
    }
}

fn main(n: int) -> () {
    println(">ONE Homo sapiens alu");
    let mut homo_Sapiens_Alu =
        "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC"
        "AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG"
        "TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG"
        "AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA";
    repeat_And_Wrap_String(&homo_Sapiens_Alu, 2*n);
    println(">TWO IUB ambiguity codes");
    let mut iub_Nucleotides_Information = [
        nucleotide_info{letter: 'a', probability: 0.27f},
        nucleotide_info{letter: 'c', probability: 0.12f},
        nucleotide_info{letter: 'g', probability: 0.12f},
        nucleotide_info{letter: 't', probability: 0.27f},
        nucleotide_info{letter: 'B', probability: 0.02f},
        nucleotide_info{letter: 'D', probability: 0.02f},
        nucleotide_info{letter: 'H', probability: 0.02f},
        nucleotide_info{letter: 'K', probability: 0.02f},
        nucleotide_info{letter: 'M', probability: 0.02f},
        nucleotide_info{letter: 'N', probability: 0.02f},
        nucleotide_info{letter: 'R', probability: 0.02f},
        nucleotide_info{letter: 'S', probability: 0.02f},
        nucleotide_info{letter: 'V', probability: 0.02f},
        nucleotide_info{letter: 'W', probability: 0.02f},
        nucleotide_info{letter: 'Y', probability: 0.02f}
    ];

    generate_And_Wrap_Pseudorandom_DNA_Sequence(&iub_Nucleotides_Information, 15, 3*n);

    println(">THREE Homo sapiens frequency");
    let mut homo_Sapien_Nucleotides_Information = [
        nucleotide_info{letter: 'a', probability: 0.3029549426680f},
        nucleotide_info{letter: 'c', probability: 0.1979883004921f},
        nucleotide_info{letter: 'g', probability: 0.1975473066391f}, 
        nucleotide_info{letter: 't', probability: 0.3015094502008f}
    ];
    generate_And_Wrap_Pseudorandom_DNA_Sequence(&homo_Sapien_Nucleotides_Information, 4, 5*n);
}