Skip to content

Commit

Permalink
chunked sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
lskatz committed Jan 17, 2025
1 parent 27c280d commit 7522617
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fasten"
version = "0.8.5"
version = "0.8.6"
authors = ["Lee Katz <gzu2@cdc.gov>"]
#license-file = "LICENSE"
license = "MIT"
Expand Down
42 changes: 37 additions & 5 deletions src/bin/fasten_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,23 @@
//! fasten_shuffle -d -1 sorted_1.fastq -2 sorted_2.fastq && \
//! gzip -v sorted_1.fastq sorted_2.fastq
//! ```
//!
//! Compare compression between unsorted and sorted
//! from the previous example
//!
//! ```bash
//! ls -lh sorted_1.fastq.gz sorted_2.fastq.gz
//! ```
//!
//! ## Fast sorting of large files
//!
//! If you want reads sorted but do not care if _everything_ is sorted,
//! you can sort in chunks. This is useful for streaming large files.
//!
//! ```bash
//! zcat large.fastq.gz | fasten_sort --paired-end --chunk-size 1000 | gzip -c > sorted.fastq.gz
//! ```
//!
//! # Usage
//!
//! ```text
Expand All @@ -46,6 +57,10 @@
//! are sorted by GC percentage. SEQ and ID are
//! alphabetically sorted.
//! -r, --reverse Reverse sort
//! -c, --chunk-size INT
//! If > 0, then chunks of reads or pairs will be sorted
//! instead of the whole set. This is useful for streaming
//! large files. Default: 0
//! ```
extern crate getopts;
Expand Down Expand Up @@ -146,6 +161,7 @@ fn main(){
opts.optopt("s","sort-by","Sort by either SEQ, MINIMIZER, GC, or ID. If GC, then the entries are sorted by GC percentage. SEQ and ID are alphabetically sorted.","STRING");
opts.optopt("k", "kmer-length", "Length of kmer if using minimizers", "STRING");
opts.optflag("r","reverse","Reverse sort");
opts.optopt("c", "chunk-size", "If > 0, then chunks of reads or pairs will be sorted instead of the whole set. This is useful for streaming large files. Default: 0", "INT");

let matches = fasten_base_options_matches("Sort reads. This can be useful for many things including checksums and reducing gzip file sizes. Remember to use --paired-end if applicable.", opts);

Expand Down Expand Up @@ -175,6 +191,7 @@ fn main(){
};

let k: usize = matches.opt_get_default("kmer-length", 21).expect("ERROR parsing --kmer-length");
let chunk_size: usize = matches.opt_get_default("chunk-size", 0).expect("ERROR parsing --chunk-size");

let mut buffer_iter = my_buffer.lines();

Expand Down Expand Up @@ -226,13 +243,28 @@ fn main(){

entries.push(entry);

// If we are chunking, then sort and print the chunk
if chunk_size > 0 && entries.len() == chunk_size {
let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
for entry in sorted_entries {
println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
if entry.pe {
println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
}
}
entries = vec![];
}

}

let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
for entry in sorted_entries {
println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
if entry.pe {
println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
// If we aren't chunking then just print everything sorted
if chunk_size == 0 {
let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
for entry in sorted_entries {
println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
if entry.pe {
println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
}
}
}

Expand Down

0 comments on commit 7522617

Please sign in to comment.