chunked sorting

lskatz · Jan 17, 2025 · 7522617 · 7522617
1 parent 27c280d
commit 7522617
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 6 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name    = "fasten"
-version = "0.8.5"
+version = "0.8.6"
 authors = ["Lee Katz <gzu2@cdc.gov>"]
 #license-file  = "LICENSE"
 license       = "MIT"

diff --git a/src/bin/fasten_sort.rs b/src/bin/fasten_sort.rs
@@ -25,12 +25,23 @@
 //!   fasten_shuffle -d -1 sorted_1.fastq -2 sorted_2.fastq && \
 //!   gzip -v sorted_1.fastq sorted_2.fastq
 //! ```
+//! 
 //! Compare compression between unsorted and sorted
 //! from the previous example
+//! 
 //! ```bash
 //! ls -lh sorted_1.fastq.gz sorted_2.fastq.gz
 //! ```
 //! 
+//! ## Fast sorting of large files
+//! 
+//! If you want reads sorted but do not care if _everything_ is sorted,
+//! you can sort in chunks. This is useful for streaming large files.
+//! 
+//! ```bash
+//! zcat large.fastq.gz | fasten_sort --paired-end --chunk-size 1000 | gzip -c > sorted.fastq.gz
+//! ```
+//! 
 //! # Usage 
 //! 
 //! ```text
@@ -46,6 +57,10 @@
 //!                         are sorted by GC percentage. SEQ and ID are
 //!                         alphabetically sorted.
 //!     -r, --reverse       Reverse sort
+//!     -c, --chunk-size INT
+//!                         If > 0, then chunks of reads or pairs will be sorted
+//!                         instead of the whole set. This is useful for streaming
+//!                         large files. Default: 0
 //! ```
 
 extern crate getopts;
@@ -146,6 +161,7 @@ fn main(){
     opts.optopt("s","sort-by","Sort by either SEQ, MINIMIZER, GC, or ID. If GC, then the entries are sorted by GC percentage. SEQ and ID are alphabetically sorted.","STRING");
     opts.optopt("k", "kmer-length", "Length of kmer if using minimizers", "STRING");
     opts.optflag("r","reverse","Reverse sort");
+    opts.optopt("c", "chunk-size", "If > 0, then chunks of reads or pairs will be sorted instead of the whole set. This is useful for streaming large files. Default: 0", "INT");
 
     let matches = fasten_base_options_matches("Sort reads. This can be useful for many things including checksums and reducing gzip file sizes. Remember to use --paired-end if applicable.", opts);
 
@@ -175,6 +191,7 @@ fn main(){
     };
 
     let k: usize = matches.opt_get_default("kmer-length", 21).expect("ERROR parsing --kmer-length");
+    let chunk_size: usize = matches.opt_get_default("chunk-size", 0).expect("ERROR parsing --chunk-size");
 
     let mut buffer_iter = my_buffer.lines();
 
@@ -226,13 +243,28 @@ fn main(){
 
         entries.push(entry);
 
+        // If we are chunking, then sort and print the chunk
+        if chunk_size > 0 && entries.len() == chunk_size {
+            let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
+            for entry in sorted_entries {
+                println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
+                if entry.pe {
+                    println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
+                }
+            }
+            entries = vec![];
+        }
+
     }
 
-    let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
-    for entry in sorted_entries {
-        println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
-        if entry.pe {
-            println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
+    // If we aren't chunking then just print everything sorted
+    if chunk_size == 0 {
+        let sorted_entries:Vec<Seq> = sort_entries(entries, &which_field, reverse_sort);
+        for entry in sorted_entries {
+            println!("{}\n{}\n+\n{}", entry.id1, entry.seq1, entry.qual1);
+            if entry.pe {
+                println!("{}\n{}\n+\n{}", entry.id2, entry.seq2, entry.qual2);
+            }
         }
     }