Skip to content

Commit

Permalink
Merge pull request #63 from bacpop/filter-tweaks
Browse files Browse the repository at this point in the history
Filter tweaks
  • Loading branch information
johnlees authored Nov 23, 2023
2 parents 798c290 + 9e55002 commit df6b0a7
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 26 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.3.4"
version = "0.3.5"
authors = [
"John Lees <jlees@ebi.ac.uk>",
"Simon Harris <simon.harris@gatesfoundation.org>",
Expand Down
16 changes: 13 additions & 3 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,20 @@ use super::QualFilter;
pub const DEFAULT_KMER: usize = 17;
/// Default single strand (which is equivalent to !rc)
pub const DEFAULT_STRAND: bool = false;
/// Default behaviour when min-freq counting ambig sites
pub const DEFAULT_AMBIGMISSING: bool = false;
/// Default repeat masking behaviour
pub const DEFAULT_REPEATMASK: bool = false;
/// Default ambiguous masking behaviour
pub const DEFAULT_AMBIGMASK: bool = false;
/// Default gap ignoring behaviour (at constant sites)
pub const DEFAULT_CONSTGAPS: bool = false;
/// Default minimum k-mer count for FASTQ files
pub const DEFAULT_MINCOUNT: u16 = 10;
pub const DEFAULT_MINCOUNT: u16 = 5;
/// Default minimum base quality (PHRED score) for FASTQ files
pub const DEFAULT_MINQUAL: u8 = 20;
/// Default quality filtering criteria
pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Middle;
pub const DEFAULT_QUALFILTER: QualFilter = QualFilter::Strict;

#[doc(hidden)]
fn valid_kmer(s: &str) -> Result<usize, String> {
Expand Down Expand Up @@ -153,7 +155,7 @@ pub enum Commands {
min_qual: u8,

/// Quality filtering criteria (with reads)
#[arg(long, value_enum, default_value_t = QualFilter::Strict)]
#[arg(long, value_enum, default_value_t = DEFAULT_QUALFILTER)]
qual_filter: QualFilter,

/// Number of CPU threads
Expand All @@ -174,6 +176,10 @@ pub enum Commands {
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
#[arg(long, default_value_t = DEFAULT_AMBIGMISSING)]
filter_ambig_as_missing: bool,

/// Filter for constant middle base sites
#[arg(long, value_enum, default_value_t = FilterType::NoConst)]
filter: FilterType,
Expand Down Expand Up @@ -286,6 +292,10 @@ pub enum Commands {
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
#[arg(long, default_value_t = DEFAULT_AMBIGMISSING)]
filter_ambig_as_missing: bool,

/// Filter for constant middle base sites
#[arg(long, value_enum, default_value_t = FilterType::NoFilter)]
filter: FilterType,
Expand Down
22 changes: 19 additions & 3 deletions src/generic_modes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,20 @@ pub fn align<IntT: for<'a> UInt<'a>>(
mask_ambig: bool,
ignore_const_gaps: bool,
min_freq: f64,
filter_ambig_as_missing: bool,
) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");

// Apply filters
apply_filters(ska_array, min_freq, filter, mask_ambig, ignore_const_gaps);
apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
filter,
mask_ambig,
ignore_const_gaps,
);

// Write out to file/stdout
log::info!("Writing alignment");
Expand Down Expand Up @@ -103,15 +111,17 @@ pub fn merge<IntT: for<'a> UInt<'a>>(
pub fn apply_filters<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
min_freq: f64,
filter_ambig_as_missing: bool,
filter: &FilterType,
ambig_mask: bool,
ignore_const_gaps: bool,
) -> i32 {
let update_kmers = false;
let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize;
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
ska_array.filter(
filter_threshold,
filter_ambig_as_missing,
filter,
ambig_mask,
ignore_const_gaps,
Expand All @@ -134,9 +144,11 @@ pub fn distance<IntT: for<'a> UInt<'a>>(

let mask_ambig = false;
let ignore_const_gaps = false;
let filter_ambig_as_missing = false;
let constant = apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
&FilterType::NoConst,
mask_ambig,
ignore_const_gaps,
Expand All @@ -146,6 +158,7 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
&FilterType::NoAmbigOrConst,
mask_ambig,
ignore_const_gaps,
Expand All @@ -154,6 +167,7 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
&FilterType::NoFilter,
mask_ambig,
ignore_const_gaps,
Expand Down Expand Up @@ -211,6 +225,7 @@ pub fn weed<IntT: for<'a> UInt<'a>>(
weed_file: &Option<String>,
reverse: bool,
min_freq: f64,
filter_ambig_as_missing: bool,
filter: &FilterType,
ambig_mask: bool,
ignore_const_gaps: bool,
Expand Down Expand Up @@ -242,10 +257,11 @@ pub fn weed<IntT: for<'a> UInt<'a>>(

let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask || ignore_const_gaps {
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
let update_kmers = true;
ska_array.filter(
filter_threshold,
filter_ambig_as_missing,
filter,
ambig_mask,
ignore_const_gaps,
Expand Down
2 changes: 1 addition & 1 deletion src/io_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ pub fn load_array<IntT: for<'a> UInt<'a>>(
) -> Result<MergeSkaArray<IntT>, Box<dyn Error>> {
// Obtain a merged ska array
if input.len() == 1 {
log::info!("Single file as input, trying to load as skf");
log::info!("Single file as input, trying to load as skf {}-bits", IntT::n_bits());
MergeSkaArray::load(input[0].as_str())
} else {
log::info!("Multiple files as input, running ska build with default settings");
Expand Down
9 changes: 8 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -368,11 +368,12 @@
//!
//! // Apply filters
//! let min_count = 2;
//! let filter_ambig_as_missing = false;
//! let update_kmers = false;
//! let filter = FilterType::NoConst;
//! let ignore_const_gaps = false;
//! let ambig_mask = false;
//! ska_array.filter(min_count, &filter, ambig_mask, ignore_const_gaps, update_kmers);
//! ska_array.filter(min_count, filter_ambig_as_missing, &filter, ambig_mask, ignore_const_gaps, update_kmers);
//!
//! // Write out to stdout
//! let mut out_stream = set_ostream(&None);
Expand Down Expand Up @@ -514,6 +515,7 @@ pub fn main() {
input,
output,
min_freq,
filter_ambig_as_missing,
filter,
ambig_mask,
no_gap_only_sites,
Expand All @@ -530,6 +532,7 @@ pub fn main() {
*ambig_mask,
*no_gap_only_sites,
*min_freq,
*filter_ambig_as_missing,
);
} else if let Ok(mut ska_array) = load_array::<u128>(input, *threads) {
// In debug mode (cannot be set from CLI, give details)
Expand All @@ -541,6 +544,7 @@ pub fn main() {
*ambig_mask,
*no_gap_only_sites,
*min_freq,
*filter_ambig_as_missing,
);
} else {
panic!("Could not read input file(s): {input:?}");
Expand Down Expand Up @@ -658,6 +662,7 @@ pub fn main() {
output,
reverse,
min_freq,
filter_ambig_as_missing,
ambig_mask,
no_gap_only_sites,
filter,
Expand All @@ -669,6 +674,7 @@ pub fn main() {
weed_file,
*reverse,
*min_freq,
*filter_ambig_as_missing,
filter,
*ambig_mask,
*no_gap_only_sites,
Expand All @@ -684,6 +690,7 @@ pub fn main() {
weed_file,
*reverse,
*min_freq,
*filter_ambig_as_missing,
filter,
*ambig_mask,
*no_gap_only_sites,
Expand Down
76 changes: 59 additions & 17 deletions src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ use crate::cli::FilterType;
///
/// // Remove constant sites and save
/// let min_count = 1; // no filtering by minor allele frequency
/// let filter_ambig_as_missing = false; // allow ambiguous bases when counting allele frequency
/// let filter = FilterType::NoAmbigOrConst; // remove sites with no minor allele
/// let mask_ambiguous = false; // leave ambiguous sites as IUPAC codes
/// let ignore_const_gaps = false; // keep sites with only '-' as variants
/// let update_counts = true; // keep counts updated, as saving
/// ska_array.filter(min_count, &filter, mask_ambiguous, ignore_const_gaps, update_counts);
/// ska_array.filter(min_count, filter_ambig_as_missing, &filter, mask_ambiguous, ignore_const_gaps, update_counts);
/// ska_array.save(&"no_const_sites.skf");
///
/// // Create an iterators
Expand Down Expand Up @@ -101,19 +102,31 @@ where
/// Update `variant_count` after changing `variants`.
///
/// Recalculates counts, and removes any totally empty rows.
fn update_counts(&mut self) {
///
/// # Arguments
///
/// - `filter_ambig_as_missing` -- any non-ACGTU base counts as missing.
fn update_counts(&mut self, filter_ambig_as_missing: bool) {
log::info!("Updating variant counts");
let mut new_counts = Vec::with_capacity(self.variant_count.len());
let mut new_sk = Vec::with_capacity(self.split_kmers.len());

let mut empty: usize = 0;
let mut new_variants = Array2::zeros((0, self.names.len()));
for (var_row, sk) in self.variants.outer_iter().zip(self.split_kmers.iter()) {
let count = var_row.iter().filter(|b| **b != b'-').count();
let count = var_row
.iter()
.filter(|b| **b != b'-' && (!filter_ambig_as_missing || !is_ambiguous(**b)))
.count();
if count > 0 {
new_counts.push(count);
new_sk.push(*sk);
new_variants.push_row(var_row).unwrap();
} else {
empty += 1;
}
}
log::info!("Removed {empty} empty variants");
self.split_kmers = new_sk;
self.variants = new_variants;
self.variant_count = new_counts;
Expand Down Expand Up @@ -224,7 +237,7 @@ where
}
self.variants = filtered_variants;
self.names = new_names;
self.update_counts();
self.update_counts(false);
}

/// Filters variants (middle bases) by frequency.
Expand All @@ -245,6 +258,7 @@ where
pub fn filter(
&mut self,
min_count: usize,
filter_ambig_as_missing: bool,
filter: &FilterType,
mask_ambig: bool,
ignore_const_gaps: bool,
Expand All @@ -259,6 +273,11 @@ where
let mut filtered_counts = Vec::new();
let mut filtered_kmers = Vec::new();
let mut removed = 0;

if filter_ambig_as_missing {
self.update_counts(true);
}

for count_it in self
.variant_count
.iter()
Expand Down Expand Up @@ -632,21 +651,39 @@ mod tests {
use ndarray::array;

fn setup_struct() -> MergeSkaArray<u64> {
let example_array = MergeSkaArray::<u64> {
// Populate with some initial data.
MergeSkaArray::<u64> {
k: 31,
rc: true,
split_kmers: vec![0, 1],
variants: array![[1, 2, 3], [4, 5, 6]],
variant_count: vec![3, 3],
ska_version: "NA".to_string(),
k_bits: 64,
names: vec![
"Sample1".to_string(),
"Sample2".to_string(),
"Sample3".to_string(),
],
};
example_array
split_kmers: vec![1, 2, 3],
variants: array![[b'A', b'G', b'Y'], [b'T', b'-', b'Y'], [b'N', b'Y', b'Y']],
variant_count: vec![3, 2, 3],
ska_version: "NA".to_string(),
k_bits: 64,
}
}

#[test]
fn test_update_counts() {
let mut merge_ska_array = setup_struct();

// Test with filter_ambig_as_missing = true
merge_ska_array.update_counts(true);

// Check if variant counts are updated correctly
assert_eq!(merge_ska_array.variant_count, vec![2, 1]); // Expected counts

// Check if variants are updated correctly
// Assuming `variants` should now contain only the non-empty rows
assert_eq!(merge_ska_array.variants, array![[b'A', b'G', b'Y'], [b'T', b'-', b'Y']]);

// Check if split_kmers are updated correctly
assert_eq!(merge_ska_array.split_kmers, vec![1, 2]); // Expected kmers
}

#[test]
Expand All @@ -656,13 +693,18 @@ mod tests {

// First iteration
let (kmer, vars) = iter.next().unwrap();
assert_eq!(kmer, 0);
assert_eq!(vars, vec![1, 2, 3]);
assert_eq!(kmer, 1);
assert_eq!(vars, vec![b'A', b'G', b'Y']);

// Second iteration
let (kmer, vars) = iter.next().unwrap();
assert_eq!(kmer, 1);
assert_eq!(vars, vec![4, 5, 6]);
assert_eq!(kmer, 2);
assert_eq!(vars, vec![b'T', b'-', b'Y']);

// Third iteration
let (kmer, vars) = iter.next().unwrap();
assert_eq!(kmer, 3);
assert_eq!(vars, vec![b'N', b'Y', b'Y']);

// No more items
assert!(iter.next().is_none());
Expand All @@ -676,7 +718,7 @@ mod tests {

// Check that the samples were deleted
assert_eq!(ska_array.names, vec!["Sample3"]);
assert_eq!(ska_array.variants, array![[3], [6]]);
assert_eq!(ska_array.variants, array![[b'Y'], [b'Y'], [b'Y']]);
}

#[test]
Expand Down
1 change: 1 addition & 0 deletions tests/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ fn filters() {
.arg(sandbox.file_string("merge_k9.skf", TestDir::Input))
.arg("--filter")
.arg("no-ambig")
.arg("--filter-ambig-as-missing")
.arg("-v")
.output()
.unwrap()
Expand Down

0 comments on commit df6b0a7

Please sign in to comment.