From 9411a42b89555433356edfdb95d641db4e68bc47 Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Wed, 12 Feb 2025 23:10:11 -0800 Subject: [PATCH 1/3] Add 'special' checks for DNA --- crates/codebook/src/dictionaries/mod.rs | 1 + crates/codebook/src/dictionaries/special.rs | 33 +++++++++++++++++++++ crates/codebook/src/lib.rs | 5 +++- 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 crates/codebook/src/dictionaries/special.rs diff --git a/crates/codebook/src/dictionaries/mod.rs b/crates/codebook/src/dictionaries/mod.rs index 97ebdfc..a347c17 100644 --- a/crates/codebook/src/dictionaries/mod.rs +++ b/crates/codebook/src/dictionaries/mod.rs @@ -1,3 +1,4 @@ pub mod dictionary; pub mod manager; pub mod repo; +pub mod special; diff --git a/crates/codebook/src/dictionaries/special.rs b/crates/codebook/src/dictionaries/special.rs new file mode 100644 index 0000000..3a75964 --- /dev/null +++ b/crates/codebook/src/dictionaries/special.rs @@ -0,0 +1,33 @@ +pub fn check_special(word: &str) -> bool { + is_dna_sequence(word) +} + +fn is_dna_sequence(s: &str) -> bool { + if s.len() < 4 { + return false; + } + for c in s.chars() { + match c { + 'A' | 'T' | 'C' | 'G' | 'a' | 't' | 'c' | 'g' => { + continue; + } + _ => return false, + } + } + true +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dna_sequence() { + assert!(is_dna_sequence("ATCGATCG")); + assert!(is_dna_sequence("ATCG")); + assert!(is_dna_sequence("atcgatcg")); + assert!(!is_dna_sequence("xyzATCGAbc")); + assert!(!is_dna_sequence("Hello")); + assert!(!is_dna_sequence("ATC")); + } +} diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs index d9f713a..8d46404 100644 --- a/crates/codebook/src/lib.rs +++ b/crates/codebook/src/lib.rs @@ -7,7 +7,7 @@ mod splitter; use std::sync::Arc; use codebook_config::CodebookConfig; -use dictionaries::{dictionary, manager::DictionaryManager}; +use dictionaries::{dictionary, manager::DictionaryManager, special::check_special}; use dictionary::Dictionary; use parser::WordLocation; @@ -43,6 +43,9 @@ impl Codebook { if self.config.is_allowed_word(word) { return true; } + if check_special(word) { + return true; + } for dictionary in &dictionaries { if dictionary.check(word) { return true; From a0953d2a7eadfd86da1ff8a5b1378efaaf12de2d Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Sun, 16 Feb 2025 14:34:15 -0800 Subject: [PATCH 2/3] Add ignore_patterns --- Cargo.lock | 1 + codebook.toml | 3 + crates/codebook-config/Cargo.toml | 1 + crates/codebook-config/src/lib.rs | 110 +++++++++++++++++++- crates/codebook/src/dictionaries/mod.rs | 1 - crates/codebook/src/dictionaries/special.rs | 33 ------ crates/codebook/src/lib.rs | 5 +- examples/example.md | 3 + 8 files changed, 118 insertions(+), 39 deletions(-) delete mode 100644 crates/codebook/src/dictionaries/special.rs diff --git a/Cargo.lock b/Cargo.lock index fe1665d..97f838b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -554,6 +554,7 @@ dependencies = [ "anyhow", "glob", "log", + "regex", "serde", "tempfile", "toml", diff --git a/codebook.toml b/codebook.toml index 71bbbb7..fdb4da7 100644 --- a/codebook.toml +++ b/codebook.toml @@ -18,3 +18,6 @@ ignore_paths = [ "**/*.json", ".git/**/*", ] +ignore_patterns = [ + "^[ATCG]+$", +] diff --git a/crates/codebook-config/Cargo.toml b/crates/codebook-config/Cargo.toml index 253c806..b7bcf5a 100644 --- a/crates/codebook-config/Cargo.toml +++ b/crates/codebook-config/Cargo.toml @@ -9,6 +9,7 @@ toml = "0.8" anyhow = "1.0" glob = "0.3" log = "0.4.25" +regex = "1.11.1" [dev-dependencies] diff --git a/crates/codebook-config/src/lib.rs b/crates/codebook-config/src/lib.rs index 0affa92..15b40a9 100644 --- a/crates/codebook-config/src/lib.rs +++ b/crates/codebook-config/src/lib.rs @@ -2,6 +2,7 @@ use anyhow::{Context, Result}; use glob::Pattern; use log::debug; use log::info; +use regex::RegexSet; use serde::{Deserialize, Serialize}; use std::env; use std::fs; @@ -27,6 +28,10 @@ pub struct ConfigSettings { /// Glob patterns for paths to ignore #[serde(default)] pub ignore_paths: Vec, + + /// Regex patterns for text to ignore + #[serde(default)] + pub ignore_patterns: Vec, } impl Default for ConfigSettings { @@ -36,6 +41,7 @@ impl Default for ConfigSettings { words: Vec::new(), flag_words: Vec::new(), ignore_paths: Vec::new(), + ignore_patterns: Vec::new(), } } } @@ -58,6 +64,8 @@ impl<'de> Deserialize<'de> for ConfigSettings { flag_words: Vec, #[serde(default)] ignore_paths: Vec, + #[serde(default)] + ignore_patterns: Vec, } let helper = Helper::deserialize(deserializer)?; @@ -65,7 +73,8 @@ impl<'de> Deserialize<'de> for ConfigSettings { dictionaries: to_lowercase_vec(helper.dictionaries), words: to_lowercase_vec(helper.words), flag_words: to_lowercase_vec(helper.flag_words), - ignore_paths: helper.ignore_paths, // Keep paths as-is + ignore_paths: helper.ignore_paths, + ignore_patterns: helper.ignore_patterns, }) } } @@ -73,6 +82,7 @@ impl<'de> Deserialize<'de> for ConfigSettings { #[derive(Debug)] pub struct CodebookConfig { settings: RwLock, + regex_set: RwLock>, pub config_path: Option, pub cache_dir: PathBuf, } @@ -81,6 +91,7 @@ impl Default for CodebookConfig { fn default() -> Self { Self { settings: RwLock::new(ConfigSettings::default()), + regex_set: RwLock::new(None), config_path: None, cache_dir: env::temp_dir().join(CACHE_DIR), } @@ -150,6 +161,7 @@ impl CodebookConfig { if new_settings != *settings { info!("Reloading config from file: {}", config_path.display()); *settings = new_settings; + *self.regex_set.write().unwrap() = None; return Ok(true); } Ok(false) @@ -289,6 +301,9 @@ impl CodebookConfig { /// Check if a word is in the custom allowlist pub fn is_allowed_word(&self, word: &str) -> bool { + if self.matches_ignore_pattern(word) { + return true; + } let word = word.to_ascii_lowercase(); self.settings .read() @@ -298,6 +313,26 @@ impl CodebookConfig { .any(|w| w == &word) } + /// Check if text matches any of the ignore patterns + fn matches_ignore_pattern(&self, word: &str) -> bool { + let patterns = &self.settings.read().unwrap().ignore_patterns; + if patterns.is_empty() { + return false; + } + + // Lazily initialize the RegexSet + let mut regex_set = self.regex_set.write().unwrap(); + if regex_set.is_none() { + *regex_set = Some(RegexSet::new(patterns).unwrap()); + } + + // Check if text matches any pattern + if let Some(set) = &*regex_set { + return set.is_match(word); + } + false + } + /// Check if a word should be flagged pub fn should_flag_word(&self, word: &str) -> bool { let word = word.to_ascii_lowercase(); @@ -338,6 +373,79 @@ mod tests { Ok(()) } + #[test] + fn test_ignore_patterns() -> Result<()> { + let temp_dir = TempDir::new()?; + let config_path = temp_dir.path().join("codebook.toml"); + let mut file = File::create(&config_path)?; + let a = r#" + ignore_patterns = [ + "^[ATCG]+$", + "\\d{3}-\\d{2}-\\d{4}" # Social Security Number format + ] + "#; + file.write_all(a.as_bytes())?; + + let config = CodebookConfig::load_from_file(&config_path)?; + assert!(config.matches_ignore_pattern("GTAC")); + assert!(config.matches_ignore_pattern("AATTCCGG")); + assert!(config.matches_ignore_pattern("123-45-6789")); + assert!(!config.matches_ignore_pattern("Hello")); + assert!(!config.matches_ignore_pattern("GTACZ")); // Invalid DNA sequence + + Ok(()) + } + #[test] + fn test_reload_ignore_patterns() -> Result<()> { + let temp_dir = TempDir::new()?; + let config_path = temp_dir.path().join("codebook.toml"); + + // Create initial config with DNA pattern + let mut file = File::create(&config_path)?; + write!( + file, + r#" + ignore_patterns = [ + "^[ATCG]+$" + ] + "# + )?; + + let config = CodebookConfig::load_from_file(&config_path)?; + assert!(config.matches_ignore_pattern("GTAC")); + assert!(!config.matches_ignore_pattern("123-45-6789")); + + // Update config with new pattern + let mut file = File::create(&config_path)?; + let a = r#" + ignore_patterns = [ + "^[ATCG]+$", + "\\d{3}-\\d{2}-\\d{4}" + ] + "#; + file.write_all(a.as_bytes())?; + + // Reload and verify both patterns work + config.reload()?; + assert!(config.matches_ignore_pattern("GTAC")); + assert!(config.matches_ignore_pattern("123-45-6789")); + + // Update config to remove all patterns + let mut file = File::create(&config_path)?; + write!( + file, + r#" + ignore_patterns = [] + "# + )?; + + // Reload and verify no patterns match + config.reload()?; + assert!(!config.matches_ignore_pattern("GTAC")); + assert!(!config.matches_ignore_pattern("123-45-6789")); + + Ok(()) + } #[test] fn test_config_recursive_search() -> Result<()> { let temp_dir = TempDir::new()?; diff --git a/crates/codebook/src/dictionaries/mod.rs b/crates/codebook/src/dictionaries/mod.rs index a347c17..97ebdfc 100644 --- a/crates/codebook/src/dictionaries/mod.rs +++ b/crates/codebook/src/dictionaries/mod.rs @@ -1,4 +1,3 @@ pub mod dictionary; pub mod manager; pub mod repo; -pub mod special; diff --git a/crates/codebook/src/dictionaries/special.rs b/crates/codebook/src/dictionaries/special.rs deleted file mode 100644 index 3a75964..0000000 --- a/crates/codebook/src/dictionaries/special.rs +++ /dev/null @@ -1,33 +0,0 @@ -pub fn check_special(word: &str) -> bool { - is_dna_sequence(word) -} - -fn is_dna_sequence(s: &str) -> bool { - if s.len() < 4 { - return false; - } - for c in s.chars() { - match c { - 'A' | 'T' | 'C' | 'G' | 'a' | 't' | 'c' | 'g' => { - continue; - } - _ => return false, - } - } - true -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_dna_sequence() { - assert!(is_dna_sequence("ATCGATCG")); - assert!(is_dna_sequence("ATCG")); - assert!(is_dna_sequence("atcgatcg")); - assert!(!is_dna_sequence("xyzATCGAbc")); - assert!(!is_dna_sequence("Hello")); - assert!(!is_dna_sequence("ATC")); - } -} diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs index 8d46404..d9f713a 100644 --- a/crates/codebook/src/lib.rs +++ b/crates/codebook/src/lib.rs @@ -7,7 +7,7 @@ mod splitter; use std::sync::Arc; use codebook_config::CodebookConfig; -use dictionaries::{dictionary, manager::DictionaryManager, special::check_special}; +use dictionaries::{dictionary, manager::DictionaryManager}; use dictionary::Dictionary; use parser::WordLocation; @@ -43,9 +43,6 @@ impl Codebook { if self.config.is_allowed_word(word) { return true; } - if check_special(word) { - return true; - } for dictionary in &dictionaries { if dictionary.check(word) { return true; diff --git a/examples/example.md b/examples/example.md index 2e6647d..8334527 100644 --- a/examples/example.md +++ b/examples/example.md @@ -1,2 +1,5 @@ I'm bvd at splellin Wolrd wolrd hello regulr + +Some DNA: +ATGCATCG From 61a63dc0c3548d4861e43ad4a155c493dd6473be Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Sun, 16 Feb 2025 14:36:43 -0800 Subject: [PATCH 3/3] Add readme for new ignore_patterns --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a8b1309..16ba1b6 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,8 @@ Codebook has an optional configuration file you can put in the root of your proj Here are the options: ```toml -# Currently unused. Will add more dictionaries soon. # Default: ["en_us"] +# "en_gb" also works. dictionaries = ["en_us"] # List of words to ignore. Case-insensitive. Codebook will add words here when you select "Add to dictionary". # Default: [] @@ -61,6 +61,12 @@ flag_words = ["todo", "fixme"] # List of path globs to ignore when spell checking. # Default: [] ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"] +# List of regex patterns to ignore when spell checking. Useful for domain-specific strings like DNA sequences. +# Default: [] +ignore_patterns = [ + "^[ATCG]+$", # DNA sequences + "\\d{3}-\\d{2}-\\d{4}" # Social Security Number format +] ``` ## Goals