From 9411a42b89555433356edfdb95d641db4e68bc47 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Wed, 12 Feb 2025 23:10:11 -0800
Subject: [PATCH 1/3] Add 'special' checks for DNA

---
 crates/codebook/src/dictionaries/mod.rs     |  1 +
 crates/codebook/src/dictionaries/special.rs | 33 +++++++++++++++++++++
 crates/codebook/src/lib.rs                  |  5 +++-
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 crates/codebook/src/dictionaries/special.rs

diff --git a/crates/codebook/src/dictionaries/mod.rs b/crates/codebook/src/dictionaries/mod.rs
index 97ebdfc..a347c17 100644
--- a/crates/codebook/src/dictionaries/mod.rs
+++ b/crates/codebook/src/dictionaries/mod.rs
@@ -1,3 +1,4 @@
 pub mod dictionary;
 pub mod manager;
 pub mod repo;
+pub mod special;
diff --git a/crates/codebook/src/dictionaries/special.rs b/crates/codebook/src/dictionaries/special.rs
new file mode 100644
index 0000000..3a75964
--- /dev/null
+++ b/crates/codebook/src/dictionaries/special.rs
@@ -0,0 +1,33 @@
+pub fn check_special(word: &str) -> bool {
+    is_dna_sequence(word)
+}
+
+fn is_dna_sequence(s: &str) -> bool {
+    if s.len() < 4 {
+        return false;
+    }
+    for c in s.chars() {
+        match c {
+            'A' | 'T' | 'C' | 'G' | 'a' | 't' | 'c' | 'g' => {
+                continue;
+            }
+            _ => return false,
+        }
+    }
+    true
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_dna_sequence() {
+        assert!(is_dna_sequence("ATCGATCG"));
+        assert!(is_dna_sequence("ATCG"));
+        assert!(is_dna_sequence("atcgatcg"));
+        assert!(!is_dna_sequence("xyzATCGAbc"));
+        assert!(!is_dna_sequence("Hello"));
+        assert!(!is_dna_sequence("ATC"));
+    }
+}
diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs
index d9f713a..8d46404 100644
--- a/crates/codebook/src/lib.rs
+++ b/crates/codebook/src/lib.rs
@@ -7,7 +7,7 @@ mod splitter;
 use std::sync::Arc;
 
 use codebook_config::CodebookConfig;
-use dictionaries::{dictionary, manager::DictionaryManager};
+use dictionaries::{dictionary, manager::DictionaryManager, special::check_special};
 use dictionary::Dictionary;
 use parser::WordLocation;
 
@@ -43,6 +43,9 @@ impl Codebook {
             if self.config.is_allowed_word(word) {
                 return true;
             }
+            if check_special(word) {
+                return true;
+            }
             for dictionary in &dictionaries {
                 if dictionary.check(word) {
                     return true;

From a0953d2a7eadfd86da1ff8a5b1378efaaf12de2d Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Sun, 16 Feb 2025 14:34:15 -0800
Subject: [PATCH 2/3] Add ignore_patterns

---
 Cargo.lock                                  |   1 +
 codebook.toml                               |   3 +
 crates/codebook-config/Cargo.toml           |   1 +
 crates/codebook-config/src/lib.rs           | 110 +++++++++++++++++++-
 crates/codebook/src/dictionaries/mod.rs     |   1 -
 crates/codebook/src/dictionaries/special.rs |  33 ------
 crates/codebook/src/lib.rs                  |   5 +-
 examples/example.md                         |   3 +
 8 files changed, 118 insertions(+), 39 deletions(-)
 delete mode 100644 crates/codebook/src/dictionaries/special.rs

diff --git a/Cargo.lock b/Cargo.lock
index fe1665d..97f838b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -554,6 +554,7 @@ dependencies = [
  "anyhow",
  "glob",
  "log",
+ "regex",
  "serde",
  "tempfile",
  "toml",
diff --git a/codebook.toml b/codebook.toml
index 71bbbb7..fdb4da7 100644
--- a/codebook.toml
+++ b/codebook.toml
@@ -18,3 +18,6 @@ ignore_paths = [
     "**/*.json",
     ".git/**/*",
 ]
+ignore_patterns = [
+    "^[ATCG]+$",
+]
diff --git a/crates/codebook-config/Cargo.toml b/crates/codebook-config/Cargo.toml
index 253c806..b7bcf5a 100644
--- a/crates/codebook-config/Cargo.toml
+++ b/crates/codebook-config/Cargo.toml
@@ -9,6 +9,7 @@ toml = "0.8"
 anyhow = "1.0"
 glob = "0.3"
 log = "0.4.25"
+regex = "1.11.1"
 
 
 [dev-dependencies]
diff --git a/crates/codebook-config/src/lib.rs b/crates/codebook-config/src/lib.rs
index 0affa92..15b40a9 100644
--- a/crates/codebook-config/src/lib.rs
+++ b/crates/codebook-config/src/lib.rs
@@ -2,6 +2,7 @@ use anyhow::{Context, Result};
 use glob::Pattern;
 use log::debug;
 use log::info;
+use regex::RegexSet;
 use serde::{Deserialize, Serialize};
 use std::env;
 use std::fs;
@@ -27,6 +28,10 @@ pub struct ConfigSettings {
     /// Glob patterns for paths to ignore
     #[serde(default)]
     pub ignore_paths: Vec<String>,
+
+    /// Regex patterns for text to ignore
+    #[serde(default)]
+    pub ignore_patterns: Vec<String>,
 }
 
 impl Default for ConfigSettings {
@@ -36,6 +41,7 @@ impl Default for ConfigSettings {
             words: Vec::new(),
             flag_words: Vec::new(),
             ignore_paths: Vec::new(),
+            ignore_patterns: Vec::new(),
         }
     }
 }
@@ -58,6 +64,8 @@ impl<'de> Deserialize<'de> for ConfigSettings {
             flag_words: Vec<String>,
             #[serde(default)]
             ignore_paths: Vec<String>,
+            #[serde(default)]
+            ignore_patterns: Vec<String>,
         }
 
         let helper = Helper::deserialize(deserializer)?;
@@ -65,7 +73,8 @@ impl<'de> Deserialize<'de> for ConfigSettings {
             dictionaries: to_lowercase_vec(helper.dictionaries),
             words: to_lowercase_vec(helper.words),
             flag_words: to_lowercase_vec(helper.flag_words),
-            ignore_paths: helper.ignore_paths, // Keep paths as-is
+            ignore_paths: helper.ignore_paths,
+            ignore_patterns: helper.ignore_patterns,
         })
     }
 }
@@ -73,6 +82,7 @@ impl<'de> Deserialize<'de> for ConfigSettings {
 #[derive(Debug)]
 pub struct CodebookConfig {
     settings: RwLock<ConfigSettings>,
+    regex_set: RwLock<Option<RegexSet>>,
     pub config_path: Option<PathBuf>,
     pub cache_dir: PathBuf,
 }
@@ -81,6 +91,7 @@ impl Default for CodebookConfig {
     fn default() -> Self {
         Self {
             settings: RwLock::new(ConfigSettings::default()),
+            regex_set: RwLock::new(None),
             config_path: None,
             cache_dir: env::temp_dir().join(CACHE_DIR),
         }
@@ -150,6 +161,7 @@ impl CodebookConfig {
         if new_settings != *settings {
             info!("Reloading config from file: {}", config_path.display());
             *settings = new_settings;
+            *self.regex_set.write().unwrap() = None;
             return Ok(true);
         }
         Ok(false)
@@ -289,6 +301,9 @@ impl CodebookConfig {
 
     /// Check if a word is in the custom allowlist
     pub fn is_allowed_word(&self, word: &str) -> bool {
+        if self.matches_ignore_pattern(word) {
+            return true;
+        }
         let word = word.to_ascii_lowercase();
         self.settings
             .read()
@@ -298,6 +313,26 @@ impl CodebookConfig {
             .any(|w| w == &word)
     }
 
+    /// Check if text matches any of the ignore patterns
+    fn matches_ignore_pattern(&self, word: &str) -> bool {
+        let patterns = &self.settings.read().unwrap().ignore_patterns;
+        if patterns.is_empty() {
+            return false;
+        }
+
+        // Lazily initialize the RegexSet
+        let mut regex_set = self.regex_set.write().unwrap();
+        if regex_set.is_none() {
+            *regex_set = Some(RegexSet::new(patterns).unwrap());
+        }
+
+        // Check if text matches any pattern
+        if let Some(set) = &*regex_set {
+            return set.is_match(word);
+        }
+        false
+    }
+
     /// Check if a word should be flagged
     pub fn should_flag_word(&self, word: &str) -> bool {
         let word = word.to_ascii_lowercase();
@@ -338,6 +373,79 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_ignore_patterns() -> Result<()> {
+        let temp_dir = TempDir::new()?;
+        let config_path = temp_dir.path().join("codebook.toml");
+        let mut file = File::create(&config_path)?;
+        let a = r#"
+        ignore_patterns = [
+            "^[ATCG]+$",
+            "\\d{3}-\\d{2}-\\d{4}"  # Social Security Number format
+        ]
+        "#;
+        file.write_all(a.as_bytes())?;
+
+        let config = CodebookConfig::load_from_file(&config_path)?;
+        assert!(config.matches_ignore_pattern("GTAC"));
+        assert!(config.matches_ignore_pattern("AATTCCGG"));
+        assert!(config.matches_ignore_pattern("123-45-6789"));
+        assert!(!config.matches_ignore_pattern("Hello"));
+        assert!(!config.matches_ignore_pattern("GTACZ")); // Invalid DNA sequence
+
+        Ok(())
+    }
+    #[test]
+    fn test_reload_ignore_patterns() -> Result<()> {
+        let temp_dir = TempDir::new()?;
+        let config_path = temp_dir.path().join("codebook.toml");
+
+        // Create initial config with DNA pattern
+        let mut file = File::create(&config_path)?;
+        write!(
+            file,
+            r#"
+            ignore_patterns = [
+                "^[ATCG]+$"
+            ]
+            "#
+        )?;
+
+        let config = CodebookConfig::load_from_file(&config_path)?;
+        assert!(config.matches_ignore_pattern("GTAC"));
+        assert!(!config.matches_ignore_pattern("123-45-6789"));
+
+        // Update config with new pattern
+        let mut file = File::create(&config_path)?;
+        let a = r#"
+        ignore_patterns = [
+            "^[ATCG]+$",
+            "\\d{3}-\\d{2}-\\d{4}"
+        ]
+        "#;
+        file.write_all(a.as_bytes())?;
+
+        // Reload and verify both patterns work
+        config.reload()?;
+        assert!(config.matches_ignore_pattern("GTAC"));
+        assert!(config.matches_ignore_pattern("123-45-6789"));
+
+        // Update config to remove all patterns
+        let mut file = File::create(&config_path)?;
+        write!(
+            file,
+            r#"
+            ignore_patterns = []
+            "#
+        )?;
+
+        // Reload and verify no patterns match
+        config.reload()?;
+        assert!(!config.matches_ignore_pattern("GTAC"));
+        assert!(!config.matches_ignore_pattern("123-45-6789"));
+
+        Ok(())
+    }
     #[test]
     fn test_config_recursive_search() -> Result<()> {
         let temp_dir = TempDir::new()?;
diff --git a/crates/codebook/src/dictionaries/mod.rs b/crates/codebook/src/dictionaries/mod.rs
index a347c17..97ebdfc 100644
--- a/crates/codebook/src/dictionaries/mod.rs
+++ b/crates/codebook/src/dictionaries/mod.rs
@@ -1,4 +1,3 @@
 pub mod dictionary;
 pub mod manager;
 pub mod repo;
-pub mod special;
diff --git a/crates/codebook/src/dictionaries/special.rs b/crates/codebook/src/dictionaries/special.rs
deleted file mode 100644
index 3a75964..0000000
--- a/crates/codebook/src/dictionaries/special.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-pub fn check_special(word: &str) -> bool {
-    is_dna_sequence(word)
-}
-
-fn is_dna_sequence(s: &str) -> bool {
-    if s.len() < 4 {
-        return false;
-    }
-    for c in s.chars() {
-        match c {
-            'A' | 'T' | 'C' | 'G' | 'a' | 't' | 'c' | 'g' => {
-                continue;
-            }
-            _ => return false,
-        }
-    }
-    true
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_dna_sequence() {
-        assert!(is_dna_sequence("ATCGATCG"));
-        assert!(is_dna_sequence("ATCG"));
-        assert!(is_dna_sequence("atcgatcg"));
-        assert!(!is_dna_sequence("xyzATCGAbc"));
-        assert!(!is_dna_sequence("Hello"));
-        assert!(!is_dna_sequence("ATC"));
-    }
-}
diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs
index 8d46404..d9f713a 100644
--- a/crates/codebook/src/lib.rs
+++ b/crates/codebook/src/lib.rs
@@ -7,7 +7,7 @@ mod splitter;
 use std::sync::Arc;
 
 use codebook_config::CodebookConfig;
-use dictionaries::{dictionary, manager::DictionaryManager, special::check_special};
+use dictionaries::{dictionary, manager::DictionaryManager};
 use dictionary::Dictionary;
 use parser::WordLocation;
 
@@ -43,9 +43,6 @@ impl Codebook {
             if self.config.is_allowed_word(word) {
                 return true;
             }
-            if check_special(word) {
-                return true;
-            }
             for dictionary in &dictionaries {
                 if dictionary.check(word) {
                     return true;
diff --git a/examples/example.md b/examples/example.md
index 2e6647d..8334527 100644
--- a/examples/example.md
+++ b/examples/example.md
@@ -1,2 +1,5 @@
 I'm bvd at splellin Wolrd wolrd
 hello regulr
+
+Some DNA:
+ATGCATCG

From 61a63dc0c3548d4861e43ad4a155c493dd6473be Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Sun, 16 Feb 2025 14:36:43 -0800
Subject: [PATCH 3/3] Add readme for new ignore_patterns

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a8b1309..16ba1b6 100644
--- a/README.md
+++ b/README.md
@@ -49,8 +49,8 @@ Codebook has an optional configuration file you can put in the root of your proj
 Here are the options:
 
 ```toml
-# Currently unused. Will add more dictionaries soon.
 # Default: ["en_us"]
+# "en_gb" also works.
 dictionaries = ["en_us"]
 # List of words to ignore. Case-insensitive. Codebook will add words here when you select "Add to dictionary".
 # Default: []
@@ -61,6 +61,12 @@ flag_words = ["todo", "fixme"]
 # List of path globs to ignore when spell checking.
 # Default: []
 ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"]
+# List of regex patterns to ignore when spell checking. Useful for domain-specific strings like DNA sequences.
+# Default: []
+ignore_patterns = [
+    "^[ATCG]+$",  # DNA sequences
+    "\\d{3}-\\d{2}-\\d{4}"  # Social Security Number format
+]
 ```
 
 ## Goals