Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #5, add ignore_patterns to config. #7

Merged
merged 3 commits into from
Feb 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ Codebook has an optional configuration file you can put in the root of your proj
Here are the options:

```toml
# Currently unused. Will add more dictionaries soon.
# Default: ["en_us"]
# "en_gb" also works.
dictionaries = ["en_us"]
# List of words to ignore. Case-insensitive. Codebook will add words here when you select "Add to dictionary".
# Default: []
Expand All @@ -61,6 +61,12 @@ flag_words = ["todo", "fixme"]
# List of path globs to ignore when spell checking.
# Default: []
ignore_paths = ["target/**/*", "**/*.json", ".git/**/*"]
# List of regex patterns to ignore when spell checking. Useful for domain-specific strings like DNA sequences.
# Default: []
ignore_patterns = [
"^[ATCG]+$", # DNA sequences
"\\d{3}-\\d{2}-\\d{4}" # Social Security Number format
]
```

## Goals
Expand Down
3 changes: 3 additions & 0 deletions codebook.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ ignore_paths = [
"**/*.json",
".git/**/*",
]
ignore_patterns = [
"^[ATCG]+$",
]
1 change: 1 addition & 0 deletions crates/codebook-config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ toml = "0.8"
anyhow = "1.0"
glob = "0.3"
log = "0.4.25"
regex = "1.11.1"


[dev-dependencies]
Expand Down
110 changes: 109 additions & 1 deletion crates/codebook-config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use anyhow::{Context, Result};
use glob::Pattern;
use log::debug;
use log::info;
use regex::RegexSet;
use serde::{Deserialize, Serialize};
use std::env;
use std::fs;
Expand All @@ -27,6 +28,10 @@ pub struct ConfigSettings {
/// Glob patterns for paths to ignore
#[serde(default)]
pub ignore_paths: Vec<String>,

/// Regex patterns for text to ignore
#[serde(default)]
pub ignore_patterns: Vec<String>,
}

impl Default for ConfigSettings {
Expand All @@ -36,6 +41,7 @@ impl Default for ConfigSettings {
words: Vec::new(),
flag_words: Vec::new(),
ignore_paths: Vec::new(),
ignore_patterns: Vec::new(),
}
}
}
Expand All @@ -58,21 +64,25 @@ impl<'de> Deserialize<'de> for ConfigSettings {
flag_words: Vec<String>,
#[serde(default)]
ignore_paths: Vec<String>,
#[serde(default)]
ignore_patterns: Vec<String>,
}

let helper = Helper::deserialize(deserializer)?;
Ok(ConfigSettings {
dictionaries: to_lowercase_vec(helper.dictionaries),
words: to_lowercase_vec(helper.words),
flag_words: to_lowercase_vec(helper.flag_words),
ignore_paths: helper.ignore_paths, // Keep paths as-is
ignore_paths: helper.ignore_paths,
ignore_patterns: helper.ignore_patterns,
})
}
}

#[derive(Debug)]
pub struct CodebookConfig {
settings: RwLock<ConfigSettings>,
regex_set: RwLock<Option<RegexSet>>,
pub config_path: Option<PathBuf>,
pub cache_dir: PathBuf,
}
Expand All @@ -81,6 +91,7 @@ impl Default for CodebookConfig {
fn default() -> Self {
Self {
settings: RwLock::new(ConfigSettings::default()),
regex_set: RwLock::new(None),
config_path: None,
cache_dir: env::temp_dir().join(CACHE_DIR),
}
Expand Down Expand Up @@ -150,6 +161,7 @@ impl CodebookConfig {
if new_settings != *settings {
info!("Reloading config from file: {}", config_path.display());
*settings = new_settings;
*self.regex_set.write().unwrap() = None;
return Ok(true);
}
Ok(false)
Expand Down Expand Up @@ -289,6 +301,9 @@ impl CodebookConfig {

/// Check if a word is in the custom allowlist
pub fn is_allowed_word(&self, word: &str) -> bool {
if self.matches_ignore_pattern(word) {
return true;
}
let word = word.to_ascii_lowercase();
self.settings
.read()
Expand All @@ -298,6 +313,26 @@ impl CodebookConfig {
.any(|w| w == &word)
}

/// Check if text matches any of the ignore patterns
fn matches_ignore_pattern(&self, word: &str) -> bool {
let patterns = &self.settings.read().unwrap().ignore_patterns;
if patterns.is_empty() {
return false;
}

// Lazily initialize the RegexSet
let mut regex_set = self.regex_set.write().unwrap();
if regex_set.is_none() {
*regex_set = Some(RegexSet::new(patterns).unwrap());
}

// Check if text matches any pattern
if let Some(set) = &*regex_set {
return set.is_match(word);
}
false
}

/// Check if a word should be flagged
pub fn should_flag_word(&self, word: &str) -> bool {
let word = word.to_ascii_lowercase();
Expand Down Expand Up @@ -338,6 +373,79 @@ mod tests {
Ok(())
}

#[test]
fn test_ignore_patterns() -> Result<()> {
let temp_dir = TempDir::new()?;
let config_path = temp_dir.path().join("codebook.toml");
let mut file = File::create(&config_path)?;
let a = r#"
ignore_patterns = [
"^[ATCG]+$",
"\\d{3}-\\d{2}-\\d{4}" # Social Security Number format
]
"#;
file.write_all(a.as_bytes())?;

let config = CodebookConfig::load_from_file(&config_path)?;
assert!(config.matches_ignore_pattern("GTAC"));
assert!(config.matches_ignore_pattern("AATTCCGG"));
assert!(config.matches_ignore_pattern("123-45-6789"));
assert!(!config.matches_ignore_pattern("Hello"));
assert!(!config.matches_ignore_pattern("GTACZ")); // Invalid DNA sequence

Ok(())
}
#[test]
fn test_reload_ignore_patterns() -> Result<()> {
let temp_dir = TempDir::new()?;
let config_path = temp_dir.path().join("codebook.toml");

// Create initial config with DNA pattern
let mut file = File::create(&config_path)?;
write!(
file,
r#"
ignore_patterns = [
"^[ATCG]+$"
]
"#
)?;

let config = CodebookConfig::load_from_file(&config_path)?;
assert!(config.matches_ignore_pattern("GTAC"));
assert!(!config.matches_ignore_pattern("123-45-6789"));

// Update config with new pattern
let mut file = File::create(&config_path)?;
let a = r#"
ignore_patterns = [
"^[ATCG]+$",
"\\d{3}-\\d{2}-\\d{4}"
]
"#;
file.write_all(a.as_bytes())?;

// Reload and verify both patterns work
config.reload()?;
assert!(config.matches_ignore_pattern("GTAC"));
assert!(config.matches_ignore_pattern("123-45-6789"));

// Update config to remove all patterns
let mut file = File::create(&config_path)?;
write!(
file,
r#"
ignore_patterns = []
"#
)?;

// Reload and verify no patterns match
config.reload()?;
assert!(!config.matches_ignore_pattern("GTAC"));
assert!(!config.matches_ignore_pattern("123-45-6789"));

Ok(())
}
#[test]
fn test_config_recursive_search() -> Result<()> {
let temp_dir = TempDir::new()?;
Expand Down
3 changes: 3 additions & 0 deletions examples/example.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
I'm bvd at splellin Wolrd wolrd
hello regulr

Some DNA:
ATGCATCG
Loading