Add struct ids to rust, add more wordlists

blopker · Jan 14, 2025 · 738dcf0 · 738dcf0
1 parent ffec971
commit 738dcf0
Show file tree

Hide file tree

Showing 19 changed files with 247 additions and 503 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,3 +1,6 @@
 [workspace]
-members = [ "codebook","codebook-lsp", "codebook-zed"]
+members = ["codebook", "codebook-lsp", "codebook-zed"]
 resolver = "2"
+
+[profile.test]
+env = { RUST_LOG = "debug" }
diff --git a/README.md b/README.md
@@ -25,6 +25,23 @@ Codebook is a spellchecker for code. It binds together the venerable Tree Sitter
 
 Codebook is being developed and not yet ready for public (or private, really) use. Hit the Star button to follow for updates though.
 
+## Goals
+
+Spellchecking is complicated and opinions about how it should be done, especially with code, differs. This section is about the trade offs that steer Codebook's decisions.
+
+### Privacy
+
+No remote calls for spellchecking or analytics. Once dictionaries are cached, Codebook needs to be usable offline. Codebook will never send the contents of files to a remote server.
+
+### Low noise/High signal
+
+Codebook should only highlight words that users have control over. For example, a misspelled word in an imported function should not be highlighted as the user can't do anything about it.
+
+### Efficient
+
+All features will be weighed against their impact on CPU and memory impact. Codebook should be fast on even low-end hardware to spellcheck on every keystroke.
+
+
 ## Features
 
 ### Code-aware spell checking

diff --git a/codebook-lsp/src/lsp.rs b/codebook-lsp/src/lsp.rs
@@ -5,21 +5,6 @@ use tower_lsp::{Client, LanguageServer};
 use codebook::CodeDictionary;
 use log::info;
 
-// #[derive(Clone, Debug)]
-// pub struct TextRange {
-//     pub start_line: u32,
-//     pub start_char: u32,
-//     pub end_line: u32,
-//     pub end_char: u32,
-// }
-
-// #[derive(Clone, Debug)]
-// pub struct SpellCheckResult {
-//     pub word: String,
-//     pub suggestions: Vec<String>,
-//     pub locations: Vec<TextRange>,
-// }
-
 #[derive(Debug)]
 pub struct Backend {
     pub client: Client,
@@ -108,8 +93,9 @@ impl Backend {
                     code_description: None,
                     source: Some("Codebook".to_string()),
                     message: format!(
-                        "Possible spelling error: '{}'. Suggestions: {:?}",
-                        res.word, res.suggestions
+                        "Possible spelling error: '{}'. Suggestions: {}",
+                        res.word,
+                        res.suggestions.join(", ")
                     ),
                     related_information: None,
                     tags: None,

diff --git a/codebook/src/downloader.rs b/codebook/src/downloader.rs
@@ -68,7 +68,7 @@ impl CacheMetadata {
     }
 }
 
-/// A downloader for dictionaries from a remote GitHub repository (by default
+/// A down-loader for dictionaries from a remote GitHub repository (by default
 /// https://github.com/blopker/dictionaries), storing them in a local cache
 /// and avoiding re-download if unchanged.
 ///

diff --git a/codebook/src/lib.rs b/codebook/src/lib.rs
@@ -1,11 +1,12 @@
 pub mod downloader;
 mod queries;
 mod splitter;
+use log::info;
 use lru::LruCache;
 
 use crate::queries::{
-    get_language_name_from_filename, get_language_setting, LanguageSetting, LanguageType,
-    COMMON_DICTIONARY,
+    get_common_dictionary, get_language_name_from_filename, get_language_setting, LanguageSetting,
+    LanguageType,
 };
 use std::{
     collections::{HashMap, HashSet},
@@ -54,7 +55,7 @@ impl CodeDictionary {
         let dict = spellbook::Dictionary::new(&aff, &dic)
             .map_err(|e| format!("Dictionary parse error: {}", e))?;
         let mut custom_dictionary: HashSet<String> = HashSet::new();
-        for word in COMMON_DICTIONARY.lines() {
+        for word in get_common_dictionary() {
             custom_dictionary.insert(word.to_string());
         }
         Ok(CodeDictionary {
@@ -85,10 +86,10 @@ impl CodeDictionary {
     }
 
     pub fn suggest(&self, word: &str) -> Vec<String> {
-        println!("Checking Cache: {:?}", word);
+        info!("Checking Cache: {:?}", word);
         // First try to get from cache with write lock since get() needs to modify LRU order
         if let Some(suggestions) = self.suggestion_cache.write().unwrap().get_mut(word) {
-            println!("Cache hit for {:?}", word);
+            info!("Cache hit for {:?}", word);
             return suggestions.clone();
         }
 
@@ -278,13 +279,13 @@ impl CodeDictionary {
                 let current_line = node_start.row as u32;
                 let current_column = node_start.column as u32;
                 let words = self.get_words_from_text(node_text);
-                println!("Found Capture:: {node_text:?}");
-                println!("Words:: {words:?}");
-                println!("Column: {current_column}");
-                println!("Line: {current_line}");
+                info!("Found Capture:: {node_text:?}");
+                info!("Words:: {words:?}");
+                info!("Column: {current_column}");
+                info!("Line: {current_line}");
                 for (word_text, (text_start_char, text_line)) in words {
                     let split = splitter::split_camel_case(&word_text);
-                    println!("Checking: {:?}", split);
+                    info!("Checking: {:?}", split);
                     for split_word in split {
                         if !self.check(&split_word.word) {
                             let offset = if text_line == 0 { current_column } else { 0 };
@@ -327,12 +328,11 @@ mod lib_tests {
     static EXTRA_WORDS: &'static [&'static str] = &["http", "https", "www", "viewport", "UTF"];
 
     fn get_processor() -> CodeDictionary {
-        let mut cdict =
-            CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
+        let dict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
         for word in EXTRA_WORDS {
-            cdict.add_to_dictionary(word);
+            dict.add_to_dictionary(word);
         }
-        cdict
+        dict
     }
 
     #[test]
@@ -347,7 +347,7 @@ mod lib_tests {
 
     #[test]
     fn test_get_words_from_text() {
-        let cdict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
+        let dict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
         let text = r#"
             HelloWorld calc_wrld
             I'm a contraction, don't ignore me
@@ -369,7 +369,7 @@ mod lib_tests {
             ("rd", (23, 3)),
             ("line", (26, 3)),
         ];
-        let words = cdict.get_words_from_text(text);
+        let words = dict.get_words_from_text(text);
         println!("{:?}", words);
         for (i, w) in expected.into_iter().enumerate() {
             assert_eq!(words[i], (w.0.to_string(), w.1));

diff --git a/codebook/src/queries.rs b/codebook/src/queries.rs
@@ -28,7 +28,7 @@ impl LanguageType {
     }
 }
 
-pub static COMMON_DICTIONARY: &str = include_str!("../../wordlists/common.txt");
+static COMMON_DICTIONARY: &str = include_str!("../../word_lists/combined.gen.txt");
 // Use https://intmainreturn0.com/ts-visualizer/ to help with writing grammar queries
 pub static LANGUAGE_SETTINGS: [LanguageSetting; 7] = [
     LanguageSetting {
@@ -41,6 +41,10 @@ pub static LANGUAGE_SETTINGS: [LanguageSetting; 7] = [
                     pattern: (identifier) @identifier)
                 (let_declaration
                     pattern: (identifier) @identifier)
+                (struct_item
+                    name: (type_identifier) @identifier)
+                (field_declaration
+                    name: (field_identifier) @identifier)
                 (line_comment) @comment
                 (string_content) @string
                 (char_literal) @string
@@ -178,3 +182,7 @@ pub fn get_language_name_from_filename(filename: &str) -> Option<LanguageType> {
     }
     None
 }
+
+pub fn get_common_dictionary() -> impl Iterator<Item = &'static str> {
+    COMMON_DICTIONARY.lines().filter(|l| !l.contains('#'))
+}
diff --git a/codebook/tests/test_rust.rs b/codebook/tests/test_rust.rs
@@ -1,9 +1,11 @@
 use codebook::{SpellCheckResult, TextRange};
 
 mod utils;
-// im a bd speler
+use utils::init_logging;
+
 #[test]
 fn test_rust_simple() {
+    init_logging();
     let processor = utils::get_processor();
     let sample_text = r#"
         fn calculat_user_age(bithDate: String) -> u32 {
@@ -25,6 +27,7 @@ fn test_rust_simple() {
 
 #[test]
 fn test_rust_comment_location() {
+    init_logging();
     let sample_rust = r#"
         // Comment with a typo: mment
         "#;
@@ -44,3 +47,56 @@ fn test_rust_comment_location() {
     assert_eq!(misspelled, expected);
     assert!(misspelled[0].locations.len() == 1);
 }
+
+#[test]
+fn test_rust_struct() {
+    init_logging();
+    let sample_rust = r#"
+        pub struct BadSpeler {
+            /// Terrible spelling: dwnloader
+            pub dataz: String,
+        }
+        "#;
+    let expected = vec![
+        SpellCheckResult::new(
+            "Speler".to_string(),
+            vec!["Speer", "Speller", "Spewer", "Spengler", "Peeler"],
+            vec![TextRange {
+                start_char: 22,
+                end_char: 28,
+                start_line: 1,
+                end_line: 1,
+            }],
+        ),
+        SpellCheckResult::new(
+            "dwnloader".to_string(),
+            vec!["loader"],
+            vec![TextRange {
+                start_char: 35,
+                end_char: 44,
+                start_line: 2,
+                end_line: 2,
+            }],
+        ),
+        SpellCheckResult::new(
+            "dataz".to_string(),
+            vec!["data", "data z"],
+            vec![TextRange {
+                start_char: 16,
+                end_char: 21,
+                start_line: 3,
+                end_line: 3,
+            }],
+        ),
+    ];
+    let processor = utils::get_processor();
+    let misspelled = processor.spell_check(sample_rust, "rust").to_vec();
+    println!("Misspelled words: {misspelled:?}");
+    for expect in expected.iter() {
+        println!("Expecting {}", expect.word);
+        let result = misspelled.iter().find(|r| r.word == expect.word).unwrap();
+        assert_eq!(result.word, expect.word);
+        assert_eq!(result.suggestions, expect.suggestions);
+        assert_eq!(result.locations, expect.locations);
+    }
+}
diff --git a/codebook/tests/utils/mod.rs b/codebook/tests/utils/mod.rs
@@ -8,3 +8,7 @@ pub fn get_processor() -> CodeDictionary {
     }
     cdict
 }
+
+pub fn init_logging() {
+    let _ = env_logger::builder().is_test(true).try_init();
+}
diff --git a/scripts/generate_combined_wordlist.ts b/scripts/generate_combined_wordlist.ts
@@ -0,0 +1,39 @@
+// a script that gets all the wordLists in word_lists (that don't have .gen.) and combines them into a single file, de-duping the words.
+// Output is written to combined.gen.txt
+// input list format:
+// ---
+// word1
+// word2
+// word3
+// ---
+
+import fs from "node:fs";
+import path from "node:path";
+
+const wordListsPath = path.join(__dirname, "..", "word_lists");
+const wordLists = fs.readdirSync(wordListsPath);
+
+const combined = new Set<string>();
+
+for (const file of wordLists) {
+  if (!file.endsWith(".gen.txt")) {
+    const words = fs
+      .readFileSync(path.join(wordListsPath, file), "utf-8")
+      .split("\n");
+    for (const word of words) {
+      if (word.length > 1) {
+        combined.add(word);
+      }
+    }
+  }
+}
+
+const combinedPath = path.join(wordListsPath, "combined.gen.txt");
+fs.writeFileSync(
+  combinedPath,
+  "# Generated by generate_combined_wordlist.ts. Do not edit.\n",
+);
+fs.writeFileSync(combinedPath, Array.from(combined).toSorted().join("\n"), {
+  flag: "a",
+});
+console.log("Combined word list written to", combinedPath);
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,7 @@ pub fn get_processor() -> CodeDictionary { @@
         }
         cdict
     }
+    pub fn init_logging() {
+        let _ = env_logger::builder().is_test(true).try_init();
+    }