Skip to content

Commit

Permalink
Add struct ids to rust, add more wordlists
Browse files Browse the repository at this point in the history
  • Loading branch information
blopker committed Jan 14, 2025
1 parent ffec971 commit 738dcf0
Show file tree
Hide file tree
Showing 19 changed files with 247 additions and 503 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[workspace]
members = [ "codebook","codebook-lsp", "codebook-zed"]
members = ["codebook", "codebook-lsp", "codebook-zed"]
resolver = "2"

[profile.test]
env = { RUST_LOG = "debug" }
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,23 @@ Codebook is a spellchecker for code. It binds together the venerable Tree Sitter

Codebook is being developed and not yet ready for public (or private, really) use. Hit the Star button to follow for updates though.

## Goals

Spellchecking is complicated and opinions about how it should be done, especially with code, differs. This section is about the trade offs that steer Codebook's decisions.

### Privacy

No remote calls for spellchecking or analytics. Once dictionaries are cached, Codebook needs to be usable offline. Codebook will never send the contents of files to a remote server.

### Low noise/High signal

Codebook should only highlight words that users have control over. For example, a misspelled word in an imported function should not be highlighted as the user can't do anything about it.

### Efficient

All features will be weighed against their impact on CPU and memory impact. Codebook should be fast on even low-end hardware to spellcheck on every keystroke.


## Features

### Code-aware spell checking
Expand Down
20 changes: 3 additions & 17 deletions codebook-lsp/src/lsp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,6 @@ use tower_lsp::{Client, LanguageServer};
use codebook::CodeDictionary;
use log::info;

// #[derive(Clone, Debug)]
// pub struct TextRange {
// pub start_line: u32,
// pub start_char: u32,
// pub end_line: u32,
// pub end_char: u32,
// }

// #[derive(Clone, Debug)]
// pub struct SpellCheckResult {
// pub word: String,
// pub suggestions: Vec<String>,
// pub locations: Vec<TextRange>,
// }

#[derive(Debug)]
pub struct Backend {
pub client: Client,
Expand Down Expand Up @@ -108,8 +93,9 @@ impl Backend {
code_description: None,
source: Some("Codebook".to_string()),
message: format!(
"Possible spelling error: '{}'. Suggestions: {:?}",
res.word, res.suggestions
"Possible spelling error: '{}'. Suggestions: {}",
res.word,
res.suggestions.join(", ")
),
related_information: None,
tags: None,
Expand Down
2 changes: 1 addition & 1 deletion codebook/src/downloader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ impl CacheMetadata {
}
}

/// A downloader for dictionaries from a remote GitHub repository (by default
/// A down-loader for dictionaries from a remote GitHub repository (by default
/// https://github.com/blopker/dictionaries), storing them in a local cache
/// and avoiding re-download if unchanged.
///
Expand Down
32 changes: 16 additions & 16 deletions codebook/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
pub mod downloader;
mod queries;
mod splitter;
use log::info;
use lru::LruCache;

use crate::queries::{
get_language_name_from_filename, get_language_setting, LanguageSetting, LanguageType,
COMMON_DICTIONARY,
get_common_dictionary, get_language_name_from_filename, get_language_setting, LanguageSetting,
LanguageType,
};
use std::{
collections::{HashMap, HashSet},
Expand Down Expand Up @@ -54,7 +55,7 @@ impl CodeDictionary {
let dict = spellbook::Dictionary::new(&aff, &dic)
.map_err(|e| format!("Dictionary parse error: {}", e))?;
let mut custom_dictionary: HashSet<String> = HashSet::new();
for word in COMMON_DICTIONARY.lines() {
for word in get_common_dictionary() {
custom_dictionary.insert(word.to_string());
}
Ok(CodeDictionary {
Expand Down Expand Up @@ -85,10 +86,10 @@ impl CodeDictionary {
}

pub fn suggest(&self, word: &str) -> Vec<String> {
println!("Checking Cache: {:?}", word);
info!("Checking Cache: {:?}", word);
// First try to get from cache with write lock since get() needs to modify LRU order
if let Some(suggestions) = self.suggestion_cache.write().unwrap().get_mut(word) {
println!("Cache hit for {:?}", word);
info!("Cache hit for {:?}", word);
return suggestions.clone();
}

Expand Down Expand Up @@ -278,13 +279,13 @@ impl CodeDictionary {
let current_line = node_start.row as u32;
let current_column = node_start.column as u32;
let words = self.get_words_from_text(node_text);
println!("Found Capture:: {node_text:?}");
println!("Words:: {words:?}");
println!("Column: {current_column}");
println!("Line: {current_line}");
info!("Found Capture:: {node_text:?}");
info!("Words:: {words:?}");
info!("Column: {current_column}");
info!("Line: {current_line}");
for (word_text, (text_start_char, text_line)) in words {
let split = splitter::split_camel_case(&word_text);
println!("Checking: {:?}", split);
info!("Checking: {:?}", split);
for split_word in split {
if !self.check(&split_word.word) {
let offset = if text_line == 0 { current_column } else { 0 };
Expand Down Expand Up @@ -327,12 +328,11 @@ mod lib_tests {
static EXTRA_WORDS: &'static [&'static str] = &["http", "https", "www", "viewport", "UTF"];

fn get_processor() -> CodeDictionary {
let mut cdict =
CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
let dict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
for word in EXTRA_WORDS {
cdict.add_to_dictionary(word);
dict.add_to_dictionary(word);
}
cdict
dict
}

#[test]
Expand All @@ -347,7 +347,7 @@ mod lib_tests {

#[test]
fn test_get_words_from_text() {
let cdict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
let dict = CodeDictionary::new("./tests/en_index.aff", "./tests/en_index.dic").unwrap();
let text = r#"
HelloWorld calc_wrld
I'm a contraction, don't ignore me
Expand All @@ -369,7 +369,7 @@ mod lib_tests {
("rd", (23, 3)),
("line", (26, 3)),
];
let words = cdict.get_words_from_text(text);
let words = dict.get_words_from_text(text);
println!("{:?}", words);
for (i, w) in expected.into_iter().enumerate() {
assert_eq!(words[i], (w.0.to_string(), w.1));
Expand Down
10 changes: 9 additions & 1 deletion codebook/src/queries.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl LanguageType {
}
}

pub static COMMON_DICTIONARY: &str = include_str!("../../wordlists/common.txt");
static COMMON_DICTIONARY: &str = include_str!("../../word_lists/combined.gen.txt");
// Use https://intmainreturn0.com/ts-visualizer/ to help with writing grammar queries
pub static LANGUAGE_SETTINGS: [LanguageSetting; 7] = [
LanguageSetting {
Expand All @@ -41,6 +41,10 @@ pub static LANGUAGE_SETTINGS: [LanguageSetting; 7] = [
pattern: (identifier) @identifier)
(let_declaration
pattern: (identifier) @identifier)
(struct_item
name: (type_identifier) @identifier)
(field_declaration
name: (field_identifier) @identifier)
(line_comment) @comment
(string_content) @string
(char_literal) @string
Expand Down Expand Up @@ -178,3 +182,7 @@ pub fn get_language_name_from_filename(filename: &str) -> Option<LanguageType> {
}
None
}

pub fn get_common_dictionary() -> impl Iterator<Item = &'static str> {
COMMON_DICTIONARY.lines().filter(|l| !l.contains('#'))
}
58 changes: 57 additions & 1 deletion codebook/tests/test_rust.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use codebook::{SpellCheckResult, TextRange};

mod utils;
// im a bd speler
use utils::init_logging;

#[test]
fn test_rust_simple() {
init_logging();
let processor = utils::get_processor();
let sample_text = r#"
fn calculat_user_age(bithDate: String) -> u32 {
Expand All @@ -25,6 +27,7 @@ fn test_rust_simple() {

#[test]
fn test_rust_comment_location() {
init_logging();
let sample_rust = r#"
// Comment with a typo: mment
"#;
Expand All @@ -44,3 +47,56 @@ fn test_rust_comment_location() {
assert_eq!(misspelled, expected);
assert!(misspelled[0].locations.len() == 1);
}

#[test]
fn test_rust_struct() {
init_logging();
let sample_rust = r#"
pub struct BadSpeler {
/// Terrible spelling: dwnloader
pub dataz: String,
}
"#;
let expected = vec![
SpellCheckResult::new(
"Speler".to_string(),
vec!["Speer", "Speller", "Spewer", "Spengler", "Peeler"],
vec![TextRange {
start_char: 22,
end_char: 28,
start_line: 1,
end_line: 1,
}],
),
SpellCheckResult::new(
"dwnloader".to_string(),
vec!["loader"],
vec![TextRange {
start_char: 35,
end_char: 44,
start_line: 2,
end_line: 2,
}],
),
SpellCheckResult::new(
"dataz".to_string(),
vec!["data", "data z"],
vec![TextRange {
start_char: 16,
end_char: 21,
start_line: 3,
end_line: 3,
}],
),
];
let processor = utils::get_processor();
let misspelled = processor.spell_check(sample_rust, "rust").to_vec();
println!("Misspelled words: {misspelled:?}");
for expect in expected.iter() {
println!("Expecting {}", expect.word);
let result = misspelled.iter().find(|r| r.word == expect.word).unwrap();
assert_eq!(result.word, expect.word);
assert_eq!(result.suggestions, expect.suggestions);
assert_eq!(result.locations, expect.locations);
}
}
4 changes: 4 additions & 0 deletions codebook/tests/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ pub fn get_processor() -> CodeDictionary {
}
cdict
}

pub fn init_logging() {
let _ = env_logger::builder().is_test(true).try_init();
}
39 changes: 39 additions & 0 deletions scripts/generate_combined_wordlist.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// a script that gets all the wordLists in word_lists (that don't have .gen.) and combines them into a single file, de-duping the words.
// Output is written to combined.gen.txt
// input list format:
// ---
// word1
// word2
// word3
// ---

import fs from "node:fs";
import path from "node:path";

const wordListsPath = path.join(__dirname, "..", "word_lists");
const wordLists = fs.readdirSync(wordListsPath);

const combined = new Set<string>();

for (const file of wordLists) {
if (!file.endsWith(".gen.txt")) {
const words = fs
.readFileSync(path.join(wordListsPath, file), "utf-8")
.split("\n");
for (const word of words) {
if (word.length > 1) {
combined.add(word);
}
}
}
}

const combinedPath = path.join(wordListsPath, "combined.gen.txt");
fs.writeFileSync(
combinedPath,
"# Generated by generate_combined_wordlist.ts. Do not edit.\n",
);
fs.writeFileSync(combinedPath, Array.from(combined).toSorted().join("\n"), {
flag: "a",
});
console.log("Combined word list written to", combinedPath);
Loading

0 comments on commit 738dcf0

Please sign in to comment.