use anyhow::Result;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
pub struct Dictionary {
user_words: HashSet<String>,
bundled_words: HashSet<String>,
workspace_path: Option<PathBuf>,
}
impl Default for Dictionary {
fn default() -> Self {
Self::new()
}
}
impl Dictionary {
#[must_use]
pub fn new() -> Self {
Self {
user_words: HashSet::new(),
bundled_words: HashSet::new(),
workspace_path: None,
}
}
pub fn load(workspace_root: &Path) -> Result<Self> {
let mut dict = Self::new();
let dict_path = workspace_root.join(".languagecheck").join("dictionary.txt");
dict.workspace_path = Some(dict_path.clone());
if dict_path.exists() {
let content = std::fs::read_to_string(&dict_path)?;
for line in content.lines() {
let word = line.trim();
if !word.is_empty() && !word.starts_with('#') {
dict.user_words.insert(word.to_lowercase());
}
}
}
Ok(dict)
}
pub fn load_bundled(&mut self) {
for words_str in bundled::ALL {
parse_wordlist_into(words_str, &mut self.bundled_words);
}
}
pub fn load_wordlist_file(&mut self, path: &Path, base: &Path) -> Result<()> {
let resolved = if path.is_absolute() {
path.to_path_buf()
} else {
base.join(path)
};
let resolved = resolved.canonicalize().map_err(|e| {
anyhow::anyhow!("Cannot resolve wordlist path {}: {e}", resolved.display())
})?;
let canonical_base = base.canonicalize().unwrap_or_else(|_| base.to_path_buf());
if !resolved.starts_with(&canonical_base)
&& !resolved.starts_with(dirs::config_dir().unwrap_or_default())
&& !resolved.starts_with(dirs::home_dir().unwrap_or_default().join(".config"))
{
anyhow::bail!(
"Wordlist path {} is outside the workspace and known config directories",
resolved.display()
);
}
let content = std::fs::read_to_string(&resolved)
.map_err(|e| anyhow::anyhow!("Cannot read wordlist {}: {e}", resolved.display()))?;
parse_wordlist_into(&content, &mut self.bundled_words);
Ok(())
}
pub fn add_word(&mut self, word: &str) -> Result<()> {
let lower = word.to_lowercase();
if self.user_words.insert(lower) {
self.persist()?;
}
Ok(())
}
#[must_use]
pub fn contains(&self, word: &str) -> bool {
let lower = word.to_lowercase();
self.user_words.contains(&lower) || self.bundled_words.contains(&lower)
}
pub fn words(&self) -> impl Iterator<Item = &String> {
self.user_words.iter().chain(self.bundled_words.iter())
}
#[must_use]
pub fn len(&self) -> usize {
self.user_words.len() + self.bundled_words.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.user_words.is_empty() && self.bundled_words.is_empty()
}
fn persist(&self) -> Result<()> {
let Some(path) = &self.workspace_path else {
return Ok(());
};
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let mut words: Vec<&str> = self.user_words.iter().map(String::as_str).collect();
words.sort_unstable();
let content = words.join("\n");
std::fs::write(path, content + "\n")?;
Ok(())
}
}
fn parse_wordlist_into(content: &str, set: &mut HashSet<String>) {
for line in content.lines() {
let word = line.trim();
if !word.is_empty() && !word.starts_with('#') {
set.insert(word.to_lowercase());
}
}
}
pub mod bundled {
pub const SOFTWARE_TERMS: &str = include_str!("../dictionaries/bundled/software-terms.txt");
pub const TYPESCRIPT: &str = include_str!("../dictionaries/bundled/typescript.txt");
pub const COMPANIES: &str = include_str!("../dictionaries/bundled/companies.txt");
pub const JARGON: &str = include_str!("../dictionaries/bundled/jargon.txt");
pub const ALL: &[&str] = &[SOFTWARE_TERMS, TYPESCRIPT, COMPANIES, JARGON];
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn new_dictionary_is_empty() {
let dict = Dictionary::new();
assert!(!dict.contains("anything"));
}
#[test]
fn add_and_contains() {
let mut dict = Dictionary::new();
dict.user_words.insert("hello".to_string());
assert!(dict.contains("hello"));
assert!(dict.contains("Hello")); assert!(dict.contains("HELLO"));
}
#[test]
fn persistence_roundtrip() {
let dir = std::env::temp_dir().join("lang_check_test_dict");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
{
let mut dict = Dictionary::load(&dir).unwrap();
dict.add_word("kubernetes").unwrap();
dict.add_word("terraform").unwrap();
}
{
let dict = Dictionary::load(&dir).unwrap();
assert!(dict.contains("kubernetes"));
assert!(dict.contains("Kubernetes")); assert!(dict.contains("terraform"));
assert!(!dict.contains("nonexistent"));
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn skips_comments_and_blank_lines() {
let dir = std::env::temp_dir().join("lang_check_test_dict_comments");
let _ = std::fs::remove_dir_all(&dir);
let dict_dir = dir.join(".languagecheck");
std::fs::create_dir_all(&dict_dir).unwrap();
std::fs::write(
dict_dir.join("dictionary.txt"),
"# This is a comment\n\nkubernetes\n \n# Another comment\nterraform\n",
)
.unwrap();
let dict = Dictionary::load(&dir).unwrap();
assert!(dict.contains("kubernetes"));
assert!(dict.contains("terraform"));
assert_eq!(dict.words().count(), 2);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn add_duplicate_word_is_idempotent() {
let mut dict = Dictionary::new();
dict.user_words.insert("test".to_string());
let initial_count = dict.words().count();
dict.user_words.insert("test".to_string());
assert_eq!(dict.words().count(), initial_count);
}
#[test]
fn words_iterator() {
let mut dict = Dictionary::new();
dict.user_words.insert("alpha".to_string());
dict.user_words.insert("beta".to_string());
assert_eq!(dict.words().count(), 2);
}
#[test]
fn bundled_dictionaries_load() {
let mut dict = Dictionary::new();
dict.load_bundled();
assert!(
dict.len() > 5000,
"Expected > 5000 bundled words, got {}",
dict.len()
);
assert!(
dict.contains("kubernetes"),
"software-terms should include kubernetes"
);
assert!(
dict.contains("webpack"),
"software-terms should include webpack"
);
assert!(
dict.contains("instanceof"),
"typescript should include instanceof"
);
assert!(dict.contains("stdout"), "jargon should include stdout");
}
#[test]
fn bundled_plus_user_words() {
let mut dict = Dictionary::new();
dict.load_bundled();
let bundled_count = dict.len();
dict.user_words.insert("myprojectword".to_string());
assert_eq!(dict.len(), bundled_count + 1);
assert!(dict.contains("myprojectword"));
assert!(dict.contains("kubernetes"));
}
#[test]
fn load_wordlist_file_works() {
let dir = std::env::temp_dir().join("lang_check_test_wordlist");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
let wordlist = dir.join("custom.txt");
std::fs::write(&wordlist, "# My custom words\nfoobar\nbazqux\n").unwrap();
let mut dict = Dictionary::new();
dict.load_wordlist_file(&wordlist, &dir).unwrap();
assert!(dict.contains("foobar"));
assert!(dict.contains("bazqux"));
assert_eq!(dict.len(), 2);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn persistence_excludes_bundled_words() {
let dir = std::env::temp_dir().join("lang_check_test_dict_bundled_persist");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
{
let mut dict = Dictionary::load(&dir).unwrap();
dict.load_bundled();
dict.add_word("myuserword").unwrap();
}
let dict_path = dir.join(".languagecheck").join("dictionary.txt");
let content = std::fs::read_to_string(&dict_path).unwrap();
assert!(
content.contains("myuserword"),
"User word should be persisted"
);
assert!(
!content.contains("kubernetes"),
"Bundled words should NOT be persisted"
);
{
let mut dict = Dictionary::load(&dir).unwrap();
dict.load_bundled();
assert!(dict.contains("myuserword"));
assert!(dict.contains("kubernetes"));
}
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn load_wordlist_file_relative_path() {
let dir = std::env::temp_dir().join("lang_check_test_wordlist_rel");
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();
std::fs::write(dir.join("terms.txt"), "myterm\n").unwrap();
let mut dict = Dictionary::new();
dict.load_wordlist_file(Path::new("terms.txt"), &dir)
.unwrap();
assert!(dict.contains("myterm"));
let _ = std::fs::remove_dir_all(&dir);
}
}