Skip to main content

lang_check/
dictionary.rs

1use anyhow::Result;
2use std::collections::HashSet;
3use std::path::{Path, PathBuf};
4
5/// Manages custom dictionaries for the language checker.
6/// Words in the dictionary are excluded from spelling diagnostics.
7///
8/// Internally, user-added words and bundled words are kept in separate sets.
9/// Only user words are persisted to `dictionary.txt`.
10pub struct Dictionary {
11    user_words: HashSet<String>,
12    bundled_words: HashSet<String>,
13    workspace_path: Option<PathBuf>,
14}
15
16impl Default for Dictionary {
17    fn default() -> Self {
18        Self::new()
19    }
20}
21
22impl Dictionary {
23    #[must_use]
24    pub fn new() -> Self {
25        Self {
26            user_words: HashSet::new(),
27            bundled_words: HashSet::new(),
28            workspace_path: None,
29        }
30    }
31
32    /// Load dictionaries from a workspace root.
33    /// Reads from .languagecheck/dictionary.txt (one word per line).
34    pub fn load(workspace_root: &Path) -> Result<Self> {
35        let mut dict = Self::new();
36        let dict_path = workspace_root.join(".languagecheck").join("dictionary.txt");
37        dict.workspace_path = Some(dict_path.clone());
38
39        if dict_path.exists() {
40            let content = std::fs::read_to_string(&dict_path)?;
41            for line in content.lines() {
42                let word = line.trim();
43                if !word.is_empty() && !word.starts_with('#') {
44                    dict.user_words.insert(word.to_lowercase());
45                }
46            }
47        }
48
49        Ok(dict)
50    }
51
52    /// Load the bundled dictionaries that ship with the extension.
53    /// These contain domain-specific technical terms from open-source wordlists.
54    /// Bundled words are kept separate and never persisted to the user's dictionary file.
55    pub fn load_bundled(&mut self) {
56        for words_str in bundled::ALL {
57            parse_wordlist_into(words_str, &mut self.bundled_words);
58        }
59    }
60
61    /// Load additional words from a file path. The file is expected to contain
62    /// one word per line; lines starting with `#` and blank lines are skipped.
63    ///
64    /// Paths are resolved relative to `base` if they are not absolute.
65    pub fn load_wordlist_file(&mut self, path: &Path, base: &Path) -> Result<()> {
66        let resolved = if path.is_absolute() {
67            path.to_path_buf()
68        } else {
69            base.join(path)
70        };
71
72        let resolved = resolved.canonicalize().map_err(|e| {
73            anyhow::anyhow!("Cannot resolve wordlist path {}: {e}", resolved.display())
74        })?;
75
76        // Security: refuse to read files outside the workspace or common config dirs.
77        // Canonicalize base too — on macOS /var → /private/var, on Windows UNC prefixes differ.
78        let canonical_base = base.canonicalize().unwrap_or_else(|_| base.to_path_buf());
79        if !resolved.starts_with(&canonical_base)
80            && !resolved.starts_with(dirs::config_dir().unwrap_or_default())
81            && !resolved.starts_with(dirs::home_dir().unwrap_or_default().join(".config"))
82        {
83            anyhow::bail!(
84                "Wordlist path {} is outside the workspace and known config directories",
85                resolved.display()
86            );
87        }
88
89        let content = std::fs::read_to_string(&resolved)
90            .map_err(|e| anyhow::anyhow!("Cannot read wordlist {}: {e}", resolved.display()))?;
91        parse_wordlist_into(&content, &mut self.bundled_words);
92        Ok(())
93    }
94
95    /// Add a word to the user dictionary and persist to disk.
96    pub fn add_word(&mut self, word: &str) -> Result<()> {
97        let lower = word.to_lowercase();
98        if self.user_words.insert(lower) {
99            self.persist()?;
100        }
101        Ok(())
102    }
103
104    /// Check if a word is in the dictionary (case-insensitive).
105    /// Checks both user words and bundled/external wordlists.
106    #[must_use]
107    pub fn contains(&self, word: &str) -> bool {
108        let lower = word.to_lowercase();
109        self.user_words.contains(&lower) || self.bundled_words.contains(&lower)
110    }
111
112    /// Return all words in the dictionary (user + bundled).
113    pub fn words(&self) -> impl Iterator<Item = &String> {
114        self.user_words.iter().chain(self.bundled_words.iter())
115    }
116
117    /// Return the total number of words loaded (user + bundled).
118    #[must_use]
119    pub fn len(&self) -> usize {
120        self.user_words.len() + self.bundled_words.len()
121    }
122
123    /// Whether the dictionary is empty.
124    #[must_use]
125    pub fn is_empty(&self) -> bool {
126        self.user_words.is_empty() && self.bundled_words.is_empty()
127    }
128
129    /// Persist only user-added words to the workspace dictionary file.
130    fn persist(&self) -> Result<()> {
131        let Some(path) = &self.workspace_path else {
132            return Ok(());
133        };
134
135        if let Some(parent) = path.parent() {
136            std::fs::create_dir_all(parent)?;
137        }
138
139        let mut words: Vec<&str> = self.user_words.iter().map(String::as_str).collect();
140        words.sort_unstable();
141        let content = words.join("\n");
142        std::fs::write(path, content + "\n")?;
143        Ok(())
144    }
145}
146
147/// Parse a wordlist string (one word per line) into a set.
148fn parse_wordlist_into(content: &str, set: &mut HashSet<String>) {
149    for line in content.lines() {
150        let word = line.trim();
151        if !word.is_empty() && !word.starts_with('#') {
152            set.insert(word.to_lowercase());
153        }
154    }
155}
156
157/// Bundled dictionary data embedded at compile time.
158/// See `dictionaries/THIRD_PARTY_NOTICES.md` for attribution and licensing.
159pub mod bundled {
160    /// Software development terms, tools, acronyms, and compound words.
161    /// Sources: cspell-dicts (software-terms, cpp). License: MIT.
162    pub const SOFTWARE_TERMS: &str = include_str!("../dictionaries/bundled/software-terms.txt");
163
164    /// TypeScript and JavaScript keywords, builtins, and API terms.
165    /// Source: cspell-dicts (typescript). License: MIT.
166    pub const TYPESCRIPT: &str = include_str!("../dictionaries/bundled/typescript.txt");
167
168    /// Well-known company and brand names.
169    /// Source: cspell-dicts (companies). License: MIT.
170    pub const COMPANIES: &str = include_str!("../dictionaries/bundled/companies.txt");
171
172    /// Computing jargon, hardware terms, and domain-specific vocabulary.
173    /// Sources: hunspell-jargon (MIT), `SpellCheckDic` (MIT),
174    ///          autoware-spell-check-dict (Apache-2.0).
175    pub const JARGON: &str = include_str!("../dictionaries/bundled/jargon.txt");
176
177    /// All bundled wordlists for convenient iteration.
178    pub const ALL: &[&str] = &[SOFTWARE_TERMS, TYPESCRIPT, COMPANIES, JARGON];
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn new_dictionary_is_empty() {
187        let dict = Dictionary::new();
188        assert!(!dict.contains("anything"));
189    }
190
191    #[test]
192    fn add_and_contains() {
193        let mut dict = Dictionary::new();
194        dict.user_words.insert("hello".to_string());
195        assert!(dict.contains("hello"));
196        assert!(dict.contains("Hello")); // case-insensitive
197        assert!(dict.contains("HELLO"));
198    }
199
200    #[test]
201    fn persistence_roundtrip() {
202        let dir = std::env::temp_dir().join("lang_check_test_dict");
203        let _ = std::fs::remove_dir_all(&dir);
204        std::fs::create_dir_all(&dir).unwrap();
205
206        // Write
207        {
208            let mut dict = Dictionary::load(&dir).unwrap();
209            dict.add_word("kubernetes").unwrap();
210            dict.add_word("terraform").unwrap();
211        }
212
213        // Read back
214        {
215            let dict = Dictionary::load(&dir).unwrap();
216            assert!(dict.contains("kubernetes"));
217            assert!(dict.contains("Kubernetes")); // case-insensitive
218            assert!(dict.contains("terraform"));
219            assert!(!dict.contains("nonexistent"));
220        }
221
222        let _ = std::fs::remove_dir_all(&dir);
223    }
224
225    #[test]
226    fn skips_comments_and_blank_lines() {
227        let dir = std::env::temp_dir().join("lang_check_test_dict_comments");
228        let _ = std::fs::remove_dir_all(&dir);
229        let dict_dir = dir.join(".languagecheck");
230        std::fs::create_dir_all(&dict_dir).unwrap();
231        std::fs::write(
232            dict_dir.join("dictionary.txt"),
233            "# This is a comment\n\nkubernetes\n  \n# Another comment\nterraform\n",
234        )
235        .unwrap();
236
237        let dict = Dictionary::load(&dir).unwrap();
238        assert!(dict.contains("kubernetes"));
239        assert!(dict.contains("terraform"));
240        assert_eq!(dict.words().count(), 2);
241
242        let _ = std::fs::remove_dir_all(&dir);
243    }
244
245    #[test]
246    fn add_duplicate_word_is_idempotent() {
247        let mut dict = Dictionary::new();
248        dict.user_words.insert("test".to_string());
249        let initial_count = dict.words().count();
250        dict.user_words.insert("test".to_string());
251        assert_eq!(dict.words().count(), initial_count);
252    }
253
254    #[test]
255    fn words_iterator() {
256        let mut dict = Dictionary::new();
257        dict.user_words.insert("alpha".to_string());
258        dict.user_words.insert("beta".to_string());
259        assert_eq!(dict.words().count(), 2);
260    }
261
262    #[test]
263    fn bundled_dictionaries_load() {
264        let mut dict = Dictionary::new();
265        dict.load_bundled();
266
267        // Should have thousands of words from bundled sources
268        assert!(
269            dict.len() > 5000,
270            "Expected > 5000 bundled words, got {}",
271            dict.len()
272        );
273
274        // Spot-check some well-known terms from each category
275        assert!(
276            dict.contains("kubernetes"),
277            "software-terms should include kubernetes"
278        );
279        assert!(
280            dict.contains("webpack"),
281            "software-terms should include webpack"
282        );
283        assert!(
284            dict.contains("instanceof"),
285            "typescript should include instanceof"
286        );
287        assert!(dict.contains("stdout"), "jargon should include stdout");
288    }
289
290    #[test]
291    fn bundled_plus_user_words() {
292        let mut dict = Dictionary::new();
293        dict.load_bundled();
294        let bundled_count = dict.len();
295
296        dict.user_words.insert("myprojectword".to_string());
297        assert_eq!(dict.len(), bundled_count + 1);
298        assert!(dict.contains("myprojectword"));
299        // Bundled words still present
300        assert!(dict.contains("kubernetes"));
301    }
302
303    #[test]
304    fn load_wordlist_file_works() {
305        let dir = std::env::temp_dir().join("lang_check_test_wordlist");
306        let _ = std::fs::remove_dir_all(&dir);
307        std::fs::create_dir_all(&dir).unwrap();
308
309        let wordlist = dir.join("custom.txt");
310        std::fs::write(&wordlist, "# My custom words\nfoobar\nbazqux\n").unwrap();
311
312        let mut dict = Dictionary::new();
313        dict.load_wordlist_file(&wordlist, &dir).unwrap();
314
315        assert!(dict.contains("foobar"));
316        assert!(dict.contains("bazqux"));
317        assert_eq!(dict.len(), 2);
318
319        let _ = std::fs::remove_dir_all(&dir);
320    }
321
322    #[test]
323    fn persistence_excludes_bundled_words() {
324        let dir = std::env::temp_dir().join("lang_check_test_dict_bundled_persist");
325        let _ = std::fs::remove_dir_all(&dir);
326        std::fs::create_dir_all(&dir).unwrap();
327
328        // Add a user word alongside bundled words
329        {
330            let mut dict = Dictionary::load(&dir).unwrap();
331            dict.load_bundled();
332            dict.add_word("myuserword").unwrap();
333        }
334
335        // Read the persisted file directly — should only contain user words
336        let dict_path = dir.join(".languagecheck").join("dictionary.txt");
337        let content = std::fs::read_to_string(&dict_path).unwrap();
338        assert!(
339            content.contains("myuserword"),
340            "User word should be persisted"
341        );
342        assert!(
343            !content.contains("kubernetes"),
344            "Bundled words should NOT be persisted"
345        );
346
347        // Reload and verify everything still works
348        {
349            let mut dict = Dictionary::load(&dir).unwrap();
350            dict.load_bundled();
351            assert!(dict.contains("myuserword"));
352            assert!(dict.contains("kubernetes"));
353        }
354
355        let _ = std::fs::remove_dir_all(&dir);
356    }
357
358    #[test]
359    fn load_wordlist_file_relative_path() {
360        let dir = std::env::temp_dir().join("lang_check_test_wordlist_rel");
361        let _ = std::fs::remove_dir_all(&dir);
362        std::fs::create_dir_all(&dir).unwrap();
363
364        std::fs::write(dir.join("terms.txt"), "myterm\n").unwrap();
365
366        let mut dict = Dictionary::new();
367        dict.load_wordlist_file(Path::new("terms.txt"), &dir)
368            .unwrap();
369
370        assert!(dict.contains("myterm"));
371
372        let _ = std::fs::remove_dir_all(&dir);
373    }
374}