harper_core/spell/
mutable_dictionary.rs

1use super::{
2    FstDictionary, WordId,
3    rune::{self, AttributeList, parse_word_list},
4    word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use lazy_static::lazy_static;
9use std::borrow::Cow;
10use std::sync::Arc;
11
12use crate::{CharString, CharStringExt, DictWordMetadata};
13
14use super::FuzzyMatchResult;
15use super::dictionary::Dictionary;
16
17/// A basic dictionary that allows words to be added after instantiating.
18/// This is useful for user and file dictionaries that may change at runtime.
19///
20/// For immutable use-cases that require fuzzy lookups, such as the curated dictionary, prefer [`super::FstDictionary`],
21/// as it is much faster.
22///
23/// To combine the contents of multiple dictionaries, regardless of type, use
24/// [`super::MergedDictionary`].
25#[derive(Debug, Clone, Eq, PartialEq)]
26pub struct MutableDictionary {
27    /// All English words
28    word_map: WordMap,
29}
30
31/// The uncached function that is used to produce the original copy of the
32/// curated dictionary.
33fn uncached_inner_new() -> Arc<MutableDictionary> {
34    MutableDictionary::from_rune_files(
35        include_str!("../../dictionary.dict"),
36        include_str!("../../annotations.json"),
37    )
38    .map(Arc::new)
39    .unwrap_or_else(|e| panic!("Failed to load curated dictionary: {}", e))
40}
41
42lazy_static! {
43    static ref DICT: Arc<MutableDictionary> = uncached_inner_new();
44}
45
46impl MutableDictionary {
47    pub fn new() -> Self {
48        Self {
49            word_map: WordMap::default(),
50        }
51    }
52
53    pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
54        let word_list = parse_word_list(word_list)?;
55        let attr_list = AttributeList::parse(attr_list)?;
56
57        // There will be at _least_ this number of words
58        let mut word_map = WordMap::default();
59
60        attr_list.expand_annotated_words(word_list, &mut word_map);
61
62        Ok(Self { word_map })
63    }
64
65    /// Create a dictionary from the curated dictionary included
66    /// in the Harper binary.
67    /// Consider using [`super::FstDictionary::curated()`] instead, as it is more performant for spellchecking.
68    pub fn curated() -> Arc<Self> {
69        (*DICT).clone()
70    }
71
72    /// Appends words to the dictionary.
73    /// It is significantly faster to append many words with one call than many
74    /// distinct calls to this function.
75    pub fn extend_words(
76        &mut self,
77        words: impl IntoIterator<Item = (impl AsRef<[char]>, DictWordMetadata)>,
78    ) {
79        for (chars, metadata) in words.into_iter() {
80            self.word_map.insert(WordMapEntry {
81                metadata,
82                canonical_spelling: chars.as_ref().into(),
83            })
84        }
85    }
86
87    /// Append a single word to the dictionary.
88    ///
89    /// If you are appending many words, consider using [`Self::extend_words`]
90    /// instead.
91    pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: DictWordMetadata) {
92        self.extend_words(std::iter::once((word.as_ref(), metadata)))
93    }
94
95    /// Append a single string to the dictionary.
96    ///
97    /// If you are appending many words, consider using [`Self::extend_words`]
98    /// instead.
99    pub fn append_word_str(&mut self, word: &str, metadata: DictWordMetadata) {
100        self.append_word(word.chars().collect::<Vec<_>>(), metadata)
101    }
102}
103
104impl Default for MutableDictionary {
105    fn default() -> Self {
106        Self::new()
107    }
108}
109
110impl Dictionary for MutableDictionary {
111    fn get_word_metadata(&self, word: &[char]) -> Option<Cow<'_, DictWordMetadata>> {
112        self.word_map
113            .get_with_chars(word)
114            .map(|v| Cow::Borrowed(&v.metadata))
115    }
116
117    fn contains_word(&self, word: &[char]) -> bool {
118        self.word_map.contains_chars(word)
119    }
120
121    fn contains_word_str(&self, word: &str) -> bool {
122        let chars: CharString = word.chars().collect();
123        self.contains_word(&chars)
124    }
125
126    fn get_word_metadata_str(&self, word: &str) -> Option<Cow<'_, DictWordMetadata>> {
127        let chars: CharString = word.chars().collect();
128        self.get_word_metadata(&chars)
129    }
130
131    fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
132        self.word_map
133            .get_with_chars(word)
134            .map(|v| v.canonical_spelling.as_slice())
135    }
136
137    /// Suggest a correct spelling for a given misspelled word.
138    /// `Self::word` is assumed to be quite small (n < 100).
139    /// `max_distance` relates to an optimization that allows the search
140    /// algorithm to prune large portions of the search.
141    fn fuzzy_match(
142        &'_ self,
143        word: &[char],
144        max_distance: u8,
145        max_results: usize,
146    ) -> Vec<FuzzyMatchResult<'_>> {
147        let misspelled_charslice = word.normalized();
148        let misspelled_charslice_lower = misspelled_charslice.to_lower();
149
150        let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
151            1
152        } else {
153            misspelled_charslice.len() - max_distance as usize
154        };
155        let longest_word_len = misspelled_charslice.len() + max_distance as usize;
156
157        // Get candidate words
158        let words_to_search = self
159            .words_iter()
160            .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
161
162        // Pre-allocated vectors for the edit-distance calculation
163        // 53 is the length of the longest word.
164        let mut buf_a = Vec::with_capacity(53);
165        let mut buf_b = Vec::with_capacity(53);
166
167        // Sort by edit-distance
168        words_to_search
169            .filter_map(|word| {
170                let dist =
171                    edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
172                let lowercase_dist = edit_distance_min_alloc(
173                    &misspelled_charslice_lower,
174                    word,
175                    &mut buf_a,
176                    &mut buf_b,
177                );
178
179                let smaller_dist = dist.min(lowercase_dist);
180                if smaller_dist <= max_distance {
181                    Some((word, smaller_dist))
182                } else {
183                    None
184                }
185            })
186            .sorted_unstable_by_key(|a| a.1)
187            .take(max_results)
188            .map(|(word, edit_distance)| FuzzyMatchResult {
189                word,
190                edit_distance,
191                metadata: self.get_word_metadata(word).unwrap(),
192            })
193            .collect()
194    }
195
196    fn fuzzy_match_str(
197        &'_ self,
198        word: &str,
199        max_distance: u8,
200        max_results: usize,
201    ) -> Vec<FuzzyMatchResult<'_>> {
202        let word: Vec<_> = word.chars().collect();
203        self.fuzzy_match(&word, max_distance, max_results)
204    }
205
206    fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
207        Box::new(
208            self.word_map
209                .iter()
210                .map(|v| v.canonical_spelling.as_slice()),
211        )
212    }
213
214    fn word_count(&self) -> usize {
215        self.word_map.len()
216    }
217
218    fn contains_exact_word(&self, word: &[char]) -> bool {
219        let normalized = word.normalized();
220
221        if let Some(found) = self.word_map.get_with_chars(normalized.as_ref())
222            && found.canonical_spelling.as_ref() == normalized.as_ref()
223        {
224            return true;
225        }
226
227        false
228    }
229
230    fn contains_exact_word_str(&self, word: &str) -> bool {
231        let word: CharString = word.chars().collect();
232        self.contains_exact_word(word.as_ref())
233    }
234
235    fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
236        self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
237    }
238
239    fn find_words_with_prefix(&self, prefix: &[char]) -> Vec<Cow<'_, [char]>> {
240        let mut found = Vec::new();
241
242        for word in self.words_iter() {
243            if let Some(item_prefix) = word.get(0..prefix.len())
244                && item_prefix == prefix
245            {
246                found.push(Cow::Borrowed(word));
247            }
248        }
249
250        found
251    }
252
253    fn find_words_with_common_prefix(&self, word: &[char]) -> Vec<Cow<'_, [char]>> {
254        let mut found = Vec::new();
255
256        for item in self.words_iter() {
257            if let Some(item_prefix) = word.get(0..item.len())
258                && item_prefix == item
259            {
260                found.push(Cow::Borrowed(item));
261            }
262        }
263
264        found
265    }
266}
267
268impl From<MutableDictionary> for FstDictionary {
269    fn from(dict: MutableDictionary) -> Self {
270        let words = dict
271            .word_map
272            .into_iter()
273            .map(|entry| (entry.canonical_spelling, entry.metadata))
274            .collect();
275
276        FstDictionary::new(words)
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use std::borrow::Cow;
283
284    use hashbrown::HashSet;
285    use itertools::Itertools;
286
287    use crate::spell::{Dictionary, MutableDictionary};
288    use crate::{DictWordMetadata, char_string::char_string};
289
290    #[test]
291    fn curated_contains_no_duplicates() {
292        let dict = MutableDictionary::curated();
293        assert!(dict.words_iter().all_unique());
294    }
295
296    #[test]
297    fn curated_matches_capitalized() {
298        let dict = MutableDictionary::curated();
299        assert!(dict.contains_word_str("this"));
300        assert!(dict.contains_word_str("This"));
301    }
302
303    // "This" is a determiner when used similarly to "the"
304    // but when used alone it's a "demonstrative pronoun".
305    // Harper previously wrongly classified it as a noun.
306    #[test]
307    fn this_is_determiner() {
308        let dict = MutableDictionary::curated();
309        assert!(dict.get_word_metadata_str("this").unwrap().is_determiner());
310        assert!(dict.get_word_metadata_str("This").unwrap().is_determiner());
311    }
312
313    #[test]
314    fn several_is_quantifier() {
315        let dict = MutableDictionary::curated();
316        assert!(
317            dict.get_word_metadata_str("several")
318                .unwrap()
319                .is_quantifier()
320        );
321    }
322
323    #[test]
324    fn few_is_quantifier() {
325        let dict = MutableDictionary::curated();
326        assert!(dict.get_word_metadata_str("few").unwrap().is_quantifier());
327    }
328
329    #[test]
330    fn fewer_is_quantifier() {
331        let dict = MutableDictionary::curated();
332        assert!(dict.get_word_metadata_str("fewer").unwrap().is_quantifier());
333    }
334
335    #[test]
336    fn than_is_conjunction() {
337        let dict = MutableDictionary::curated();
338        assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
339        assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
340    }
341
342    #[test]
343    fn herself_is_pronoun() {
344        let dict = MutableDictionary::curated();
345        assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
346        assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
347    }
348
349    #[test]
350    fn discussion_171() {
351        let dict = MutableDictionary::curated();
352        assert!(dict.contains_word_str("natively"));
353    }
354
355    #[test]
356    fn im_is_common() {
357        let dict = MutableDictionary::curated();
358        assert!(dict.get_word_metadata_str("I'm").unwrap().common);
359    }
360
361    #[test]
362    fn fuzzy_result_sorted_by_edit_distance() {
363        let dict = MutableDictionary::curated();
364
365        let results = dict.fuzzy_match_str("hello", 3, 100);
366        let is_sorted_by_dist = results
367            .iter()
368            .map(|fm| fm.edit_distance)
369            .tuple_windows()
370            .all(|(a, b)| a <= b);
371
372        assert!(is_sorted_by_dist)
373    }
374
375    #[test]
376    fn there_is_not_a_pronoun() {
377        let dict = MutableDictionary::curated();
378
379        assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
380        assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
381    }
382
383    #[test]
384    fn expanded_contains_giants() {
385        assert!(MutableDictionary::curated().contains_word_str("giants"));
386    }
387
388    #[test]
389    fn expanded_contains_deallocate() {
390        assert!(MutableDictionary::curated().contains_word_str("deallocate"));
391    }
392
393    #[test]
394    fn curated_contains_repo() {
395        let dict = MutableDictionary::curated();
396
397        assert!(dict.contains_word_str("repo"));
398        assert!(dict.contains_word_str("repos"));
399        assert!(dict.contains_word_str("repo's"));
400    }
401
402    #[test]
403    fn curated_contains_possessive_abandonment() {
404        assert!(
405            MutableDictionary::curated()
406                .get_word_metadata_str("abandonment's")
407                .unwrap()
408                .is_possessive_noun()
409        )
410    }
411
412    #[test]
413    fn has_is_not_a_nominal() {
414        let dict = MutableDictionary::curated();
415
416        let has = dict.get_word_metadata_str("has");
417        assert!(has.is_some());
418
419        assert!(!has.unwrap().is_nominal())
420    }
421
422    #[test]
423    fn is_is_linking_verb() {
424        let dict = MutableDictionary::curated();
425
426        let is = dict.get_word_metadata_str("is");
427
428        assert!(is.is_some());
429        assert!(is.unwrap().is_linking_verb());
430    }
431
432    #[test]
433    fn are_merged_attrs_same_as_spread_attrs() {
434        let curated_attr_list = include_str!("../../annotations.json");
435
436        let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
437        let spread =
438            MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
439
440        assert_eq!(
441            merged.word_map.into_iter().collect::<HashSet<_>>(),
442            spread.word_map.into_iter().collect::<HashSet<_>>()
443        );
444    }
445
446    #[test]
447    fn apart_is_not_noun() {
448        let dict = MutableDictionary::curated();
449
450        assert!(!dict.get_word_metadata_str("apart").unwrap().is_noun());
451    }
452
453    #[test]
454    fn be_is_verb_lemma() {
455        let dict = MutableDictionary::curated();
456
457        let is = dict.get_word_metadata_str("be");
458
459        assert!(is.is_some());
460        assert!(is.unwrap().is_verb_lemma());
461    }
462
463    #[test]
464    fn gets_prefixes_as_expected() {
465        let mut dict = MutableDictionary::new();
466        dict.append_word_str("predict", DictWordMetadata::default());
467        dict.append_word_str("prelude", DictWordMetadata::default());
468        dict.append_word_str("preview", DictWordMetadata::default());
469        dict.append_word_str("dwight", DictWordMetadata::default());
470
471        let with_prefix = dict.find_words_with_prefix(char_string!("pre").as_slice());
472
473        assert_eq!(with_prefix.len(), 3);
474        assert!(with_prefix.contains(&Cow::Owned(char_string!("predict").into_vec())));
475        assert!(with_prefix.contains(&Cow::Owned(char_string!("prelude").into_vec())));
476        assert!(with_prefix.contains(&Cow::Owned(char_string!("preview").into_vec())));
477    }
478
479    #[test]
480    fn gets_common_prefixes_as_expected() {
481        let mut dict = MutableDictionary::new();
482        dict.append_word_str("pre", DictWordMetadata::default());
483        dict.append_word_str("prep", DictWordMetadata::default());
484        dict.append_word_str("dwight", DictWordMetadata::default());
485
486        let with_prefix =
487            dict.find_words_with_common_prefix(char_string!("preposition").as_slice());
488
489        assert_eq!(with_prefix.len(), 2);
490        assert!(with_prefix.contains(&Cow::Owned(char_string!("pre").into_vec())));
491        assert!(with_prefix.contains(&Cow::Owned(char_string!("prep").into_vec())));
492    }
493}