Skip to main content

harper_core/spell/
mutable_dictionary.rs

1use super::{
2    FstDictionary, WordId,
3    rune::{self, AttributeList, parse_word_list},
4    word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use std::sync::Arc;
9use std::{borrow::Cow, sync::LazyLock};
10
11use crate::{CharString, CharStringExt, DictWordMetadata};
12
13use super::FuzzyMatchResult;
14use super::dictionary::Dictionary;
15
16/// A basic dictionary that allows words to be added after instantiating.
17/// This is useful for user and file dictionaries that may change at runtime.
18///
19/// For immutable use-cases that require fuzzy lookups, such as the curated dictionary, prefer [`super::FstDictionary`],
20/// as it is much faster.
21///
22/// To combine the contents of multiple dictionaries, regardless of type, use
23/// [`super::MergedDictionary`].
24#[derive(Debug, Clone, Eq, PartialEq)]
25pub struct MutableDictionary {
26    /// All English words
27    word_map: WordMap,
28}
29
30/// The uncached function that is used to produce the original copy of the
31/// curated dictionary.
32fn uncached_inner_new() -> Arc<MutableDictionary> {
33    MutableDictionary::from_rune_files(
34        include_str!("../../dictionary.dict"),
35        include_str!("../../annotations.json"),
36    )
37    .map(Arc::new)
38    .unwrap_or_else(|e| panic!("Failed to load curated dictionary: {}", e))
39}
40
41static DICT: LazyLock<Arc<MutableDictionary>> = LazyLock::new(uncached_inner_new);
42
43impl MutableDictionary {
44    pub fn new() -> Self {
45        Self {
46            word_map: WordMap::default(),
47        }
48    }
49
50    pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
51        let word_list = parse_word_list(word_list)?;
52        let attr_list = AttributeList::parse(attr_list)?;
53
54        // There will be at _least_ this number of words
55        let mut word_map = WordMap::default();
56
57        attr_list.expand_annotated_words(word_list, &mut word_map);
58
59        Ok(Self { word_map })
60    }
61
62    /// Create a dictionary from the curated dictionary included
63    /// in the Harper binary.
64    /// Consider using [`super::FstDictionary::curated()`] instead, as it is more performant for spellchecking.
65    pub fn curated() -> Arc<Self> {
66        (*DICT).clone()
67    }
68
69    /// Appends words to the dictionary.
70    /// It is significantly faster to append many words with one call than many
71    /// distinct calls to this function.
72    pub fn extend_words(
73        &mut self,
74        words: impl IntoIterator<Item = (impl AsRef<[char]>, DictWordMetadata)>,
75    ) {
76        for (chars, metadata) in words.into_iter() {
77            self.word_map.insert(WordMapEntry {
78                metadata,
79                canonical_spelling: chars.as_ref().into(),
80            })
81        }
82    }
83
84    /// Append a single word to the dictionary.
85    ///
86    /// If you are appending many words, consider using [`Self::extend_words`]
87    /// instead.
88    pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: DictWordMetadata) {
89        self.extend_words(std::iter::once((word.as_ref(), metadata)))
90    }
91
92    /// Append a single string to the dictionary.
93    ///
94    /// If you are appending many words, consider using [`Self::extend_words`]
95    /// instead.
96    pub fn append_word_str(&mut self, word: &str, metadata: DictWordMetadata) {
97        self.append_word(word.chars().collect::<Vec<_>>(), metadata)
98    }
99}
100
101impl Default for MutableDictionary {
102    fn default() -> Self {
103        Self::new()
104    }
105}
106
107impl Dictionary for MutableDictionary {
108    fn get_word_metadata(&self, word: &[char]) -> Option<Cow<'_, DictWordMetadata>> {
109        self.word_map
110            .get_with_chars(word)
111            .map(|v| Cow::Borrowed(&v.metadata))
112    }
113
114    fn contains_word(&self, word: &[char]) -> bool {
115        self.word_map.contains_chars(word)
116    }
117
118    fn contains_word_str(&self, word: &str) -> bool {
119        let chars: CharString = word.chars().collect();
120        self.contains_word(&chars)
121    }
122
123    fn get_word_metadata_str(&self, word: &str) -> Option<Cow<'_, DictWordMetadata>> {
124        let chars: CharString = word.chars().collect();
125        self.get_word_metadata(&chars)
126    }
127
128    fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
129        self.word_map
130            .get_with_chars(word)
131            .map(|v| v.canonical_spelling.as_slice())
132    }
133
134    /// Suggest a correct spelling for a given misspelled word.
135    /// `Self::word` is assumed to be quite small (n < 100).
136    /// `max_distance` relates to an optimization that allows the search
137    /// algorithm to prune large portions of the search.
138    fn fuzzy_match(
139        &'_ self,
140        word: &[char],
141        max_distance: u8,
142        max_results: usize,
143    ) -> Vec<FuzzyMatchResult<'_>> {
144        let misspelled_charslice = word.normalized();
145        let misspelled_charslice_lower = misspelled_charslice.to_lower();
146
147        let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
148            1
149        } else {
150            misspelled_charslice.len() - max_distance as usize
151        };
152        let longest_word_len = misspelled_charslice.len() + max_distance as usize;
153
154        // Get candidate words
155        let words_to_search = self
156            .words_iter()
157            .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
158
159        // Pre-allocated vectors for the edit-distance calculation
160        // 53 is the length of the longest word.
161        let mut buf_a = Vec::with_capacity(53);
162        let mut buf_b = Vec::with_capacity(53);
163
164        // Sort by edit-distance
165        words_to_search
166            .filter_map(|word| {
167                let dist =
168                    edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
169                let lowercase_dist = edit_distance_min_alloc(
170                    &misspelled_charslice_lower,
171                    word,
172                    &mut buf_a,
173                    &mut buf_b,
174                );
175
176                let smaller_dist = dist.min(lowercase_dist);
177                if smaller_dist <= max_distance {
178                    Some((word, smaller_dist))
179                } else {
180                    None
181                }
182            })
183            .sorted_unstable_by_key(|a| a.1)
184            .take(max_results)
185            .map(|(word, edit_distance)| FuzzyMatchResult {
186                word,
187                edit_distance,
188                metadata: self.get_word_metadata(word).unwrap(),
189            })
190            .collect()
191    }
192
193    fn fuzzy_match_str(
194        &'_ self,
195        word: &str,
196        max_distance: u8,
197        max_results: usize,
198    ) -> Vec<FuzzyMatchResult<'_>> {
199        let word: Vec<_> = word.chars().collect();
200        self.fuzzy_match(&word, max_distance, max_results)
201    }
202
203    fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
204        Box::new(
205            self.word_map
206                .iter()
207                .map(|v| v.canonical_spelling.as_slice()),
208        )
209    }
210
211    fn word_count(&self) -> usize {
212        self.word_map.len()
213    }
214
215    fn contains_exact_word(&self, word: &[char]) -> bool {
216        let normalized = word.normalized();
217
218        if let Some(found) = self.word_map.get_with_chars(normalized.as_ref())
219            && found.canonical_spelling.as_ref() == normalized.as_ref()
220        {
221            return true;
222        }
223
224        false
225    }
226
227    fn contains_exact_word_str(&self, word: &str) -> bool {
228        let word: CharString = word.chars().collect();
229        self.contains_exact_word(word.as_ref())
230    }
231
232    fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
233        self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
234    }
235
236    fn find_words_with_prefix(&self, prefix: &[char]) -> Vec<Cow<'_, [char]>> {
237        let mut found = Vec::new();
238
239        for word in self.words_iter() {
240            if let Some(item_prefix) = word.get(0..prefix.len())
241                && item_prefix == prefix
242            {
243                found.push(Cow::Borrowed(word));
244            }
245        }
246
247        found
248    }
249
250    fn find_words_with_common_prefix(&self, word: &[char]) -> Vec<Cow<'_, [char]>> {
251        let mut found = Vec::new();
252
253        for item in self.words_iter() {
254            if let Some(item_prefix) = word.get(0..item.len())
255                && item_prefix == item
256            {
257                found.push(Cow::Borrowed(item));
258            }
259        }
260
261        found
262    }
263}
264
265impl From<MutableDictionary> for FstDictionary {
266    fn from(dict: MutableDictionary) -> Self {
267        let words = dict
268            .word_map
269            .into_iter()
270            .map(|entry| (entry.canonical_spelling, entry.metadata))
271            .collect();
272
273        FstDictionary::new(words)
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use std::borrow::Cow;
280
281    use hashbrown::HashSet;
282    use itertools::Itertools;
283
284    use crate::spell::{Dictionary, MutableDictionary};
285    use crate::{DictWordMetadata, char_string::char_string};
286
287    #[test]
288    fn curated_contains_no_duplicates() {
289        let dict = MutableDictionary::curated();
290        assert!(dict.words_iter().all_unique());
291    }
292
293    #[test]
294    fn curated_matches_capitalized() {
295        let dict = MutableDictionary::curated();
296        assert!(dict.contains_word_str("this"));
297        assert!(dict.contains_word_str("This"));
298    }
299
300    // "This" is a determiner when used similarly to "the"
301    // but when used alone it's a "demonstrative pronoun".
302    // Harper previously wrongly classified it as a noun.
303    #[test]
304    fn this_is_determiner() {
305        let dict = MutableDictionary::curated();
306        assert!(dict.get_word_metadata_str("this").unwrap().is_determiner());
307        assert!(dict.get_word_metadata_str("This").unwrap().is_determiner());
308    }
309
310    #[test]
311    fn several_is_quantifier() {
312        let dict = MutableDictionary::curated();
313        assert!(
314            dict.get_word_metadata_str("several")
315                .unwrap()
316                .is_quantifier()
317        );
318    }
319
320    #[test]
321    fn few_is_quantifier() {
322        let dict = MutableDictionary::curated();
323        assert!(dict.get_word_metadata_str("few").unwrap().is_quantifier());
324    }
325
326    #[test]
327    fn fewer_is_quantifier() {
328        let dict = MutableDictionary::curated();
329        assert!(dict.get_word_metadata_str("fewer").unwrap().is_quantifier());
330    }
331
332    #[test]
333    fn than_is_conjunction() {
334        let dict = MutableDictionary::curated();
335        assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
336        assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
337    }
338
339    #[test]
340    fn herself_is_pronoun() {
341        let dict = MutableDictionary::curated();
342        assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
343        assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
344    }
345
346    #[test]
347    fn discussion_171() {
348        let dict = MutableDictionary::curated();
349        assert!(dict.contains_word_str("natively"));
350    }
351
352    #[test]
353    fn im_is_common() {
354        let dict = MutableDictionary::curated();
355        assert!(dict.get_word_metadata_str("I'm").unwrap().common);
356    }
357
358    #[test]
359    fn fuzzy_result_sorted_by_edit_distance() {
360        let dict = MutableDictionary::curated();
361
362        let results = dict.fuzzy_match_str("hello", 3, 100);
363        let is_sorted_by_dist = results
364            .iter()
365            .map(|fm| fm.edit_distance)
366            .tuple_windows()
367            .all(|(a, b)| a <= b);
368
369        assert!(is_sorted_by_dist)
370    }
371
372    #[test]
373    fn there_is_not_a_pronoun() {
374        let dict = MutableDictionary::curated();
375
376        assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
377        assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
378    }
379
380    #[test]
381    fn expanded_contains_giants() {
382        assert!(MutableDictionary::curated().contains_word_str("giants"));
383    }
384
385    #[test]
386    fn expanded_contains_deallocate() {
387        assert!(MutableDictionary::curated().contains_word_str("deallocate"));
388    }
389
390    #[test]
391    fn curated_contains_repo() {
392        let dict = MutableDictionary::curated();
393
394        assert!(dict.contains_word_str("repo"));
395        assert!(dict.contains_word_str("repos"));
396        assert!(dict.contains_word_str("repo's"));
397    }
398
399    #[test]
400    fn curated_contains_possessive_abandonment() {
401        assert!(
402            MutableDictionary::curated()
403                .get_word_metadata_str("abandonment's")
404                .unwrap()
405                .is_possessive_noun()
406        )
407    }
408
409    #[test]
410    fn has_is_not_a_nominal() {
411        let dict = MutableDictionary::curated();
412
413        let has = dict.get_word_metadata_str("has");
414        assert!(has.is_some());
415
416        assert!(!has.unwrap().is_nominal())
417    }
418
419    #[test]
420    fn is_is_linking_verb() {
421        let dict = MutableDictionary::curated();
422
423        let is = dict.get_word_metadata_str("is");
424
425        assert!(is.is_some());
426        assert!(is.unwrap().is_linking_verb());
427    }
428
429    #[test]
430    fn are_merged_attrs_same_as_spread_attrs() {
431        let curated_attr_list = include_str!("../../annotations.json");
432
433        let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
434        let spread =
435            MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
436
437        assert_eq!(
438            merged.word_map.into_iter().collect::<HashSet<_>>(),
439            spread.word_map.into_iter().collect::<HashSet<_>>()
440        );
441    }
442
443    #[test]
444    fn apart_is_not_noun() {
445        let dict = MutableDictionary::curated();
446
447        assert!(!dict.get_word_metadata_str("apart").unwrap().is_noun());
448    }
449
450    #[test]
451    fn be_is_verb_lemma() {
452        let dict = MutableDictionary::curated();
453
454        let is = dict.get_word_metadata_str("be");
455
456        assert!(is.is_some());
457        assert!(is.unwrap().is_verb_lemma());
458    }
459
460    #[test]
461    fn gets_prefixes_as_expected() {
462        let mut dict = MutableDictionary::new();
463        dict.append_word_str("predict", DictWordMetadata::default());
464        dict.append_word_str("prelude", DictWordMetadata::default());
465        dict.append_word_str("preview", DictWordMetadata::default());
466        dict.append_word_str("dwight", DictWordMetadata::default());
467
468        let with_prefix = dict.find_words_with_prefix(char_string!("pre").as_slice());
469
470        assert_eq!(with_prefix.len(), 3);
471        assert!(with_prefix.contains(&Cow::Owned(char_string!("predict").into_vec())));
472        assert!(with_prefix.contains(&Cow::Owned(char_string!("prelude").into_vec())));
473        assert!(with_prefix.contains(&Cow::Owned(char_string!("preview").into_vec())));
474    }
475
476    #[test]
477    fn gets_common_prefixes_as_expected() {
478        let mut dict = MutableDictionary::new();
479        dict.append_word_str("pre", DictWordMetadata::default());
480        dict.append_word_str("prep", DictWordMetadata::default());
481        dict.append_word_str("dwight", DictWordMetadata::default());
482
483        let with_prefix =
484            dict.find_words_with_common_prefix(char_string!("preposition").as_slice());
485
486        assert_eq!(with_prefix.len(), 2);
487        assert!(with_prefix.contains(&Cow::Owned(char_string!("pre").into_vec())));
488        assert!(with_prefix.contains(&Cow::Owned(char_string!("prep").into_vec())));
489    }
490}