harper_core/spell/
mutable_dictionary.rs

1use super::{
2    FstDictionary, WordId,
3    rune::{self, AttributeList, parse_word_list},
4    word_map::{WordMap, WordMapEntry},
5};
6use crate::edit_distance::edit_distance_min_alloc;
7use itertools::Itertools;
8use lazy_static::lazy_static;
9use std::sync::Arc;
10
11use crate::{CharString, CharStringExt, WordMetadata};
12
13use super::FuzzyMatchResult;
14use super::dictionary::Dictionary;
15
16/// A basic dictionary that allows words to be added after instantiating.
17/// This is useful for user and file dictionaries that may change at runtime.
18///
19/// For immutable use-cases, such as the curated dictionary, prefer [`super::FstDictionary`],
20/// as it is much faster.
21///
22/// To combine the contents of multiple dictionaries, regardless of type, use
23/// [`super::MergedDictionary`].
24#[derive(Debug, Clone, Eq, PartialEq)]
25pub struct MutableDictionary {
26    /// All English words
27    word_map: WordMap,
28}
29
30/// The uncached function that is used to produce the original copy of the
31/// curated dictionary.
32fn uncached_inner_new() -> Arc<MutableDictionary> {
33    Arc::new(
34        MutableDictionary::from_rune_files(
35            include_str!("../../dictionary.dict"),
36            include_str!("../../affixes.json"),
37        )
38        .expect("Curated dictionary should be valid."),
39    )
40}
41
42lazy_static! {
43    static ref DICT: Arc<MutableDictionary> = uncached_inner_new();
44}
45
46impl MutableDictionary {
47    pub fn new() -> Self {
48        Self {
49            word_map: WordMap::default(),
50        }
51    }
52
53    pub fn from_rune_files(word_list: &str, attr_list: &str) -> Result<Self, rune::Error> {
54        let word_list = parse_word_list(word_list)?;
55        let attr_list = AttributeList::parse(attr_list)?;
56
57        // There will be at _least_ this number of words
58        let mut word_map = WordMap::default();
59
60        attr_list.expand_marked_words(word_list, &mut word_map);
61
62        Ok(Self { word_map })
63    }
64
65    /// Create a dictionary from the curated dictionary included
66    /// in the Harper binary.
67    /// Consider using [`super::FstDictionary::curated()`] instead, as it is more performant for spellchecking.
68    pub fn curated() -> Arc<Self> {
69        (*DICT).clone()
70    }
71
72    /// Appends words to the dictionary.
73    /// It is significantly faster to append many words with one call than many
74    /// distinct calls to this function.
75    pub fn extend_words(
76        &mut self,
77        words: impl IntoIterator<Item = (impl AsRef<[char]>, WordMetadata)>,
78    ) {
79        for (chars, metadata) in words.into_iter() {
80            self.word_map.insert(WordMapEntry {
81                metadata,
82                canonical_spelling: chars.as_ref().into(),
83            })
84        }
85    }
86
87    /// Append a single word to the dictionary.
88    ///
89    /// If you are appending many words, consider using [`Self::extend_words`]
90    /// instead.
91    pub fn append_word(&mut self, word: impl AsRef<[char]>, metadata: WordMetadata) {
92        self.extend_words(std::iter::once((word.as_ref(), metadata)))
93    }
94
95    /// Append a single string to the dictionary.
96    ///
97    /// If you are appending many words, consider using [`Self::extend_words`]
98    /// instead.
99    pub fn append_word_str(&mut self, word: &str, metadata: WordMetadata) {
100        self.append_word(word.chars().collect::<Vec<_>>(), metadata)
101    }
102}
103
104impl Default for MutableDictionary {
105    fn default() -> Self {
106        Self::new()
107    }
108}
109
110impl Dictionary for MutableDictionary {
111    fn get_word_metadata(&self, word: &[char]) -> Option<&WordMetadata> {
112        self.word_map.get_with_chars(word).map(|v| &v.metadata)
113    }
114
115    fn contains_word(&self, word: &[char]) -> bool {
116        self.word_map.contains_chars(word)
117    }
118
119    fn contains_word_str(&self, word: &str) -> bool {
120        let chars: CharString = word.chars().collect();
121        self.contains_word(&chars)
122    }
123
124    fn get_word_metadata_str(&self, word: &str) -> Option<&WordMetadata> {
125        let chars: CharString = word.chars().collect();
126        self.get_word_metadata(&chars)
127    }
128
129    fn get_correct_capitalization_of(&self, word: &[char]) -> Option<&'_ [char]> {
130        self.word_map
131            .get_with_chars(word)
132            .map(|v| v.canonical_spelling.as_slice())
133    }
134
135    /// Suggest a correct spelling for a given misspelled word.
136    /// `Self::word` is assumed to be quite small (n < 100).
137    /// `max_distance` relates to an optimization that allows the search
138    /// algorithm to prune large portions of the search.
139    fn fuzzy_match(
140        &self,
141        word: &[char],
142        max_distance: u8,
143        max_results: usize,
144    ) -> Vec<FuzzyMatchResult> {
145        let misspelled_charslice = word.normalized();
146        let misspelled_charslice_lower = misspelled_charslice.to_lower();
147
148        let shortest_word_len = if misspelled_charslice.len() <= max_distance as usize {
149            1
150        } else {
151            misspelled_charslice.len() - max_distance as usize
152        };
153        let longest_word_len = misspelled_charslice.len() + max_distance as usize;
154
155        // Get candidate words
156        let words_to_search = self
157            .words_iter()
158            .filter(|word| (shortest_word_len..=longest_word_len).contains(&word.len()));
159
160        // Pre-allocated vectors for the edit-distance calculation
161        // 53 is the length of the longest word.
162        let mut buf_a = Vec::with_capacity(53);
163        let mut buf_b = Vec::with_capacity(53);
164
165        // Sort by edit-distance
166        words_to_search
167            .filter_map(|word| {
168                let dist =
169                    edit_distance_min_alloc(&misspelled_charslice, word, &mut buf_a, &mut buf_b);
170                let lowercase_dist = edit_distance_min_alloc(
171                    &misspelled_charslice_lower,
172                    word,
173                    &mut buf_a,
174                    &mut buf_b,
175                );
176
177                let smaller_dist = dist.min(lowercase_dist);
178                if smaller_dist <= max_distance {
179                    Some((word, smaller_dist))
180                } else {
181                    None
182                }
183            })
184            .sorted_unstable_by_key(|a| a.1)
185            .take(max_results)
186            .map(|(word, edit_distance)| FuzzyMatchResult {
187                word,
188                edit_distance,
189                metadata: self.get_word_metadata(word).unwrap(),
190            })
191            .collect()
192    }
193
194    fn fuzzy_match_str(
195        &self,
196        word: &str,
197        max_distance: u8,
198        max_results: usize,
199    ) -> Vec<FuzzyMatchResult> {
200        let word: Vec<_> = word.chars().collect();
201        self.fuzzy_match(&word, max_distance, max_results)
202    }
203
204    fn words_iter(&self) -> Box<dyn Iterator<Item = &'_ [char]> + Send + '_> {
205        Box::new(
206            self.word_map
207                .iter()
208                .map(|v| v.canonical_spelling.as_slice()),
209        )
210    }
211
212    fn word_count(&self) -> usize {
213        self.word_map.len()
214    }
215
216    fn contains_exact_word(&self, word: &[char]) -> bool {
217        let normalized = word.normalized();
218
219        if let Some(found) = self.word_map.get_with_chars(normalized.as_ref()) {
220            if found.canonical_spelling.as_ref() == normalized.as_ref() {
221                return true;
222            }
223        }
224
225        false
226    }
227
228    fn contains_exact_word_str(&self, word: &str) -> bool {
229        let word: CharString = word.chars().collect();
230        self.contains_exact_word(word.as_ref())
231    }
232
233    fn get_word_from_id(&self, id: &WordId) -> Option<&[char]> {
234        self.word_map.get(id).map(|w| w.canonical_spelling.as_ref())
235    }
236}
237
238impl From<MutableDictionary> for FstDictionary {
239    fn from(dict: MutableDictionary) -> Self {
240        let words = dict
241            .word_map
242            .into_iter()
243            .map(|entry| (entry.canonical_spelling, entry.metadata))
244            .collect();
245
246        FstDictionary::new(words)
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use hashbrown::HashSet;
253    use itertools::Itertools;
254
255    use crate::{Dictionary, MutableDictionary};
256
257    #[test]
258    fn curated_contains_no_duplicates() {
259        let dict = MutableDictionary::curated();
260        assert!(dict.words_iter().all_unique());
261    }
262
263    #[test]
264    fn curated_matches_capitalized() {
265        let dict = MutableDictionary::curated();
266        assert!(dict.contains_word_str("this"));
267        assert!(dict.contains_word_str("This"));
268    }
269
270    // TODO "this" is a determiner when used similarly to "the"
271    // TODO but when used alone it's a "demonstrative pronoun"
272    // TODO Harper previously wrongly classified it as a noun
273    // TODO .is_determiner() is not yet implemented
274    // #[test]
275    // fn this_is_determiner() {
276    //     let dict = MutableDictionary::curated();
277    //     assert!(dict.get_word_metadata_str("this").unwrap().is_determiner());
278    //     assert!(dict.get_word_metadata_str("This").unwrap().is_determiner());
279    // }
280
281    #[test]
282    fn than_is_conjunction() {
283        let dict = MutableDictionary::curated();
284        assert!(dict.get_word_metadata_str("than").unwrap().is_conjunction());
285        assert!(dict.get_word_metadata_str("Than").unwrap().is_conjunction());
286    }
287
288    #[test]
289    fn herself_is_pronoun() {
290        let dict = MutableDictionary::curated();
291        assert!(dict.get_word_metadata_str("herself").unwrap().is_pronoun());
292        assert!(dict.get_word_metadata_str("Herself").unwrap().is_pronoun());
293    }
294
295    #[test]
296    fn discussion_171() {
297        let dict = MutableDictionary::curated();
298        assert!(dict.contains_word_str("natively"));
299    }
300
301    #[test]
302    fn im_is_common() {
303        let dict = MutableDictionary::curated();
304        assert!(dict.get_word_metadata_str("I'm").unwrap().common);
305    }
306
307    #[test]
308    fn fuzzy_result_sorted_by_edit_distance() {
309        let dict = MutableDictionary::curated();
310
311        let results = dict.fuzzy_match_str("hello", 3, 100);
312        let is_sorted_by_dist = results
313            .iter()
314            .map(|fm| fm.edit_distance)
315            .tuple_windows()
316            .all(|(a, b)| a <= b);
317
318        assert!(is_sorted_by_dist)
319    }
320
321    #[test]
322    fn there_is_not_a_pronoun() {
323        let dict = MutableDictionary::curated();
324
325        assert!(!dict.get_word_metadata_str("there").unwrap().is_nominal());
326        assert!(!dict.get_word_metadata_str("there").unwrap().is_pronoun());
327    }
328
329    #[test]
330    fn expanded_contains_giants() {
331        assert!(MutableDictionary::curated().contains_word_str("giants"));
332    }
333
334    #[test]
335    fn expanded_contains_deallocate() {
336        assert!(MutableDictionary::curated().contains_word_str("deallocate"));
337    }
338
339    #[test]
340    fn curated_contains_repo() {
341        let dict = MutableDictionary::curated();
342
343        assert!(dict.contains_word_str("repo"));
344        assert!(dict.contains_word_str("repos"));
345        assert!(dict.contains_word_str("repo's"));
346    }
347
348    #[test]
349    fn curated_contains_possessive_abandonment() {
350        assert!(
351            MutableDictionary::curated()
352                .get_word_metadata_str("abandonment's")
353                .unwrap()
354                .is_possessive_noun()
355        )
356    }
357
358    #[test]
359    fn has_is_not_a_nominal() {
360        let dict = MutableDictionary::curated();
361
362        let has = dict.get_word_metadata_str("has");
363        assert!(has.is_some());
364
365        assert!(!has.unwrap().is_nominal())
366    }
367
368    #[test]
369    fn is_is_linking_verb() {
370        let dict = MutableDictionary::curated();
371
372        let is = dict.get_word_metadata_str("is");
373
374        assert!(is.is_some());
375        assert!(is.unwrap().is_linking_verb());
376    }
377
378    #[test]
379    fn are_merged_attrs_same_as_spread_attrs() {
380        let curated_attr_list = include_str!("../../affixes.json");
381
382        let merged = MutableDictionary::from_rune_files("1\nblork/DGS", curated_attr_list).unwrap();
383        let spread =
384            MutableDictionary::from_rune_files("2\nblork/DG\nblork/S", curated_attr_list).unwrap();
385
386        assert_eq!(
387            merged.word_map.into_iter().collect::<HashSet<_>>(),
388            spread.word_map.into_iter().collect::<HashSet<_>>()
389        );
390    }
391}