Skip to main content

harper_core/spell/
mod.rs

1//! Contains the relevant code for performing dictionary lookups and spellchecking (i.e. fuzzy
2//! dictionary lookups).
3
4use itertools::Itertools;
5
6use crate::{CharString, CharStringExt, DictWordMetadata};
7
8pub use self::dictionary::Dictionary;
9pub use self::fst_dictionary::FstDictionary;
10pub use self::merged_dictionary::MergedDictionary;
11pub use self::mutable_dictionary::MutableDictionary;
12pub use self::trie_dictionary::TrieDictionary;
13pub use self::word_id::WordId;
14
15mod dictionary;
16mod fst_dictionary;
17mod merged_dictionary;
18mod mutable_dictionary;
19mod rune;
20mod trie_dictionary;
21mod word_id;
22mod word_map;
23
24#[derive(PartialEq, Debug, Hash, Eq)]
25pub struct FuzzyMatchResult<'a> {
26    pub word: &'a [char],
27    pub edit_distance: u8,
28    pub metadata: std::borrow::Cow<'a, DictWordMetadata>,
29}
30
31impl PartialOrd for FuzzyMatchResult<'_> {
32    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
33        self.edit_distance.partial_cmp(&other.edit_distance)
34    }
35}
36
37/// Returns whether the two words are the same, expect that one is written
38/// with 'ou' and the other with 'o'.
39///
40/// E.g. "color" and "colour"
41pub(crate) fn is_ou_misspelling(a: &[char], b: &[char]) -> bool {
42    if a.len().abs_diff(b.len()) != 1 {
43        return false;
44    }
45
46    let mut a_iter = a.iter();
47    let mut b_iter = b.iter();
48
49    loop {
50        match (
51            a_iter.next().map(char::to_ascii_lowercase),
52            b_iter.next().map(char::to_ascii_lowercase),
53        ) {
54            (Some('o'), Some('o')) => {
55                let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
56                let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
57                if a_next != b_next {
58                    if a_next == Some('u') {
59                        a_next = a_iter.next().map(char::to_ascii_lowercase);
60                    } else if b_next == Some('u') {
61                        b_next = b_iter.next().map(char::to_ascii_lowercase);
62                    }
63
64                    if a_next != b_next {
65                        return false;
66                    }
67                }
68            }
69            (Some(a_char), Some(b_char)) => {
70                if !a_char.eq_ignore_ascii_case(&b_char) {
71                    return false;
72                }
73            }
74            (None, None) => return true,
75            _ => return false,
76        }
77    }
78}
79
80/// Returns whether the two words are the same, expect for a single confusion of:
81///
82/// - `s` and `z`. E.g."realize" and "realise"
83/// - `s` and `c`. E.g. "defense" and "defence"
84/// - `k` and `c`. E.g. "skepticism" and "scepticism"
85pub(crate) fn is_cksz_misspelling(a: &[char], b: &[char]) -> bool {
86    if a.len() != b.len() {
87        return false;
88    }
89    if a.is_empty() {
90        return true;
91    }
92
93    // the first character must be the same
94    if !a[0].eq_ignore_ascii_case(&b[0]) {
95        return false;
96    }
97
98    let mut found = false;
99    for (a_char, b_char) in a.iter().copied().zip(b.iter().copied()) {
100        let a_char = a_char.to_ascii_lowercase();
101        let b_char = b_char.to_ascii_lowercase();
102
103        if a_char != b_char {
104            if (a_char == 's' && b_char == 'z')
105                || (a_char == 'z' && b_char == 's')
106                || (a_char == 's' && b_char == 'c')
107                || (a_char == 'c' && b_char == 's')
108                || (a_char == 'k' && b_char == 'c')
109                || (a_char == 'c' && b_char == 'k')
110            {
111                if found {
112                    return false;
113                }
114                found = true;
115            } else {
116                return false;
117            }
118        }
119    }
120
121    found
122}
123
124/// Returns whether the two words are the same, expect that one is written
125/// with '-er' and the other with '-re'.
126///
127/// E.g. "meter" and "metre"
128pub(crate) fn is_er_misspelling(a: &[char], b: &[char]) -> bool {
129    if a.len() != b.len() || a.len() <= 4 {
130        return false;
131    }
132
133    let len = a.len();
134    let a_suffix = [&a[len - 2], &a[len - 1]].map(char::to_ascii_lowercase);
135    let b_suffix = [&b[len - 2], &b[len - 1]].map(char::to_ascii_lowercase);
136
137    if a_suffix == ['r', 'e'] && b_suffix == ['e', 'r']
138        || a_suffix == ['e', 'r'] && b_suffix == ['r', 'e']
139    {
140        return a[0..len - 2]
141            .iter()
142            .copied()
143            .zip(b[0..len - 2].iter().copied())
144            .all(|(a_char, b_char)| a_char.eq_ignore_ascii_case(&b_char));
145    }
146
147    false
148}
149
150/// Returns whether the two words are the same, expect that one is written
151/// with 'll' and the other with 'l'.
152///
153/// E.g. "traveller" and "traveler"
154pub(crate) fn is_ll_misspelling(a: &[char], b: &[char]) -> bool {
155    if a.len().abs_diff(b.len()) != 1 {
156        return false;
157    }
158
159    let mut a_iter = a.iter();
160    let mut b_iter = b.iter();
161
162    loop {
163        match (
164            a_iter.next().map(char::to_ascii_lowercase),
165            b_iter.next().map(char::to_ascii_lowercase),
166        ) {
167            (Some('l'), Some('l')) => {
168                let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
169                let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
170                if a_next != b_next {
171                    if a_next == Some('l') {
172                        a_next = a_iter.next().map(char::to_ascii_lowercase);
173                    } else if b_next == Some('l') {
174                        b_next = b_iter.next().map(char::to_ascii_lowercase);
175                    }
176
177                    if a_next != b_next {
178                        return false;
179                    }
180                }
181            }
182            (Some(a_char), Some(b_char)) => {
183                if !a_char.eq_ignore_ascii_case(&b_char) {
184                    return false;
185                }
186            }
187            (None, None) => return true,
188            _ => return false,
189        }
190    }
191}
192
193pub fn is_th_h_missing(a: &[char], b: &[char]) -> bool {
194    a.iter().any(|c| c.eq_ignore_ascii_case(&'t'))
195        && b.iter()
196            .tuple_windows()
197            .any(|(a, b)| a.eq_ignore_ascii_case(&'t') && b.eq_ignore_ascii_case(&'h'))
198}
199
200/// Returns whether the two words are the same, except that one is written
201/// with 'ay' and the other with 'ey'.
202///
203/// E.g. "gray" and "grey"
204pub(crate) fn is_ay_ey_misspelling(a: &[char], b: &[char]) -> bool {
205    if a.len() != b.len() {
206        return false;
207    }
208
209    let mut found_ay_ey = false;
210    let mut a_iter = a.iter();
211    let mut b_iter = b.iter();
212
213    while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
214        if a_char.eq_ignore_ascii_case(&b_char) {
215            continue;
216        }
217
218        // Check for 'a'/'e' difference
219        if (a_char.eq_ignore_ascii_case(&'a') && b_char.eq_ignore_ascii_case(&'e'))
220            || (a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'a'))
221        {
222            // Check if next character is 'y' for both
223            if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
224                && a_next.eq_ignore_ascii_case(&'y')
225                && b_next.eq_ignore_ascii_case(&'y')
226            {
227                if found_ay_ey {
228                    return false; // More than one ay/ey difference
229                }
230                found_ay_ey = true;
231                continue;
232            }
233        }
234        return false; // Non-ay/ey difference found
235    }
236
237    if !found_ay_ey {
238        return false;
239    }
240    found_ay_ey
241}
242
243/// Returns whether the two words are the same, except that one is written
244/// with 'ei' and the other with 'ie'.
245///
246/// E.g. "recieved" instead of "received", "cheif" instead of "chief"
247pub(crate) fn is_ei_ie_misspelling(a: &[char], b: &[char]) -> bool {
248    if a.len() != b.len() {
249        return false;
250    }
251    let mut found_ei_ie = false;
252    let mut a_iter = a.iter();
253    let mut b_iter = b.iter();
254
255    while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
256        if a_char.eq_ignore_ascii_case(&b_char) {
257            continue;
258        }
259
260        // Check for 'e' vs 'i' in first position
261        if a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'i') {
262            if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next()) {
263                // Next chars must be 'i' and 'e' respectively
264                if a_next.eq_ignore_ascii_case(&'i') && b_next.eq_ignore_ascii_case(&'e') {
265                    if found_ei_ie {
266                        return false; // More than one ei/ie difference
267                    }
268                    found_ei_ie = true;
269                    continue;
270                }
271            }
272        }
273        // Check for 'i' vs 'e' in first position
274        else if a_char.eq_ignore_ascii_case(&'i')
275            && b_char.eq_ignore_ascii_case(&'e')
276            && let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
277        {
278            // Next chars must be 'e' and 'i' respectively
279            if a_next.eq_ignore_ascii_case(&'e') && b_next.eq_ignore_ascii_case(&'i') {
280                if found_ei_ie {
281                    return false; // More than one ei/ie difference
282                }
283                found_ei_ie = true;
284                continue;
285            }
286        }
287        return false;
288    }
289    found_ei_ie
290}
291
292/// Scores a possible spelling suggestion based on possible relevance to the user.
293///
294/// Lower = better.
295fn score_suggestion(misspelled_word: &[char], sug: &FuzzyMatchResult) -> i32 {
296    if misspelled_word.is_empty() || sug.word.is_empty() {
297        return i32::MAX;
298    }
299
300    let mut score = sug.edit_distance as i32 * 10;
301
302    // People are much less likely to mistype the first letter.
303    if misspelled_word
304        .first()
305        .unwrap()
306        .eq_ignore_ascii_case(sug.word.first().unwrap())
307    {
308        score -= 10;
309    }
310
311    // If the original word is plural, the correct one probably is too.
312    if *misspelled_word.last().unwrap() == 's' && *sug.word.last().unwrap() == 's' {
313        score -= 5;
314    }
315
316    // Promote suggestions that differ only by an apostrophe
317    let check_apostrophe_diff = |longer: &[char], shorter: &[char]| -> bool {
318        if let Some(pos) = longer.iter().position(|&c| c == '\'' || c == '’') {
319            longer.len() - 1 == shorter.len()
320                && longer.starts_with(&shorter[..pos])
321                && longer.ends_with(&shorter[pos..])
322        } else {
323            false
324        }
325    };
326
327    match (
328        misspelled_word.len() as i32 - sug.word.len() as i32,
329        sug.metadata.is_apostrophized(),
330    ) {
331        (1, _)
332            if (misspelled_word.contains(&'\'') || misspelled_word.contains(&'\u{2019}'))
333                && check_apostrophe_diff(misspelled_word, sug.word) =>
334        {
335            score -= 8
336        }
337        (-1, true) if check_apostrophe_diff(sug.word, misspelled_word) => score -= 8,
338        _ => {} // not a single-character apostrophe difference
339    }
340
341    // Boost common words.
342    if sug.metadata.common && sug.metadata.derived_from.is_none() {
343        score -= 4;
344    }
345
346    // For turning words into contractions.
347    if sug.word.iter().filter(|c| **c == '\'').count() == 1 {
348        score -= 5;
349    }
350
351    if is_th_h_missing(misspelled_word, sug.word) {
352        score -= 6;
353    }
354
355    if !misspelled_word.contains_vowel() && !sug.word.contains_vowel() {
356        score += 10;
357    }
358
359    // Detect dialect-specific variations
360    if sug.edit_distance == 1
361        && (is_cksz_misspelling(misspelled_word, sug.word)
362            || is_ou_misspelling(misspelled_word, sug.word)
363            || is_ll_misspelling(misspelled_word, sug.word)
364            || is_ay_ey_misspelling(misspelled_word, sug.word)
365            || is_th_h_missing(misspelled_word, sug.word))
366    {
367        score -= 6;
368    }
369
370    if sug.edit_distance <= 2 {
371        if is_ei_ie_misspelling(misspelled_word, sug.word) {
372            score -= 11;
373        }
374        if is_er_misspelling(misspelled_word, sug.word) {
375            score -= 15;
376        }
377    }
378
379    score
380}
381
382/// Order the suggestions to be shown to the user.
383fn order_suggestions<'b>(
384    misspelled_word: &[char],
385    mut matches: Vec<FuzzyMatchResult<'b>>,
386) -> Vec<&'b [char]> {
387    matches.sort_by_cached_key(|v| score_suggestion(misspelled_word, v));
388
389    matches.into_iter().map(|v| v.word).collect()
390}
391
392/// Get the closest matches in the provided [`Dictionary`] and rank them
393/// Implementation is left up to the underlying dictionary.
394pub fn suggest_correct_spelling<'a>(
395    misspelled_word: &[char],
396    result_limit: usize,
397    max_edit_dist: u8,
398    dictionary: &'a impl Dictionary,
399) -> Vec<&'a [char]> {
400    let matches: Vec<FuzzyMatchResult> = dictionary
401        .fuzzy_match(misspelled_word, max_edit_dist, result_limit)
402        .into_iter()
403        .collect();
404
405    order_suggestions(misspelled_word, matches)
406}
407
408/// Convenience function over [`suggest_correct_spelling`] that does conversions
409/// for you.
410pub fn suggest_correct_spelling_str(
411    misspelled_word: impl Into<String>,
412    result_limit: usize,
413    max_edit_dist: u8,
414    dictionary: &impl Dictionary,
415) -> Vec<String> {
416    let chars: CharString = misspelled_word.into().chars().collect();
417    suggest_correct_spelling(&chars, result_limit, max_edit_dist, dictionary)
418        .into_iter()
419        .map(|a| a.to_string())
420        .collect()
421}
422
423#[cfg(test)]
424mod tests {
425    use itertools::Itertools;
426
427    use crate::CharStringExt;
428
429    use super::{FstDictionary, suggest_correct_spelling_str};
430
431    const RESULT_LIMIT: usize = 200;
432    const MAX_EDIT_DIST: u8 = 2;
433
434    #[test]
435    fn normalizes_weve() {
436        let word = ['w', 'e', '’', 'v', 'e'];
437        let norm = word.normalized();
438
439        assert_eq!(norm.clone(), vec!['w', 'e', '\'', 'v', 'e'])
440    }
441
442    #[test]
443    fn punctation_no_duplicates() {
444        let results = suggest_correct_spelling_str(
445            "punctation",
446            RESULT_LIMIT,
447            MAX_EDIT_DIST,
448            &FstDictionary::curated(),
449        );
450
451        assert!(results.iter().all_unique())
452    }
453
454    #[test]
455    fn youre_contraction() {
456        assert_suggests_correction("youre", "you're");
457    }
458
459    #[test]
460    fn thats_contraction() {
461        assert_suggests_correction("thats", "that's");
462    }
463
464    #[test]
465    fn weve_contraction() {
466        assert_suggests_correction("weve", "we've");
467    }
468
469    #[test]
470    fn this_correction() {
471        assert_suggests_correction("ths", "this");
472    }
473
474    #[test]
475    fn issue_624_no_duplicates() {
476        let results = suggest_correct_spelling_str(
477            "Semantical",
478            RESULT_LIMIT,
479            MAX_EDIT_DIST,
480            &FstDictionary::curated(),
481        );
482
483        assert!(results.iter().all_unique())
484    }
485
486    #[test]
487    fn issue_182() {
488        assert_suggests_correction("Im", "I'm");
489    }
490
491    #[test]
492    fn fst_spellcheck_hvllo() {
493        let results = suggest_correct_spelling_str(
494            "hvllo",
495            RESULT_LIMIT,
496            MAX_EDIT_DIST,
497            &FstDictionary::curated(),
498        );
499
500        assert!(results.iter().take(3).contains(&"hello".to_string()));
501    }
502
503    /// Assert that the default suggestion settings result in a specific word
504    /// being in the top three results for a given misspelling.
505    #[track_caller]
506    fn assert_suggests_correction(misspelled_word: &str, correct: &str) {
507        let results = suggest_correct_spelling_str(
508            misspelled_word,
509            RESULT_LIMIT,
510            MAX_EDIT_DIST,
511            &FstDictionary::curated(),
512        );
513
514        dbg!(&results);
515
516        assert!(results.iter().take(3).contains(&correct.to_string()));
517    }
518
519    #[test]
520    fn spellcheck_hvllo() {
521        assert_suggests_correction("hvllo", "hello");
522    }
523
524    #[test]
525    fn spellcheck_aout() {
526        assert_suggests_correction("aout", "about");
527    }
528
529    #[test]
530    fn spellchecking_is_deterministic() {
531        let results1 = suggest_correct_spelling_str(
532            "hello",
533            RESULT_LIMIT,
534            MAX_EDIT_DIST,
535            &FstDictionary::curated(),
536        );
537        let results2 = suggest_correct_spelling_str(
538            "hello",
539            RESULT_LIMIT,
540            MAX_EDIT_DIST,
541            &FstDictionary::curated(),
542        );
543        let results3 = suggest_correct_spelling_str(
544            "hello",
545            RESULT_LIMIT,
546            MAX_EDIT_DIST,
547            &FstDictionary::curated(),
548        );
549
550        assert_eq!(results1, results2);
551        assert_eq!(results1, results3);
552    }
553
554    #[test]
555    fn adviced_correction() {
556        assert_suggests_correction("adviced", "advised");
557    }
558
559    #[test]
560    fn aknowledged_correction() {
561        assert_suggests_correction("aknowledged", "acknowledged");
562    }
563
564    #[test]
565    fn alcaholic_correction() {
566        assert_suggests_correction("alcaholic", "alcoholic");
567    }
568
569    #[test]
570    fn slaves_correction() {
571        assert_suggests_correction("Slaves", "Slavs");
572    }
573
574    #[test]
575    fn conciousness_correction() {
576        assert_suggests_correction("conciousness", "consciousness");
577    }
578
579    #[test]
580    fn v_apostrophe_s_suggests_vs() {
581        assert_suggests_correction("v's", "vs");
582    }
583
584    #[test]
585    fn v_apostrophe_typographical_s_suggests_vs() {
586        assert_suggests_correction("v’s", "vs");
587    }
588
589    #[test]
590    fn missing_apostrophe_childrens_suggests_childrens() {
591        assert_suggests_correction("childrens", "children's");
592    }
593}