harper_core/spell/
mod.rs

1use crate::{CharString, CharStringExt, WordMetadata};
2
3pub use self::dictionary::Dictionary;
4pub use self::fst_dictionary::FstDictionary;
5pub use self::merged_dictionary::MergedDictionary;
6pub use self::mutable_dictionary::MutableDictionary;
7pub use self::word_id::WordId;
8
9mod dictionary;
10mod fst_dictionary;
11mod merged_dictionary;
12mod mutable_dictionary;
13mod rune;
14mod word_id;
15mod word_map;
16
17#[derive(PartialEq, Debug, Hash, Eq)]
18pub struct FuzzyMatchResult<'a> {
19    pub word: &'a [char],
20    pub edit_distance: u8,
21    pub metadata: &'a WordMetadata,
22}
23
24impl PartialOrd for FuzzyMatchResult<'_> {
25    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
26        self.edit_distance.partial_cmp(&other.edit_distance)
27    }
28}
29
30/// Returns whether the two words are the same, expect that one is written
31/// with 'ou' and the other with 'o'.
32///
33/// E.g. "color" and "colour"
34pub(crate) fn is_ou_misspelling(a: &[char], b: &[char]) -> bool {
35    if a.len().abs_diff(b.len()) != 1 {
36        return false;
37    }
38
39    let mut a_iter = a.iter();
40    let mut b_iter = b.iter();
41
42    loop {
43        match (
44            a_iter.next().map(char::to_ascii_lowercase),
45            b_iter.next().map(char::to_ascii_lowercase),
46        ) {
47            (Some('o'), Some('o')) => {
48                let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
49                let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
50                if a_next != b_next {
51                    if a_next == Some('u') {
52                        a_next = a_iter.next().map(char::to_ascii_lowercase);
53                    } else if b_next == Some('u') {
54                        b_next = b_iter.next().map(char::to_ascii_lowercase);
55                    }
56
57                    if a_next != b_next {
58                        return false;
59                    }
60                }
61            }
62            (Some(a_char), Some(b_char)) => {
63                if !a_char.eq_ignore_ascii_case(&b_char) {
64                    return false;
65                }
66            }
67            (None, None) => return true,
68            _ => return false,
69        }
70    }
71}
72
73/// Returns whether the two words are the same, expect for a single confusion of:
74///
75/// - `s` and `z`. E.g."realize" and "realise"
76/// - `s` and `c`. E.g. "defense" and "defence"
77/// - `k` and `c`. E.g. "skepticism" and "scepticism"
78pub(crate) fn is_cksz_misspelling(a: &[char], b: &[char]) -> bool {
79    if a.len() != b.len() {
80        return false;
81    }
82    if a.is_empty() {
83        return true;
84    }
85
86    // the first character must be the same
87    if !a[0].eq_ignore_ascii_case(&b[0]) {
88        return false;
89    }
90
91    let mut found = false;
92    for (a_char, b_char) in a.iter().copied().zip(b.iter().copied()) {
93        let a_char = a_char.to_ascii_lowercase();
94        let b_char = b_char.to_ascii_lowercase();
95
96        if a_char != b_char {
97            if (a_char == 's' && b_char == 'z')
98                || (a_char == 'z' && b_char == 's')
99                || (a_char == 's' && b_char == 'c')
100                || (a_char == 'c' && b_char == 's')
101                || (a_char == 'k' && b_char == 'c')
102                || (a_char == 'c' && b_char == 'k')
103            {
104                if found {
105                    return false;
106                }
107                found = true;
108            } else {
109                return false;
110            }
111        }
112    }
113
114    found
115}
116
117/// Returns whether the two words are the same, expect that one is written
118/// with '-er' and the other with '-re'.
119///
120/// E.g. "meter" and "metre"
121pub(crate) fn is_er_misspelling(a: &[char], b: &[char]) -> bool {
122    if a.len() != b.len() || a.len() <= 4 {
123        return false;
124    }
125
126    let len = a.len();
127    let a_suffix = [&a[len - 2], &a[len - 1]].map(char::to_ascii_lowercase);
128    let b_suffix = [&b[len - 2], &b[len - 1]].map(char::to_ascii_lowercase);
129
130    if a_suffix == ['r', 'e'] && b_suffix == ['e', 'r']
131        || a_suffix == ['e', 'r'] && b_suffix == ['r', 'e']
132    {
133        return a[0..len - 2]
134            .iter()
135            .copied()
136            .zip(b[0..len - 2].iter().copied())
137            .all(|(a_char, b_char)| a_char.eq_ignore_ascii_case(&b_char));
138    }
139
140    false
141}
142
143/// Returns whether the two words are the same, expect that one is written
144/// with 'll' and the other with 'l'.
145///
146/// E.g. "traveller" and "traveler"
147pub(crate) fn is_ll_misspelling(a: &[char], b: &[char]) -> bool {
148    if a.len().abs_diff(b.len()) != 1 {
149        return false;
150    }
151
152    let mut a_iter = a.iter();
153    let mut b_iter = b.iter();
154
155    loop {
156        match (
157            a_iter.next().map(char::to_ascii_lowercase),
158            b_iter.next().map(char::to_ascii_lowercase),
159        ) {
160            (Some('l'), Some('l')) => {
161                let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
162                let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
163                if a_next != b_next {
164                    if a_next == Some('l') {
165                        a_next = a_iter.next().map(char::to_ascii_lowercase);
166                    } else if b_next == Some('l') {
167                        b_next = b_iter.next().map(char::to_ascii_lowercase);
168                    }
169
170                    if a_next != b_next {
171                        return false;
172                    }
173                }
174            }
175            (Some(a_char), Some(b_char)) => {
176                if !a_char.eq_ignore_ascii_case(&b_char) {
177                    return false;
178                }
179            }
180            (None, None) => return true,
181            _ => return false,
182        }
183    }
184}
185
186/// Scores a possible spelling suggestion based on possible relevance to the user.
187///
188/// Lower = better.
189fn score_suggestion(misspelled_word: &[char], sug: &FuzzyMatchResult) -> i32 {
190    if misspelled_word.is_empty() || sug.word.is_empty() {
191        return i32::MAX;
192    }
193
194    let mut score = sug.edit_distance as i32 * 10;
195
196    // People are much less likely to mistype the first letter.
197    if misspelled_word
198        .first()
199        .unwrap()
200        .eq_ignore_ascii_case(sug.word.first().unwrap())
201    {
202        score -= 10;
203    }
204
205    // If the original word is plural, the correct one probably is too.
206    if *misspelled_word.last().unwrap() == 's' && *sug.word.last().unwrap() == 's' {
207        score -= 5;
208    }
209
210    // Boost common words.
211    if sug.metadata.common {
212        score -= 5;
213    }
214
215    // For turning words into contractions.
216    if sug.word.iter().filter(|c| **c == '\'').count() == 1 {
217        score -= 5;
218    }
219
220    // Detect dialect-specific variations
221    if sug.edit_distance == 1
222        && (is_cksz_misspelling(misspelled_word, sug.word)
223            || is_ou_misspelling(misspelled_word, sug.word)
224            || is_ll_misspelling(misspelled_word, sug.word))
225    {
226        score -= 5;
227    }
228    if sug.edit_distance == 2 && is_er_misspelling(misspelled_word, sug.word) {
229        score -= 15;
230    }
231
232    score
233}
234
235/// Order the suggestions to be shown to the user.
236fn order_suggestions<'b>(
237    misspelled_word: &[char],
238    mut matches: Vec<FuzzyMatchResult<'b>>,
239) -> Vec<&'b [char]> {
240    matches.sort_by_key(|v| score_suggestion(misspelled_word, v));
241
242    matches.into_iter().map(|v| v.word).collect()
243}
244
245/// Get the closest matches in the provided [`Dictionary`] and rank them
246/// Implementation is left up to the underlying dictionary.
247pub fn suggest_correct_spelling<'a>(
248    misspelled_word: &[char],
249    result_limit: usize,
250    max_edit_dist: u8,
251    dictionary: &'a impl Dictionary,
252) -> Vec<&'a [char]> {
253    let matches: Vec<FuzzyMatchResult> = dictionary
254        .fuzzy_match(misspelled_word, max_edit_dist, result_limit)
255        .into_iter()
256        .collect();
257
258    order_suggestions(misspelled_word, matches)
259}
260
261/// Convenience function over [`suggest_correct_spelling`] that does conversions
262/// for you.
263pub fn suggest_correct_spelling_str(
264    misspelled_word: impl Into<String>,
265    result_limit: usize,
266    max_edit_dist: u8,
267    dictionary: &impl Dictionary,
268) -> Vec<String> {
269    let chars: CharString = misspelled_word.into().chars().collect();
270    suggest_correct_spelling(&chars, result_limit, max_edit_dist, dictionary)
271        .into_iter()
272        .map(|a| a.to_string())
273        .collect()
274}
275
276#[cfg(test)]
277mod tests {
278    use itertools::Itertools;
279
280    use crate::CharStringExt;
281
282    use super::{FstDictionary, suggest_correct_spelling_str};
283
284    const RESULT_LIMIT: usize = 100;
285    const MAX_EDIT_DIST: u8 = 2;
286
287    #[test]
288    fn normalizes_weve() {
289        let word = ['w', 'e', '’', 'v', 'e'];
290        let norm = word.normalized();
291
292        assert_eq!(norm.clone(), vec!['w', 'e', '\'', 'v', 'e'])
293    }
294
295    #[test]
296    fn punctation_no_duplicates() {
297        let results = suggest_correct_spelling_str(
298            "punctation",
299            RESULT_LIMIT,
300            MAX_EDIT_DIST,
301            &FstDictionary::curated(),
302        );
303
304        assert!(results.iter().all_unique())
305    }
306
307    #[test]
308    fn youre_contraction() {
309        assert_suggests_correction("youre", "you're");
310    }
311
312    #[test]
313    fn thats_contraction() {
314        assert_suggests_correction("thats", "that's");
315    }
316
317    #[test]
318    fn weve_contraction() {
319        assert_suggests_correction("weve", "we've");
320    }
321
322    #[test]
323    fn this_correction() {
324        assert_suggests_correction("ths", "this");
325    }
326
327    #[test]
328    fn issue_624_no_duplicates() {
329        let results = suggest_correct_spelling_str(
330            "Semantical",
331            RESULT_LIMIT,
332            MAX_EDIT_DIST,
333            &FstDictionary::curated(),
334        );
335
336        dbg!(&results);
337
338        assert!(results.iter().all_unique())
339    }
340
341    #[test]
342    fn issue_182() {
343        assert_suggests_correction("Im", "I'm");
344    }
345
346    #[test]
347    fn fst_spellcheck_hvllo() {
348        let results = suggest_correct_spelling_str(
349            "hvllo",
350            RESULT_LIMIT,
351            MAX_EDIT_DIST,
352            &FstDictionary::curated(),
353        );
354
355        dbg!(&results);
356
357        assert!(results.iter().take(3).contains(&"hello".to_string()));
358    }
359
360    /// Assert that the default suggestion settings result in a specific word
361    /// being in the top three results for a given misspelling.
362    #[track_caller]
363    fn assert_suggests_correction(misspelled_word: &str, correct: &str) {
364        let results = suggest_correct_spelling_str(
365            misspelled_word,
366            RESULT_LIMIT,
367            MAX_EDIT_DIST,
368            &FstDictionary::curated(),
369        );
370
371        dbg!(&results);
372
373        assert!(results.iter().take(3).contains(&correct.to_string()));
374    }
375
376    #[test]
377    fn spellcheck_hvllo() {
378        assert_suggests_correction("hvllo", "hello");
379    }
380
381    #[test]
382    fn spellcheck_aout() {
383        assert_suggests_correction("aout", "about");
384    }
385
386    #[test]
387    fn spellchecking_is_deterministic() {
388        let results1 = suggest_correct_spelling_str(
389            "hello",
390            RESULT_LIMIT,
391            MAX_EDIT_DIST,
392            &FstDictionary::curated(),
393        );
394        let results2 = suggest_correct_spelling_str(
395            "hello",
396            RESULT_LIMIT,
397            MAX_EDIT_DIST,
398            &FstDictionary::curated(),
399        );
400        let results3 = suggest_correct_spelling_str(
401            "hello",
402            RESULT_LIMIT,
403            MAX_EDIT_DIST,
404            &FstDictionary::curated(),
405        );
406
407        assert_eq!(results1, results2);
408        assert_eq!(results1, results3);
409    }
410
411    #[test]
412    fn adviced_correction() {
413        assert_suggests_correction("adviced", "advised");
414    }
415
416    #[test]
417    fn aknowledged_correction() {
418        assert_suggests_correction("aknowledged", "acknowledged");
419    }
420
421    #[test]
422    fn alcaholic_correction() {
423        assert_suggests_correction("alcaholic", "alcoholic");
424    }
425
426    #[test]
427    fn slaves_correction() {
428        assert_suggests_correction("Slaves", "Slavs");
429    }
430
431    #[test]
432    fn conciousness_correction() {
433        assert_suggests_correction("conciousness", "consciousness");
434    }
435}