1use itertools::Itertools;
5
6use crate::{CharString, CharStringExt, DictWordMetadata};
7
8pub use self::dictionary::Dictionary;
9pub use self::fst_dictionary::FstDictionary;
10pub use self::merged_dictionary::MergedDictionary;
11pub use self::mutable_dictionary::MutableDictionary;
12pub use self::trie_dictionary::TrieDictionary;
13pub use self::word_id::WordId;
14
15mod dictionary;
16mod fst_dictionary;
17mod merged_dictionary;
18mod mutable_dictionary;
19mod rune;
20mod trie_dictionary;
21mod word_id;
22mod word_map;
23
24#[derive(PartialEq, Debug, Hash, Eq)]
25pub struct FuzzyMatchResult<'a> {
26 pub word: &'a [char],
27 pub edit_distance: u8,
28 pub metadata: std::borrow::Cow<'a, DictWordMetadata>,
29}
30
31impl PartialOrd for FuzzyMatchResult<'_> {
32 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
33 self.edit_distance.partial_cmp(&other.edit_distance)
34 }
35}
36
37pub(crate) fn is_ou_misspelling(a: &[char], b: &[char]) -> bool {
42 if a.len().abs_diff(b.len()) != 1 {
43 return false;
44 }
45
46 let mut a_iter = a.iter();
47 let mut b_iter = b.iter();
48
49 loop {
50 match (
51 a_iter.next().map(char::to_ascii_lowercase),
52 b_iter.next().map(char::to_ascii_lowercase),
53 ) {
54 (Some('o'), Some('o')) => {
55 let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
56 let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
57 if a_next != b_next {
58 if a_next == Some('u') {
59 a_next = a_iter.next().map(char::to_ascii_lowercase);
60 } else if b_next == Some('u') {
61 b_next = b_iter.next().map(char::to_ascii_lowercase);
62 }
63
64 if a_next != b_next {
65 return false;
66 }
67 }
68 }
69 (Some(a_char), Some(b_char)) => {
70 if !a_char.eq_ignore_ascii_case(&b_char) {
71 return false;
72 }
73 }
74 (None, None) => return true,
75 _ => return false,
76 }
77 }
78}
79
80pub(crate) fn is_cksz_misspelling(a: &[char], b: &[char]) -> bool {
86 if a.len() != b.len() {
87 return false;
88 }
89 if a.is_empty() {
90 return true;
91 }
92
93 if !a[0].eq_ignore_ascii_case(&b[0]) {
95 return false;
96 }
97
98 let mut found = false;
99 for (a_char, b_char) in a.iter().copied().zip(b.iter().copied()) {
100 let a_char = a_char.to_ascii_lowercase();
101 let b_char = b_char.to_ascii_lowercase();
102
103 if a_char != b_char {
104 if (a_char == 's' && b_char == 'z')
105 || (a_char == 'z' && b_char == 's')
106 || (a_char == 's' && b_char == 'c')
107 || (a_char == 'c' && b_char == 's')
108 || (a_char == 'k' && b_char == 'c')
109 || (a_char == 'c' && b_char == 'k')
110 {
111 if found {
112 return false;
113 }
114 found = true;
115 } else {
116 return false;
117 }
118 }
119 }
120
121 found
122}
123
124pub(crate) fn is_er_misspelling(a: &[char], b: &[char]) -> bool {
129 if a.len() != b.len() || a.len() <= 4 {
130 return false;
131 }
132
133 let len = a.len();
134 let a_suffix = [&a[len - 2], &a[len - 1]].map(char::to_ascii_lowercase);
135 let b_suffix = [&b[len - 2], &b[len - 1]].map(char::to_ascii_lowercase);
136
137 if a_suffix == ['r', 'e'] && b_suffix == ['e', 'r']
138 || a_suffix == ['e', 'r'] && b_suffix == ['r', 'e']
139 {
140 return a[0..len - 2]
141 .iter()
142 .copied()
143 .zip(b[0..len - 2].iter().copied())
144 .all(|(a_char, b_char)| a_char.eq_ignore_ascii_case(&b_char));
145 }
146
147 false
148}
149
150pub(crate) fn is_ll_misspelling(a: &[char], b: &[char]) -> bool {
155 if a.len().abs_diff(b.len()) != 1 {
156 return false;
157 }
158
159 let mut a_iter = a.iter();
160 let mut b_iter = b.iter();
161
162 loop {
163 match (
164 a_iter.next().map(char::to_ascii_lowercase),
165 b_iter.next().map(char::to_ascii_lowercase),
166 ) {
167 (Some('l'), Some('l')) => {
168 let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
169 let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
170 if a_next != b_next {
171 if a_next == Some('l') {
172 a_next = a_iter.next().map(char::to_ascii_lowercase);
173 } else if b_next == Some('l') {
174 b_next = b_iter.next().map(char::to_ascii_lowercase);
175 }
176
177 if a_next != b_next {
178 return false;
179 }
180 }
181 }
182 (Some(a_char), Some(b_char)) => {
183 if !a_char.eq_ignore_ascii_case(&b_char) {
184 return false;
185 }
186 }
187 (None, None) => return true,
188 _ => return false,
189 }
190 }
191}
192
193pub fn is_th_h_missing(a: &[char], b: &[char]) -> bool {
194 a.iter().any(|c| c.eq_ignore_ascii_case(&'t'))
195 && b.iter()
196 .tuple_windows()
197 .any(|(a, b)| a.eq_ignore_ascii_case(&'t') && b.eq_ignore_ascii_case(&'h'))
198}
199
200pub(crate) fn is_ay_ey_misspelling(a: &[char], b: &[char]) -> bool {
205 if a.len() != b.len() {
206 return false;
207 }
208
209 let mut found_ay_ey = false;
210 let mut a_iter = a.iter();
211 let mut b_iter = b.iter();
212
213 while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
214 if a_char.eq_ignore_ascii_case(&b_char) {
215 continue;
216 }
217
218 if (a_char.eq_ignore_ascii_case(&'a') && b_char.eq_ignore_ascii_case(&'e'))
220 || (a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'a'))
221 {
222 if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
224 && a_next.eq_ignore_ascii_case(&'y')
225 && b_next.eq_ignore_ascii_case(&'y')
226 {
227 if found_ay_ey {
228 return false; }
230 found_ay_ey = true;
231 continue;
232 }
233 }
234 return false; }
236
237 if !found_ay_ey {
238 return false;
239 }
240 found_ay_ey
241}
242
243pub(crate) fn is_ei_ie_misspelling(a: &[char], b: &[char]) -> bool {
248 if a.len() != b.len() {
249 return false;
250 }
251 let mut found_ei_ie = false;
252 let mut a_iter = a.iter();
253 let mut b_iter = b.iter();
254
255 while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
256 if a_char.eq_ignore_ascii_case(&b_char) {
257 continue;
258 }
259
260 if a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'i') {
262 if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next()) {
263 if a_next.eq_ignore_ascii_case(&'i') && b_next.eq_ignore_ascii_case(&'e') {
265 if found_ei_ie {
266 return false; }
268 found_ei_ie = true;
269 continue;
270 }
271 }
272 }
273 else if a_char.eq_ignore_ascii_case(&'i')
275 && b_char.eq_ignore_ascii_case(&'e')
276 && let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
277 {
278 if a_next.eq_ignore_ascii_case(&'e') && b_next.eq_ignore_ascii_case(&'i') {
280 if found_ei_ie {
281 return false; }
283 found_ei_ie = true;
284 continue;
285 }
286 }
287 return false;
288 }
289 found_ei_ie
290}
291
292fn score_suggestion(misspelled_word: &[char], sug: &FuzzyMatchResult) -> i32 {
296 if misspelled_word.is_empty() || sug.word.is_empty() {
297 return i32::MAX;
298 }
299
300 let mut score = sug.edit_distance as i32 * 10;
301
302 if misspelled_word
304 .first()
305 .unwrap()
306 .eq_ignore_ascii_case(sug.word.first().unwrap())
307 {
308 score -= 10;
309 }
310
311 if *misspelled_word.last().unwrap() == 's' && *sug.word.last().unwrap() == 's' {
313 score -= 5;
314 }
315
316 let check_apostrophe_diff = |longer: &[char], shorter: &[char]| -> bool {
318 if let Some(pos) = longer.iter().position(|&c| c == '\'' || c == 'ā') {
319 longer.len() - 1 == shorter.len()
320 && longer.starts_with(&shorter[..pos])
321 && longer.ends_with(&shorter[pos..])
322 } else {
323 false
324 }
325 };
326
327 match (
328 misspelled_word.len() as i32 - sug.word.len() as i32,
329 sug.metadata.is_apostrophized(),
330 ) {
331 (1, _)
332 if (misspelled_word.contains(&'\'') || misspelled_word.contains(&'\u{2019}'))
333 && check_apostrophe_diff(misspelled_word, sug.word) =>
334 {
335 score -= 8
336 }
337 (-1, true) if check_apostrophe_diff(sug.word, misspelled_word) => score -= 8,
338 _ => {} }
340
341 if sug.metadata.common && sug.metadata.derived_from.is_none() {
343 score -= 4;
344 }
345
346 if sug.word.iter().filter(|c| **c == '\'').count() == 1 {
348 score -= 5;
349 }
350
351 if is_th_h_missing(misspelled_word, sug.word) {
352 score -= 6;
353 }
354
355 if !misspelled_word.contains_vowel() && !sug.word.contains_vowel() {
356 score += 10;
357 }
358
359 if sug.edit_distance == 1
361 && (is_cksz_misspelling(misspelled_word, sug.word)
362 || is_ou_misspelling(misspelled_word, sug.word)
363 || is_ll_misspelling(misspelled_word, sug.word)
364 || is_ay_ey_misspelling(misspelled_word, sug.word)
365 || is_th_h_missing(misspelled_word, sug.word))
366 {
367 score -= 6;
368 }
369
370 if sug.edit_distance <= 2 {
371 if is_ei_ie_misspelling(misspelled_word, sug.word) {
372 score -= 11;
373 }
374 if is_er_misspelling(misspelled_word, sug.word) {
375 score -= 15;
376 }
377 }
378
379 score
380}
381
382fn order_suggestions<'b>(
384 misspelled_word: &[char],
385 mut matches: Vec<FuzzyMatchResult<'b>>,
386) -> Vec<&'b [char]> {
387 matches.sort_by_cached_key(|v| score_suggestion(misspelled_word, v));
388
389 matches.into_iter().map(|v| v.word).collect()
390}
391
392pub fn suggest_correct_spelling<'a>(
395 misspelled_word: &[char],
396 result_limit: usize,
397 max_edit_dist: u8,
398 dictionary: &'a impl Dictionary,
399) -> Vec<&'a [char]> {
400 let matches: Vec<FuzzyMatchResult> = dictionary
401 .fuzzy_match(misspelled_word, max_edit_dist, result_limit)
402 .into_iter()
403 .collect();
404
405 order_suggestions(misspelled_word, matches)
406}
407
408pub fn suggest_correct_spelling_str(
411 misspelled_word: impl Into<String>,
412 result_limit: usize,
413 max_edit_dist: u8,
414 dictionary: &impl Dictionary,
415) -> Vec<String> {
416 let chars: CharString = misspelled_word.into().chars().collect();
417 suggest_correct_spelling(&chars, result_limit, max_edit_dist, dictionary)
418 .into_iter()
419 .map(|a| a.to_string())
420 .collect()
421}
422
423#[cfg(test)]
424mod tests {
425 use itertools::Itertools;
426
427 use crate::CharStringExt;
428
429 use super::{FstDictionary, suggest_correct_spelling_str};
430
431 const RESULT_LIMIT: usize = 200;
432 const MAX_EDIT_DIST: u8 = 2;
433
434 #[test]
435 fn normalizes_weve() {
436 let word = ['w', 'e', 'ā', 'v', 'e'];
437 let norm = word.normalized();
438
439 assert_eq!(norm.clone(), vec!['w', 'e', '\'', 'v', 'e'])
440 }
441
442 #[test]
443 fn punctation_no_duplicates() {
444 let results = suggest_correct_spelling_str(
445 "punctation",
446 RESULT_LIMIT,
447 MAX_EDIT_DIST,
448 &FstDictionary::curated(),
449 );
450
451 assert!(results.iter().all_unique())
452 }
453
454 #[test]
455 fn youre_contraction() {
456 assert_suggests_correction("youre", "you're");
457 }
458
459 #[test]
460 fn thats_contraction() {
461 assert_suggests_correction("thats", "that's");
462 }
463
464 #[test]
465 fn weve_contraction() {
466 assert_suggests_correction("weve", "we've");
467 }
468
469 #[test]
470 fn this_correction() {
471 assert_suggests_correction("ths", "this");
472 }
473
474 #[test]
475 fn issue_624_no_duplicates() {
476 let results = suggest_correct_spelling_str(
477 "Semantical",
478 RESULT_LIMIT,
479 MAX_EDIT_DIST,
480 &FstDictionary::curated(),
481 );
482
483 assert!(results.iter().all_unique())
484 }
485
486 #[test]
487 fn issue_182() {
488 assert_suggests_correction("Im", "I'm");
489 }
490
491 #[test]
492 fn fst_spellcheck_hvllo() {
493 let results = suggest_correct_spelling_str(
494 "hvllo",
495 RESULT_LIMIT,
496 MAX_EDIT_DIST,
497 &FstDictionary::curated(),
498 );
499
500 assert!(results.iter().take(3).contains(&"hello".to_string()));
501 }
502
503 #[track_caller]
506 fn assert_suggests_correction(misspelled_word: &str, correct: &str) {
507 let results = suggest_correct_spelling_str(
508 misspelled_word,
509 RESULT_LIMIT,
510 MAX_EDIT_DIST,
511 &FstDictionary::curated(),
512 );
513
514 dbg!(&results);
515
516 assert!(results.iter().take(3).contains(&correct.to_string()));
517 }
518
519 #[test]
520 fn spellcheck_hvllo() {
521 assert_suggests_correction("hvllo", "hello");
522 }
523
524 #[test]
525 fn spellcheck_aout() {
526 assert_suggests_correction("aout", "about");
527 }
528
529 #[test]
530 fn spellchecking_is_deterministic() {
531 let results1 = suggest_correct_spelling_str(
532 "hello",
533 RESULT_LIMIT,
534 MAX_EDIT_DIST,
535 &FstDictionary::curated(),
536 );
537 let results2 = suggest_correct_spelling_str(
538 "hello",
539 RESULT_LIMIT,
540 MAX_EDIT_DIST,
541 &FstDictionary::curated(),
542 );
543 let results3 = suggest_correct_spelling_str(
544 "hello",
545 RESULT_LIMIT,
546 MAX_EDIT_DIST,
547 &FstDictionary::curated(),
548 );
549
550 assert_eq!(results1, results2);
551 assert_eq!(results1, results3);
552 }
553
554 #[test]
555 fn adviced_correction() {
556 assert_suggests_correction("adviced", "advised");
557 }
558
559 #[test]
560 fn aknowledged_correction() {
561 assert_suggests_correction("aknowledged", "acknowledged");
562 }
563
564 #[test]
565 fn alcaholic_correction() {
566 assert_suggests_correction("alcaholic", "alcoholic");
567 }
568
569 #[test]
570 fn slaves_correction() {
571 assert_suggests_correction("Slaves", "Slavs");
572 }
573
574 #[test]
575 fn conciousness_correction() {
576 assert_suggests_correction("conciousness", "consciousness");
577 }
578
579 #[test]
580 fn v_apostrophe_s_suggests_vs() {
581 assert_suggests_correction("v's", "vs");
582 }
583
584 #[test]
585 fn v_apostrophe_typographical_s_suggests_vs() {
586 assert_suggests_correction("vās", "vs");
587 }
588
589 #[test]
590 fn missing_apostrophe_childrens_suggests_childrens() {
591 assert_suggests_correction("childrens", "children's");
592 }
593}