1use crate::{CharString, CharStringExt, WordMetadata};
2
3pub use self::dictionary::Dictionary;
4pub use self::fst_dictionary::FstDictionary;
5pub use self::merged_dictionary::MergedDictionary;
6pub use self::mutable_dictionary::MutableDictionary;
7pub use self::word_id::WordId;
8
9mod dictionary;
10mod fst_dictionary;
11mod merged_dictionary;
12mod mutable_dictionary;
13mod rune;
14mod word_id;
15mod word_map;
16
17#[derive(PartialEq, Debug, Hash, Eq)]
18pub struct FuzzyMatchResult<'a> {
19 pub word: &'a [char],
20 pub edit_distance: u8,
21 pub metadata: &'a WordMetadata,
22}
23
24impl PartialOrd for FuzzyMatchResult<'_> {
25 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
26 self.edit_distance.partial_cmp(&other.edit_distance)
27 }
28}
29
30pub(crate) fn is_ou_misspelling(a: &[char], b: &[char]) -> bool {
35 if a.len().abs_diff(b.len()) != 1 {
36 return false;
37 }
38
39 let mut a_iter = a.iter();
40 let mut b_iter = b.iter();
41
42 loop {
43 match (
44 a_iter.next().map(char::to_ascii_lowercase),
45 b_iter.next().map(char::to_ascii_lowercase),
46 ) {
47 (Some('o'), Some('o')) => {
48 let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
49 let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
50 if a_next != b_next {
51 if a_next == Some('u') {
52 a_next = a_iter.next().map(char::to_ascii_lowercase);
53 } else if b_next == Some('u') {
54 b_next = b_iter.next().map(char::to_ascii_lowercase);
55 }
56
57 if a_next != b_next {
58 return false;
59 }
60 }
61 }
62 (Some(a_char), Some(b_char)) => {
63 if !a_char.eq_ignore_ascii_case(&b_char) {
64 return false;
65 }
66 }
67 (None, None) => return true,
68 _ => return false,
69 }
70 }
71}
72
73pub(crate) fn is_cksz_misspelling(a: &[char], b: &[char]) -> bool {
79 if a.len() != b.len() {
80 return false;
81 }
82 if a.is_empty() {
83 return true;
84 }
85
86 if !a[0].eq_ignore_ascii_case(&b[0]) {
88 return false;
89 }
90
91 let mut found = false;
92 for (a_char, b_char) in a.iter().copied().zip(b.iter().copied()) {
93 let a_char = a_char.to_ascii_lowercase();
94 let b_char = b_char.to_ascii_lowercase();
95
96 if a_char != b_char {
97 if (a_char == 's' && b_char == 'z')
98 || (a_char == 'z' && b_char == 's')
99 || (a_char == 's' && b_char == 'c')
100 || (a_char == 'c' && b_char == 's')
101 || (a_char == 'k' && b_char == 'c')
102 || (a_char == 'c' && b_char == 'k')
103 {
104 if found {
105 return false;
106 }
107 found = true;
108 } else {
109 return false;
110 }
111 }
112 }
113
114 found
115}
116
117pub(crate) fn is_er_misspelling(a: &[char], b: &[char]) -> bool {
122 if a.len() != b.len() || a.len() <= 4 {
123 return false;
124 }
125
126 let len = a.len();
127 let a_suffix = [&a[len - 2], &a[len - 1]].map(char::to_ascii_lowercase);
128 let b_suffix = [&b[len - 2], &b[len - 1]].map(char::to_ascii_lowercase);
129
130 if a_suffix == ['r', 'e'] && b_suffix == ['e', 'r']
131 || a_suffix == ['e', 'r'] && b_suffix == ['r', 'e']
132 {
133 return a[0..len - 2]
134 .iter()
135 .copied()
136 .zip(b[0..len - 2].iter().copied())
137 .all(|(a_char, b_char)| a_char.eq_ignore_ascii_case(&b_char));
138 }
139
140 false
141}
142
143pub(crate) fn is_ll_misspelling(a: &[char], b: &[char]) -> bool {
148 if a.len().abs_diff(b.len()) != 1 {
149 return false;
150 }
151
152 let mut a_iter = a.iter();
153 let mut b_iter = b.iter();
154
155 loop {
156 match (
157 a_iter.next().map(char::to_ascii_lowercase),
158 b_iter.next().map(char::to_ascii_lowercase),
159 ) {
160 (Some('l'), Some('l')) => {
161 let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
162 let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
163 if a_next != b_next {
164 if a_next == Some('l') {
165 a_next = a_iter.next().map(char::to_ascii_lowercase);
166 } else if b_next == Some('l') {
167 b_next = b_iter.next().map(char::to_ascii_lowercase);
168 }
169
170 if a_next != b_next {
171 return false;
172 }
173 }
174 }
175 (Some(a_char), Some(b_char)) => {
176 if !a_char.eq_ignore_ascii_case(&b_char) {
177 return false;
178 }
179 }
180 (None, None) => return true,
181 _ => return false,
182 }
183 }
184}
185
186fn score_suggestion(misspelled_word: &[char], sug: &FuzzyMatchResult) -> i32 {
190 if misspelled_word.is_empty() || sug.word.is_empty() {
191 return i32::MAX;
192 }
193
194 let mut score = sug.edit_distance as i32 * 10;
195
196 if misspelled_word
198 .first()
199 .unwrap()
200 .eq_ignore_ascii_case(sug.word.first().unwrap())
201 {
202 score -= 10;
203 }
204
205 if *misspelled_word.last().unwrap() == 's' && *sug.word.last().unwrap() == 's' {
207 score -= 5;
208 }
209
210 if sug.metadata.common {
212 score -= 5;
213 }
214
215 if sug.word.iter().filter(|c| **c == '\'').count() == 1 {
217 score -= 5;
218 }
219
220 if sug.edit_distance == 1
222 && (is_cksz_misspelling(misspelled_word, sug.word)
223 || is_ou_misspelling(misspelled_word, sug.word)
224 || is_ll_misspelling(misspelled_word, sug.word))
225 {
226 score -= 5;
227 }
228 if sug.edit_distance == 2 && is_er_misspelling(misspelled_word, sug.word) {
229 score -= 15;
230 }
231
232 score
233}
234
235fn order_suggestions<'b>(
237 misspelled_word: &[char],
238 mut matches: Vec<FuzzyMatchResult<'b>>,
239) -> Vec<&'b [char]> {
240 matches.sort_by_key(|v| score_suggestion(misspelled_word, v));
241
242 matches.into_iter().map(|v| v.word).collect()
243}
244
245pub fn suggest_correct_spelling<'a>(
248 misspelled_word: &[char],
249 result_limit: usize,
250 max_edit_dist: u8,
251 dictionary: &'a impl Dictionary,
252) -> Vec<&'a [char]> {
253 let matches: Vec<FuzzyMatchResult> = dictionary
254 .fuzzy_match(misspelled_word, max_edit_dist, result_limit)
255 .into_iter()
256 .collect();
257
258 order_suggestions(misspelled_word, matches)
259}
260
261pub fn suggest_correct_spelling_str(
264 misspelled_word: impl Into<String>,
265 result_limit: usize,
266 max_edit_dist: u8,
267 dictionary: &impl Dictionary,
268) -> Vec<String> {
269 let chars: CharString = misspelled_word.into().chars().collect();
270 suggest_correct_spelling(&chars, result_limit, max_edit_dist, dictionary)
271 .into_iter()
272 .map(|a| a.to_string())
273 .collect()
274}
275
276#[cfg(test)]
277mod tests {
278 use itertools::Itertools;
279
280 use crate::CharStringExt;
281
282 use super::{FstDictionary, suggest_correct_spelling_str};
283
284 const RESULT_LIMIT: usize = 100;
285 const MAX_EDIT_DIST: u8 = 2;
286
287 #[test]
288 fn normalizes_weve() {
289 let word = ['w', 'e', '’', 'v', 'e'];
290 let norm = word.normalized();
291
292 assert_eq!(norm.clone(), vec!['w', 'e', '\'', 'v', 'e'])
293 }
294
295 #[test]
296 fn punctation_no_duplicates() {
297 let results = suggest_correct_spelling_str(
298 "punctation",
299 RESULT_LIMIT,
300 MAX_EDIT_DIST,
301 &FstDictionary::curated(),
302 );
303
304 assert!(results.iter().all_unique())
305 }
306
307 #[test]
308 fn youre_contraction() {
309 assert_suggests_correction("youre", "you're");
310 }
311
312 #[test]
313 fn thats_contraction() {
314 assert_suggests_correction("thats", "that's");
315 }
316
317 #[test]
318 fn weve_contraction() {
319 assert_suggests_correction("weve", "we've");
320 }
321
322 #[test]
323 fn this_correction() {
324 assert_suggests_correction("ths", "this");
325 }
326
327 #[test]
328 fn issue_624_no_duplicates() {
329 let results = suggest_correct_spelling_str(
330 "Semantical",
331 RESULT_LIMIT,
332 MAX_EDIT_DIST,
333 &FstDictionary::curated(),
334 );
335
336 dbg!(&results);
337
338 assert!(results.iter().all_unique())
339 }
340
341 #[test]
342 fn issue_182() {
343 assert_suggests_correction("Im", "I'm");
344 }
345
346 #[test]
347 fn fst_spellcheck_hvllo() {
348 let results = suggest_correct_spelling_str(
349 "hvllo",
350 RESULT_LIMIT,
351 MAX_EDIT_DIST,
352 &FstDictionary::curated(),
353 );
354
355 dbg!(&results);
356
357 assert!(results.iter().take(3).contains(&"hello".to_string()));
358 }
359
360 #[track_caller]
363 fn assert_suggests_correction(misspelled_word: &str, correct: &str) {
364 let results = suggest_correct_spelling_str(
365 misspelled_word,
366 RESULT_LIMIT,
367 MAX_EDIT_DIST,
368 &FstDictionary::curated(),
369 );
370
371 dbg!(&results);
372
373 assert!(results.iter().take(3).contains(&correct.to_string()));
374 }
375
376 #[test]
377 fn spellcheck_hvllo() {
378 assert_suggests_correction("hvllo", "hello");
379 }
380
381 #[test]
382 fn spellcheck_aout() {
383 assert_suggests_correction("aout", "about");
384 }
385
386 #[test]
387 fn spellchecking_is_deterministic() {
388 let results1 = suggest_correct_spelling_str(
389 "hello",
390 RESULT_LIMIT,
391 MAX_EDIT_DIST,
392 &FstDictionary::curated(),
393 );
394 let results2 = suggest_correct_spelling_str(
395 "hello",
396 RESULT_LIMIT,
397 MAX_EDIT_DIST,
398 &FstDictionary::curated(),
399 );
400 let results3 = suggest_correct_spelling_str(
401 "hello",
402 RESULT_LIMIT,
403 MAX_EDIT_DIST,
404 &FstDictionary::curated(),
405 );
406
407 assert_eq!(results1, results2);
408 assert_eq!(results1, results3);
409 }
410
411 #[test]
412 fn adviced_correction() {
413 assert_suggests_correction("adviced", "advised");
414 }
415
416 #[test]
417 fn aknowledged_correction() {
418 assert_suggests_correction("aknowledged", "acknowledged");
419 }
420
421 #[test]
422 fn alcaholic_correction() {
423 assert_suggests_correction("alcaholic", "alcoholic");
424 }
425
426 #[test]
427 fn slaves_correction() {
428 assert_suggests_correction("Slaves", "Slavs");
429 }
430
431 #[test]
432 fn conciousness_correction() {
433 assert_suggests_correction("conciousness", "consciousness");
434 }
435}