use itertools::Itertools;
use crate::{CharString, CharStringExt, DictWordMetadata};
pub use self::dictionary::Dictionary;
pub use self::fst_dictionary::FstDictionary;
pub use self::merged_dictionary::MergedDictionary;
pub use self::mutable_dictionary::MutableDictionary;
pub use self::trie_dictionary::TrieDictionary;
pub use self::word_id::WordId;
mod dictionary;
mod fst_dictionary;
mod merged_dictionary;
mod mutable_dictionary;
mod rune;
mod trie_dictionary;
mod word_id;
mod word_map;
#[derive(PartialEq, Debug, Hash, Eq)]
pub struct FuzzyMatchResult<'a> {
pub word: &'a [char],
pub edit_distance: u8,
pub metadata: std::borrow::Cow<'a, DictWordMetadata>,
}
impl PartialOrd for FuzzyMatchResult<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.edit_distance.partial_cmp(&other.edit_distance)
}
}
pub(crate) fn is_ou_misspelling(a: &[char], b: &[char]) -> bool {
if a.len().abs_diff(b.len()) != 1 {
return false;
}
let mut a_iter = a.iter();
let mut b_iter = b.iter();
loop {
match (
a_iter.next().map(char::to_ascii_lowercase),
b_iter.next().map(char::to_ascii_lowercase),
) {
(Some('o'), Some('o')) => {
let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
if a_next != b_next {
if a_next == Some('u') {
a_next = a_iter.next().map(char::to_ascii_lowercase);
} else if b_next == Some('u') {
b_next = b_iter.next().map(char::to_ascii_lowercase);
}
if a_next != b_next {
return false;
}
}
}
(Some(a_char), Some(b_char)) => {
if !a_char.eq_ignore_ascii_case(&b_char) {
return false;
}
}
(None, None) => return true,
_ => return false,
}
}
}
pub(crate) fn is_cksz_misspelling(a: &[char], b: &[char]) -> bool {
if a.len() != b.len() {
return false;
}
if a.is_empty() {
return true;
}
if !a[0].eq_ignore_ascii_case(&b[0]) {
return false;
}
let mut found = false;
for (a_char, b_char) in a.iter().copied().zip(b.iter().copied()) {
let a_char = a_char.to_ascii_lowercase();
let b_char = b_char.to_ascii_lowercase();
if a_char != b_char {
if (a_char == 's' && b_char == 'z')
|| (a_char == 'z' && b_char == 's')
|| (a_char == 's' && b_char == 'c')
|| (a_char == 'c' && b_char == 's')
|| (a_char == 'k' && b_char == 'c')
|| (a_char == 'c' && b_char == 'k')
{
if found {
return false;
}
found = true;
} else {
return false;
}
}
}
found
}
pub(crate) fn is_er_misspelling(a: &[char], b: &[char]) -> bool {
if a.len() != b.len() || a.len() <= 4 {
return false;
}
let len = a.len();
let a_suffix = [&a[len - 2], &a[len - 1]].map(char::to_ascii_lowercase);
let b_suffix = [&b[len - 2], &b[len - 1]].map(char::to_ascii_lowercase);
if a_suffix == ['r', 'e'] && b_suffix == ['e', 'r']
|| a_suffix == ['e', 'r'] && b_suffix == ['r', 'e']
{
return a[0..len - 2]
.iter()
.copied()
.zip(b[0..len - 2].iter().copied())
.all(|(a_char, b_char)| a_char.eq_ignore_ascii_case(&b_char));
}
false
}
pub(crate) fn is_ll_misspelling(a: &[char], b: &[char]) -> bool {
if a.len().abs_diff(b.len()) != 1 {
return false;
}
let mut a_iter = a.iter();
let mut b_iter = b.iter();
loop {
match (
a_iter.next().map(char::to_ascii_lowercase),
b_iter.next().map(char::to_ascii_lowercase),
) {
(Some('l'), Some('l')) => {
let mut a_next = a_iter.next().map(char::to_ascii_lowercase);
let mut b_next = b_iter.next().map(char::to_ascii_lowercase);
if a_next != b_next {
if a_next == Some('l') {
a_next = a_iter.next().map(char::to_ascii_lowercase);
} else if b_next == Some('l') {
b_next = b_iter.next().map(char::to_ascii_lowercase);
}
if a_next != b_next {
return false;
}
}
}
(Some(a_char), Some(b_char)) => {
if !a_char.eq_ignore_ascii_case(&b_char) {
return false;
}
}
(None, None) => return true,
_ => return false,
}
}
}
pub fn is_th_h_missing(a: &[char], b: &[char]) -> bool {
a.iter().any(|c| c.eq_ignore_ascii_case(&'t'))
&& b.iter()
.tuple_windows()
.any(|(a, b)| a.eq_ignore_ascii_case(&'t') && b.eq_ignore_ascii_case(&'h'))
}
pub(crate) fn is_ay_ey_misspelling(a: &[char], b: &[char]) -> bool {
if a.len() != b.len() {
return false;
}
let mut found_ay_ey = false;
let mut a_iter = a.iter();
let mut b_iter = b.iter();
while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
if a_char.eq_ignore_ascii_case(&b_char) {
continue;
}
if (a_char.eq_ignore_ascii_case(&'a') && b_char.eq_ignore_ascii_case(&'e'))
|| (a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'a'))
{
if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
&& a_next.eq_ignore_ascii_case(&'y')
&& b_next.eq_ignore_ascii_case(&'y')
{
if found_ay_ey {
return false; }
found_ay_ey = true;
continue;
}
}
return false; }
if !found_ay_ey {
return false;
}
found_ay_ey
}
pub(crate) fn is_ei_ie_misspelling(a: &[char], b: &[char]) -> bool {
if a.len() != b.len() {
return false;
}
let mut found_ei_ie = false;
let mut a_iter = a.iter();
let mut b_iter = b.iter();
while let (Some(&a_char), Some(&b_char)) = (a_iter.next(), b_iter.next()) {
if a_char.eq_ignore_ascii_case(&b_char) {
continue;
}
if a_char.eq_ignore_ascii_case(&'e') && b_char.eq_ignore_ascii_case(&'i') {
if let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next()) {
if a_next.eq_ignore_ascii_case(&'i') && b_next.eq_ignore_ascii_case(&'e') {
if found_ei_ie {
return false; }
found_ei_ie = true;
continue;
}
}
}
else if a_char.eq_ignore_ascii_case(&'i')
&& b_char.eq_ignore_ascii_case(&'e')
&& let (Some(&a_next), Some(&b_next)) = (a_iter.next(), b_iter.next())
{
if a_next.eq_ignore_ascii_case(&'e') && b_next.eq_ignore_ascii_case(&'i') {
if found_ei_ie {
return false; }
found_ei_ie = true;
continue;
}
}
return false;
}
found_ei_ie
}
fn score_suggestion(misspelled_word: &[char], sug: &FuzzyMatchResult) -> i32 {
if misspelled_word.is_empty() || sug.word.is_empty() {
return i32::MAX;
}
let mut score = sug.edit_distance as i32 * 10;
if misspelled_word
.first()
.unwrap()
.eq_ignore_ascii_case(sug.word.first().unwrap())
{
score -= 10;
}
if *misspelled_word.last().unwrap() == 's' && *sug.word.last().unwrap() == 's' {
score -= 5;
}
let check_apostrophe_diff = |longer: &[char], shorter: &[char]| -> bool {
if let Some(pos) = longer.iter().position(|&c| c == '\'' || c == '’') {
longer.len() - 1 == shorter.len()
&& longer.starts_with(&shorter[..pos])
&& longer.ends_with(&shorter[pos..])
} else {
false
}
};
match (
misspelled_word.len() as i32 - sug.word.len() as i32,
sug.metadata.is_apostrophized(),
) {
(1, _)
if (misspelled_word.contains(&'\'') || misspelled_word.contains(&'\u{2019}'))
&& check_apostrophe_diff(misspelled_word, sug.word) =>
{
score -= 8
}
(-1, true) if check_apostrophe_diff(sug.word, misspelled_word) => score -= 8,
_ => {} }
if sug.metadata.common && sug.metadata.derived_from.is_none() {
score -= 4;
}
if sug.word.iter().filter(|c| **c == '\'').count() == 1 {
score -= 5;
}
if is_th_h_missing(misspelled_word, sug.word) {
score -= 6;
}
if !misspelled_word.contains_vowel() && !sug.word.contains_vowel() {
score += 10;
}
if sug.edit_distance == 1
&& (is_cksz_misspelling(misspelled_word, sug.word)
|| is_ou_misspelling(misspelled_word, sug.word)
|| is_ll_misspelling(misspelled_word, sug.word)
|| is_ay_ey_misspelling(misspelled_word, sug.word)
|| is_th_h_missing(misspelled_word, sug.word))
{
score -= 6;
}
if sug.edit_distance <= 2 {
if is_ei_ie_misspelling(misspelled_word, sug.word) {
score -= 11;
}
if is_er_misspelling(misspelled_word, sug.word) {
score -= 15;
}
}
score
}
fn order_suggestions<'b>(
misspelled_word: &[char],
mut matches: Vec<FuzzyMatchResult<'b>>,
) -> Vec<&'b [char]> {
matches.sort_by_cached_key(|v| score_suggestion(misspelled_word, v));
matches.into_iter().map(|v| v.word).collect()
}
pub fn suggest_correct_spelling<'a>(
misspelled_word: &[char],
result_limit: usize,
max_edit_dist: u8,
dictionary: &'a impl Dictionary,
) -> Vec<&'a [char]> {
let matches: Vec<FuzzyMatchResult> = dictionary
.fuzzy_match(misspelled_word, max_edit_dist, result_limit)
.into_iter()
.collect();
order_suggestions(misspelled_word, matches)
}
pub fn suggest_correct_spelling_str(
misspelled_word: impl Into<String>,
result_limit: usize,
max_edit_dist: u8,
dictionary: &impl Dictionary,
) -> Vec<String> {
let chars: CharString = misspelled_word.into().chars().collect();
suggest_correct_spelling(&chars, result_limit, max_edit_dist, dictionary)
.into_iter()
.map(|a| a.to_string())
.collect()
}
#[cfg(test)]
mod tests {
use itertools::Itertools;
use crate::CharStringExt;
use super::{FstDictionary, suggest_correct_spelling_str};
const RESULT_LIMIT: usize = 200;
const MAX_EDIT_DIST: u8 = 2;
#[test]
fn normalizes_weve() {
let word = ['w', 'e', '’', 'v', 'e'];
let norm = word.normalized();
assert_eq!(norm.clone(), vec!['w', 'e', '\'', 'v', 'e'])
}
#[test]
fn punctation_no_duplicates() {
let results = suggest_correct_spelling_str(
"punctation",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
assert!(results.iter().all_unique())
}
#[test]
fn youre_contraction() {
assert_suggests_correction("youre", "you're");
}
#[test]
fn thats_contraction() {
assert_suggests_correction("thats", "that's");
}
#[test]
fn weve_contraction() {
assert_suggests_correction("weve", "we've");
}
#[test]
fn this_correction() {
assert_suggests_correction("ths", "this");
}
#[test]
fn issue_624_no_duplicates() {
let results = suggest_correct_spelling_str(
"Semantical",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
assert!(results.iter().all_unique())
}
#[test]
fn issue_182() {
assert_suggests_correction("Im", "I'm");
}
#[test]
fn fst_spellcheck_hvllo() {
let results = suggest_correct_spelling_str(
"hvllo",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
assert!(results.iter().take(3).contains(&"hello".to_string()));
}
#[track_caller]
fn assert_suggests_correction(misspelled_word: &str, correct: &str) {
let results = suggest_correct_spelling_str(
misspelled_word,
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
dbg!(&results);
assert!(results.iter().take(3).contains(&correct.to_string()));
}
#[test]
fn spellcheck_hvllo() {
assert_suggests_correction("hvllo", "hello");
}
#[test]
fn spellcheck_aout() {
assert_suggests_correction("aout", "about");
}
#[test]
fn spellchecking_is_deterministic() {
let results1 = suggest_correct_spelling_str(
"hello",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
let results2 = suggest_correct_spelling_str(
"hello",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
let results3 = suggest_correct_spelling_str(
"hello",
RESULT_LIMIT,
MAX_EDIT_DIST,
&FstDictionary::curated(),
);
assert_eq!(results1, results2);
assert_eq!(results1, results3);
}
#[test]
fn adviced_correction() {
assert_suggests_correction("adviced", "advised");
}
#[test]
fn aknowledged_correction() {
assert_suggests_correction("aknowledged", "acknowledged");
}
#[test]
fn alcaholic_correction() {
assert_suggests_correction("alcaholic", "alcoholic");
}
#[test]
fn slaves_correction() {
assert_suggests_correction("Slaves", "Slavs");
}
#[test]
fn conciousness_correction() {
assert_suggests_correction("conciousness", "consciousness");
}
#[test]
fn v_apostrophe_s_suggests_vs() {
assert_suggests_correction("v's", "vs");
}
#[test]
fn v_apostrophe_typographical_s_suggests_vs() {
assert_suggests_correction("v’s", "vs");
}
#[test]
fn missing_apostrophe_childrens_suggests_childrens() {
assert_suggests_correction("childrens", "children's");
}
}