#![allow(clippy::needless_range_loop)]
use std::collections::{HashMap, HashSet};
const MAX_EDIT_DISTANCE: usize = 2;
const MIN_WORD_LENGTH: usize = 4;
pub struct FuzzyEngine {
vocabulary: HashSet<String>,
word_freq: HashMap<String, usize>,
}
impl FuzzyEngine {
pub fn new() -> Self {
Self {
vocabulary: HashSet::new(),
word_freq: HashMap::new(),
}
}
pub fn add_to_vocabulary(&mut self, text: &str) {
for word in tokenize(text) {
if word.len() >= MIN_WORD_LENGTH {
self.vocabulary.insert(word.clone());
*self.word_freq.entry(word).or_insert(0) += 1;
}
}
}
pub fn correct_query(&self, query: &str) -> CorrectionResult {
let mut corrections = Vec::new();
let mut corrected_query = String::new();
let mut had_corrections = false;
for word in query.split_whitespace() {
let word_lower = word.to_lowercase();
if word_lower.len() < MIN_WORD_LENGTH || self.vocabulary.contains(&word_lower) {
if !corrected_query.is_empty() {
corrected_query.push(' ');
}
corrected_query.push_str(word);
continue;
}
if let Some(correction) = self.find_best_correction(&word_lower) {
corrections.push(Correction {
original: word.to_string(),
corrected: correction.clone(),
distance: levenshtein(&word_lower, &correction),
});
had_corrections = true;
if !corrected_query.is_empty() {
corrected_query.push(' ');
}
corrected_query.push_str(&correction);
} else {
if !corrected_query.is_empty() {
corrected_query.push(' ');
}
corrected_query.push_str(word);
}
}
CorrectionResult {
original_query: query.to_string(),
corrected_query: if had_corrections {
Some(corrected_query)
} else {
None
},
corrections,
suggestions: self.get_suggestions(query, 5),
}
}
fn find_best_correction(&self, word: &str) -> Option<String> {
let mut best: Option<(String, usize, usize)> = None;
for vocab_word in &self.vocabulary {
let distance = levenshtein(word, vocab_word);
if distance <= MAX_EDIT_DISTANCE {
let freq = *self.word_freq.get(vocab_word).unwrap_or(&0);
match &best {
None => {
best = Some((vocab_word.clone(), distance, freq));
}
Some((_, best_dist, best_freq)) => {
if distance < *best_dist || (distance == *best_dist && freq > *best_freq) {
best = Some((vocab_word.clone(), distance, freq));
}
}
}
}
}
best.map(|(word, _, _)| word)
}
fn get_suggestions(&self, query: &str, limit: usize) -> Vec<String> {
let query_lower = query.to_lowercase();
let mut suggestions: Vec<(String, usize)> = Vec::new();
for word in &self.vocabulary {
if word.starts_with(&query_lower) {
let freq = *self.word_freq.get(word).unwrap_or(&0);
suggestions.push((word.clone(), freq));
}
else if query_lower.len() >= MIN_WORD_LENGTH {
let distance = levenshtein(&query_lower, word);
if distance <= MAX_EDIT_DISTANCE {
let freq = *self.word_freq.get(word).unwrap_or(&0);
suggestions.push((word.clone(), freq));
}
}
}
suggestions.sort_by(|a, b| b.1.cmp(&a.1));
suggestions
.into_iter()
.take(limit)
.map(|(word, _)| word)
.collect()
}
pub fn vocabulary_size(&self) -> usize {
self.vocabulary.len()
}
}
impl Default for FuzzyEngine {
fn default() -> Self {
Self::new()
}
}
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorrectionResult {
pub original_query: String,
pub corrected_query: Option<String>,
pub corrections: Vec<Correction>,
pub suggestions: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Correction {
pub original: String,
pub corrected: String,
pub distance: usize,
}
pub fn levenshtein(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev_row: Vec<usize> = (0..=b_len).collect();
let mut curr_row: Vec<usize> = vec![0; b_len + 1];
for i in 1..=a_len {
curr_row[0] = i;
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
curr_row[j] = (prev_row[j] + 1) .min(curr_row[j - 1] + 1) .min(prev_row[j - 1] + cost); }
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[b_len]
}
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut matrix: Vec<Vec<usize>> = vec![vec![0; b_len + 1]; a_len + 1];
for i in 0..=a_len {
matrix[i][0] = i;
}
for j in 0..=b_len {
matrix[0][j] = j;
}
#[allow(clippy::needless_range_loop)]
for i in 1..=a_len {
for j in 1..=b_len {
let cost = if a_chars[i - 1] == b_chars[j - 1] {
0
} else {
1
};
matrix[i][j] = (matrix[i - 1][j] + 1) .min(matrix[i][j - 1] + 1) .min(matrix[i - 1][j - 1] + cost);
if i > 1
&& j > 1
&& a_chars[i - 1] == b_chars[j - 2]
&& a_chars[i - 2] == b_chars[j - 1]
{
matrix[i][j] = matrix[i][j].min(matrix[i - 2][j - 2] + cost);
}
}
}
matrix[a_len][b_len]
}
fn tokenize(text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.map(String::from)
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_levenshtein() {
assert_eq!(levenshtein("kitten", "sitting"), 3);
assert_eq!(levenshtein("hello", "hello"), 0);
assert_eq!(levenshtein("", "abc"), 3);
assert_eq!(levenshtein("abc", ""), 3);
}
#[test]
fn test_damerau_levenshtein() {
assert_eq!(damerau_levenshtein("ab", "ba"), 1); assert_eq!(damerau_levenshtein("hello", "hlelo"), 1); }
#[test]
fn test_fuzzy_engine() {
let mut engine = FuzzyEngine::new();
engine.add_to_vocabulary("authentication");
engine.add_to_vocabulary("authorization");
engine.add_to_vocabulary("automatic");
let result = engine.correct_query("authentcation"); assert!(result.corrected_query.is_some());
assert_eq!(result.corrected_query.unwrap(), "authentication");
}
#[test]
fn test_suggestions() {
let mut engine = FuzzyEngine::new();
engine.add_to_vocabulary("authentication");
engine.add_to_vocabulary("authorization");
engine.add_to_vocabulary("automatic");
let suggestions = engine.get_suggestions("auth", 5);
assert!(!suggestions.is_empty());
assert!(suggestions.iter().any(|s| s.starts_with("auth")));
}
}