use crate::search::simd;
use rapidfuzz::distance::{jaro_winkler, levenshtein};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum MatchAlgorithm {
Levenshtein,
#[default]
JaroWinkler,
}
#[derive(Debug, Clone)]
pub struct MatchConfig {
pub algorithm: MatchAlgorithm,
pub min_score: f64,
pub case_sensitive: bool,
}
impl Default for MatchConfig {
fn default() -> Self {
Self {
algorithm: MatchAlgorithm::JaroWinkler,
min_score: 0.6,
case_sensitive: false,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct MatchResult {
pub score: f64,
pub entry_id: usize,
}
impl MatchResult {
#[must_use]
pub fn new(score: f64, entry_id: usize) -> Self {
Self { score, entry_id }
}
}
pub struct FuzzyMatcher {
config: MatchConfig,
}
impl FuzzyMatcher {
#[must_use]
pub fn new() -> Self {
Self {
config: MatchConfig::default(),
}
}
#[must_use]
pub fn with_config(config: MatchConfig) -> Self {
Self { config }
}
#[must_use]
pub fn score(&self, query: &str, target: &str) -> f64 {
let (query_normalized, target_normalized) = if self.config.case_sensitive {
(query.to_string(), target.to_string())
} else {
let query_norm = query.to_lowercase();
let target_norm = if target.is_ascii() {
simd::to_lowercase_ascii(target)
} else {
target.to_lowercase()
};
(query_norm, target_norm)
};
match self.config.algorithm {
MatchAlgorithm::Levenshtein => {
self.levenshtein_similarity(&query_normalized, &target_normalized)
}
MatchAlgorithm::JaroWinkler => {
jaro_winkler::similarity(query_normalized.chars(), target_normalized.chars())
}
}
}
#[allow(clippy::cast_precision_loss, clippy::unused_self)]
fn levenshtein_similarity(&self, query: &str, target: &str) -> f64 {
let distance = levenshtein::distance(query.chars(), target.chars());
let max_len = query.chars().count().max(target.chars().count()) as f64;
if max_len == 0.0 {
return 1.0; }
1.0 - (distance as f64 / max_len)
}
#[must_use]
pub fn match_one(&self, query: &str, target: &str) -> Option<f64> {
let score = self.score(query, target);
if score >= self.config.min_score {
Some(score)
} else {
None
}
}
pub fn match_many<'a, I>(&self, query: &str, targets: I) -> Vec<MatchResult>
where
I: Iterator<Item = (usize, &'a str)>,
{
let mut results: Vec<MatchResult> = targets
.filter_map(|(entry_id, target)| {
self.match_one(query, target)
.map(|score| MatchResult::new(score, entry_id))
})
.collect();
results.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results
}
#[must_use]
pub fn config(&self) -> &MatchConfig {
&self.config
}
}
impl Default for FuzzyMatcher {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
#[test]
fn test_levenshtein_exact_match() {
let config = MatchConfig {
algorithm: MatchAlgorithm::Levenshtein,
min_score: 0.6,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
let score = matcher.score("hello", "hello");
assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
}
#[test]
fn test_levenshtein_similar() {
let config = MatchConfig {
algorithm: MatchAlgorithm::Levenshtein,
min_score: 0.6,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
let score = matcher.score("hello", "helo");
assert!(score > 0.7); }
#[test]
fn test_jaro_winkler_exact_match() {
let matcher = FuzzyMatcher::new(); let score = matcher.score("hello", "hello");
assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
}
#[test]
fn test_jaro_winkler_similar() {
let matcher = FuzzyMatcher::new();
let score = matcher.score("hello", "helo");
assert!(score > 0.8); }
#[test]
fn test_case_insensitive() {
let matcher = FuzzyMatcher::new();
let score = matcher.score("HELLO", "hello");
assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
}
#[test]
fn test_case_sensitive() {
let config = MatchConfig {
algorithm: MatchAlgorithm::JaroWinkler,
min_score: 0.6,
case_sensitive: true,
};
let matcher = FuzzyMatcher::with_config(config);
let score = matcher.score("HELLO", "hello");
assert!(score < 1.0);
}
#[test]
fn test_match_one_threshold() {
let config = MatchConfig {
algorithm: MatchAlgorithm::JaroWinkler,
min_score: 0.8,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
assert!(matcher.match_one("hello", "hello").is_some());
assert!(matcher.match_one("hello", "helo").is_some());
assert!(matcher.match_one("hello", "goodbye").is_none());
}
#[test]
fn test_match_many_sorted() {
let matcher = FuzzyMatcher::new();
let targets = vec![
(0, "goodbye"),
(1, "helo_world"),
(2, "hello_world"),
(3, "hello"),
];
let match_results = matcher.match_many("hello", targets.into_iter());
assert!(!match_results.is_empty());
assert_eq!(match_results[0].entry_id, 3); assert!(match_results[0].score >= match_results[1].score);
}
#[test]
fn test_match_many_filters_low_scores() {
let config = MatchConfig {
algorithm: MatchAlgorithm::JaroWinkler,
min_score: 0.9,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
let targets = vec![(0, "hello"), (1, "helo"), (2, "goodbye")];
let match_results = matcher.match_many("hello", targets.into_iter());
assert!(match_results.len() <= 2); assert!(match_results.iter().all(|m| m.score >= 0.9));
}
#[test]
fn test_empty_strings() {
let matcher = FuzzyMatcher::new();
let score = matcher.score("", "");
assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
}
#[test]
fn test_levenshtein_similarity_calculation() {
let config = MatchConfig {
algorithm: MatchAlgorithm::Levenshtein,
min_score: 0.6,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
let score = matcher.score("kitten", "sitting");
assert!((score - 0.571).abs() < 0.01);
}
#[test]
fn test_levenshtein_unicode_normalization() {
let config = MatchConfig {
algorithm: MatchAlgorithm::Levenshtein,
min_score: 0.0,
case_sensitive: false,
};
let matcher = FuzzyMatcher::with_config(config);
let score = matcher.score("café", "cafe");
assert!((score - 0.75).abs() < 0.01);
let score = matcher.score("hello世界", "hello");
assert!((score - 0.714).abs() < 0.02);
}
}