use liblevenshtein::prelude::{Dictionary, DictionaryNode};
use liblevenshtein::transducer::{Algorithm, Candidate, Transducer};
use crate::semiring::{EditOp, EditSequence, EditWeight, TropicalWeight};
#[derive(Debug, Clone)]
pub struct FuzzyConfig {
pub max_distance: usize,
pub algorithm: Algorithm,
pub include_exact: bool,
pub max_results: Option<usize>,
}
impl Default for FuzzyConfig {
fn default() -> Self {
Self {
max_distance: 2,
algorithm: Algorithm::Standard,
include_exact: true,
max_results: None,
}
}
}
impl FuzzyConfig {
pub fn new(max_distance: usize) -> Self {
Self {
max_distance,
..Default::default()
}
}
pub fn with_transpositions(mut self) -> Self {
self.algorithm = Algorithm::Transposition;
self
}
pub fn with_merge_split(mut self) -> Self {
self.algorithm = Algorithm::MergeAndSplit;
self
}
pub fn exclude_exact(mut self) -> Self {
self.include_exact = false;
self
}
pub fn with_max_results(mut self, limit: usize) -> Self {
self.max_results = Some(limit);
self
}
}
#[derive(Debug, Clone)]
pub struct FuzzyResult<T = String> {
pub term: T,
pub distance: usize,
}
impl<T> FuzzyResult<T> {
pub fn new(term: T, distance: usize) -> Self {
Self { term, distance }
}
pub fn map<U, F: FnOnce(T) -> U>(self, f: F) -> FuzzyResult<U> {
FuzzyResult {
term: f(self.term),
distance: self.distance,
}
}
}
pub fn fuzzy_lookup<D>(dictionary: &D, query: &str, config: FuzzyConfig) -> Vec<FuzzyResult<String>>
where
D: Dictionary + Clone + Send + Sync,
D::Node: Send + Sync,
<D::Node as DictionaryNode>::Unit: Into<char> + TryFrom<char> + Copy + Send + Sync,
{
let transducer = Transducer::new(dictionary.clone(), config.algorithm);
let candidates: Vec<Candidate> = transducer
.query_with_distance(query, config.max_distance)
.collect();
let mut results: Vec<FuzzyResult<String>> = candidates
.into_iter()
.filter(|c| config.include_exact || c.distance > 0)
.map(|c| FuzzyResult::new(c.term, c.distance))
.collect();
results.sort_by_key(|r| r.distance);
if let Some(limit) = config.max_results {
results.truncate(limit);
}
results
}
pub fn fuzzy_lookup_parallel<D, Q, I>(
dictionary: &D,
queries: I,
config: FuzzyConfig,
) -> Vec<Vec<FuzzyResult<String>>>
where
D: Dictionary + Clone + Send + Sync + 'static,
D::Node: Send + Sync,
<D::Node as DictionaryNode>::Unit: Into<char> + TryFrom<char> + Copy + Send + Sync,
Q: AsRef<str> + Send,
I: IntoIterator<Item = Q>,
{
let queries: Vec<_> = queries.into_iter().collect();
if queries.len() < 4 {
return queries
.iter()
.map(|q| fuzzy_lookup(dictionary, q.as_ref(), config.clone()))
.collect();
}
queries
.iter()
.map(|q| fuzzy_lookup(dictionary, q.as_ref(), config.clone()))
.collect()
}
pub fn fuzzy_lookup_with_edits<D>(
dictionary: &D,
query: &str,
config: FuzzyConfig,
) -> Vec<(String, EditWeight)>
where
D: Dictionary + Clone + Send + Sync,
D::Node: Send + Sync,
<D::Node as DictionaryNode>::Unit: Into<char> + TryFrom<char> + Copy + Send + Sync,
{
let basic_results = fuzzy_lookup(dictionary, query, config);
basic_results
.into_iter()
.map(|result| {
let edit_weight = compute_edit_weight(query, &result.term);
(result.term, edit_weight)
})
.collect()
}
fn compute_edit_weight(source: &str, target: &str) -> EditWeight {
let source_chars: Vec<char> = source.chars().collect();
let target_chars: Vec<char> = target.chars().collect();
let m = source_chars.len();
let n = target_chars.len();
let mut dp: Vec<Vec<(usize, Vec<EditOp>)>> = vec![vec![(0, Vec::new()); n + 1]; m + 1];
for i in 1..=m {
let mut ops = dp[i - 1][0].1.clone();
ops.push(EditOp::Delete(source_chars[i - 1]));
dp[i][0] = (i, ops);
}
for j in 1..=n {
let mut ops = dp[0][j - 1].1.clone();
ops.push(EditOp::Insert(target_chars[j - 1]));
dp[0][j] = (j, ops);
}
for i in 1..=m {
for j in 1..=n {
let s = source_chars[i - 1];
let t = target_chars[j - 1];
if s == t {
let mut ops = dp[i - 1][j - 1].1.clone();
ops.push(EditOp::Copy(s));
dp[i][j] = (dp[i - 1][j - 1].0, ops);
} else {
let sub_cost = dp[i - 1][j - 1].0 + 1;
let del_cost = dp[i - 1][j].0 + 1;
let ins_cost = dp[i][j - 1].0 + 1;
if sub_cost <= del_cost && sub_cost <= ins_cost {
let mut ops = dp[i - 1][j - 1].1.clone();
ops.push(EditOp::Substitute { from: s, to: t });
dp[i][j] = (sub_cost, ops);
} else if del_cost <= ins_cost {
let mut ops = dp[i - 1][j].1.clone();
ops.push(EditOp::Delete(s));
dp[i][j] = (del_cost, ops);
} else {
let mut ops = dp[i][j - 1].1.clone();
ops.push(EditOp::Insert(t));
dp[i][j] = (ins_cost, ops);
}
}
}
}
let (cost, operations) = &dp[m][n];
let sequence: EditSequence = operations.iter().cloned().collect();
EditWeight::new(sequence, *cost as f64)
}
#[derive(Debug, Clone)]
pub struct EditTracker {
operations: Vec<EditOp>,
cost: f64,
costs: EditCosts,
}
#[derive(Debug, Clone)]
pub struct EditCosts {
pub insert: f64,
pub delete: f64,
pub substitute: f64,
pub transpose: f64,
}
impl Default for EditCosts {
fn default() -> Self {
Self {
insert: 1.0,
delete: 1.0,
substitute: 1.0,
transpose: 1.0,
}
}
}
impl EditTracker {
pub fn new() -> Self {
Self {
operations: Vec::new(),
cost: 0.0,
costs: EditCosts::default(),
}
}
pub fn with_costs(costs: EditCosts) -> Self {
Self {
operations: Vec::new(),
cost: 0.0,
costs,
}
}
pub fn copy(&mut self, c: char) {
self.operations.push(EditOp::Copy(c));
}
pub fn insert(&mut self, c: char) {
self.operations.push(EditOp::Insert(c));
self.cost += self.costs.insert;
}
pub fn delete(&mut self, c: char) {
self.operations.push(EditOp::Delete(c));
self.cost += self.costs.delete;
}
pub fn substitute(&mut self, from: char, to: char) {
self.operations.push(EditOp::Substitute { from, to });
self.cost += self.costs.substitute;
}
pub fn transpose(&mut self, a: char, b: char) {
self.operations.push(EditOp::Transpose { a, b });
self.cost += self.costs.transpose;
}
pub fn cost(&self) -> f64 {
self.cost
}
pub fn operations(&self) -> &[EditOp] {
&self.operations
}
pub fn build(self) -> EditWeight {
let sequence: EditSequence = self.operations.into_iter().collect();
EditWeight::new(sequence, self.cost)
}
pub fn to_tropical(&self) -> TropicalWeight {
TropicalWeight::new(self.cost)
}
}
impl Default for EditTracker {
fn default() -> Self {
Self::new()
}
}
pub struct EditTrackerBuilder {
costs: EditCosts,
}
impl EditTrackerBuilder {
pub fn new() -> Self {
Self {
costs: EditCosts::default(),
}
}
pub fn insert_cost(mut self, cost: f64) -> Self {
self.costs.insert = cost;
self
}
pub fn delete_cost(mut self, cost: f64) -> Self {
self.costs.delete = cost;
self
}
pub fn substitute_cost(mut self, cost: f64) -> Self {
self.costs.substitute = cost;
self
}
pub fn transpose_cost(mut self, cost: f64) -> Self {
self.costs.transpose = cost;
self
}
pub fn build(self) -> EditTracker {
EditTracker::with_costs(self.costs)
}
}
impl Default for EditTrackerBuilder {
fn default() -> Self {
Self::new()
}
}
impl From<Candidate> for FuzzyResult<String> {
fn from(candidate: Candidate) -> Self {
FuzzyResult {
term: candidate.term,
distance: candidate.distance,
}
}
}
impl<T> From<&FuzzyResult<T>> for TropicalWeight {
fn from(result: &FuzzyResult<T>) -> Self {
TropicalWeight::new(result.distance as f64)
}
}
#[cfg(test)]
mod tests {
use super::*;
use libdictenstein::dynamic_dawg_char::DynamicDawgChar;
#[test]
fn test_fuzzy_lookup_basic() {
let dict = DynamicDawgChar::<()>::from_terms(vec!["hello", "help", "world"]);
let results = fuzzy_lookup(&dict, "helo", FuzzyConfig::new(2));
assert!(!results.is_empty());
let terms: Vec<_> = results.iter().map(|r| r.term.as_str()).collect();
assert!(terms.contains(&"hello") || terms.contains(&"help"));
}
#[test]
fn test_fuzzy_lookup_exact_match() {
let dict = DynamicDawgChar::<()>::from_terms(vec!["test", "testing"]);
let results = fuzzy_lookup(&dict, "test", FuzzyConfig::new(2));
assert!(results.iter().any(|r| r.term == "test" && r.distance == 0));
}
#[test]
fn test_fuzzy_lookup_exclude_exact() {
let dict = DynamicDawgChar::<()>::from_terms(vec!["test", "testing"]);
let config = FuzzyConfig::new(2).exclude_exact();
let results = fuzzy_lookup(&dict, "test", config);
assert!(!results.iter().any(|r| r.distance == 0));
}
#[test]
fn test_fuzzy_lookup_with_edits() {
let dict = DynamicDawgChar::<()>::from_terms(vec!["hello", "help"]);
let results = fuzzy_lookup_with_edits(&dict, "helo", FuzzyConfig::new(2));
assert!(!results.is_empty());
for (term, weight) in &results {
assert!(weight.cost() > 0.0 || term == "helo");
let description = weight.describe();
assert!(!description.is_empty());
}
}
#[test]
fn test_edit_tracker() {
let mut tracker = EditTracker::new();
tracker.copy('h');
tracker.copy('e');
tracker.insert('l'); tracker.copy('l');
tracker.copy('o');
assert_eq!(tracker.cost(), 1.0);
assert_eq!(tracker.operations().len(), 5);
let weight = tracker.build();
assert_eq!(weight.cost(), 1.0);
}
#[test]
fn test_edit_tracker_custom_costs() {
let tracker = EditTrackerBuilder::new()
.insert_cost(0.5)
.delete_cost(0.5)
.substitute_cost(1.0)
.transpose_cost(0.8)
.build();
assert_eq!(tracker.cost(), 0.0);
}
#[test]
fn test_compute_edit_weight() {
let weight = compute_edit_weight("hello", "helo");
assert_eq!(weight.cost() as usize, 1);
}
#[test]
fn test_compute_edit_weight_substitution() {
let weight = compute_edit_weight("cat", "bat");
assert_eq!(weight.cost() as usize, 1);
}
#[test]
fn test_compute_edit_weight_insertion() {
let weight = compute_edit_weight("helo", "hello");
assert_eq!(weight.cost() as usize, 1);
}
#[test]
fn test_fuzzy_config_transpositions() {
let config = FuzzyConfig::new(2).with_transpositions();
assert!(matches!(config.algorithm, Algorithm::Transposition));
}
#[test]
fn test_fuzzy_result_map() {
let result = FuzzyResult::new("hello".to_string(), 1);
let mapped = result.map(|s| s.len());
assert_eq!(mapped.term, 5);
assert_eq!(mapped.distance, 1);
}
}