fuzzy_parser/
distance.rs

1//! String distance/similarity calculation utilities
2//!
3//! This module provides wrappers around strsim algorithms for fuzzy matching.
4
5use strsim::{damerau_levenshtein, jaro_winkler, levenshtein};
6
7/// Algorithm for similarity calculation
8#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
9pub enum Algorithm {
10    /// Jaro-Winkler similarity (recommended for typos)
11    ///
12    /// Good for: transpositions, prefix matching
13    /// Returns: 0.0 to 1.0 (1.0 = identical)
14    #[default]
15    JaroWinkler,
16
17    /// Levenshtein distance (edit distance)
18    ///
19    /// Good for: insertions, deletions, substitutions
20    /// Normalized to 0.0 to 1.0 (1.0 = identical)
21    Levenshtein,
22
23    /// Damerau-Levenshtein distance
24    ///
25    /// Like Levenshtein but also handles transpositions
26    /// Normalized to 0.0 to 1.0 (1.0 = identical)
27    DamerauLevenshtein,
28}
29
30/// Calculate similarity between two strings
31///
32/// Returns a value between 0.0 (completely different) and 1.0 (identical).
33pub fn similarity(a: &str, b: &str, algo: Algorithm) -> f64 {
34    if a == b {
35        return 1.0;
36    }
37    if a.is_empty() || b.is_empty() {
38        return 0.0;
39    }
40
41    match algo {
42        Algorithm::JaroWinkler => jaro_winkler(a, b),
43        Algorithm::Levenshtein => {
44            let dist = levenshtein(a, b);
45            let max_len = a.len().max(b.len());
46            1.0 - (dist as f64 / max_len as f64)
47        }
48        Algorithm::DamerauLevenshtein => {
49            let dist = damerau_levenshtein(a, b);
50            let max_len = a.len().max(b.len());
51            1.0 - (dist as f64 / max_len as f64)
52        }
53    }
54}
55
56/// Match result with similarity score
57#[derive(Debug, Clone, PartialEq)]
58pub struct Match {
59    /// The matched candidate string
60    pub candidate: String,
61    /// Similarity score (0.0 to 1.0)
62    pub similarity: f64,
63}
64
65impl Match {
66    pub fn new(candidate: impl Into<String>, similarity: f64) -> Self {
67        Self {
68            candidate: candidate.into(),
69            similarity,
70        }
71    }
72}
73
74/// Find the closest match from a list of candidates
75///
76/// Returns `None` if no candidate meets the minimum similarity threshold.
77pub fn find_closest<'a>(
78    input: &str,
79    candidates: impl IntoIterator<Item = &'a str>,
80    min_similarity: f64,
81    algo: Algorithm,
82) -> Option<Match> {
83    candidates
84        .into_iter()
85        .map(|c| Match::new(c, similarity(input, c, algo)))
86        .filter(|m| m.similarity >= min_similarity)
87        .max_by(|a, b| a.similarity.partial_cmp(&b.similarity).unwrap())
88}
89
90/// Find all matches above the minimum similarity threshold
91///
92/// Returns matches sorted by similarity (highest first).
93pub fn find_all_matches<'a>(
94    input: &str,
95    candidates: impl IntoIterator<Item = &'a str>,
96    min_similarity: f64,
97    algo: Algorithm,
98) -> Vec<Match> {
99    let mut matches: Vec<_> = candidates
100        .into_iter()
101        .map(|c| Match::new(c, similarity(input, c, algo)))
102        .filter(|m| m.similarity >= min_similarity)
103        .collect();
104
105    matches.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap());
106    matches
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn test_identical_strings() {
115        assert_eq!(similarity("hello", "hello", Algorithm::JaroWinkler), 1.0);
116        assert_eq!(similarity("hello", "hello", Algorithm::Levenshtein), 1.0);
117        assert_eq!(
118            similarity("hello", "hello", Algorithm::DamerauLevenshtein),
119            1.0
120        );
121    }
122
123    #[test]
124    fn test_empty_strings() {
125        assert_eq!(similarity("", "", Algorithm::JaroWinkler), 1.0);
126        assert_eq!(similarity("hello", "", Algorithm::JaroWinkler), 0.0);
127        assert_eq!(similarity("", "hello", Algorithm::JaroWinkler), 0.0);
128    }
129
130    #[test]
131    fn test_typo_detection_jaro_winkler() {
132        // Typos should have high similarity with Jaro-Winkler
133        let sim = similarity("AddDeriv", "AddDerive", Algorithm::JaroWinkler);
134        assert!(sim > 0.9, "Expected > 0.9, got {}", sim);
135
136        let sim = similarity("RenamIdent", "RenameIdent", Algorithm::JaroWinkler);
137        assert!(sim > 0.9, "Expected > 0.9, got {}", sim);
138    }
139
140    #[test]
141    fn test_field_name_typo() {
142        let sim = similarity("target_name", "target", Algorithm::JaroWinkler);
143        assert!(sim > 0.7, "Expected > 0.7, got {}", sim);
144
145        let sim = similarity("struct_nam", "struct_name", Algorithm::JaroWinkler);
146        assert!(sim > 0.9, "Expected > 0.9, got {}", sim);
147    }
148
149    #[test]
150    fn test_find_closest() {
151        let candidates = ["AddDerive", "RemoveDerive", "AddField", "RemoveField"];
152        let result = find_closest(
153            "AddDeriv",
154            candidates.iter().copied(),
155            0.7,
156            Algorithm::JaroWinkler,
157        );
158
159        assert!(result.is_some());
160        let m = result.unwrap();
161        assert_eq!(m.candidate, "AddDerive");
162        assert!(m.similarity > 0.9);
163    }
164
165    #[test]
166    fn test_find_closest_no_match() {
167        let candidates = ["AddDerive", "RemoveDerive"];
168        let result = find_closest(
169            "CompletelyDifferent",
170            candidates.iter().copied(),
171            0.9, // High threshold
172            Algorithm::JaroWinkler,
173        );
174
175        assert!(result.is_none());
176    }
177
178    #[test]
179    fn test_find_all_matches() {
180        let candidates = ["target", "target_mod", "target_fn", "body"];
181        let matches = find_all_matches(
182            "target_name",
183            candidates.iter().copied(),
184            0.6,
185            Algorithm::JaroWinkler,
186        );
187
188        assert!(!matches.is_empty());
189        // Results should be sorted by similarity (highest first)
190        for i in 1..matches.len() {
191            assert!(matches[i - 1].similarity >= matches[i].similarity);
192        }
193    }
194
195    #[test]
196    fn test_transposition_damerau() {
197        // Damerau-Levenshtein handles transpositions better
198        let sim_dl = similarity("teh", "the", Algorithm::DamerauLevenshtein);
199        let sim_l = similarity("teh", "the", Algorithm::Levenshtein);
200        // DL should give same or higher similarity for transpositions
201        assert!(sim_dl >= sim_l);
202    }
203}