Skip to main content

sqry_core/search/
matcher.rs

1//! Fuzzy string matching for identifier search.
2//!
3//! This module provides fuzzy matching capabilities using various algorithms
4//! (Levenshtein distance, Jaro-Winkler) to find identifiers that approximately
5//! match a query string.
6
7use crate::search::simd;
8use rapidfuzz::distance::{jaro_winkler, levenshtein};
9
10/// Algorithm to use for fuzzy matching
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
12pub enum MatchAlgorithm {
13    /// Levenshtein edit distance (measures insertions, deletions, substitutions)
14    Levenshtein,
15    /// Jaro-Winkler similarity (good for typos near the beginning)
16    #[default]
17    JaroWinkler,
18}
19
20/// Configuration for fuzzy matching
21#[derive(Debug, Clone)]
22pub struct MatchConfig {
23    /// Algorithm to use for matching
24    pub algorithm: MatchAlgorithm,
25
26    /// Minimum similarity score (0.0-1.0) to consider a match
27    /// For Levenshtein: 1.0 - (distance / `max_len`)
28    /// For Jaro-Winkler: direct similarity score
29    pub min_score: f64,
30
31    /// Case-sensitive matching
32    pub case_sensitive: bool,
33}
34
35impl Default for MatchConfig {
36    fn default() -> Self {
37        Self {
38            algorithm: MatchAlgorithm::JaroWinkler,
39            min_score: 0.6,
40            case_sensitive: false,
41        }
42    }
43}
44
45/// Result of a fuzzy match
46#[derive(Debug, Clone, PartialEq)]
47pub struct MatchResult {
48    /// Similarity score (0.0-1.0, higher is better)
49    pub score: f64,
50    /// Index of the matched entry
51    pub entry_id: usize,
52}
53
54impl MatchResult {
55    /// Create a new match result
56    #[must_use]
57    pub fn new(score: f64, entry_id: usize) -> Self {
58        Self { score, entry_id }
59    }
60}
61
62/// Fuzzy matcher for identifier names
63pub struct FuzzyMatcher {
64    config: MatchConfig,
65}
66
67impl FuzzyMatcher {
68    /// Create a new fuzzy matcher with default configuration
69    #[must_use]
70    pub fn new() -> Self {
71        Self {
72            config: MatchConfig::default(),
73        }
74    }
75
76    /// Create a new fuzzy matcher with custom configuration
77    #[must_use]
78    pub fn with_config(config: MatchConfig) -> Self {
79        Self { config }
80    }
81
82    /// Calculate similarity score between query and target
83    ///
84    /// Returns a score between 0.0 and 1.0 (1.0 = perfect match)
85    ///
86    /// # Performance
87    /// Uses SIMD-accelerated lowercase conversion for ASCII-only targets (identifier names).
88    /// Query normalization uses stdlib to handle Unicode user input correctly.
89    #[must_use]
90    pub fn score(&self, query: &str, target: &str) -> f64 {
91        let (query_normalized, target_normalized) = if self.config.case_sensitive {
92            (query.to_string(), target.to_string())
93        } else {
94            // Query: user input, may contain Unicode, use stdlib
95            let query_norm = query.to_lowercase();
96
97            // Target: identifier name, typically ASCII, use SIMD
98            let target_norm = if target.is_ascii() {
99                simd::to_lowercase_ascii(target)
100            } else {
101                target.to_lowercase()
102            };
103
104            (query_norm, target_norm)
105        };
106
107        match self.config.algorithm {
108            MatchAlgorithm::Levenshtein => {
109                self.levenshtein_similarity(&query_normalized, &target_normalized)
110            }
111            MatchAlgorithm::JaroWinkler => {
112                jaro_winkler::similarity(query_normalized.chars(), target_normalized.chars())
113            }
114        }
115    }
116
117    /// Calculate Levenshtein similarity (normalized to 0.0-1.0)
118    #[allow(clippy::cast_precision_loss, clippy::unused_self)]
119    fn levenshtein_similarity(&self, query: &str, target: &str) -> f64 {
120        let distance = levenshtein::distance(query.chars(), target.chars());
121
122        // Use character count (not byte length) for proper Unicode normalization
123        let max_len = query.chars().count().max(target.chars().count()) as f64;
124
125        if max_len == 0.0 {
126            return 1.0; // Both strings empty
127        }
128
129        1.0 - (distance as f64 / max_len)
130    }
131
132    /// Match a query against a single target
133    ///
134    /// Returns Some(score) if the match exceeds `min_score` threshold, None otherwise
135    #[must_use]
136    pub fn match_one(&self, query: &str, target: &str) -> Option<f64> {
137        let score = self.score(query, target);
138
139        if score >= self.config.min_score {
140            Some(score)
141        } else {
142            None
143        }
144    }
145
146    /// Match a query against multiple targets with their entry IDs
147    ///
148    /// Returns a Vec of `MatchResult` sorted by score (descending)
149    ///
150    /// # Arguments
151    ///
152    /// * `query` - The query string to match
153    /// * `targets` - Iterator of (`entry_id`, `target_name`) tuples
154    ///
155    /// # Examples
156    ///
157    /// ```
158    /// use sqry_core::search::matcher::{FuzzyMatcher, MatchConfig};
159    ///
160    /// let matcher = FuzzyMatcher::new();
161    /// let targets = vec![
162    ///     (0, "hello_world"),
163    ///     (1, "helo_world"),
164    ///     (2, "goodbye"),
165    /// ];
166    ///
167    /// let matches = matcher.match_many("hello", targets.into_iter());
168    /// assert_eq!(matches[0].entry_id, 0); // "hello_world" is best match
169    /// assert!(matches[0].score > matches[1].score);
170    /// ```
171    pub fn match_many<'a, I>(&self, query: &str, targets: I) -> Vec<MatchResult>
172    where
173        I: Iterator<Item = (usize, &'a str)>,
174    {
175        let mut results: Vec<MatchResult> = targets
176            .filter_map(|(entry_id, target)| {
177                self.match_one(query, target)
178                    .map(|score| MatchResult::new(score, entry_id))
179            })
180            .collect();
181
182        // Sort by score descending
183        results.sort_by(|a, b| {
184            b.score
185                .partial_cmp(&a.score)
186                .unwrap_or(std::cmp::Ordering::Equal)
187        });
188
189        results
190    }
191
192    /// Get the current configuration
193    #[must_use]
194    pub fn config(&self) -> &MatchConfig {
195        &self.config
196    }
197}
198
199impl Default for FuzzyMatcher {
200    fn default() -> Self {
201        Self::new()
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208    use approx::assert_abs_diff_eq;
209
210    #[test]
211    fn test_levenshtein_exact_match() {
212        let config = MatchConfig {
213            algorithm: MatchAlgorithm::Levenshtein,
214            min_score: 0.6,
215            case_sensitive: false,
216        };
217        let matcher = FuzzyMatcher::with_config(config);
218
219        let score = matcher.score("hello", "hello");
220        assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
221    }
222
223    #[test]
224    fn test_levenshtein_similar() {
225        let config = MatchConfig {
226            algorithm: MatchAlgorithm::Levenshtein,
227            min_score: 0.6,
228            case_sensitive: false,
229        };
230        let matcher = FuzzyMatcher::with_config(config);
231
232        let score = matcher.score("hello", "helo");
233        assert!(score > 0.7); // Should be similar (1 deletion)
234    }
235
236    #[test]
237    fn test_jaro_winkler_exact_match() {
238        let matcher = FuzzyMatcher::new(); // Uses Jaro-Winkler by default
239        let score = matcher.score("hello", "hello");
240        assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
241    }
242
243    #[test]
244    fn test_jaro_winkler_similar() {
245        let matcher = FuzzyMatcher::new();
246        let score = matcher.score("hello", "helo");
247        assert!(score > 0.8); // Jaro-Winkler is good for typos
248    }
249
250    #[test]
251    fn test_case_insensitive() {
252        let matcher = FuzzyMatcher::new();
253        let score = matcher.score("HELLO", "hello");
254        assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
255    }
256
257    #[test]
258    fn test_case_sensitive() {
259        let config = MatchConfig {
260            algorithm: MatchAlgorithm::JaroWinkler,
261            min_score: 0.6,
262            case_sensitive: true,
263        };
264        let matcher = FuzzyMatcher::with_config(config);
265
266        let score = matcher.score("HELLO", "hello");
267        assert!(score < 1.0);
268    }
269
270    #[test]
271    fn test_match_one_threshold() {
272        let config = MatchConfig {
273            algorithm: MatchAlgorithm::JaroWinkler,
274            min_score: 0.8,
275            case_sensitive: false,
276        };
277        let matcher = FuzzyMatcher::with_config(config);
278
279        assert!(matcher.match_one("hello", "hello").is_some());
280        assert!(matcher.match_one("hello", "helo").is_some());
281        assert!(matcher.match_one("hello", "goodbye").is_none());
282    }
283
284    #[test]
285    fn test_match_many_sorted() {
286        let matcher = FuzzyMatcher::new();
287        let targets = vec![
288            (0, "goodbye"),
289            (1, "helo_world"),
290            (2, "hello_world"),
291            (3, "hello"),
292        ];
293
294        let match_results = matcher.match_many("hello", targets.into_iter());
295
296        // Should have matches, sorted by score
297        assert!(!match_results.is_empty());
298        assert_eq!(match_results[0].entry_id, 3); // "hello" is exact match
299        assert!(match_results[0].score >= match_results[1].score);
300    }
301
302    #[test]
303    fn test_match_many_filters_low_scores() {
304        let config = MatchConfig {
305            algorithm: MatchAlgorithm::JaroWinkler,
306            min_score: 0.9,
307            case_sensitive: false,
308        };
309        let matcher = FuzzyMatcher::with_config(config);
310
311        let targets = vec![(0, "hello"), (1, "helo"), (2, "goodbye")];
312
313        let match_results = matcher.match_many("hello", targets.into_iter());
314
315        // Should only get high-scoring matches
316        assert!(match_results.len() <= 2); // "hello" and maybe "helo"
317        assert!(match_results.iter().all(|m| m.score >= 0.9));
318    }
319
320    #[test]
321    fn test_empty_strings() {
322        let matcher = FuzzyMatcher::new();
323        let score = matcher.score("", "");
324        assert_abs_diff_eq!(score, 1.0, epsilon = 1e-10);
325    }
326
327    #[test]
328    fn test_levenshtein_similarity_calculation() {
329        let config = MatchConfig {
330            algorithm: MatchAlgorithm::Levenshtein,
331            min_score: 0.6,
332            case_sensitive: false,
333        };
334        let matcher = FuzzyMatcher::with_config(config);
335
336        // "kitten" -> "sitting" = 3 edits, max_len = 7 chars
337        // similarity = 1.0 - (3/7) ≈ 0.571
338        let score = matcher.score("kitten", "sitting");
339        assert!((score - 0.571).abs() < 0.01);
340    }
341
342    #[test]
343    fn test_levenshtein_unicode_normalization() {
344        let config = MatchConfig {
345            algorithm: MatchAlgorithm::Levenshtein,
346            min_score: 0.0,
347            case_sensitive: false,
348        };
349        let matcher = FuzzyMatcher::with_config(config);
350
351        // Unicode test: "café" (4 chars, 5 bytes) vs "cafe" (4 chars, 4 bytes)
352        // Distance = 1 (substitution: é -> e)
353        // max_len = 4 chars (not 5 bytes)
354        // similarity = 1.0 - (1/4) = 0.75
355        let score = matcher.score("café", "cafe");
356        assert!((score - 0.75).abs() < 0.01);
357
358        // Another test: "hello世界" (7 chars) vs "hello" (5 chars)
359        // Distance = 2 (delete 世, delete 界)
360        // max_len = 7 chars
361        // similarity = 1.0 - (2/7) ≈ 0.714
362        let score = matcher.score("hello世界", "hello");
363        assert!((score - 0.714).abs() < 0.02);
364    }
365}