Skip to main content

do_memory_core/search/
fuzzy.rs

1//! Fuzzy string matching for typo-tolerant search
2//!
3//! This module provides fuzzy matching capabilities using the Levenshtein distance
4//! algorithm. It allows finding text that is similar but not exactly matching,
5//! which is useful for handling typos, spelling variations, and approximate searches.
6
7use strsim::normalized_levenshtein;
8
9/// Perform fuzzy matching between a query and text
10///
11/// Returns the similarity score (0.0 to 1.0) if the text matches the query
12/// above the given threshold. Returns `None` if below threshold.
13///
14/// # Arguments
15///
16/// * `text` - The text to search in
17/// * `query` - The search query
18/// * `threshold` - Minimum similarity score (0.0 to 1.0)
19///
20/// # Returns
21///
22/// `Some(score)` if match quality >= threshold, `None` otherwise
23///
24/// # Examples
25///
26/// ```
27/// use do_memory_core::search::fuzzy::fuzzy_match;
28///
29/// // Exact match
30/// assert_eq!(fuzzy_match("database", "database", 0.8), Some(1.0));
31///
32/// // Close match (typo)
33/// let score = fuzzy_match("database", "databse", 0.8).unwrap();
34/// assert!(score > 0.8);
35///
36/// // Too different
37/// assert_eq!(fuzzy_match("database", "xyz", 0.8), None);
38/// ```
39#[must_use]
40pub fn fuzzy_match(text: &str, query: &str, threshold: f64) -> Option<f64> {
41    let text_lower = text.to_lowercase();
42    let query_lower = query.to_lowercase();
43
44    // Fast path: exact substring match gets perfect score
45    if text_lower.contains(&query_lower) {
46        return Some(1.0);
47    }
48
49    // Calculate similarity score
50    let score = normalized_levenshtein(&text_lower, &query_lower);
51
52    if score >= threshold {
53        Some(score)
54    } else {
55        None
56    }
57}
58
59/// Search for fuzzy matches within a text body
60///
61/// This function searches for the query within a larger text body,
62/// checking each word and word combination for fuzzy matches.
63///
64/// # Arguments
65///
66/// * `text` - The text to search in
67/// * `query` - The search query
68/// * `threshold` - Minimum similarity score (0.0 to 1.0)
69///
70/// # Returns
71///
72/// A vector of tuples containing (position, `similarity_score`) for each match
73///
74/// # Examples
75///
76/// ```
77/// use do_memory_core::search::fuzzy::fuzzy_search_in_text;
78///
79/// let text = "This is a database connection example";
80/// let matches = fuzzy_search_in_text(text, "databse", 0.8);
81///
82/// assert!(!matches.is_empty());
83/// assert!(matches[0].1 > 0.8); // similarity score
84/// ```
85#[must_use]
86pub fn fuzzy_search_in_text(text: &str, query: &str, threshold: f64) -> Vec<(usize, f64)> {
87    let mut matches = Vec::new();
88    let text_lower = text.to_lowercase();
89    let query_lower = query.to_lowercase();
90    let query_words: Vec<&str> = query_lower.split_whitespace().collect();
91    let text_words: Vec<&str> = text_lower.split_whitespace().collect();
92
93    // Fast path: check if query is a substring
94    if let Some(pos) = text_lower.find(&query_lower) {
95        matches.push((pos, 1.0));
96        return matches;
97    }
98
99    // Try single-word matches
100    for (word_idx, word) in text_words.iter().enumerate() {
101        if let Some(score) = fuzzy_match(word, &query_lower, threshold) {
102            // Calculate approximate position in original text
103            let position = text_words[..word_idx]
104                .iter()
105                .map(|w| w.len() + 1)
106                .sum::<usize>();
107            matches.push((position, score));
108        }
109    }
110
111    // Try multi-word matches (sliding window)
112    if query_words.len() > 1 {
113        for window_size in 2..=query_words.len().min(5) {
114            for window in text_words.windows(window_size) {
115                let window_text = window.join(" ");
116                if let Some(score) = fuzzy_match(&window_text, &query_lower, threshold) {
117                    // Calculate approximate position
118                    let word_idx = text_words.iter().position(|&w| w == window[0]).unwrap_or(0);
119                    let position = text_words[..word_idx]
120                        .iter()
121                        .map(|w| w.len() + 1)
122                        .sum::<usize>();
123                    matches.push((position, score));
124                }
125            }
126        }
127    }
128
129    // Sort by score (highest first), then by position (earliest first)
130    // Use unwrap_or(Ordering::Equal) to handle NaN safely (treat NaN as equal)
131    matches.sort_by(|a, b| {
132        b.1.partial_cmp(&a.1)
133            .unwrap_or(std::cmp::Ordering::Equal)
134            .then(a.0.cmp(&b.0))
135    });
136
137    // Deduplicate matches that are too close together
138    let mut deduped = Vec::new();
139    for (pos, score) in matches {
140        if deduped.is_empty()
141            || deduped
142                .iter()
143                .all(|(p, _)| (*p as i64 - pos as i64).abs() > 5)
144        {
145            deduped.push((pos, score));
146        }
147    }
148
149    deduped
150}
151
152/// Calculate the best fuzzy match score for a query across multiple text fields
153///
154/// This is a helper function for multi-field search that returns the highest
155/// similarity score found across all provided text fields.
156///
157/// # Arguments
158///
159/// * `texts` - Iterator of text strings to search in
160/// * `query` - The search query
161/// * `threshold` - Minimum similarity score (0.0 to 1.0)
162///
163/// # Returns
164///
165/// The best (highest) similarity score found, or `None` if no matches
166#[must_use]
167pub fn best_fuzzy_match<'a, I>(texts: I, query: &str, threshold: f64) -> Option<f64>
168where
169    I: IntoIterator<Item = &'a str>,
170{
171    texts
172        .into_iter()
173        .filter_map(|text| fuzzy_match(text, query, threshold))
174        .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    #[test]
182    fn test_exact_match() {
183        assert_eq!(fuzzy_match("database", "database", 0.8), Some(1.0));
184        assert_eq!(fuzzy_match("hello world", "hello", 0.8), Some(1.0));
185    }
186
187    #[test]
188    fn test_fuzzy_match_typo() {
189        // Common typos should match
190        let score = fuzzy_match("database", "databse", 0.7).unwrap();
191        assert!(score > 0.7);
192
193        let score = fuzzy_match("connection", "conection", 0.7).unwrap();
194        assert!(score > 0.7);
195    }
196
197    #[test]
198    fn test_fuzzy_match_below_threshold() {
199        // Very different strings should not match
200        assert_eq!(fuzzy_match("database", "xyz", 0.8), None);
201        assert_eq!(fuzzy_match("hello", "goodbye", 0.8), None);
202    }
203
204    #[test]
205    fn test_case_insensitive() {
206        assert_eq!(fuzzy_match("Database", "DATABASE", 0.8), Some(1.0));
207        assert_eq!(fuzzy_match("Hello World", "hello world", 0.8), Some(1.0));
208    }
209
210    #[test]
211    fn test_fuzzy_search_in_text() {
212        let text = "This is a database connection example";
213        let matches = fuzzy_search_in_text(text, "databse", 0.7);
214
215        assert!(!matches.is_empty());
216        assert!(matches[0].1 > 0.7);
217    }
218
219    #[test]
220    fn test_fuzzy_search_exact_substring() {
221        let text = "This is a database connection example";
222        let matches = fuzzy_search_in_text(text, "database", 0.8);
223
224        assert_eq!(matches.len(), 1);
225        assert_eq!(matches[0].1, 1.0);
226    }
227
228    #[test]
229    fn test_fuzzy_search_multi_word() {
230        let text = "This is a database connection example";
231        let matches = fuzzy_search_in_text(text, "databse conection", 0.7);
232
233        assert!(!matches.is_empty());
234    }
235
236    #[test]
237    fn test_fuzzy_search_no_match() {
238        let text = "This is a database connection example";
239        let matches = fuzzy_search_in_text(text, "xyz", 0.8);
240
241        assert!(matches.is_empty());
242    }
243
244    #[test]
245    fn test_best_fuzzy_match() {
246        let texts = ["hello", "database", "connection"];
247        let score = best_fuzzy_match(texts.iter().copied(), "databse", 0.7).unwrap();
248
249        assert!(score > 0.7);
250    }
251
252    #[test]
253    fn test_best_fuzzy_match_no_match() {
254        let texts = ["hello", "world"];
255        let score = best_fuzzy_match(texts.iter().copied(), "xyz", 0.8);
256
257        assert_eq!(score, None);
258    }
259
260    #[test]
261    fn test_fuzzy_match_empty_strings() {
262        // Empty strings match perfectly
263        assert_eq!(fuzzy_match("", "", 0.8), Some(1.0));
264        // Text searching for empty query should return 1.0 (substring match)
265        assert_eq!(fuzzy_match("text", "", 0.8), Some(1.0));
266        // Empty text can't contain non-empty query
267        assert_eq!(fuzzy_match("", "text", 0.8), None);
268    }
269
270    #[test]
271    fn test_fuzzy_search_special_characters() {
272        let text = "error: database-connection failed!";
273        // The word "database-connection" should fuzzy match "databse"
274        // We need to adjust the threshold since the hyphenated word is longer
275        let matches = fuzzy_search_in_text(text, "database", 0.7);
276
277        // Should find exact substring match
278        assert!(!matches.is_empty());
279    }
280}