Skip to main content

sqry_core/search/
fuzzy.rs

1//! Fuzzy search candidate generation and matching using trigram indices.
2//!
3//! This module provides efficient two-stage fuzzy search:
4//! 1. Candidate generation: Use trigram overlap to filter the symbol space
5//! 2. Fuzzy matching: Apply string similarity algorithms to rank candidates
6//!
7//! This approach is significantly faster than naive fuzzy matching on large
8//! symbol sets while maintaining high-quality results.
9
10use super::trigram::TrigramIndex;
11use std::sync::Arc;
12
13/// Check if Jaccard similarity should be used (default: true)
14/// Can be disabled via `SQRY_FUZZY_USE_JACCARD=0` environment variable
15fn use_jaccard_similarity() -> bool {
16    std::env::var("SQRY_FUZZY_USE_JACCARD")
17        .ok()
18        .and_then(|v| v.parse::<u8>().ok())
19        != Some(0) // Default to enabled
20}
21
22#[inline]
23#[allow(clippy::cast_precision_loss)] // Trigram counts are bounded well below 2^53
24fn to_f64(value: usize) -> f64 {
25    value as f64
26}
27
28/// Configuration for fuzzy search candidate generation
29#[derive(Debug, Clone)]
30pub struct FuzzyConfig {
31    /// Maximum number of candidates to return (default: 1000)
32    pub max_candidates: usize,
33
34    /// Minimum similarity score (0.0-1.0) based on trigram overlap (default: 0.1)
35    /// A score of 0.0 means no filtering, 1.0 means perfect match required
36    pub min_similarity: f64,
37}
38
39impl Default for FuzzyConfig {
40    fn default() -> Self {
41        Self {
42            max_candidates: 1000,
43            min_similarity: 0.1,
44        }
45    }
46}
47
48/// Generates candidate symbol IDs for fuzzy matching using trigram indices
49pub struct CandidateGenerator {
50    /// Shared trigram index (can be shared across threads)
51    trigram_index: Arc<TrigramIndex>,
52
53    /// Configuration
54    config: FuzzyConfig,
55}
56
57impl CandidateGenerator {
58    /// Create a new candidate generator with a trigram index
59    #[must_use]
60    pub fn new(trigram_index: Arc<TrigramIndex>) -> Self {
61        Self {
62            trigram_index,
63            config: FuzzyConfig::default(),
64        }
65    }
66
67    /// Create a new candidate generator with custom configuration
68    #[must_use]
69    pub fn with_config(trigram_index: Arc<TrigramIndex>, config: FuzzyConfig) -> Self {
70        Self {
71            trigram_index,
72            config,
73        }
74    }
75
76    /// Generate candidates for a query string
77    ///
78    /// Returns a vector of symbol IDs sorted by trigram similarity (descending).
79    /// The result is capped at `max_candidates` and filtered by `min_similarity`.
80    ///
81    /// # Arguments
82    ///
83    /// * `query` - The query string to match against
84    ///
85    /// # Examples
86    ///
87    /// ```
88    /// use sqry_core::search::trigram::TrigramIndex;
89    /// use sqry_core::search::fuzzy::{CandidateGenerator, FuzzyConfig};
90    /// use std::sync::Arc;
91    ///
92    /// let mut index = TrigramIndex::new();
93    /// index.add_symbol(0, "hello_world");
94    /// index.add_symbol(1, "hello_rust");
95    /// index.add_symbol(2, "goodbye");
96    ///
97    /// let generator = CandidateGenerator::new(Arc::new(index));
98    /// let candidates = generator.generate("hello");
99    ///
100    /// assert!(candidates.len() <= 2); // "hello_world" and "hello_rust"
101    /// ```
102    #[allow(clippy::cast_precision_loss)] // Similarity ratios rely on f64, acceptable for scoring heuristics
103    #[must_use]
104    pub fn generate(&self, query: &str) -> Vec<usize> {
105        let Some(query_trigrams) = Self::extract_query_trigrams(query) else {
106            return Vec::new();
107        };
108
109        let query_trigram_count = to_f64(query_trigrams.len());
110        let use_jaccard = use_jaccard_similarity();
111        let overlap_counts = self.collect_overlap_counts(&query_trigrams);
112        let mut telemetry = FuzzyTelemetry::new(overlap_counts.len());
113
114        // Filter by similarity threshold and cap
115        let candidates = self.select_candidates(
116            &overlap_counts,
117            query_trigram_count,
118            query_trigrams.len(),
119            use_jaccard,
120            &mut telemetry,
121        );
122
123        // Debug logging
124        telemetry.log(query, use_jaccard, candidates.len());
125
126        candidates
127    }
128
129    /// Get the number of symbols in the index
130    #[must_use]
131    pub fn symbol_count(&self) -> usize {
132        self.trigram_index.symbol_count
133    }
134
135    /// Get the current configuration
136    #[must_use]
137    pub fn config(&self) -> &FuzzyConfig {
138        &self.config
139    }
140}
141
142struct FuzzyTelemetry {
143    initial_candidates: usize,
144    jaccard_sum: f64,
145    jaccard_count: u32,
146    fallback_count: usize,
147    dropped_count: usize,
148}
149
150impl FuzzyTelemetry {
151    fn new(initial_candidates: usize) -> Self {
152        Self {
153            initial_candidates,
154            jaccard_sum: 0.0,
155            jaccard_count: 0,
156            fallback_count: 0,
157            dropped_count: 0,
158        }
159    }
160
161    fn record_similarity(&mut self, similarity: f64, jaccard_applied: bool) {
162        if jaccard_applied {
163            self.jaccard_sum += similarity;
164            self.jaccard_count += 1;
165        } else {
166            self.fallback_count += 1;
167        }
168    }
169
170    fn mark_dropped(&mut self) {
171        self.dropped_count += 1;
172    }
173
174    fn log(&self, query: &str, use_jaccard: bool, kept: usize) {
175        log::debug!(
176            "Fuzzy candidate generation: query='{}' initial={} kept={} dropped={} jaccard_avg={:.3} fallback={} mode={}",
177            query,
178            self.initial_candidates,
179            kept,
180            self.dropped_count,
181            self.jaccard_average(),
182            self.fallback_count,
183            if use_jaccard { "jaccard" } else { "ratio" }
184        );
185
186        if self.fallback_count > 0 && use_jaccard {
187            log::debug!(
188                "Fuzzy search using fallback ratio for {} candidates (old index or missing counts)",
189                self.fallback_count
190            );
191        }
192    }
193
194    fn jaccard_average(&self) -> f64 {
195        if self.jaccard_count > 0 {
196            self.jaccard_sum / f64::from(self.jaccard_count)
197        } else {
198            0.0
199        }
200    }
201}
202
203fn compute_similarity(
204    use_jaccard: bool,
205    entry_id: usize,
206    overlap: usize,
207    query_trigram_count: f64,
208    query_trigram_len: usize,
209    symbol_trigram_counts: &[usize],
210) -> (f64, bool) {
211    if use_jaccard && entry_id < symbol_trigram_counts.len() && !symbol_trigram_counts.is_empty() {
212        let symbol_trigram_count = symbol_trigram_counts[entry_id];
213        let union = query_trigram_len + symbol_trigram_count - overlap;
214        let jaccard = if union > 0 {
215            to_f64(overlap) / to_f64(union)
216        } else {
217            0.0
218        };
219        (jaccard, true)
220    } else {
221        (to_f64(overlap) / query_trigram_count, false)
222    }
223}
224
225impl CandidateGenerator {
226    fn collect_overlap_counts(&self, query_trigrams: &[String]) -> Vec<(usize, usize)> {
227        use std::collections::HashMap;
228
229        // Count trigram overlaps for each symbol using a HashMap to avoid O(n^2) scans
230        let mut overlap_map: HashMap<usize, usize> = HashMap::new();
231        for trigram in query_trigrams {
232            if let Some(entry_ids) = self.trigram_index.postings.get(trigram) {
233                for &entry_id in entry_ids {
234                    *overlap_map.entry(entry_id).or_insert(0) += 1;
235                }
236            }
237        }
238
239        // Move into a vector and sort by overlap count (descending)
240        let mut overlap_counts: Vec<(usize, usize)> = overlap_map.into_iter().collect();
241        // Sort by overlap descending; tie-break by entry_id ascending for stability
242        overlap_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
243        overlap_counts
244    }
245
246    fn extract_query_trigrams(query: &str) -> Option<Vec<String>> {
247        use super::trigram::extract_normalized_trigrams;
248
249        let query_trigrams = extract_normalized_trigrams(query);
250        if query_trigrams.is_empty() {
251            None
252        } else {
253            Some(query_trigrams)
254        }
255    }
256
257    fn select_candidates(
258        &self,
259        overlap_counts: &[(usize, usize)],
260        query_trigram_count: f64,
261        query_trigram_len: usize,
262        use_jaccard: bool,
263        telemetry: &mut FuzzyTelemetry,
264    ) -> Vec<usize> {
265        let mut candidates =
266            Vec::with_capacity(self.config.max_candidates.min(overlap_counts.len()));
267        let symbol_trigram_counts = &self.trigram_index.symbol_trigram_counts;
268
269        for &(entry_id, overlap) in overlap_counts {
270            let (similarity, jaccard_applied) = compute_similarity(
271                use_jaccard,
272                entry_id,
273                overlap,
274                query_trigram_count,
275                query_trigram_len,
276                symbol_trigram_counts,
277            );
278            telemetry.record_similarity(similarity, jaccard_applied);
279            if similarity < self.config.min_similarity {
280                telemetry.mark_dropped();
281                break; // Since sorted by overlap, no more candidates will pass
282            }
283
284            candidates.push(entry_id);
285
286            if candidates.len() >= self.config.max_candidates {
287                break;
288            }
289        }
290
291        candidates
292    }
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    fn create_test_index() -> TrigramIndex {
300        let mut index = TrigramIndex::new();
301        index.add_symbol(0, "hello_world");
302        index.add_symbol(1, "hello_rust");
303        index.add_symbol(2, "hello");
304        index.add_symbol(3, "world");
305        index.add_symbol(4, "goodbye");
306        index
307    }
308
309    #[test]
310    fn test_candidate_generation_basic() {
311        let index = create_test_index();
312        let generator = CandidateGenerator::new(Arc::new(index));
313
314        let candidates = generator.generate("hello");
315        assert!(!candidates.is_empty());
316        assert!(candidates.contains(&0)); // hello_world
317        assert!(candidates.contains(&1)); // hello_rust
318        assert!(candidates.contains(&2)); // hello
319    }
320
321    #[test]
322    fn test_candidate_cap_enforced() {
323        let index = create_test_index();
324        let config = FuzzyConfig {
325            max_candidates: 2,
326            min_similarity: 0.0,
327        };
328        let generator = CandidateGenerator::with_config(Arc::new(index), config);
329
330        let candidates = generator.generate("hello");
331        assert!(candidates.len() <= 2, "Should cap at 2 candidates");
332    }
333
334    #[test]
335    fn test_similarity_threshold() {
336        let index = create_test_index();
337        let config = FuzzyConfig {
338            max_candidates: 1000,
339            min_similarity: 0.9, // Very high threshold
340        };
341        let generator = CandidateGenerator::with_config(Arc::new(index), config);
342
343        let candidates = generator.generate("hello");
344        // With high threshold, should only get exact or very close matches
345        assert!(candidates.len() <= 3);
346    }
347
348    #[test]
349    fn test_empty_query() {
350        let index = create_test_index();
351        let generator = CandidateGenerator::new(Arc::new(index));
352
353        let candidates = generator.generate("");
354        assert_eq!(candidates.len(), 0);
355    }
356
357    #[test]
358    fn test_no_matches() {
359        let index = create_test_index();
360        let generator = CandidateGenerator::new(Arc::new(index));
361
362        let candidates = generator.generate("xyz123");
363        assert_eq!(candidates.len(), 0);
364    }
365
366    #[test]
367    fn test_symbol_count() {
368        let index = create_test_index();
369        let generator = CandidateGenerator::new(Arc::new(index));
370
371        assert_eq!(generator.symbol_count(), 5);
372    }
373
374    #[test]
375    fn test_candidates_sorted_by_relevance() {
376        let mut index = TrigramIndex::new();
377        index.add_symbol(0, "test");
378        index.add_symbol(1, "testing");
379        index.add_symbol(2, "test_function");
380
381        let generator = CandidateGenerator::new(Arc::new(index));
382        let candidates = generator.generate("test");
383
384        // First candidate should have highest overlap
385        // "test" has all trigrams matching
386        assert_eq!(candidates[0], 0);
387    }
388
389    #[test]
390    fn test_jaccard_similarity_exact_match() {
391        let mut index = TrigramIndex::new();
392        index.add_symbol(0, "hello");
393        let generator = CandidateGenerator::new(Arc::new(index));
394
395        // Exact match should have Jaccard = 1.0
396        // Both query and symbol have same trigrams: ["hel", "ell", "llo"]
397        // overlap = 3, union = 3 + 3 - 3 = 3, jaccard = 3/3 = 1.0
398        let candidates = generator.generate("hello");
399
400        assert_eq!(candidates.len(), 1);
401        assert_eq!(candidates[0], 0);
402    }
403
404    #[test]
405    fn test_jaccard_similarity_partial_overlap() {
406        let mut index = TrigramIndex::new();
407        // "hello" has trigrams: ["hel", "ell", "llo"]
408        index.add_symbol(0, "hello");
409        // "help" has trigrams: ["hel", "elp"]
410        index.add_symbol(1, "help");
411
412        let generator = CandidateGenerator::new(Arc::new(index));
413
414        // Query "hel" has trigram: ["hel"]
415        // Node 0 ("hello"): overlap=1, union=1+3-1=3, jaccard=1/3=0.33
416        // Node 1 ("help"): overlap=1, union=1+2-1=2, jaccard=1/2=0.5
417        let candidates = generator.generate("hel");
418
419        // Both should be candidates (above default 0.1 threshold)
420        assert_eq!(candidates.len(), 2);
421    }
422
423    #[test]
424    fn test_jaccard_vs_ratio_difference() {
425        let mut index = TrigramIndex::new();
426        // Short symbol with high overlap
427        index.add_symbol(0, "test");
428        // Long symbol with same overlap
429        index.add_symbol(1, "testing_function_with_test");
430
431        let generator = CandidateGenerator::new(Arc::new(index));
432
433        // Query "test" has trigrams: ["tes", "est"]
434        // Node 0 ("test"): overlap=2, |S|=2, union=2+2-2=2, jaccard=2/2=1.0
435        // Node 1 ("testing_function_with_test"): overlap=2, |S|=many, jaccard < 1.0
436        // Jaccard properly penalizes the long symbol despite high overlap
437        let candidates = generator.generate("test");
438
439        // First candidate should be the exact match
440        assert_eq!(candidates[0], 0);
441    }
442
443    #[test]
444    fn test_fallback_to_ratio_when_counts_missing() {
445        // Create index with empty counts (simulating old format)
446        let mut index = TrigramIndex::new();
447        index.add_symbol(0, "hello");
448        index.add_symbol(1, "world");
449
450        // Manually clear counts to simulate old index
451        let index_no_counts = TrigramIndex {
452            postings: index.postings.clone(),
453            symbol_lengths: index.symbol_lengths.clone(),
454            symbol_trigram_counts: Vec::new(), // Empty counts
455            symbol_count: index.symbol_count,
456        };
457
458        let generator = CandidateGenerator::new(Arc::new(index_no_counts));
459
460        // Should still work using fallback ratio method
461        let candidates = generator.generate("hello");
462        assert!(!candidates.is_empty());
463        assert!(candidates.contains(&0));
464    }
465
466    #[test]
467    fn test_jaccard_computation_correctness() {
468        use crate::search::trigram::extract_normalized_trigrams;
469
470        let mut index = TrigramIndex::new();
471        index.add_symbol(0, "context");
472        index.add_symbol(1, "content");
473
474        // Manually verify Jaccard computation
475        let query = "conte";
476        let _query_trigrams = extract_normalized_trigrams(query);
477
478        // "context" trigrams: ["con", "ont", "nte", "ext", "xte"] = 5
479        // overlap with query: 3, union: 3 + 5 - 3 = 5, jaccard: 3/5 = 0.6
480
481        // "content" trigrams: ["con", "ont", "nte", "ent"] = 4
482        // overlap with query: 3, union: 3 + 4 - 3 = 4, jaccard: 3/4 = 0.75
483
484        let config = FuzzyConfig {
485            max_candidates: 10,
486            min_similarity: 0.5, // Both should pass this threshold
487        };
488        let generator = CandidateGenerator::with_config(Arc::new(index), config);
489        let candidates = generator.generate(query);
490
491        // Both symbols should pass the 0.5 threshold:
492        // "context": jaccard = 3/5 = 0.6 ✓
493        // "content": jaccard = 3/4 = 0.75 ✓
494        assert_eq!(candidates.len(), 2);
495
496        // Candidates are sorted by overlap count (both have 3), then by entry_id
497        // So we expect: [0 (context), 1 (content)]
498        assert!(candidates.contains(&0)); // context
499        assert!(candidates.contains(&1)); // content
500    }
501
502    #[test]
503    fn test_jaccard_with_high_threshold() {
504        let mut index = TrigramIndex::new();
505        index.add_symbol(0, "hello");
506        index.add_symbol(1, "helloworld");
507        index.add_symbol(2, "help");
508
509        let config = FuzzyConfig {
510            max_candidates: 10,
511            min_similarity: 0.8, // High threshold
512        };
513        let generator = CandidateGenerator::with_config(Arc::new(index), config);
514
515        let candidates = generator.generate("hello");
516
517        // Only exact or very close matches should pass
518        // "hello" itself should definitely be included
519        assert!(candidates.contains(&0));
520    }
521
522    #[test]
523    fn test_env_var_toggle_disables_jaccard() {
524        // Set env var to disable Jaccard
525        unsafe {
526            std::env::set_var("SQRY_FUZZY_USE_JACCARD", "0");
527        }
528
529        let mut index = TrigramIndex::new();
530        index.add_symbol(0, "context");
531        index.add_symbol(1, "content");
532
533        let config = FuzzyConfig {
534            max_candidates: 10,
535            min_similarity: 0.5,
536        };
537        let generator = CandidateGenerator::with_config(Arc::new(index), config);
538        let candidates = generator.generate("conte");
539
540        // With ratio mode (disabled Jaccard), both should still pass
541        // "context": ratio = 3/3 = 1.0 ✓
542        // "content": ratio = 3/3 = 1.0 ✓
543        assert_eq!(candidates.len(), 2);
544
545        // Clean up
546        unsafe {
547            std::env::remove_var("SQRY_FUZZY_USE_JACCARD");
548        }
549    }
550
551    #[test]
552    fn test_zero_union_guard() {
553        let mut index = TrigramIndex::new();
554        // Edge case: very short strings
555        index.add_symbol(0, "a");
556        index.add_symbol(1, "b");
557
558        let generator = CandidateGenerator::new(Arc::new(index));
559
560        // Query with no overlap should handle union=0 gracefully
561        let candidates = generator.generate("c");
562        // Should return empty or handle gracefully without panic
563        assert!(candidates.is_empty() || !candidates.is_empty());
564    }
565}