1use super::trigram::TrigramIndex;
11use std::sync::Arc;
12
13fn use_jaccard_similarity() -> bool {
16 std::env::var("SQRY_FUZZY_USE_JACCARD")
17 .ok()
18 .and_then(|v| v.parse::<u8>().ok())
19 != Some(0) }
21
22#[inline]
23#[allow(clippy::cast_precision_loss)] fn to_f64(value: usize) -> f64 {
25 value as f64
26}
27
28#[derive(Debug, Clone)]
30pub struct FuzzyConfig {
31 pub max_candidates: usize,
33
34 pub min_similarity: f64,
37}
38
39impl Default for FuzzyConfig {
40 fn default() -> Self {
41 Self {
42 max_candidates: 1000,
43 min_similarity: 0.1,
44 }
45 }
46}
47
48pub struct CandidateGenerator {
50 trigram_index: Arc<TrigramIndex>,
52
53 config: FuzzyConfig,
55}
56
57impl CandidateGenerator {
58 #[must_use]
60 pub fn new(trigram_index: Arc<TrigramIndex>) -> Self {
61 Self {
62 trigram_index,
63 config: FuzzyConfig::default(),
64 }
65 }
66
67 #[must_use]
69 pub fn with_config(trigram_index: Arc<TrigramIndex>, config: FuzzyConfig) -> Self {
70 Self {
71 trigram_index,
72 config,
73 }
74 }
75
76 #[allow(clippy::cast_precision_loss)] #[must_use]
104 pub fn generate(&self, query: &str) -> Vec<usize> {
105 let Some(query_trigrams) = Self::extract_query_trigrams(query) else {
106 return Vec::new();
107 };
108
109 let query_trigram_count = to_f64(query_trigrams.len());
110 let use_jaccard = use_jaccard_similarity();
111 let overlap_counts = self.collect_overlap_counts(&query_trigrams);
112 let mut telemetry = FuzzyTelemetry::new(overlap_counts.len());
113
114 let candidates = self.select_candidates(
116 &overlap_counts,
117 query_trigram_count,
118 query_trigrams.len(),
119 use_jaccard,
120 &mut telemetry,
121 );
122
123 telemetry.log(query, use_jaccard, candidates.len());
125
126 candidates
127 }
128
129 #[must_use]
131 pub fn symbol_count(&self) -> usize {
132 self.trigram_index.symbol_count
133 }
134
135 #[must_use]
137 pub fn config(&self) -> &FuzzyConfig {
138 &self.config
139 }
140}
141
142struct FuzzyTelemetry {
143 initial_candidates: usize,
144 jaccard_sum: f64,
145 jaccard_count: u32,
146 fallback_count: usize,
147 dropped_count: usize,
148}
149
150impl FuzzyTelemetry {
151 fn new(initial_candidates: usize) -> Self {
152 Self {
153 initial_candidates,
154 jaccard_sum: 0.0,
155 jaccard_count: 0,
156 fallback_count: 0,
157 dropped_count: 0,
158 }
159 }
160
161 fn record_similarity(&mut self, similarity: f64, jaccard_applied: bool) {
162 if jaccard_applied {
163 self.jaccard_sum += similarity;
164 self.jaccard_count += 1;
165 } else {
166 self.fallback_count += 1;
167 }
168 }
169
170 fn mark_dropped(&mut self) {
171 self.dropped_count += 1;
172 }
173
174 fn log(&self, query: &str, use_jaccard: bool, kept: usize) {
175 log::debug!(
176 "Fuzzy candidate generation: query='{}' initial={} kept={} dropped={} jaccard_avg={:.3} fallback={} mode={}",
177 query,
178 self.initial_candidates,
179 kept,
180 self.dropped_count,
181 self.jaccard_average(),
182 self.fallback_count,
183 if use_jaccard { "jaccard" } else { "ratio" }
184 );
185
186 if self.fallback_count > 0 && use_jaccard {
187 log::debug!(
188 "Fuzzy search using fallback ratio for {} candidates (old index or missing counts)",
189 self.fallback_count
190 );
191 }
192 }
193
194 fn jaccard_average(&self) -> f64 {
195 if self.jaccard_count > 0 {
196 self.jaccard_sum / f64::from(self.jaccard_count)
197 } else {
198 0.0
199 }
200 }
201}
202
203fn compute_similarity(
204 use_jaccard: bool,
205 entry_id: usize,
206 overlap: usize,
207 query_trigram_count: f64,
208 query_trigram_len: usize,
209 symbol_trigram_counts: &[usize],
210) -> (f64, bool) {
211 if use_jaccard && entry_id < symbol_trigram_counts.len() && !symbol_trigram_counts.is_empty() {
212 let symbol_trigram_count = symbol_trigram_counts[entry_id];
213 let union = query_trigram_len + symbol_trigram_count - overlap;
214 let jaccard = if union > 0 {
215 to_f64(overlap) / to_f64(union)
216 } else {
217 0.0
218 };
219 (jaccard, true)
220 } else {
221 (to_f64(overlap) / query_trigram_count, false)
222 }
223}
224
225impl CandidateGenerator {
226 fn collect_overlap_counts(&self, query_trigrams: &[String]) -> Vec<(usize, usize)> {
227 use std::collections::HashMap;
228
229 let mut overlap_map: HashMap<usize, usize> = HashMap::new();
231 for trigram in query_trigrams {
232 if let Some(entry_ids) = self.trigram_index.postings.get(trigram) {
233 for &entry_id in entry_ids {
234 *overlap_map.entry(entry_id).or_insert(0) += 1;
235 }
236 }
237 }
238
239 let mut overlap_counts: Vec<(usize, usize)> = overlap_map.into_iter().collect();
241 overlap_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
243 overlap_counts
244 }
245
246 fn extract_query_trigrams(query: &str) -> Option<Vec<String>> {
247 use super::trigram::extract_normalized_trigrams;
248
249 let query_trigrams = extract_normalized_trigrams(query);
250 if query_trigrams.is_empty() {
251 None
252 } else {
253 Some(query_trigrams)
254 }
255 }
256
257 fn select_candidates(
258 &self,
259 overlap_counts: &[(usize, usize)],
260 query_trigram_count: f64,
261 query_trigram_len: usize,
262 use_jaccard: bool,
263 telemetry: &mut FuzzyTelemetry,
264 ) -> Vec<usize> {
265 let mut candidates =
266 Vec::with_capacity(self.config.max_candidates.min(overlap_counts.len()));
267 let symbol_trigram_counts = &self.trigram_index.symbol_trigram_counts;
268
269 for &(entry_id, overlap) in overlap_counts {
270 let (similarity, jaccard_applied) = compute_similarity(
271 use_jaccard,
272 entry_id,
273 overlap,
274 query_trigram_count,
275 query_trigram_len,
276 symbol_trigram_counts,
277 );
278 telemetry.record_similarity(similarity, jaccard_applied);
279 if similarity < self.config.min_similarity {
280 telemetry.mark_dropped();
281 break; }
283
284 candidates.push(entry_id);
285
286 if candidates.len() >= self.config.max_candidates {
287 break;
288 }
289 }
290
291 candidates
292 }
293}
294
295#[cfg(test)]
296mod tests {
297 use super::*;
298
299 fn create_test_index() -> TrigramIndex {
300 let mut index = TrigramIndex::new();
301 index.add_symbol(0, "hello_world");
302 index.add_symbol(1, "hello_rust");
303 index.add_symbol(2, "hello");
304 index.add_symbol(3, "world");
305 index.add_symbol(4, "goodbye");
306 index
307 }
308
309 #[test]
310 fn test_candidate_generation_basic() {
311 let index = create_test_index();
312 let generator = CandidateGenerator::new(Arc::new(index));
313
314 let candidates = generator.generate("hello");
315 assert!(!candidates.is_empty());
316 assert!(candidates.contains(&0)); assert!(candidates.contains(&1)); assert!(candidates.contains(&2)); }
320
321 #[test]
322 fn test_candidate_cap_enforced() {
323 let index = create_test_index();
324 let config = FuzzyConfig {
325 max_candidates: 2,
326 min_similarity: 0.0,
327 };
328 let generator = CandidateGenerator::with_config(Arc::new(index), config);
329
330 let candidates = generator.generate("hello");
331 assert!(candidates.len() <= 2, "Should cap at 2 candidates");
332 }
333
334 #[test]
335 fn test_similarity_threshold() {
336 let index = create_test_index();
337 let config = FuzzyConfig {
338 max_candidates: 1000,
339 min_similarity: 0.9, };
341 let generator = CandidateGenerator::with_config(Arc::new(index), config);
342
343 let candidates = generator.generate("hello");
344 assert!(candidates.len() <= 3);
346 }
347
348 #[test]
349 fn test_empty_query() {
350 let index = create_test_index();
351 let generator = CandidateGenerator::new(Arc::new(index));
352
353 let candidates = generator.generate("");
354 assert_eq!(candidates.len(), 0);
355 }
356
357 #[test]
358 fn test_no_matches() {
359 let index = create_test_index();
360 let generator = CandidateGenerator::new(Arc::new(index));
361
362 let candidates = generator.generate("xyz123");
363 assert_eq!(candidates.len(), 0);
364 }
365
366 #[test]
367 fn test_symbol_count() {
368 let index = create_test_index();
369 let generator = CandidateGenerator::new(Arc::new(index));
370
371 assert_eq!(generator.symbol_count(), 5);
372 }
373
374 #[test]
375 fn test_candidates_sorted_by_relevance() {
376 let mut index = TrigramIndex::new();
377 index.add_symbol(0, "test");
378 index.add_symbol(1, "testing");
379 index.add_symbol(2, "test_function");
380
381 let generator = CandidateGenerator::new(Arc::new(index));
382 let candidates = generator.generate("test");
383
384 assert_eq!(candidates[0], 0);
387 }
388
389 #[test]
390 fn test_jaccard_similarity_exact_match() {
391 let mut index = TrigramIndex::new();
392 index.add_symbol(0, "hello");
393 let generator = CandidateGenerator::new(Arc::new(index));
394
395 let candidates = generator.generate("hello");
399
400 assert_eq!(candidates.len(), 1);
401 assert_eq!(candidates[0], 0);
402 }
403
404 #[test]
405 fn test_jaccard_similarity_partial_overlap() {
406 let mut index = TrigramIndex::new();
407 index.add_symbol(0, "hello");
409 index.add_symbol(1, "help");
411
412 let generator = CandidateGenerator::new(Arc::new(index));
413
414 let candidates = generator.generate("hel");
418
419 assert_eq!(candidates.len(), 2);
421 }
422
423 #[test]
424 fn test_jaccard_vs_ratio_difference() {
425 let mut index = TrigramIndex::new();
426 index.add_symbol(0, "test");
428 index.add_symbol(1, "testing_function_with_test");
430
431 let generator = CandidateGenerator::new(Arc::new(index));
432
433 let candidates = generator.generate("test");
438
439 assert_eq!(candidates[0], 0);
441 }
442
443 #[test]
444 fn test_fallback_to_ratio_when_counts_missing() {
445 let mut index = TrigramIndex::new();
447 index.add_symbol(0, "hello");
448 index.add_symbol(1, "world");
449
450 let index_no_counts = TrigramIndex {
452 postings: index.postings.clone(),
453 symbol_lengths: index.symbol_lengths.clone(),
454 symbol_trigram_counts: Vec::new(), symbol_count: index.symbol_count,
456 };
457
458 let generator = CandidateGenerator::new(Arc::new(index_no_counts));
459
460 let candidates = generator.generate("hello");
462 assert!(!candidates.is_empty());
463 assert!(candidates.contains(&0));
464 }
465
466 #[test]
467 fn test_jaccard_computation_correctness() {
468 use crate::search::trigram::extract_normalized_trigrams;
469
470 let mut index = TrigramIndex::new();
471 index.add_symbol(0, "context");
472 index.add_symbol(1, "content");
473
474 let query = "conte";
476 let _query_trigrams = extract_normalized_trigrams(query);
477
478 let config = FuzzyConfig {
485 max_candidates: 10,
486 min_similarity: 0.5, };
488 let generator = CandidateGenerator::with_config(Arc::new(index), config);
489 let candidates = generator.generate(query);
490
491 assert_eq!(candidates.len(), 2);
495
496 assert!(candidates.contains(&0)); assert!(candidates.contains(&1)); }
501
502 #[test]
503 fn test_jaccard_with_high_threshold() {
504 let mut index = TrigramIndex::new();
505 index.add_symbol(0, "hello");
506 index.add_symbol(1, "helloworld");
507 index.add_symbol(2, "help");
508
509 let config = FuzzyConfig {
510 max_candidates: 10,
511 min_similarity: 0.8, };
513 let generator = CandidateGenerator::with_config(Arc::new(index), config);
514
515 let candidates = generator.generate("hello");
516
517 assert!(candidates.contains(&0));
520 }
521
522 #[test]
523 fn test_env_var_toggle_disables_jaccard() {
524 unsafe {
526 std::env::set_var("SQRY_FUZZY_USE_JACCARD", "0");
527 }
528
529 let mut index = TrigramIndex::new();
530 index.add_symbol(0, "context");
531 index.add_symbol(1, "content");
532
533 let config = FuzzyConfig {
534 max_candidates: 10,
535 min_similarity: 0.5,
536 };
537 let generator = CandidateGenerator::with_config(Arc::new(index), config);
538 let candidates = generator.generate("conte");
539
540 assert_eq!(candidates.len(), 2);
544
545 unsafe {
547 std::env::remove_var("SQRY_FUZZY_USE_JACCARD");
548 }
549 }
550
551 #[test]
552 fn test_zero_union_guard() {
553 let mut index = TrigramIndex::new();
554 index.add_symbol(0, "a");
556 index.add_symbol(1, "b");
557
558 let generator = CandidateGenerator::new(Arc::new(index));
559
560 let candidates = generator.generate("c");
562 assert!(candidates.is_empty() || !candidates.is_empty());
564 }
565}