use super::trigram::TrigramIndex;
use std::sync::Arc;
fn use_jaccard_similarity() -> bool {
std::env::var("SQRY_FUZZY_USE_JACCARD")
.ok()
.and_then(|v| v.parse::<u8>().ok())
!= Some(0) }
#[inline]
#[allow(clippy::cast_precision_loss)] fn to_f64(value: usize) -> f64 {
value as f64
}
#[derive(Debug, Clone)]
pub struct FuzzyConfig {
pub max_candidates: usize,
pub min_similarity: f64,
}
impl Default for FuzzyConfig {
fn default() -> Self {
Self {
max_candidates: 1000,
min_similarity: 0.1,
}
}
}
pub struct CandidateGenerator {
trigram_index: Arc<TrigramIndex>,
config: FuzzyConfig,
}
impl CandidateGenerator {
#[must_use]
pub fn new(trigram_index: Arc<TrigramIndex>) -> Self {
Self {
trigram_index,
config: FuzzyConfig::default(),
}
}
#[must_use]
pub fn with_config(trigram_index: Arc<TrigramIndex>, config: FuzzyConfig) -> Self {
Self {
trigram_index,
config,
}
}
#[allow(clippy::cast_precision_loss)] #[must_use]
pub fn generate(&self, query: &str) -> Vec<usize> {
let Some(query_trigrams) = Self::extract_query_trigrams(query) else {
return Vec::new();
};
let query_trigram_count = to_f64(query_trigrams.len());
let use_jaccard = use_jaccard_similarity();
let overlap_counts = self.collect_overlap_counts(&query_trigrams);
let mut telemetry = FuzzyTelemetry::new(overlap_counts.len());
let candidates = self.select_candidates(
&overlap_counts,
query_trigram_count,
query_trigrams.len(),
use_jaccard,
&mut telemetry,
);
telemetry.log(query, use_jaccard, candidates.len());
candidates
}
#[must_use]
pub fn symbol_count(&self) -> usize {
self.trigram_index.symbol_count
}
#[must_use]
pub fn config(&self) -> &FuzzyConfig {
&self.config
}
}
struct FuzzyTelemetry {
initial_candidates: usize,
jaccard_sum: f64,
jaccard_count: u32,
fallback_count: usize,
dropped_count: usize,
}
impl FuzzyTelemetry {
fn new(initial_candidates: usize) -> Self {
Self {
initial_candidates,
jaccard_sum: 0.0,
jaccard_count: 0,
fallback_count: 0,
dropped_count: 0,
}
}
fn record_similarity(&mut self, similarity: f64, jaccard_applied: bool) {
if jaccard_applied {
self.jaccard_sum += similarity;
self.jaccard_count += 1;
} else {
self.fallback_count += 1;
}
}
fn mark_dropped(&mut self) {
self.dropped_count += 1;
}
fn log(&self, query: &str, use_jaccard: bool, kept: usize) {
log::debug!(
"Fuzzy candidate generation: query='{}' initial={} kept={} dropped={} jaccard_avg={:.3} fallback={} mode={}",
query,
self.initial_candidates,
kept,
self.dropped_count,
self.jaccard_average(),
self.fallback_count,
if use_jaccard { "jaccard" } else { "ratio" }
);
if self.fallback_count > 0 && use_jaccard {
log::debug!(
"Fuzzy search using fallback ratio for {} candidates (old index or missing counts)",
self.fallback_count
);
}
}
fn jaccard_average(&self) -> f64 {
if self.jaccard_count > 0 {
self.jaccard_sum / f64::from(self.jaccard_count)
} else {
0.0
}
}
}
fn compute_similarity(
use_jaccard: bool,
entry_id: usize,
overlap: usize,
query_trigram_count: f64,
query_trigram_len: usize,
symbol_trigram_counts: &[usize],
) -> (f64, bool) {
if use_jaccard && entry_id < symbol_trigram_counts.len() && !symbol_trigram_counts.is_empty() {
let symbol_trigram_count = symbol_trigram_counts[entry_id];
let union = query_trigram_len + symbol_trigram_count - overlap;
let jaccard = if union > 0 {
to_f64(overlap) / to_f64(union)
} else {
0.0
};
(jaccard, true)
} else {
(to_f64(overlap) / query_trigram_count, false)
}
}
impl CandidateGenerator {
fn collect_overlap_counts(&self, query_trigrams: &[String]) -> Vec<(usize, usize)> {
use std::collections::HashMap;
let mut overlap_map: HashMap<usize, usize> = HashMap::new();
for trigram in query_trigrams {
if let Some(entry_ids) = self.trigram_index.postings.get(trigram) {
for &entry_id in entry_ids {
*overlap_map.entry(entry_id).or_insert(0) += 1;
}
}
}
let mut overlap_counts: Vec<(usize, usize)> = overlap_map.into_iter().collect();
overlap_counts.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
overlap_counts
}
fn extract_query_trigrams(query: &str) -> Option<Vec<String>> {
use super::trigram::extract_normalized_trigrams;
let query_trigrams = extract_normalized_trigrams(query);
if query_trigrams.is_empty() {
None
} else {
Some(query_trigrams)
}
}
fn select_candidates(
&self,
overlap_counts: &[(usize, usize)],
query_trigram_count: f64,
query_trigram_len: usize,
use_jaccard: bool,
telemetry: &mut FuzzyTelemetry,
) -> Vec<usize> {
let mut candidates =
Vec::with_capacity(self.config.max_candidates.min(overlap_counts.len()));
let symbol_trigram_counts = &self.trigram_index.symbol_trigram_counts;
for &(entry_id, overlap) in overlap_counts {
let (similarity, jaccard_applied) = compute_similarity(
use_jaccard,
entry_id,
overlap,
query_trigram_count,
query_trigram_len,
symbol_trigram_counts,
);
telemetry.record_similarity(similarity, jaccard_applied);
if similarity < self.config.min_similarity {
telemetry.mark_dropped();
break; }
candidates.push(entry_id);
if candidates.len() >= self.config.max_candidates {
break;
}
}
candidates
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_index() -> TrigramIndex {
let mut index = TrigramIndex::new();
index.add_symbol(0, "hello_world");
index.add_symbol(1, "hello_rust");
index.add_symbol(2, "hello");
index.add_symbol(3, "world");
index.add_symbol(4, "goodbye");
index
}
#[test]
fn test_candidate_generation_basic() {
let index = create_test_index();
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("hello");
assert!(!candidates.is_empty());
assert!(candidates.contains(&0)); assert!(candidates.contains(&1)); assert!(candidates.contains(&2)); }
#[test]
fn test_candidate_cap_enforced() {
let index = create_test_index();
let config = FuzzyConfig {
max_candidates: 2,
min_similarity: 0.0,
};
let generator = CandidateGenerator::with_config(Arc::new(index), config);
let candidates = generator.generate("hello");
assert!(candidates.len() <= 2, "Should cap at 2 candidates");
}
#[test]
fn test_similarity_threshold() {
let index = create_test_index();
let config = FuzzyConfig {
max_candidates: 1000,
min_similarity: 0.9, };
let generator = CandidateGenerator::with_config(Arc::new(index), config);
let candidates = generator.generate("hello");
assert!(candidates.len() <= 3);
}
#[test]
fn test_empty_query() {
let index = create_test_index();
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("");
assert_eq!(candidates.len(), 0);
}
#[test]
fn test_no_matches() {
let index = create_test_index();
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("xyz123");
assert_eq!(candidates.len(), 0);
}
#[test]
fn test_symbol_count() {
let index = create_test_index();
let generator = CandidateGenerator::new(Arc::new(index));
assert_eq!(generator.symbol_count(), 5);
}
#[test]
fn test_candidates_sorted_by_relevance() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "test");
index.add_symbol(1, "testing");
index.add_symbol(2, "test_function");
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("test");
assert_eq!(candidates[0], 0);
}
#[test]
fn test_jaccard_similarity_exact_match() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "hello");
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("hello");
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0], 0);
}
#[test]
fn test_jaccard_similarity_partial_overlap() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "hello");
index.add_symbol(1, "help");
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("hel");
assert_eq!(candidates.len(), 2);
}
#[test]
fn test_jaccard_vs_ratio_difference() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "test");
index.add_symbol(1, "testing_function_with_test");
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("test");
assert_eq!(candidates[0], 0);
}
#[test]
fn test_fallback_to_ratio_when_counts_missing() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "hello");
index.add_symbol(1, "world");
let index_no_counts = TrigramIndex {
postings: index.postings.clone(),
symbol_lengths: index.symbol_lengths.clone(),
symbol_trigram_counts: Vec::new(), symbol_count: index.symbol_count,
};
let generator = CandidateGenerator::new(Arc::new(index_no_counts));
let candidates = generator.generate("hello");
assert!(!candidates.is_empty());
assert!(candidates.contains(&0));
}
#[test]
fn test_jaccard_computation_correctness() {
use crate::search::trigram::extract_normalized_trigrams;
let mut index = TrigramIndex::new();
index.add_symbol(0, "context");
index.add_symbol(1, "content");
let query = "conte";
let _query_trigrams = extract_normalized_trigrams(query);
let config = FuzzyConfig {
max_candidates: 10,
min_similarity: 0.5, };
let generator = CandidateGenerator::with_config(Arc::new(index), config);
let candidates = generator.generate(query);
assert_eq!(candidates.len(), 2);
assert!(candidates.contains(&0)); assert!(candidates.contains(&1)); }
#[test]
fn test_jaccard_with_high_threshold() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "hello");
index.add_symbol(1, "helloworld");
index.add_symbol(2, "help");
let config = FuzzyConfig {
max_candidates: 10,
min_similarity: 0.8, };
let generator = CandidateGenerator::with_config(Arc::new(index), config);
let candidates = generator.generate("hello");
assert!(candidates.contains(&0));
}
#[test]
fn test_env_var_toggle_disables_jaccard() {
unsafe {
std::env::set_var("SQRY_FUZZY_USE_JACCARD", "0");
}
let mut index = TrigramIndex::new();
index.add_symbol(0, "context");
index.add_symbol(1, "content");
let config = FuzzyConfig {
max_candidates: 10,
min_similarity: 0.5,
};
let generator = CandidateGenerator::with_config(Arc::new(index), config);
let candidates = generator.generate("conte");
assert_eq!(candidates.len(), 2);
unsafe {
std::env::remove_var("SQRY_FUZZY_USE_JACCARD");
}
}
#[test]
fn test_zero_union_guard() {
let mut index = TrigramIndex::new();
index.add_symbol(0, "a");
index.add_symbol(1, "b");
let generator = CandidateGenerator::new(Arc::new(index));
let candidates = generator.generate("c");
assert!(candidates.is_empty() || !candidates.is_empty());
}
}