lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Fuzzy query: match terms within edit distance (Levenshtein).
//!
//! Uses the `levenshtein_automata` crate to build a DFA that accepts
//! all terms within the specified edit distance. Term enumeration is
//! done via FST/DFA intersection (`reader.automaton_search`) which
//! prunes non-matching subtrees.
//!
//! Uses **per-segment rewrite**: term enumeration happens inside
//! `scorer_supplier(reader)` for each segment, not globally at `bind()`.
//! Matched terms are unioned via [`ConstantScoreMultiTermSupplier`]
//! (shared `FilterScorer` + `BufferedUnionScorer`), matching Lucene's
//! `MultiTermQueryConstantScoreBlendedWrapper`. Every matching doc
//! receives a constant score of 1.0.
//!
//! See [[fix-wildcard-fuzzy-quadratic-dedup]],
//! [[optimization-wildcard-fuzzy-queries]], and
//! [[optimization-multiterm-constant-score-rewrite]].

use crate::core::{Result, ScoreMode};
use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder};

use crate::query::multi_term::ConstantScoreMultiTermSupplier;
use crate::query::{BoundQuery, Query, ScorerSupplier};
use crate::search::searcher::Searcher;
use crate::segment::reader::SegmentReader;

/// Fuzzy query: find terms within Levenshtein edit distance.
pub struct FuzzyQuery {
    pub field: String,
    pub value: String,
    pub fuzziness: u32,
}

impl Query for FuzzyQuery {
    fn bind(&self, _searcher: &Searcher, _score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
        // Compile the Levenshtein DFA once. No segment iteration —
        // term enumeration happens per-segment in scorer_supplier(reader).
        let max_dist = self.fuzziness.min(2) as u8;
        let builder = LevenshteinAutomatonBuilder::new(max_dist, true);
        let dfa = builder.build_dfa(&self.value);

        Ok(Box::new(BoundFuzzyQuery {
            field: self.field.clone(),
            dfa,
        }))
    }
}

/// Bound fuzzy query — defers term enumeration to per-segment
/// `scorer_supplier(reader)` calls. Matches Lucene's MultiTermQuery model.
struct BoundFuzzyQuery {
    field: String,
    dfa: DFA,
}

impl BoundQuery for BoundFuzzyQuery {
    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
        let field_id = match reader
            .header()
            .fields
            .iter()
            .find(|f| f.field_name == self.field)
            .map(|f| f.field_id)
        {
            Some(id) => id,
            None => return Ok(None),
        };

        // Enumerate THIS segment's matching terms via FST/DFA
        // intersection (prunes non-matching subtrees).
        let terms: Vec<(String, u32)> = reader.automaton_search(field_id, &self.dfa);
        if terms.is_empty() {
            return Ok(None);
        }

        Ok(Some(Box::new(ConstantScoreMultiTermSupplier::new(
            reader, field_id, terms,
        ))))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::analysis::Token;
    use crate::core::{FieldId, SegmentId};
    use crate::mapping::{FieldType, Mapping};
    use crate::segment::builder::SegmentBuilder;
    use crate::segment::reader::SegmentReader;

    #[test]
    fn fuzzy_query_basic() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["technology", "technlogy", "techology", "science", "tennis"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &FuzzyQuery {
                    field: "tag".into(),
                    value: "technology".into(),
                    fuzziness: 1,
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 3); // technology (0), technlogy (1), techology (1)
    }

    #[test]
    fn fuzzy_query_fuzziness_2() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["apple", "aple", "apples", "apply", "orange"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &FuzzyQuery {
                    field: "tag".into(),
                    value: "apple".into(),
                    fuzziness: 2,
                },
                10,
                0,
            )
            .unwrap();
        // apple(0), aple(1), apples(1), apply(2)
        assert_eq!(results.total_hits.value, 4);
    }

    #[test]
    fn fuzzy_constant_score_all_ones() {
        // Multi-term fuzzy match: every hit should score exactly 1.0,
        // matching ES's CONSTANT_SCORE_BLENDED_REWRITE behavior.
        // See [[optimization-multiterm-constant-score-rewrite]].
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["cat", "bat", "hat", "dog"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &FuzzyQuery {
                    field: "tag".into(),
                    value: "cat".into(),
                    fuzziness: 1,
                },
                10,
                0,
            )
            .unwrap();
        // cat(0), bat(1), hat(1) — 3 matches within edit distance 1
        assert_eq!(results.total_hits.value, 3);
        for hit in &results.hits {
            assert_eq!(
                hit.score, 1.0,
                "fuzzy hit should have constant score 1.0, got {}",
                hit.score
            );
        }
    }

    #[test]
    fn fuzzy_no_matches() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        builder.add_document(
            &[(FieldId::new(0), vec![Token::new("hello", 0, 5, 0)])],
            b"{}",
        );
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &FuzzyQuery {
                    field: "tag".into(),
                    value: "xyz".into(),
                    fuzziness: 1,
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 0);
    }

    #[test]
    fn fuzzy_first_char_substitution() {
        // Test that fuzzy matches work even when the first character differs
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["cat", "bat", "hat", "dog"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &FuzzyQuery {
                    field: "tag".into(),
                    value: "cat".into(),
                    fuzziness: 1,
                },
                10,
                0,
            )
            .unwrap();
        // cat(0), bat(1), hat(1) — first-char substitution
        assert_eq!(results.total_hits.value, 3);
    }
}