lucisearch 0.8.1

Embeddable, in-process search engine — the SQLite/DuckDB of search
Documentation
//! Regexp query: match terms against a regular expression.
//!
//! Compiles the pattern to a `RegexAutomaton` (DFA implementing
//! `fst::Automaton`) and uses `reader.automaton_search` for FST/DFA
//! intersection — the FST walk prunes non-matching subtrees via
//! `can_match`, replacing the previous "decode every term, then
//! `regex.is_match`" full-scan loop with O(matching subtrees) work.
//! Same primitive `levenshtein_automata::DFA` uses for fuzzy.
//!
//! Uses **per-segment rewrite**: term enumeration happens inside
//! `scorer_supplier(reader)` for each segment, not globally at `bind()`.
//! Matched terms are unioned via [`ConstantScoreMultiTermSupplier`]
//! (shared `FilterScorer` + `BufferedUnionScorer`), matching Lucene's
//! `MultiTermQueryConstantScoreBlendedWrapper`. Every matching doc
//! receives a constant score of 1.0.
//!
//! See [[fix-wildcard-fuzzy-quadratic-dedup]],
//! [[optimization-multiterm-constant-score-rewrite]], and
//! [[optimization-regexp-wildcard-fst-automaton]].

use crate::core::{Result, ScoreMode};

use crate::query::multi_term::ConstantScoreMultiTermSupplier;
use crate::query::regex_automaton::RegexAutomaton;
use crate::query::{BoundQuery, Query, ScorerSupplier};
use crate::search::searcher::Searcher;
use crate::segment::reader::SegmentReader;

/// Regexp query on a field's term dictionary.
pub struct RegexpQuery {
    pub field: String,
    pub pattern: String,
}

impl Query for RegexpQuery {
    fn bind(&self, _searcher: &Searcher, _score_mode: ScoreMode) -> Result<Box<dyn BoundQuery>> {
        // Strip explicit ^...$ anchors from the user pattern — the
        // RegexAutomaton is anchored implicitly via DFA configuration.
        let pattern = strip_anchors(&self.pattern);
        let automaton = RegexAutomaton::new(pattern)?;
        Ok(Box::new(BoundRegexpQuery {
            field: self.field.clone(),
            automaton,
        }))
    }
}

fn strip_anchors(pattern: &str) -> &str {
    let mut p = pattern;
    if let Some(stripped) = p.strip_prefix('^') {
        p = stripped;
    }
    if let Some(stripped) = p.strip_suffix('$') {
        p = stripped;
    }
    p
}

/// Bound regexp query — defers term enumeration to per-segment
/// `scorer_supplier(reader)` calls via `automaton_search`.
struct BoundRegexpQuery {
    field: String,
    automaton: RegexAutomaton,
}

impl BoundQuery for BoundRegexpQuery {
    fn scorer_supplier(&self, reader: &SegmentReader) -> Result<Option<Box<dyn ScorerSupplier>>> {
        let field_id = match reader
            .header()
            .fields
            .iter()
            .find(|f| f.field_name == self.field)
            .map(|f| f.field_id)
        {
            Some(id) => id,
            None => return Ok(None),
        };

        // FST/DFA intersection — prunes non-matching subtrees via the
        // automaton's can_match. O(matching subtrees), not O(all terms).
        let terms: Vec<(String, u32)> = reader.automaton_search(field_id, &self.automaton);
        if terms.is_empty() {
            return Ok(None);
        }

        Ok(Some(Box::new(ConstantScoreMultiTermSupplier::new(
            reader, field_id, terms,
        ))))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::analysis::Token;
    use crate::core::{FieldId, SegmentId};
    use crate::mapping::{FieldType, Mapping};
    use crate::segment::builder::SegmentBuilder;
    use crate::segment::reader::SegmentReader;

    #[test]
    fn regexp_basic() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["technology", "technical", "tennis", "science"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &RegexpQuery {
                    field: "tag".into(),
                    pattern: "tech.*".into(),
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 2); // technology, technical
    }

    #[test]
    fn regexp_character_class() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["cat", "cut", "cot", "cart", "cit"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &RegexpQuery {
                    field: "tag".into(),
                    pattern: "c[aou]t".into(),
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 3); // cat, cut, cot
    }

    #[test]
    fn regexp_alternation() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["red", "blue", "green"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &RegexpQuery {
                    field: "tag".into(),
                    pattern: "red|blue".into(),
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 2);
    }

    #[test]
    fn regexp_constant_score_all_ones() {
        // Multi-term regexp match: every hit should score exactly 1.0,
        // matching ES's CONSTANT_SCORE_BLENDED_REWRITE behavior.
        // See [[optimization-multiterm-constant-score-rewrite]].
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        for tag in &["cat", "cut", "cot", "cart", "cit"] {
            builder.add_document(
                &[(FieldId::new(0), vec![Token::new(*tag, 0, tag.len(), 0)])],
                b"{}",
            );
        }
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &RegexpQuery {
                    field: "tag".into(),
                    pattern: "c[aou]t".into(),
                },
                10,
                0,
            )
            .unwrap();
        // cat, cut, cot — 3 matches
        assert_eq!(results.total_hits.value, 3);
        for hit in &results.hits {
            assert_eq!(
                hit.score, 1.0,
                "regexp hit should have constant score 1.0, got {}",
                hit.score
            );
        }
    }

    #[test]
    fn regexp_no_matches() {
        let schema = Mapping::builder().field("tag", FieldType::Keyword).build();
        let mut builder = SegmentBuilder::new(SegmentId::new(1), &schema);
        builder.add_document(
            &[(FieldId::new(0), vec![Token::new("hello", 0, 5, 0)])],
            b"{}",
        );
        let reader = SegmentReader::open(builder.build()).unwrap();
        let store = crate::search::segment_store::SegmentStore::new(
            vec![reader],
            crate::analysis::AnalyzerRegistry::new(),
            None,
            None,
        );
        let searcher = Searcher::new(&store);

        let results = searcher
            .search_query(
                &RegexpQuery {
                    field: "tag".into(),
                    pattern: "xyz.*".into(),
                },
                10,
                0,
            )
            .unwrap();
        assert_eq!(results.total_hits.value, 0);
    }
}