lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Adapter from `regex_automata::dfa::dense::DFA` to `fst::Automaton`.
//!
//! Used by `regexp.rs` and `wildcard.rs` to drive
//! `term_dict.automaton_search` with regex DFAs. The FST walk prunes
//! non-matching subtrees via `can_match`, replacing the quadratic
//! "decode every term, then `regex.is_match`" loop with O(matching
//! subtrees) FST/DFA intersection. Same primitive `levenshtein_automata::DFA`
//! uses for fuzzy.
//!
//! See [[optimization-regexp-wildcard-fst-automaton]].

use crate::core::{LuciError, Result};
use regex_automata::Anchored;
use regex_automata::dfa::Automaton as DfaAutomaton;
use regex_automata::dfa::StartKind;
use regex_automata::dfa::dense::{Builder, Config, DFA};
use regex_automata::util::primitives::StateID;
use regex_automata::util::start::Config as StartConfig;

/// Compiled regex DFA + a cached anchored start state, adapted to
/// `fst::Automaton`. The DFA is built with `StartKind::Anchored` so
/// that matches must start at byte 0 of each FST term (left-anchored).
/// The right-anchor (`$`) is implemented in `is_match` via
/// `next_eoi_state` — a state is a match state only if signaling
/// EOI from it leads to a match state.
pub(crate) struct RegexAutomaton {
    dfa: DFA<Vec<u32>>,
    start: StateID,
}

impl RegexAutomaton {
    /// Compile a regex pattern into an FST-walkable automaton.
    /// The pattern matches the *entire* term, not a substring.
    pub(crate) fn new(pattern: &str) -> Result<Self> {
        let dfa = Builder::new()
            .configure(Config::new().start_kind(StartKind::Anchored))
            .build(pattern)
            .map_err(|e| LuciError::InvalidQuery(format!("invalid regexp: {e}")))?;
        let start = dfa
            .start_state(&StartConfig::new().anchored(Anchored::Yes))
            .map_err(|e| LuciError::InvalidQuery(format!("regex start state: {e}")))?;
        Ok(Self { dfa, start })
    }
}

impl fst::Automaton for RegexAutomaton {
    type State = StateID;

    fn start(&self) -> StateID {
        self.start
    }

    /// Called at every prefix during FST iteration. We need full-term
    /// matching, so we check the EOI state — if signaling EOI from
    /// the current state lands in a match state, the current FST
    /// prefix is a complete match.
    fn is_match(&self, state: &StateID) -> bool {
        let eoi = self.dfa.next_eoi_state(*state);
        self.dfa.is_match_state(eoi)
    }

    /// Pruning: return false if the DFA is in a dead state, meaning
    /// no extension can ever lead to a match. The FST walker uses
    /// this to skip entire subtrees.
    fn can_match(&self, state: &StateID) -> bool {
        !self.dfa.is_dead_state(*state)
    }

    fn accept(&self, state: &StateID, byte: u8) -> StateID {
        self.dfa.next_state(*state, byte)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use fst::Automaton;

    /// Walk a string through the automaton and return whether it
    /// matches at end of input.
    fn matches(aut: &RegexAutomaton, input: &str) -> bool {
        let mut state = aut.start();
        for &byte in input.as_bytes() {
            state = aut.accept(&state, byte);
        }
        aut.is_match(&state)
    }

    #[test]
    fn matches_full_string() {
        let aut = RegexAutomaton::new("foo").unwrap();
        assert!(matches(&aut, "foo"));
    }

    #[test]
    fn does_not_match_substring_at_end() {
        // Anchored regex — "foo" should not match "foobar"
        let aut = RegexAutomaton::new("foo").unwrap();
        assert!(!matches(&aut, "foobar"));
    }

    #[test]
    fn does_not_match_prefix() {
        // "foo" should not match "fo"
        let aut = RegexAutomaton::new("foo").unwrap();
        assert!(!matches(&aut, "fo"));
    }

    #[test]
    fn matches_character_class() {
        let aut = RegexAutomaton::new("c[aou]t").unwrap();
        for word in &["cat", "cut", "cot"] {
            assert!(matches(&aut, word), "{} should match", word);
        }
        for word in &["cit", "cart", "ct"] {
            assert!(!matches(&aut, word), "{} should not match", word);
        }
    }

    #[test]
    fn matches_alternation() {
        let aut = RegexAutomaton::new("red|blue").unwrap();
        assert!(matches(&aut, "red"));
        assert!(matches(&aut, "blue"));
        assert!(!matches(&aut, "green"));
        assert!(!matches(&aut, "redblue"));
    }

    #[test]
    fn matches_with_quantifier() {
        let aut = RegexAutomaton::new("a+b").unwrap();
        assert!(matches(&aut, "ab"));
        assert!(matches(&aut, "aaab"));
        assert!(!matches(&aut, "b"));
        assert!(!matches(&aut, "abx"));
    }

    #[test]
    fn pruning_via_can_match_dead_state() {
        // For "foo.*", after seeing 'x' the DFA should be dead
        // (can't match because the regex requires literal "foo" first).
        let aut = RegexAutomaton::new("foo.*").unwrap();
        let dead = aut.accept(&aut.start(), b'x');
        assert!(!aut.can_match(&dead));
    }

    #[test]
    fn pruning_alive_on_partial_match() {
        // For "foo.*", after seeing 'f' the DFA is alive (could
        // still match if next byte is 'o').
        let aut = RegexAutomaton::new("foo.*").unwrap();
        let s = aut.accept(&aut.start(), b'f');
        assert!(aut.can_match(&s));
    }

    #[test]
    fn matches_dot_star() {
        // Edge case: ".*" matches everything (including empty).
        let aut = RegexAutomaton::new(".*").unwrap();
        assert!(matches(&aut, ""));
        assert!(matches(&aut, "anything"));
        assert!(matches(&aut, "with spaces and stuff"));
    }

    #[test]
    fn matches_empty_pattern_is_empty_string() {
        // Empty pattern matches only the empty string.
        let aut = RegexAutomaton::new("").unwrap();
        assert!(matches(&aut, ""));
        assert!(!matches(&aut, "x"));
    }

    #[test]
    fn invalid_regex_returns_error() {
        let result = RegexAutomaton::new("[unclosed");
        assert!(result.is_err());
    }

    #[test]
    fn unicode_byte_walk() {
        // The fst::Automaton trait is byte-based, but the regex DFA
        // also operates byte-by-byte. Verify a multi-byte UTF-8 char
        // round-trips correctly.
        let aut = RegexAutomaton::new("café").unwrap();
        assert!(matches(&aut, "café"));
        assert!(!matches(&aut, "cafe"));
    }
}