use crate::core::{LuciError, Result};
use regex_automata::Anchored;
use regex_automata::dfa::Automaton as DfaAutomaton;
use regex_automata::dfa::StartKind;
use regex_automata::dfa::dense::{Builder, Config, DFA};
use regex_automata::util::primitives::StateID;
use regex_automata::util::start::Config as StartConfig;
pub(crate) struct RegexAutomaton {
dfa: DFA<Vec<u32>>,
start: StateID,
}
impl RegexAutomaton {
pub(crate) fn new(pattern: &str) -> Result<Self> {
let dfa = Builder::new()
.configure(Config::new().start_kind(StartKind::Anchored))
.build(pattern)
.map_err(|e| LuciError::InvalidQuery(format!("invalid regexp: {e}")))?;
let start = dfa
.start_state(&StartConfig::new().anchored(Anchored::Yes))
.map_err(|e| LuciError::InvalidQuery(format!("regex start state: {e}")))?;
Ok(Self { dfa, start })
}
}
impl fst::Automaton for RegexAutomaton {
type State = StateID;
fn start(&self) -> StateID {
self.start
}
fn is_match(&self, state: &StateID) -> bool {
let eoi = self.dfa.next_eoi_state(*state);
self.dfa.is_match_state(eoi)
}
fn can_match(&self, state: &StateID) -> bool {
!self.dfa.is_dead_state(*state)
}
fn accept(&self, state: &StateID, byte: u8) -> StateID {
self.dfa.next_state(*state, byte)
}
}
#[cfg(test)]
mod tests {
use super::*;
use fst::Automaton;
fn matches(aut: &RegexAutomaton, input: &str) -> bool {
let mut state = aut.start();
for &byte in input.as_bytes() {
state = aut.accept(&state, byte);
}
aut.is_match(&state)
}
#[test]
fn matches_full_string() {
let aut = RegexAutomaton::new("foo").unwrap();
assert!(matches(&aut, "foo"));
}
#[test]
fn does_not_match_substring_at_end() {
let aut = RegexAutomaton::new("foo").unwrap();
assert!(!matches(&aut, "foobar"));
}
#[test]
fn does_not_match_prefix() {
let aut = RegexAutomaton::new("foo").unwrap();
assert!(!matches(&aut, "fo"));
}
#[test]
fn matches_character_class() {
let aut = RegexAutomaton::new("c[aou]t").unwrap();
for word in &["cat", "cut", "cot"] {
assert!(matches(&aut, word), "{} should match", word);
}
for word in &["cit", "cart", "ct"] {
assert!(!matches(&aut, word), "{} should not match", word);
}
}
#[test]
fn matches_alternation() {
let aut = RegexAutomaton::new("red|blue").unwrap();
assert!(matches(&aut, "red"));
assert!(matches(&aut, "blue"));
assert!(!matches(&aut, "green"));
assert!(!matches(&aut, "redblue"));
}
#[test]
fn matches_with_quantifier() {
let aut = RegexAutomaton::new("a+b").unwrap();
assert!(matches(&aut, "ab"));
assert!(matches(&aut, "aaab"));
assert!(!matches(&aut, "b"));
assert!(!matches(&aut, "abx"));
}
#[test]
fn pruning_via_can_match_dead_state() {
let aut = RegexAutomaton::new("foo.*").unwrap();
let dead = aut.accept(&aut.start(), b'x');
assert!(!aut.can_match(&dead));
}
#[test]
fn pruning_alive_on_partial_match() {
let aut = RegexAutomaton::new("foo.*").unwrap();
let s = aut.accept(&aut.start(), b'f');
assert!(aut.can_match(&s));
}
#[test]
fn matches_dot_star() {
let aut = RegexAutomaton::new(".*").unwrap();
assert!(matches(&aut, ""));
assert!(matches(&aut, "anything"));
assert!(matches(&aut, "with spaces and stuff"));
}
#[test]
fn matches_empty_pattern_is_empty_string() {
let aut = RegexAutomaton::new("").unwrap();
assert!(matches(&aut, ""));
assert!(!matches(&aut, "x"));
}
#[test]
fn invalid_regex_returns_error() {
let result = RegexAutomaton::new("[unclosed");
assert!(result.is_err());
}
#[test]
fn unicode_byte_walk() {
let aut = RegexAutomaton::new("café").unwrap();
assert!(matches(&aut, "café"));
assert!(!matches(&aut, "cafe"));
}
}