use crate::regex_ast::Ast;
use crate::trigram;
const REPEAT_INLINE_CAP: u32 = 2;
#[must_use]
pub fn required_trigrams(ast: &Ast) -> Vec<[u8; 3]> {
let mut state = LinState::new();
state.append(ast);
state.flush();
state.required.sort_unstable();
state.required.dedup();
state.required
}
#[must_use]
pub fn required_trigram_hashes(ast: &Ast) -> Vec<u64> {
let mut hashes: Vec<u64> = required_trigrams(ast)
.iter()
.map(|t| trigram::hash_trigram(t))
.collect();
hashes.sort_unstable();
hashes.dedup();
hashes
}
struct LinState {
run: Vec<u8>,
required: Vec<[u8; 3]>,
}
impl LinState {
fn new() -> Self {
Self {
run: Vec::new(),
required: Vec::new(),
}
}
fn flush(&mut self) {
for w in self.run.windows(trigram::TRIGRAM_LEN) {
let t: [u8; 3] = [w[0], w[1], w[2]];
if !self.required.contains(&t) {
self.required.push(t);
}
}
self.run.clear();
}
fn append(&mut self, ast: &Ast) {
match ast {
Ast::Empty => {}
Ast::Literal(bytes) => self.run.extend_from_slice(bytes),
Ast::AnyChar | Ast::Anchor => self.flush(),
Ast::Concat(parts) => {
for p in parts {
self.append(p);
}
}
Ast::Alt(branches) => self.append_alt(branches),
Ast::Repeat { sub, min, max: _ } => self.append_repeat(sub, *min),
}
}
fn append_alt(&mut self, branches: &[Ast]) {
self.flush();
let Some((first, rest)) = branches.split_first() else {
return;
};
let mut common: Vec<[u8; 3]> = {
let mut sub = LinState::new();
sub.append(first);
sub.flush();
sub.required
};
for b in rest {
if common.is_empty() {
return;
}
let mut sub = LinState::new();
sub.append(b);
sub.flush();
common.retain(|t| sub.required.contains(t));
}
for t in common {
if !self.required.contains(&t) {
self.required.push(t);
}
}
}
fn append_repeat(&mut self, sub: &Ast, min: u32) {
if min == 0 {
self.flush();
return;
}
let copies = min.min(REPEAT_INLINE_CAP);
for _ in 0..copies {
self.append(sub);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::regex_ast::parse;
fn extract(pattern: &str) -> Vec<[u8; 3]> {
let ast = parse(pattern).expect("parses");
required_trigrams(&ast)
}
#[test]
fn empty_pattern_yields_no_trigrams() {
assert!(extract("").is_empty());
}
#[test]
fn literal_yields_its_trigrams() {
let tris = extract("abcde");
assert_eq!(tris.len(), 3);
assert!(tris.contains(b"abc"));
assert!(tris.contains(b"bcd"));
assert!(tris.contains(b"cde"));
}
#[test]
fn dot_breaks_the_run() {
let tris = extract("ab.cd");
assert!(tris.is_empty());
}
#[test]
fn anchors_break_the_run() {
let tris = extract("ab^cd");
assert!(tris.is_empty());
}
#[test]
fn repeated_literal_inlines_twice() {
let tris = extract("(ab){2,}");
assert!(tris.contains(b"aba"));
assert!(tris.contains(b"bab"));
}
#[test]
fn optional_group_yields_empty() {
let tris = extract("(foo)?");
assert!(tris.is_empty());
}
}