use crate::regex_ast::Ast;
use crate::trigram;
const REPEAT_INLINE_CAP_MULTI: u32 = 2;
const REPEAT_INLINE_CAP_SINGLE: u32 = 3;
#[must_use]
pub fn required_trigrams(ast: &Ast) -> Vec<[u8; 3]> {
let mut state = LinState::new();
state.append(ast);
state.flush();
state.required.sort_unstable();
state.required.dedup();
state.required
}
#[must_use]
pub fn required_trigram_hashes(ast: &Ast) -> Vec<u64> {
let mut hashes: Vec<u64> = required_trigrams(ast)
.iter()
.map(|t| trigram::hash_trigram(t))
.collect();
hashes.sort_unstable();
hashes.dedup();
hashes
}
#[must_use]
pub fn extract_literal_runs(ast: &Ast) -> Vec<Vec<u8>> {
let mut state = RunState::new();
state.append(ast);
state.flush();
state.runs
}
#[must_use]
pub fn has_top_level_start_anchor(ast: &Ast) -> bool {
match ast {
Ast::Anchor => true,
Ast::Concat(parts) => parts.first().is_some_and(has_top_level_start_anchor),
_ => false,
}
}
#[must_use]
pub fn anchored_prefix(ast: &Ast) -> Option<Vec<u8>> {
let Ast::Concat(parts) = ast else {
return None;
};
let mut iter = parts.iter();
match iter.next()? {
Ast::Anchor => {}
_ => return None,
}
let mut prefix = Vec::new();
for p in iter {
match p {
Ast::Literal(bytes) => prefix.extend_from_slice(bytes),
_ => break,
}
}
if prefix.is_empty() {
None
} else {
Some(prefix)
}
}
struct LinState {
run: Vec<u8>,
required: Vec<[u8; 3]>,
}
impl LinState {
fn new() -> Self {
Self {
run: Vec::new(),
required: Vec::new(),
}
}
fn flush(&mut self) {
for w in self.run.windows(trigram::TRIGRAM_LEN) {
let t: [u8; 3] = [w[0], w[1], w[2]];
if !self.required.contains(&t) {
self.required.push(t);
}
}
self.run.clear();
}
fn append(&mut self, ast: &Ast) {
match ast {
Ast::Empty => {}
Ast::Literal(bytes) => self.run.extend_from_slice(bytes),
Ast::AnyChar | Ast::Anchor => self.flush(),
Ast::Concat(parts) => self.append_concat(parts),
Ast::Alt(branches) => self.append_alt(branches, &[], &[]),
Ast::Repeat { sub, min, max: _ } => self.append_repeat(sub, *min),
}
}
fn append_concat(&mut self, parts: &[Ast]) {
let n = parts.len();
let mut i = 0;
while i < n {
let part = &parts[i];
if let Ast::Alt(branches) = part {
let prefix = self.run.clone();
let suffix = next_literal_prefix(parts, i + 1);
self.append_alt(branches, &prefix, &suffix);
i += 1;
continue;
}
self.append(part);
i += 1;
}
}
fn append_alt(&mut self, branches: &[Ast], context_prefix: &[u8], context_suffix: &[u8]) {
self.flush();
let Some((first, rest)) = branches.split_first() else {
return;
};
let mut common: Vec<[u8; 3]> = branch_trigrams(first, context_prefix, context_suffix);
for b in rest {
if common.is_empty() {
return;
}
let other = branch_trigrams(b, context_prefix, context_suffix);
common.retain(|t| other.contains(t));
}
for t in common {
if !self.required.contains(&t) {
self.required.push(t);
}
}
}
fn append_repeat(&mut self, sub: &Ast, min: u32) {
if min == 0 {
self.flush();
return;
}
let cap = if let Ast::Literal(bytes) = sub {
if bytes.len() == 1 {
REPEAT_INLINE_CAP_SINGLE
} else {
REPEAT_INLINE_CAP_MULTI
}
} else {
REPEAT_INLINE_CAP_MULTI
};
let copies = min.min(cap);
for _ in 0..copies {
self.append(sub);
}
}
}
struct RunState {
run: Vec<u8>,
runs: Vec<Vec<u8>>,
}
impl RunState {
fn new() -> Self {
Self {
run: Vec::new(),
runs: Vec::new(),
}
}
fn flush(&mut self) {
if !self.run.is_empty() {
self.runs.push(std::mem::take(&mut self.run));
}
}
fn append(&mut self, ast: &Ast) {
match ast {
Ast::Empty => {}
Ast::Literal(bytes) => self.run.extend_from_slice(bytes),
Ast::AnyChar | Ast::Anchor => self.flush(),
Ast::Concat(parts) => {
for p in parts {
self.append(p);
}
}
Ast::Alt(branches) => {
self.flush();
let Some((first, rest)) = branches.split_first() else {
return;
};
let first_runs = extract_literal_runs(first);
let mut common: Vec<Vec<u8>> = first_runs;
for b in rest {
if common.is_empty() {
return;
}
let other = extract_literal_runs(b);
common.retain(|r| other.iter().any(|o| o == r));
}
self.runs.extend(common);
}
Ast::Repeat { sub, min, max: _ } => {
if *min == 0 {
self.flush();
return;
}
let cap = if let Ast::Literal(bytes) = sub.as_ref() {
if bytes.len() == 1 {
REPEAT_INLINE_CAP_SINGLE
} else {
REPEAT_INLINE_CAP_MULTI
}
} else {
REPEAT_INLINE_CAP_MULTI
};
let copies = (*min).min(cap);
for _ in 0..copies {
self.append(sub);
}
}
}
}
}
fn next_literal_prefix(parts: &[Ast], start: usize) -> Vec<u8> {
let mut out = Vec::new();
for p in parts.iter().skip(start) {
match p {
Ast::Literal(bytes) => out.extend_from_slice(bytes),
_ => break,
}
}
out
}
fn branch_trigrams(branch: &Ast, prefix: &[u8], suffix: &[u8]) -> Vec<[u8; 3]> {
let mut sub = LinState::new();
sub.run.extend_from_slice(prefix);
sub.append(branch);
sub.run.extend_from_slice(suffix);
sub.flush();
sub.required
}
#[cfg(test)]
mod tests {
use super::*;
use crate::regex_ast::parse;
fn extract(pattern: &str) -> Vec<[u8; 3]> {
let ast = parse(pattern).expect("parses");
required_trigrams(&ast)
}
fn runs(pattern: &str) -> Vec<Vec<u8>> {
let ast = parse(pattern).expect("parses");
extract_literal_runs(&ast)
}
#[test]
fn empty_pattern_yields_no_trigrams() {
assert!(extract("").is_empty());
}
#[test]
fn literal_yields_its_trigrams() {
let tris = extract("abcde");
assert_eq!(tris.len(), 3);
assert!(tris.contains(b"abc"));
assert!(tris.contains(b"bcd"));
assert!(tris.contains(b"cde"));
}
#[test]
fn dot_breaks_the_run() {
let tris = extract("ab.cd");
assert!(tris.is_empty());
}
#[test]
fn anchors_break_the_run() {
let tris = extract("ab^cd");
assert!(tris.is_empty());
}
#[test]
fn repeated_literal_inlines_twice() {
let tris = extract("(ab){2,}");
assert!(tris.contains(b"aba"));
assert!(tris.contains(b"bab"));
}
#[test]
fn optional_group_yields_empty() {
let tris = extract("(foo)?");
assert!(tris.is_empty());
}
#[test]
fn single_byte_repeat_min_three_yields_homogeneous_trigram() {
let tris = extract("a{3,}");
assert!(tris.contains(b"aaa"), "expected 'aaa' in {tris:?}");
}
#[test]
fn alternation_with_common_suffix_extracts_suffix_trigrams() {
let tris = extract("(foo|bar)baz");
assert!(tris.contains(b"baz"), "expected 'baz' in {tris:?}");
}
#[test]
fn alternation_with_common_prefix_extracts_prefix_trigrams() {
let tris = extract("pre(foo|bar)");
assert!(tris.contains(b"pre"), "expected 'pre' in {tris:?}");
}
#[test]
fn extract_literal_runs_single_literal() {
let r = runs("hello");
assert_eq!(r, vec![b"hello".to_vec()]);
}
#[test]
fn extract_literal_runs_split_by_dot() {
let r = runs("foo.bar");
assert_eq!(r, vec![b"foo".to_vec(), b"bar".to_vec()]);
}
#[test]
fn extract_literal_runs_split_by_class() {
let r = runs(r"abc\w+def");
assert_eq!(r, vec![b"abc".to_vec(), b"def".to_vec()]);
}
#[test]
fn extract_literal_runs_anchor_does_not_emit_runs() {
let r = runs("^abc$");
assert_eq!(r, vec![b"abc".to_vec()]);
}
#[test]
fn extract_literal_runs_optional_group_drops_run() {
let r = runs("(abc)?def");
assert_eq!(r, vec![b"def".to_vec()]);
}
#[test]
fn anchored_prefix_returns_prefix_after_caret() {
let ast = parse("^errno: ").expect("parses");
assert_eq!(anchored_prefix(&ast), Some(b"errno: ".to_vec()));
}
#[test]
fn anchored_prefix_none_for_unanchored() {
let ast = parse("errno: ").expect("parses");
assert!(anchored_prefix(&ast).is_none());
}
#[test]
fn anchored_prefix_none_when_anchor_followed_by_non_literal() {
let ast = parse(r"^\w+abc").expect("parses");
assert!(anchored_prefix(&ast).is_none());
}
#[test]
fn has_top_level_start_anchor_detects_caret() {
let ast = parse("^abc").expect("parses");
assert!(has_top_level_start_anchor(&ast));
let ast = parse("abc").expect("parses");
assert!(!has_top_level_start_anchor(&ast));
}
}