use aho_corasick::AhoCorasick;
use regex_automata::meta;
use super::Match;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum Anchor {
Anywhere,
AtStart,
AtEnd,
Exact,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct SmallByteSet {
bytes: [u8; 3],
n: u8,
}
impl SmallByteSet {
pub(crate) fn new(src: &[u8]) -> Self {
assert!(
!src.is_empty() && src.len() <= 3,
"SmallByteSet expects 1..=3 bytes, got {}",
src.len()
);
let mut bytes = [0_u8; 3];
bytes[..src.len()].copy_from_slice(src);
Self {
bytes,
n: u8::try_from(src.len()).expect("1..=3 bounds checked above"),
}
}
#[inline]
pub(crate) fn contains_any_in(self, hay: &[u8]) -> Option<usize> {
match self.n {
1 => memchr::memchr(self.bytes[0], hay),
2 => memchr::memchr2(self.bytes[0], self.bytes[1], hay),
3 => memchr::memchr3(self.bytes[0], self.bytes[1], self.bytes[2], hay),
_ => None,
}
}
#[inline]
pub(crate) fn n(self) -> u8 {
self.n
}
#[inline]
pub(crate) fn byte(self, i: usize) -> u8 {
self.bytes[i]
}
}
#[derive(Debug, Clone)]
pub(crate) enum ShapeOp {
ContainsByte(u8),
ByteSet(SmallByteSet),
Contains(Box<memchr::memmem::Finder<'static>>),
StartsWith(Box<[u8]>),
EndsWith(Box<[u8]>),
ExactMatch(Box<[u8]>),
}
pub(crate) enum Plan {
LiteralOnly {
ac: Box<AhoCorasick>,
anchor: Anchor,
literals: Box<[Box<[u8]>]>,
case_insensitive: bool,
},
Shape(ShapeOp),
Meta(Box<meta::Regex>),
}
impl Plan {
#[inline]
pub(crate) fn is_match(&self, hay: &[u8]) -> bool {
match self {
Self::LiteralOnly { ac, anchor, .. } => match_set_is(ac, *anchor, hay),
Self::Shape(op) => match_shape_is(op, hay),
Self::Meta(r) => r.is_match(hay),
}
}
#[inline]
pub(crate) fn find(&self, hay: &[u8]) -> Option<Match> {
match self {
Self::LiteralOnly { ac, anchor, .. } => match_set_find(ac, *anchor, hay),
Self::Shape(op) => match_shape_find(op, hay),
Self::Meta(r) => r.find(hay).map(|m| Match {
start: m.start(),
end: m.end(),
}),
}
}
pub(crate) fn collect_matches(&self, hay: &[u8], out: &mut Vec<Match>) {
match self {
Self::LiteralOnly { ac, anchor, .. } => collect_set(ac, *anchor, hay, out),
Self::Shape(op) => collect_shape(op, hay, out),
Self::Meta(r) => {
for m in r.find_iter(hay) {
out.push(Match {
start: m.start(),
end: m.end(),
});
}
}
}
}
pub(crate) fn merge_literals(&self) -> Option<(Vec<Vec<u8>>, bool)> {
match self {
Self::LiteralOnly {
anchor: Anchor::Anywhere,
literals,
case_insensitive,
..
} => Some((
literals.iter().map(|l| l.to_vec()).collect(),
*case_insensitive,
)),
Self::Shape(ShapeOp::ContainsByte(b)) => Some((vec![vec![*b]], false)),
Self::Shape(ShapeOp::ByteSet(s)) => {
let n = s.n() as usize;
Some(((0..n).map(|i| vec![s.byte(i)]).collect(), false))
}
Self::Shape(ShapeOp::Contains(f)) => Some((vec![f.needle().to_vec()], false)),
Self::LiteralOnly { .. } | Self::Shape(_) | Self::Meta(_) => None,
}
}
}
#[inline]
fn match_set_is(ac: &AhoCorasick, anchor: Anchor, hay: &[u8]) -> bool {
match anchor {
Anchor::Anywhere => ac.find(hay).is_some(),
Anchor::AtStart => ac.find(hay).is_some_and(|m| m.start() == 0),
Anchor::AtEnd => ac.find_iter(hay).any(|m| m.end() == hay.len()),
Anchor::Exact => ac
.find_iter(hay)
.any(|m| m.start() == 0 && m.end() == hay.len()),
}
}
#[inline]
fn match_set_find(ac: &AhoCorasick, anchor: Anchor, hay: &[u8]) -> Option<Match> {
match anchor {
Anchor::Anywhere => ac.find(hay).map(|m| Match {
start: m.start(),
end: m.end(),
}),
Anchor::AtStart => ac.find(hay).filter(|m| m.start() == 0).map(|m| Match {
start: m.start(),
end: m.end(),
}),
Anchor::AtEnd => ac
.find_iter(hay)
.find(|m| m.end() == hay.len())
.map(|m| Match {
start: m.start(),
end: m.end(),
}),
Anchor::Exact => ac
.find_iter(hay)
.find(|m| m.start() == 0 && m.end() == hay.len())
.map(|m| Match {
start: m.start(),
end: m.end(),
}),
}
}
#[inline]
fn match_shape_is(op: &ShapeOp, hay: &[u8]) -> bool {
match op {
ShapeOp::ContainsByte(b) => memchr::memchr(*b, hay).is_some(),
ShapeOp::ByteSet(s) => s.contains_any_in(hay).is_some(),
ShapeOp::Contains(f) => f.find(hay).is_some(),
ShapeOp::StartsWith(lit) => hay.starts_with(lit),
ShapeOp::EndsWith(lit) => hay.ends_with(lit),
ShapeOp::ExactMatch(lit) => hay == lit.as_ref(),
}
}
#[inline]
fn match_shape_find(op: &ShapeOp, hay: &[u8]) -> Option<Match> {
match op {
ShapeOp::ContainsByte(b) => memchr::memchr(*b, hay).map(|i| Match {
start: i,
end: i + 1,
}),
ShapeOp::ByteSet(s) => s.contains_any_in(hay).map(|i| Match {
start: i,
end: i + 1,
}),
ShapeOp::Contains(f) => f.find(hay).map(|i| Match {
start: i,
end: i + f.needle().len(),
}),
ShapeOp::StartsWith(lit) => hay.starts_with(lit).then_some(Match {
start: 0,
end: lit.len(),
}),
ShapeOp::EndsWith(lit) => hay.ends_with(lit).then(|| Match {
start: hay.len() - lit.len(),
end: hay.len(),
}),
ShapeOp::ExactMatch(lit) => (hay == lit.as_ref()).then_some(Match {
start: 0,
end: hay.len(),
}),
}
}
fn collect_set(ac: &AhoCorasick, anchor: Anchor, hay: &[u8], out: &mut Vec<Match>) {
for m in ac.find_iter(hay) {
let keep = match anchor {
Anchor::Anywhere => true,
Anchor::AtStart => m.start() == 0,
Anchor::AtEnd => m.end() == hay.len(),
Anchor::Exact => m.start() == 0 && m.end() == hay.len(),
};
if keep {
out.push(Match {
start: m.start(),
end: m.end(),
});
if !matches!(anchor, Anchor::Anywhere) {
break;
}
}
}
}
fn collect_shape(op: &ShapeOp, hay: &[u8], out: &mut Vec<Match>) {
match op {
ShapeOp::ContainsByte(b) => {
for i in memchr::memchr_iter(*b, hay) {
out.push(Match {
start: i,
end: i + 1,
});
}
}
ShapeOp::ByteSet(s) => match s.n() {
1 => {
for i in memchr::memchr_iter(s.byte(0), hay) {
out.push(Match {
start: i,
end: i + 1,
});
}
}
2 => {
for i in memchr::memchr2_iter(s.byte(0), s.byte(1), hay) {
out.push(Match {
start: i,
end: i + 1,
});
}
}
3 => {
for i in memchr::memchr3_iter(s.byte(0), s.byte(1), s.byte(2), hay) {
out.push(Match {
start: i,
end: i + 1,
});
}
}
_ => {}
},
ShapeOp::Contains(f) => {
for i in f.find_iter(hay) {
out.push(Match {
start: i,
end: i + f.needle().len(),
});
}
}
ShapeOp::StartsWith(_) | ShapeOp::EndsWith(_) | ShapeOp::ExactMatch(_) => {
if let Some(m) = match_shape_find(op, hay) {
out.push(m);
}
}
}
}