#[cfg(feature = "simd")]
use crate::simd::Teddy;
use super::Literals;
pub enum Prefilter {
None,
SingleByte(u8),
#[allow(dead_code)]
InnerByte {
byte: u8,
max_lookback: usize,
},
Literal(memchr::memmem::Finder<'static>),
StartsWithDigit,
AhoCorasick {
ac: aho_corasick::AhoCorasick,
},
AhoCorasickFull {
ac: aho_corasick::AhoCorasick,
},
#[cfg(feature = "simd")]
Teddy(Teddy),
#[cfg(feature = "simd")]
TeddyFull {
teddy: Teddy,
lengths: Vec<usize>,
},
}
impl std::fmt::Debug for Prefilter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::None => write!(f, "Prefilter::None"),
Self::SingleByte(b) => write!(f, "Prefilter::SingleByte({:#04x})", b),
Self::InnerByte { byte, max_lookback } => {
write!(
f,
"Prefilter::InnerByte({:#04x}, lookback={})",
byte, max_lookback
)
}
Self::Literal(_) => write!(f, "Prefilter::Literal"),
Self::StartsWithDigit => write!(f, "Prefilter::StartsWithDigit"),
Self::AhoCorasick { ac } => {
write!(f, "Prefilter::AhoCorasick({} patterns)", ac.patterns_len())
}
Self::AhoCorasickFull { ac } => write!(
f,
"Prefilter::AhoCorasickFull({} patterns)",
ac.patterns_len()
),
#[cfg(feature = "simd")]
Self::Teddy(t) => write!(f, "Prefilter::Teddy({} patterns)", t.pattern_count()),
#[cfg(feature = "simd")]
Self::TeddyFull { teddy, .. } => write!(
f,
"Prefilter::TeddyFull({} patterns)",
teddy.pattern_count()
),
}
}
}
impl Prefilter {
pub fn from_literals(literals: &Literals) -> Self {
if literals.prefixes.is_empty() {
if literals.starts_with_digit {
return Self::StartsWithDigit;
}
return Self::None;
}
if literals.prefixes.len() == 1 {
let prefix = &literals.prefixes[0];
if prefix.len() == 1 {
return Self::SingleByte(prefix[0]);
}
if !prefix.is_empty() {
let finder = memchr::memmem::Finder::new(prefix).into_owned();
return Self::Literal(finder);
}
return Self::None;
}
let ac = aho_corasick::AhoCorasick::builder()
.match_kind(aho_corasick::MatchKind::LeftmostFirst)
.build(&literals.prefixes)
.expect("aho-corasick build should not fail");
if literals.prefix_complete {
return Self::AhoCorasickFull { ac };
}
Self::AhoCorasick { ac }
}
pub fn find_candidate(&self, haystack: &[u8], pos: usize) -> Option<usize> {
if pos >= haystack.len() {
return None;
}
match self {
Self::None => {
if pos < haystack.len() {
Some(pos)
} else {
None
}
}
Self::SingleByte(needle) => {
let slice = &haystack[pos..];
memchr::memchr(*needle, slice).map(|i| pos + i)
}
Self::InnerByte { byte, .. } => {
let slice = &haystack[pos..];
memchr::memchr(*byte, slice).map(|i| pos + i)
}
Self::Literal(finder) => {
let slice = &haystack[pos..];
finder.find(slice).map(|i| pos + i)
}
Self::StartsWithDigit => {
let slice = &haystack[pos..];
#[cfg(feature = "simd")]
{
crate::simd::memchr_range(b'0', b'9', slice).map(|i| pos + i)
}
#[cfg(not(feature = "simd"))]
{
slice
.iter()
.position(|&b| b >= b'0' && b <= b'9')
.map(|i| pos + i)
}
}
Self::AhoCorasick { ac } => {
let input = aho_corasick::Input::new(haystack).span(pos..haystack.len());
ac.find(input).map(|m| m.start())
}
Self::AhoCorasickFull { ac } => {
let input = aho_corasick::Input::new(haystack).span(pos..haystack.len());
ac.find(input).map(|m| m.start())
}
#[cfg(feature = "simd")]
Self::Teddy(teddy) => {
let slice = &haystack[pos..];
teddy.find(slice).map(|(_, i)| pos + i)
}
#[cfg(feature = "simd")]
Self::TeddyFull { teddy, .. } => {
let slice = &haystack[pos..];
teddy.find(slice).map(|(_, i)| pos + i)
}
}
}
pub fn find_full_match(&self, haystack: &[u8], pos: usize) -> Option<(usize, usize)> {
if pos >= haystack.len() {
return None;
}
match self {
Self::AhoCorasickFull { ac } => {
let input = aho_corasick::Input::new(haystack).span(pos..haystack.len());
ac.find(input).map(|m| (m.start(), m.end()))
}
#[cfg(feature = "simd")]
Self::TeddyFull { teddy, lengths } => {
let slice = &haystack[pos..];
if let Some((pattern_id, match_pos)) = teddy.find(slice) {
let abs_pos = pos + match_pos;
let len = lengths[pattern_id];
return Some((abs_pos, abs_pos + len));
}
None
}
_ => None,
}
}
pub fn is_full_match(&self) -> bool {
matches!(self, Self::AhoCorasickFull { .. }) || {
#[cfg(feature = "simd")]
{
matches!(self, Self::TeddyFull { .. })
}
#[cfg(not(feature = "simd"))]
{
false
}
}
}
pub fn find_candidates<'a, 'h>(&'a self, haystack: &'h [u8]) -> CandidateIter<'a, 'h> {
CandidateIter {
prefilter: self,
haystack,
pos: 0,
}
}
pub fn find_full_matches<'a, 'h>(&'a self, haystack: &'h [u8]) -> FullMatchIter<'a, 'h> {
FullMatchIter::new(self, haystack)
}
pub fn is_none(&self) -> bool {
matches!(self, Self::None)
}
pub fn is_effective(&self) -> bool {
match self {
Self::SingleByte(_) => true,
Self::Literal(_) => true,
Self::AhoCorasick { .. } => true,
Self::AhoCorasickFull { .. } => true,
#[cfg(feature = "simd")]
Self::Teddy(_) => true,
#[cfg(feature = "simd")]
Self::TeddyFull { .. } => true,
Self::None => false,
Self::StartsWithDigit => false, Self::InnerByte { .. } => false, }
}
pub fn is_inner_byte(&self) -> bool {
matches!(self, Self::InnerByte { .. })
}
pub fn inner_byte_lookback(&self) -> usize {
match self {
Self::InnerByte { max_lookback, .. } => *max_lookback,
_ => 0,
}
}
}
pub struct CandidateIter<'a, 'h> {
prefilter: &'a Prefilter,
haystack: &'h [u8],
pos: usize,
}
impl<'a, 'h> Iterator for CandidateIter<'a, 'h> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
let candidate = self.prefilter.find_candidate(self.haystack, self.pos)?;
self.pos = candidate + 1;
Some(candidate)
}
}
pub struct FullMatchIter<'a, 'h> {
inner: FullMatchIterInner<'a, 'h>,
}
enum FullMatchIterInner<'a, 'h> {
AhoCorasickFull {
ac_iter: aho_corasick::FindIter<'a, 'h>,
last_end: usize,
},
#[cfg(feature = "simd")]
TeddyFull {
teddy_iter: crate::simd::TeddyIter<'a, 'h>,
lengths: &'a [usize],
last_end: usize,
},
Empty,
}
impl<'a, 'h> FullMatchIter<'a, 'h> {
pub(crate) fn new(prefilter: &'a Prefilter, haystack: &'h [u8]) -> Self {
let inner = match prefilter {
Prefilter::AhoCorasickFull { ac } => FullMatchIterInner::AhoCorasickFull {
ac_iter: ac.find_iter(haystack),
last_end: 0,
},
#[cfg(feature = "simd")]
Prefilter::TeddyFull { teddy, lengths } => FullMatchIterInner::TeddyFull {
teddy_iter: teddy.find_iter(haystack),
lengths,
last_end: 0,
},
_ => FullMatchIterInner::Empty,
};
Self { inner }
}
}
impl<'a, 'h> Iterator for FullMatchIter<'a, 'h> {
type Item = (usize, usize);
fn next(&mut self) -> Option<Self::Item> {
match &mut self.inner {
FullMatchIterInner::AhoCorasickFull { ac_iter, last_end } => {
loop {
let m = ac_iter.next()?;
if m.start() >= *last_end {
*last_end = m.end();
return Some((m.start(), m.end()));
}
}
}
#[cfg(feature = "simd")]
FullMatchIterInner::TeddyFull {
teddy_iter,
lengths,
last_end,
} => {
loop {
let (pattern_id, pos) = teddy_iter.next()?;
if pos >= *last_end {
let len = lengths[pattern_id];
let end = pos + len;
*last_end = end;
return Some((pos, end));
}
}
}
FullMatchIterInner::Empty => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_single_byte_prefilter() {
let pf = Prefilter::SingleByte(b'x');
assert_eq!(pf.find_candidate(b"hello x world", 0), Some(6));
assert_eq!(pf.find_candidate(b"hello x world", 7), None);
}
#[test]
fn test_literal_prefilter() {
let finder = memchr::memmem::Finder::new(b"world").into_owned();
let pf = Prefilter::Literal(finder);
assert_eq!(pf.find_candidate(b"hello world", 0), Some(6));
assert_eq!(pf.find_candidate(b"hello world", 7), None);
}
#[test]
fn test_none_prefilter() {
let pf = Prefilter::None;
assert_eq!(pf.find_candidate(b"hello", 0), Some(0));
assert_eq!(pf.find_candidate(b"hello", 3), Some(3));
assert_eq!(pf.find_candidate(b"hello", 5), None);
}
#[test]
fn test_from_literals_single() {
let literals = Literals {
prefixes: vec![b"hello".to_vec()],
suffixes: vec![],
prefix_complete: true,
starts_with_digit: false,
};
let pf = Prefilter::from_literals(&literals);
assert!(matches!(pf, Prefilter::Literal(_)));
}
#[test]
fn test_from_literals_single_byte() {
let literals = Literals {
prefixes: vec![b"h".to_vec()],
suffixes: vec![],
prefix_complete: true,
starts_with_digit: false,
};
let pf = Prefilter::from_literals(&literals);
assert!(matches!(pf, Prefilter::SingleByte(b'h')));
}
#[test]
fn test_from_literals_empty() {
let literals = Literals {
prefixes: vec![],
suffixes: vec![],
prefix_complete: false,
starts_with_digit: false,
};
let pf = Prefilter::from_literals(&literals);
assert!(pf.is_none());
}
#[test]
fn test_multi_literal_prefilter() {
let literals = Literals {
prefixes: vec![b"hello".to_vec(), b"world".to_vec()],
suffixes: vec![],
prefix_complete: true,
starts_with_digit: false,
};
let pf = Prefilter::from_literals(&literals);
assert!(matches!(pf, Prefilter::AhoCorasickFull { .. }));
assert_eq!(pf.find_candidate(b"say hello there", 0), Some(4));
assert_eq!(pf.find_candidate(b"hello world", 6), Some(6));
assert_eq!(pf.find_full_match(b"say hello there", 0), Some((4, 9)));
assert_eq!(pf.find_full_match(b"hello world", 0), Some((0, 5)));
}
#[test]
fn test_multi_literal_prefilter_incomplete() {
let literals = Literals {
prefixes: vec![b"hello".to_vec(), b"world".to_vec()],
suffixes: vec![],
prefix_complete: false, starts_with_digit: false,
};
let pf = Prefilter::from_literals(&literals);
assert!(matches!(pf, Prefilter::AhoCorasick { .. }));
assert_eq!(pf.find_candidate(b"say hello there", 0), Some(4));
}
#[test]
fn test_candidate_iter() {
let pf = Prefilter::SingleByte(b'o');
let candidates: Vec<_> = pf.find_candidates(b"hello world").collect();
assert_eq!(candidates, vec![4, 7]);
}
}