use core::{
fmt::{self, Debug},
hash::Hash,
mem,
};
use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder, MatchKind};
use memchr::{memchr, memchr2, memchr3, memmem};
use crate::sparse::SparseSet;
fn build_aho_corasick(parsed: &Parsed<I>) -> Option<AhoCorasick<u32>> {
if parsed.reprs.len() != 1 {
return None;
}
return None;
}
#[unconst]
#[derive_const(Clone)]
#[derive(Debug)]
pub struct LiteralSearcher<I: ~const Integral> {
complete: bool,
lcp: Memmem,
lcs: Memmem,
matcher: Matcher<I>,
}
#[unconst]
#[derive_const(Clone)]
#[derive(Debug)]
enum Matcher<I: ~const Integral + ~const Hash> {
Empty,
Seq(SeqSet<I>),
Memmem(Memmem),
AC {
ac: AhoCorasick<u32>,
lits: Vec<Literal<I>>,
},
Packed {
s: packed::Searcher,
lits: Vec<Literal<I>>,
},
}
#[unconst]
impl<I: ~const Integral> LiteralSearcher<I> {
pub fn empty() -> Self {
Self::new(Literals::empty(), Matcher::Empty)
}
pub fn prefixes(lits: Literals<I>) -> Self {
let matcher = Matcher::prefixes(&lits);
Self::new(lits, matcher)
}
pub fn suffixes(lits: Literals<I>) -> Self {
let matcher = Matcher::suffixes(&lits);
Self::new(lits, matcher)
}
fn new(lits: Literals<I>, matcher: Matcher<I>) -> Self {
let complete = lits.all_complete();
LiteralSearcher {
complete,
lcp: Memmem::new(lits.longest_common_prefix()),
lcs: Memmem::new(lits.longest_common_suffix()),
matcher,
}
}
pub fn complete(&self) -> bool {
self.complete && !self.is_empty()
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn find(&self, context: &[I]) -> Option<(usize, usize)> {
use self::Matcher::*;
match self.matcher {
Empty => Some((0, 0)),
Seq(ref sset) => sset.find(context).map(|i| (i, i + 1)),
Memmem(ref s) => s.find(context).map(|i| (i, i + s.len())),
AC { ref ac, .. } => ac.find(context).map(|m| (m.start(), m.end())),
Packed { ref s, .. } => s.find(context).map(|m| (m.start(), m.end())),
}
}
pub fn find_start(&self, context: &Context<I>) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > context.len() {
continue;
}
if lit == &context[0..lit.len()] {
return Some((0, lit.len()));
}
}
None
}
pub fn find_end(&self, context: &Context<I>) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > context.len() {
continue;
}
if lit == &context[context.len() - lit.len()..] {
return Some((context.len() - lit.len(), context.len()));
}
}
None
}
pub fn iter(&self) -> LiteralIter<'_, I> {
match self.matcher {
Matcher::Empty => LiteralIter::Empty,
Matcher::Seq(ref sset) => LiteralIter::Seq(&sset.dense),
Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
}
}
pub fn lcp(&self) -> &Memmem {
&self.lcp
}
pub fn lcs(&self) -> &Memmem {
&self.lcs
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn len(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Seq(ref sset) => sset.dense.len(),
Memmem(_) => 1,
AC { ref ac, .. } => ac.pattern_count(),
Packed { ref lits, .. } => lits.len(),
}
}
pub fn approximate_size(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Seq(ref sset) => sset.approximate_size(),
Memmem(ref single) => single.approximate_size(),
AC { ref ac, .. } => ac.heap_bytes(),
Packed { ref s, .. } => s.heap_bytes(),
}
}
}
#[unconst]
impl<I: ~const Integral> Matcher<I> {
fn prefixes(lits: &Literals<I>) -> Self {
let sset = SeqSet::prefixes(lits);
Matcher::new(lits, sset)
}
fn suffixes(lits: &Literals<I>) -> Self {
let sset = SeqSet::suffixes(lits);
Matcher::new(lits, sset)
}
fn new(lits: &Literals<I>, sset: SeqSet<I>) -> Self {
if lits.literals().is_empty() {
return Matcher::Empty;
}
if sset.dense.len() >= 26 {
return Matcher::Empty;
}
if sset.complete {
return Matcher::Seq(sset);
}
if lits.literals().len() == 1 {
return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
}
let pats = lits.literals().to_owned();
let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
if lits.literals().len() <= 100 && !is_aho_corasick_fast {
let mut builder = packed::Config::new()
.match_kind(packed::MatchKind::LeftmostFirst)
.builder();
if let Some(s) = builder.extend(&pats).build() {
return Matcher::Packed { s, lits: pats };
}
}
let ac = AhoCorasickBuilder::new()
.match_kind(aho_corasick::MatchKind::LeftmostFirst)
.dfa(true)
.build_with_size::<u32, _, _>(&pats)
.unwrap();
Matcher::AC { ac, lits: pats }
}
}
#[unconst]
#[derive(Debug)]
pub enum LiteralIter<'a, I: ~const Integral> {
Empty,
Seq(&'a [I]),
Single(&'a [I]),
AC(&'a [Literal<I>]),
Packed(&'a [Literal<I>]),
}
#[unconst]
impl<'a, I: ~const Integral> const Iterator for LiteralIter<'a, I> {
type Item = &'a [I];
fn next(&mut self) -> Option<Self::Item> {
match *self {
LiteralIter::Empty => None,
LiteralIter::Seq(ref mut many) => {
if many.is_empty() {
None
} else {
let next = &many[0..1];
*many = &many[1..];
Some(next)
}
}
LiteralIter::Single(ref mut one) => {
if one.is_empty() {
None
} else {
let next = &one[..];
*one = &[];
Some(next)
}
}
LiteralIter::AC(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
LiteralIter::Packed(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
}
}
}
#[unconst]
#[derive(Clone, Debug)]
struct SeqSet<I: ~const Integral + ~const Hash> {
sparse: Vec<bool>,
dense: Vec<I>,
complete: bool,
all_ascii: bool,
}
#[unconst]
impl<I: ~const Integral> SeqSet<I> {
fn new() -> SeqSet<I> {
SeqSet {
sparse: vec![false; 256],
dense: vec![],
complete: true,
all_ascii: true,
}
}
fn prefixes(lits: &Literals<I>) -> SeqSet<I> {
let mut sset = SeqSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(0) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
fn suffixes(lits: &Literals<I>) -> SeqSet<I> {
let mut sset = SeqSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find(&self, context: &[I]) -> Option<usize> {
match self.dense.len() {
0 => None,
1 => memchr(self.dense[0], context),
2 => memchr2(self.dense[0], self.dense[1], context),
3 => memchr3(self.dense[0], self.dense[1], self.dense[2], context),
_ => self._find(context),
}
}
fn _find(&self, context: &[I]) -> Option<usize> {
for (i, &b) in context.iter().enumerate() {
if self.sparse[b as usize] {
return Some(i);
}
}
None
}
fn approximate_size(&self) -> usize {
(self.dense.len() * mem::size_of::<u8>()) + (self.sparse.len() * mem::size_of::<bool>())
}
}
#[derive(Clone, Debug)]
pub struct Memmem {
finder: Finder<'static>,
char_len: usize,
}
#[derive(Clone, Debug)]
pub struct Finder<'n> {
searcher: Searcher<'n>,
}
impl<'n> Finder<'n> {
#[inline]
pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> {
FinderBuilder::new().build_forward(needle)
}
pub fn find(&self, haystack: &[u8]) -> Option<usize> {
self.searcher
.find(&mut self.searcher.prefilter_state(), haystack)
}
#[inline]
pub fn find_iter<'a, 'h>(&'a self, haystack: &'h [u8]) -> FindIter<'h, 'a> {
FindIter::new(haystack, self.as_ref())
}
#[cfg(feature = "std")]
#[inline]
pub fn into_owned(self) -> Finder<'static> {
Finder {
searcher: self.searcher.into_owned(),
}
}
#[inline]
pub fn as_ref(&self) -> Finder<'_> {
Finder {
searcher: self.searcher.as_ref(),
}
}
#[inline]
pub fn needle(&self) -> &[u8] {
self.searcher.needle()
}
}
#[derive(Clone, Debug)]
struct Searcher<'n> {
needle: CowBytes<'n>,
ninfo: NeedleInfo,
prefn: Option<PrefilterFn>,
kind: SearcherKind,
}
impl<'n> Searcher<'n> {
fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
use self::SearcherKind::*;
let ninfo = NeedleInfo::new(needle);
let mk = |kind: SearcherKind| {
let prefn = prefilter::forward(&config.prefilter, &ninfo.rarebytes, needle);
Searcher {
needle: CowBytes::new(needle),
ninfo,
prefn,
kind,
}
};
if needle.len() == 0 {
return mk(Empty);
}
if needle.len() == 1 {
return mk(OneByte(needle[0]));
}
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
{
if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) {
return mk(GenericSIMD256(fwd));
} else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) {
return mk(GenericSIMD128(fwd));
}
}
#[cfg(all(target_arch = "wasm32", memchr_runtime_simd))]
{
if let Some(fwd) = wasm::Forward::new(&ninfo, needle) {
return mk(GenericSIMD128(fwd));
}
}
mk(TwoWay(twoway::Forward::new(needle)))
}
fn prefilter_state(&self) -> PrefilterState {
if self.prefn.is_none() {
PrefilterState::inert()
} else {
PrefilterState::new()
}
}
fn needle(&self) -> &[u8] {
self.needle.as_slice()
}
fn as_ref(&self) -> Searcher<'_> {
use self::SearcherKind::*;
let kind = match self.kind {
Empty => Empty,
OneByte(b) => OneByte(b),
TwoWay(tw) => TwoWay(tw),
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(gs) => GenericSIMD128(gs),
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
GenericSIMD256(gs) => GenericSIMD256(gs),
};
Searcher {
needle: CowBytes::new(self.needle()),
ninfo: self.ninfo,
prefn: self.prefn,
kind,
}
}
#[cfg(feature = "std")]
fn into_owned(self) -> Searcher<'static> {
use self::SearcherKind::*;
let kind = match self.kind {
Empty => Empty,
OneByte(b) => OneByte(b),
TwoWay(tw) => TwoWay(tw),
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(gs) => GenericSIMD128(gs),
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
GenericSIMD256(gs) => GenericSIMD256(gs),
};
Searcher {
needle: self.needle.into_owned(),
ninfo: self.ninfo,
prefn: self.prefn,
kind,
}
}
#[inline(always)]
fn find(&self, state: &mut PrefilterState, haystack: &[u8]) -> Option<usize> {
use self::SearcherKind::*;
let needle = self.needle();
if haystack.len() < needle.len() {
return None;
}
match self.kind {
Empty => Some(0),
OneByte(b) => crate::memchr(b, haystack),
TwoWay(ref tw) => {
if rabinkarp::is_fast(haystack, needle) {
rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
} else {
self.find_tw(tw, state, haystack, needle)
}
}
#[cfg(all(not(miri), memchr_runtime_simd))]
GenericSIMD128(ref gs) => {
if haystack.len() < gs.min_haystack_len() {
rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
} else {
gs.find(haystack, needle)
}
}
#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))]
GenericSIMD256(ref gs) => {
if haystack.len() < gs.min_haystack_len() {
rabinkarp::find_with(&self.ninfo.nhash, haystack, needle)
} else {
gs.find(haystack, needle)
}
}
}
}
#[inline(never)]
fn find_tw(
&self,
tw: &twoway::Forward,
state: &mut PrefilterState,
haystack: &[u8],
needle: &[u8],
) -> Option<usize> {
if let Some(prefn) = self.prefn {
if state.is_effective() {
let mut pre = Pre {
state,
prefn,
ninfo: &self.ninfo,
};
return tw.find(Some(&mut pre), haystack, needle);
}
}
tw.find(None, haystack, needle)
}
}
impl NeedleInfo {
pub(crate) fn new(needle: &[u8]) -> NeedleInfo {
NeedleInfo {
rarebytes: RareNeedleBytes::forward(needle),
nhash: NeedleHash::forward(needle),
}
}
}
impl Memmem {
fn new(context: &Context<I>) -> Memmem {
Memmem {
finder: Finder::new(context).into_owned(),
char_len: char_len_lossy(context),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn find(&self, context: &Context<I>) -> Option<usize> {
self.finder.find(context)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn is_suffix(&self, context: &Context<I>) -> bool {
if context.len() < self.len() {
return false;
}
&context[context.len() - self.len()..] == self.finder.needle()
}
pub fn len(&self) -> usize {
self.finder.needle().len()
}
pub fn char_len(&self) -> usize {
self.char_len
}
fn approximate_size(&self) -> usize {
self.finder.needle().len() * mem::size_of::<u8>()
}
}