use std::mem;
use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
use memchr::{memchr, memchr2, memchr3};
use syntax;
use freqs::BYTE_FREQUENCIES;
use simd_accel::teddy128::{Teddy, is_teddy_128_available};
#[derive(Clone, Debug)]
pub struct LiteralSearcher {
complete: bool,
lcp: SingleSearch,
lcs: SingleSearch,
matcher: Matcher,
}
#[derive(Clone, Debug)]
enum Matcher {
Empty,
Bytes(SingleByteSet),
Single(SingleSearch),
AC(FullAcAutomaton<syntax::Lit>),
Teddy128(Teddy),
}
impl LiteralSearcher {
pub fn empty() -> Self {
Self::new(syntax::Literals::empty(), Matcher::Empty)
}
pub fn prefixes(lits: syntax::Literals) -> Self {
let matcher = Matcher::prefixes(&lits);
Self::new(lits, matcher)
}
pub fn suffixes(lits: syntax::Literals) -> Self {
let matcher = Matcher::suffixes(&lits);
Self::new(lits, matcher)
}
fn new(lits: syntax::Literals, matcher: Matcher) -> Self {
let complete = lits.all_complete();
LiteralSearcher {
complete: complete,
lcp: SingleSearch::new(lits.longest_common_prefix().to_vec()),
lcs: SingleSearch::new(lits.longest_common_suffix().to_vec()),
matcher: matcher,
}
}
pub fn complete(&self) -> bool {
self.complete && self.len() > 0
}
#[inline(always)] pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
use self::Matcher::*;
match self.matcher {
Empty => Some((0, 0)),
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
Single(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
}
}
pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > haystack.len() {
continue;
}
if lit == &haystack[0..lit.len()] {
return Some((0, lit.len()));
}
}
None
}
pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
for lit in self.iter() {
if lit.len() > haystack.len() {
continue;
}
if lit == &haystack[haystack.len() - lit.len()..] {
return Some((haystack.len() - lit.len(), haystack.len()));
}
}
None
}
pub fn iter(&self) -> LiteralIter {
match self.matcher {
Matcher::Empty => LiteralIter::Empty,
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
Matcher::Single(ref s) => LiteralIter::Single(&s.pat),
Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
Matcher::Teddy128(ref ted) => {
LiteralIter::Teddy128(ted.patterns())
}
}
}
pub fn lcp(&self) -> &SingleSearch {
&self.lcp
}
pub fn lcs(&self) -> &SingleSearch {
&self.lcs
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn len(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Bytes(ref sset) => sset.dense.len(),
Single(_) => 1,
AC(ref aut) => aut.len(),
Teddy128(ref ted) => ted.len(),
}
}
pub fn approximate_size(&self) -> usize {
use self::Matcher::*;
match self.matcher {
Empty => 0,
Bytes(ref sset) => sset.approximate_size(),
Single(ref single) => single.approximate_size(),
AC(ref aut) => aut.heap_bytes(),
Teddy128(ref ted) => ted.approximate_size(),
}
}
}
impl Matcher {
fn prefixes(lits: &syntax::Literals) -> Self {
let sset = SingleByteSet::prefixes(&lits);
Matcher::new(lits, sset)
}
fn suffixes(lits: &syntax::Literals) -> Self {
let sset = SingleByteSet::suffixes(&lits);
Matcher::new(lits, sset)
}
fn new(lits: &syntax::Literals, sset: SingleByteSet) -> Self {
if lits.literals().is_empty() {
return Matcher::Empty;
}
if sset.dense.len() >= 26 {
return Matcher::Empty;
}
if sset.complete {
return Matcher::Bytes(sset);
}
if lits.literals().len() == 1 {
let lit = lits.literals()[0].to_vec();
return Matcher::Single(SingleSearch::new(lit));
}
let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
if is_teddy_128_available() && !is_aho_corasick_fast {
const MAX_TEDDY_LITERALS: usize = 32;
if lits.literals().len() <= MAX_TEDDY_LITERALS {
if let Some(ted) = Teddy::new(lits) {
return Matcher::Teddy128(ted);
}
}
}
let pats = lits.literals().to_owned();
Matcher::AC(AcAutomaton::new(pats).into_full())
}
}
pub enum LiteralIter<'a> {
Empty,
Bytes(&'a [u8]),
Single(&'a [u8]),
AC(&'a [syntax::Lit]),
Teddy128(&'a [Vec<u8>]),
}
impl<'a> Iterator for LiteralIter<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
match *self {
LiteralIter::Empty => None,
LiteralIter::Bytes(ref mut many) => {
if many.is_empty() {
None
} else {
let next = &many[0..1];
*many = &many[1..];
Some(next)
}
}
LiteralIter::Single(ref mut one) => {
if one.is_empty() {
None
} else {
let next = &one[..];
*one = &[];
Some(next)
}
}
LiteralIter::AC(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
LiteralIter::Teddy128(ref mut lits) => {
if lits.is_empty() {
None
} else {
let next = &lits[0];
*lits = &lits[1..];
Some(&**next)
}
}
}
}
}
#[derive(Clone, Debug)]
struct SingleByteSet {
sparse: Vec<bool>,
dense: Vec<u8>,
complete: bool,
all_ascii: bool,
}
impl SingleByteSet {
fn new() -> SingleByteSet {
SingleByteSet {
sparse: vec![false; 256],
dense: vec![],
complete: true,
all_ascii: true,
}
}
fn prefixes(lits: &syntax::Literals) -> SingleByteSet {
let mut sset = SingleByteSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(0) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
fn suffixes(lits: &syntax::Literals) -> SingleByteSet {
let mut sset = SingleByteSet::new();
for lit in lits.literals() {
sset.complete = sset.complete && lit.len() == 1;
if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
}
sset.dense.push(b);
sset.sparse[b as usize] = true;
}
}
}
sset
}
#[inline(always)] fn find(&self, text: &[u8]) -> Option<usize> {
match self.dense.len() {
0 => None,
1 => memchr(self.dense[0], text),
2 => memchr2(self.dense[0], self.dense[1], text),
3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
_ => self._find(text),
}
}
fn _find(&self, haystack: &[u8]) -> Option<usize> {
for (i, &b) in haystack.iter().enumerate() {
if self.sparse[b as usize] {
return Some(i);
}
}
None
}
fn approximate_size(&self) -> usize {
(self.dense.len() * mem::size_of::<u8>())
+ (self.sparse.len() * mem::size_of::<bool>())
}
}
#[derive(Clone, Debug)]
pub struct SingleSearch {
pat: Vec<u8>,
char_len: usize,
rare1: u8,
rare1i: usize,
rare2: u8,
rare2i: usize,
}
impl SingleSearch {
fn new(pat: Vec<u8>) -> SingleSearch {
fn freq_rank(b: u8) -> usize { BYTE_FREQUENCIES[b as usize] as usize }
if pat.is_empty() {
return SingleSearch::empty();
}
let mut rare1 = pat[0];
let mut rare2 = pat[0];
for b in pat[1..].iter().cloned() {
if freq_rank(b) < freq_rank(rare1) {
rare1 = b;
}
}
for &b in &pat {
if rare1 == rare2 {
rare2 = b
} else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
rare2 = b;
}
}
let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
let char_len = char_len_lossy(&pat);
SingleSearch {
pat: pat,
char_len: char_len,
rare1: rare1,
rare1i: rare1i,
rare2: rare2,
rare2i: rare2i,
}
}
fn empty() -> SingleSearch {
SingleSearch {
pat: vec![],
char_len: 0,
rare1: 0,
rare1i: 0,
rare2: 0,
rare2i: 0,
}
}
#[inline(always)] pub fn find(&self, haystack: &[u8]) -> Option<usize> {
let pat = &*self.pat;
if haystack.len() < pat.len() || pat.is_empty() {
return None;
}
let mut i = self.rare1i;
while i < haystack.len() {
i += match memchr(self.rare1, &haystack[i..]) {
None => return None,
Some(i) => i,
};
let start = i - self.rare1i;
let end = start + pat.len();
if end > haystack.len() {
return None;
}
let aligned = &haystack[start..end];
if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
return Some(start);
}
i += 1;
}
None
}
#[inline(always)] pub fn is_suffix(&self, text: &[u8]) -> bool {
if text.len() < self.len() {
return false;
}
&text[text.len() - self.len()..] == &*self.pat
}
pub fn len(&self) -> usize {
self.pat.len()
}
pub fn char_len(&self) -> usize {
self.char_len
}
fn approximate_size(&self) -> usize {
self.pat.len() * mem::size_of::<u8>()
}
}
fn char_len_lossy(bytes: &[u8]) -> usize {
String::from_utf8_lossy(bytes).chars().count()
}