mod mask;
mod quality;
use std::collections::Bound;
use std::iter;
use std::iter::zip;
use std::ops::{RangeBounds, RangeInclusive};
use std::slice::SliceIndex;
use itertools::{Itertools, MultiProduct};
use regex_syntax::hir::literal::Literal;
use serde::{Deserialize, Serialize};
use smallvec::{SmallVec, ToSmallVec, smallvec};
pub(crate) use crate::compiler::atoms::mask::ByteMaskCombinator;
pub(crate) use crate::compiler::atoms::quality::AtomsQuality;
pub(crate) use crate::compiler::atoms::quality::best_atom_in_bytes;
pub(crate) use crate::compiler::atoms::quality::best_range_in_bytes;
pub(crate) use crate::compiler::atoms::quality::best_range_in_masked_bytes;
use crate::compiler::SubPatternFlags;
pub(crate) const DESIRED_ATOM_SIZE: usize = 4;
pub(crate) const MAX_ATOMS_PER_REGEXP: usize = 10000;
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub(crate) struct Atom {
bytes: SmallVec<[u8; DESIRED_ATOM_SIZE]>,
exact: bool,
backtrack: u16,
}
impl AsRef<[u8]> for Atom {
fn as_ref(&self) -> &[u8] {
self.bytes.as_ref()
}
}
impl From<&[u8]> for Atom {
#[inline]
fn from(value: &[u8]) -> Self {
Self {
bytes: value.to_smallvec(),
backtrack: 0,
exact: !value.is_empty(),
}
}
}
impl From<Vec<u8>> for Atom {
#[inline]
fn from(value: Vec<u8>) -> Self {
Self {
bytes: value.to_smallvec(),
backtrack: 0,
exact: !value.is_empty(),
}
}
}
impl From<SmallVec<[u8; DESIRED_ATOM_SIZE]>> for Atom {
#[inline]
fn from(value: SmallVec<[u8; DESIRED_ATOM_SIZE]>) -> Self {
let exact = !value.is_empty();
Self { bytes: value, backtrack: 0, exact }
}
}
impl From<&Literal> for Atom {
#[inline]
fn from(value: &Literal) -> Self {
Self {
bytes: value.as_bytes().to_smallvec(),
backtrack: 0,
exact: value.is_exact(),
}
}
}
impl Atom {
pub fn from_slice_range<R>(s: &[u8], range: R) -> Self
where
R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
{
let backtrack = match range.start_bound() {
Bound::Included(b) => *b as u16,
Bound::Excluded(b) => (*b + 1) as u16,
Bound::Unbounded => 0,
};
let atom: &[u8] = &s[range];
Self {
bytes: atom.to_smallvec(),
backtrack,
exact: atom.len() == s.len(),
}
}
#[inline]
pub fn len(&self) -> usize {
self.bytes.len()
}
#[inline]
pub fn backtrack(&self) -> u16 {
self.backtrack
}
#[inline]
pub fn set_backtrack(&mut self, b: u16) {
self.backtrack = b;
}
#[inline]
pub fn is_exact(&self) -> bool {
self.exact
}
#[inline]
pub fn set_exact(&mut self, exact: bool) {
self.exact = exact;
}
#[inline]
pub fn make_inexact(&mut self) -> &mut Self {
self.exact = false;
self
}
pub fn make_wide(mut self) -> Self {
let atom_len = self.bytes.len();
self.backtrack *= 2;
self.bytes = self
.bytes
.into_iter()
.interleave(itertools::repeat_n(0x00, atom_len))
.collect();
self
}
#[allow(dead_code)]
pub fn exact<T: AsRef<[u8]>>(v: T) -> Self {
let mut atom = Self::from(v.as_ref().to_vec());
atom.exact = true;
atom
}
#[allow(dead_code)]
pub fn inexact<T: AsRef<[u8]>>(v: T) -> Self {
let mut atom = Self::from(v.as_ref().to_vec());
atom.exact = false;
atom
}
pub fn xor_combinations(
self,
range: RangeInclusive<u8>,
) -> XorCombinations {
XorCombinations::new(self, range)
}
pub fn case_combinations(self) -> CaseCombinations {
CaseCombinations::new(self)
}
#[allow(dead_code)]
pub fn mask_combinations(self, mask: &[u8]) -> MaskCombinations {
MaskCombinations::new(self, mask)
}
}
pub(crate) fn extract_atoms(
literal_bytes: &[u8],
flags: SubPatternFlags,
) -> Box<dyn Iterator<Item = Atom>> {
let mut best_atom = best_atom_in_bytes(literal_bytes);
if flags.intersects(
SubPatternFlags::FullwordLeft | SubPatternFlags::FullwordRight,
) {
best_atom.make_inexact();
}
if flags.contains(SubPatternFlags::Nocase) {
Box::new(CaseCombinations::new(best_atom))
} else {
Box::new(iter::once(best_atom))
}
}
pub(super) fn make_wide(s: &[u8]) -> Vec<u8> {
itertools::interleave(
s.iter().cloned(),
itertools::repeat_n(0_u8, s.len()),
)
.collect()
}
pub(crate) struct MaskCombinations {
cartesian_product: MultiProduct<ByteMaskCombinator>,
backtrack: u16,
exact: bool,
}
impl MaskCombinations {
fn new(atom: Atom, mask: &[u8]) -> Self {
Self {
exact: atom.exact,
backtrack: atom.backtrack,
cartesian_product: zip(atom.bytes, mask)
.map(|(byte, mask)| ByteMaskCombinator::new(byte, *mask))
.multi_cartesian_product(),
}
}
}
impl Iterator for MaskCombinations {
type Item = Atom;
fn next(&mut self) -> Option<Self::Item> {
let mut atom = Atom::from(self.cartesian_product.next()?);
atom.backtrack = self.backtrack;
atom.exact = self.exact;
Some(atom)
}
}
pub(crate) struct CaseCombinations {
cartesian_product:
MultiProduct<smallvec::IntoIter<[u8; DESIRED_ATOM_SIZE]>>,
backtrack: u16,
exact: bool,
}
impl CaseCombinations {
fn new(atom: Atom) -> Self {
Self {
exact: atom.exact,
backtrack: atom.backtrack,
cartesian_product: atom
.bytes
.to_ascii_lowercase()
.into_iter()
.map(|byte| {
if byte.is_ascii_alphabetic() {
smallvec![byte, byte.to_ascii_uppercase()]
} else {
smallvec![byte]
}
})
.multi_cartesian_product(),
}
}
}
impl Iterator for CaseCombinations {
type Item = Atom;
fn next(&mut self) -> Option<Self::Item> {
let mut atom = Atom::from(self.cartesian_product.next()?);
atom.backtrack = self.backtrack;
atom.exact = self.exact;
Some(atom)
}
}
pub(crate) struct XorCombinations {
atom: Atom,
range: RangeInclusive<u8>,
}
impl XorCombinations {
fn new(atom: Atom, range: RangeInclusive<u8>) -> Self {
Self { atom, range }
}
}
impl Iterator for XorCombinations {
type Item = Atom;
fn next(&mut self) -> Option<Self::Item> {
let i = self.range.next()?;
let mut atom = Atom::from(
self.atom.bytes.iter().map(|b| b ^ i).collect::<Vec<u8>>(),
);
atom.backtrack = self.atom.backtrack;
atom.exact = false;
Some(atom)
}
}
#[cfg(test)]
mod test {
use pretty_assertions::assert_eq;
use crate::compiler::atoms::Atom;
#[test]
fn mask_combinations() {
let atom = Atom::exact([0x11, 0x22, 0x33, 0x44]);
let mut c = atom.mask_combinations(&[0xff, 0xf0, 0xff, 0xff]);
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x20, 0x33, 0x44])));
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x21, 0x33, 0x44])));
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x22, 0x33, 0x44])));
let mut c = c.skip(10);
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x2d, 0x33, 0x44])));
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x2e, 0x33, 0x44])));
assert_eq!(c.next(), Some(Atom::exact([0x11, 0x2f, 0x33, 0x44])));
assert_eq!(c.next(), None);
}
#[test]
fn case_combinations() {
let atom = Atom::exact(b"a1B2c");
let mut c = atom.case_combinations();
assert_eq!(c.next(), Some(Atom::exact(b"a1b2c")));
assert_eq!(c.next(), Some(Atom::exact(b"a1b2C")));
assert_eq!(c.next(), Some(Atom::exact(b"a1B2c")));
assert_eq!(c.next(), Some(Atom::exact(b"a1B2C")));
assert_eq!(c.next(), Some(Atom::exact(b"A1b2c")));
assert_eq!(c.next(), Some(Atom::exact(b"A1b2C")));
assert_eq!(c.next(), Some(Atom::exact(b"A1B2c")));
assert_eq!(c.next(), Some(Atom::exact(b"A1B2C")));
assert_eq!(c.next(), None);
let mut atom = Atom::exact([0x00_u8, 0x01, 0x02]);
atom.set_backtrack(2);
let mut c = atom.clone().case_combinations();
assert_eq!(c.next(), Some(atom));
assert_eq!(c.next(), None);
}
#[test]
fn xor_combinations() {
let atom = Atom::exact([0x00_u8, 0x01, 0x02]);
let mut c = atom.xor_combinations(0..=1);
assert_eq!(c.next(), Some(Atom::inexact([0x00, 0x01, 0x02])));
assert_eq!(c.next(), Some(Atom::inexact([0x01, 0x00, 0x03])));
assert_eq!(c.next(), None);
}
#[test]
fn make_wide() {
let mut atom = Atom::exact([0x01_u8, 0x02, 0x03]);
atom.set_backtrack(2);
let atom = atom.make_wide();
assert_eq!(
atom.bytes.as_slice(),
&[0x01, 0x00, 0x02, 0x00, 0x03, 0x00]
);
assert_eq!(atom.backtrack, 4);
}
}