#[cfg(feature = "std")]
use core::fmt;
#[cfg(feature = "std")]
use core::iter;
use core::mem;
use core::slice;
use byteorder::{ByteOrder, NativeEndian};
#[cfg(feature = "std")]
use byteorder::{BigEndian, LittleEndian};
#[cfg(feature = "std")]
use regex_syntax::ParserBuilder;
use classes::ByteClasses;
#[cfg(feature = "std")]
use determinize::Determinizer;
use dfa::DFA;
#[cfg(feature = "std")]
use error::{Error, Result};
#[cfg(feature = "std")]
use minimize::Minimizer;
#[cfg(feature = "std")]
use nfa::{NFA, NFABuilder};
#[cfg(feature = "std")]
use sparse::SparseDFA;
use state_id::{StateID, dead_id};
#[cfg(feature = "std")]
use state_id::{
premultiply_overflow_error, next_state_id, write_state_id_bytes,
};
const ALPHABET_LEN: usize = 256;
pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001;
pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010;
#[derive(Clone, Debug)]
pub enum DenseDFA<T: AsRef<[S]>, S: StateID> {
Standard(Standard<T, S>),
ByteClass(ByteClass<T, S>),
Premultiplied(Premultiplied<T, S>),
PremultipliedByteClass(PremultipliedByteClass<T, S>),
#[doc(hidden)]
__Nonexhaustive,
}
impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
fn repr(&self) -> &Repr<T, S> {
match *self {
DenseDFA::Standard(ref r) => &r.0,
DenseDFA::ByteClass(ref r) => &r.0,
DenseDFA::Premultiplied(ref r) => &r.0,
DenseDFA::PremultipliedByteClass(ref r) => &r.0,
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
}
#[cfg(feature = "std")]
impl DenseDFA<Vec<usize>, usize> {
pub fn new(pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
Builder::new().build(pattern)
}
}
#[cfg(feature = "std")]
impl<S: StateID> DenseDFA<Vec<S>, S> {
pub fn empty() -> DenseDFA<Vec<S>, S> {
Repr::empty().into_dense_dfa()
}
}
impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> {
match *self {
DenseDFA::Standard(ref r) => {
DenseDFA::Standard(Standard(r.0.as_ref()))
}
DenseDFA::ByteClass(ref r) => {
DenseDFA::ByteClass(ByteClass(r.0.as_ref()))
}
DenseDFA::Premultiplied(ref r) => {
DenseDFA::Premultiplied(Premultiplied(r.0.as_ref()))
}
DenseDFA::PremultipliedByteClass(ref r) => {
let inner = PremultipliedByteClass(r.0.as_ref());
DenseDFA::PremultipliedByteClass(inner)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[cfg(feature = "std")]
pub fn to_owned(&self) -> DenseDFA<Vec<S>, S> {
match *self {
DenseDFA::Standard(ref r) => {
DenseDFA::Standard(Standard(r.0.to_owned()))
}
DenseDFA::ByteClass(ref r) => {
DenseDFA::ByteClass(ByteClass(r.0.to_owned()))
}
DenseDFA::Premultiplied(ref r) => {
DenseDFA::Premultiplied(Premultiplied(r.0.to_owned()))
}
DenseDFA::PremultipliedByteClass(ref r) => {
let inner = PremultipliedByteClass(r.0.to_owned());
DenseDFA::PremultipliedByteClass(inner)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
pub fn memory_usage(&self) -> usize {
self.repr().memory_usage()
}
}
#[cfg(feature = "std")]
impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
pub fn to_sparse(&self) -> Result<SparseDFA<Vec<u8>, S>> {
self.to_sparse_sized()
}
pub fn to_sparse_sized<A: StateID>(
&self,
) -> Result<SparseDFA<Vec<u8>, A>> {
self.repr().to_sparse_sized()
}
pub fn to_u8(&self) -> Result<DenseDFA<Vec<u8>, u8>> {
self.to_sized()
}
pub fn to_u16(&self) -> Result<DenseDFA<Vec<u16>, u16>> {
self.to_sized()
}
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
pub fn to_u32(&self) -> Result<DenseDFA<Vec<u32>, u32>> {
self.to_sized()
}
#[cfg(target_pointer_width = "64")]
pub fn to_u64(&self) -> Result<DenseDFA<Vec<u64>, u64>> {
self.to_sized()
}
pub fn to_sized<A: StateID>(&self) -> Result<DenseDFA<Vec<A>, A>> {
self.repr().to_sized().map(|r| r.into_dense_dfa())
}
pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<LittleEndian>()
}
pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<BigEndian>()
}
pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<NativeEndian>()
}
}
impl<'a, S: StateID> DenseDFA<&'a [S], S> {
pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> {
Repr::from_bytes(buf).into_dense_dfa()
}
}
#[cfg(feature = "std")]
impl<S: StateID> DenseDFA<Vec<S>, S> {
#[doc(hidden)]
pub fn minimize(&mut self) {
self.repr_mut().minimize();
}
fn repr_mut(&mut self) -> &mut Repr<Vec<S>, S> {
match *self {
DenseDFA::Standard(ref mut r) => &mut r.0,
DenseDFA::ByteClass(ref mut r) => &mut r.0,
DenseDFA::Premultiplied(ref mut r) => &mut r.0,
DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0,
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
}
impl<T: AsRef<[S]>, S: StateID> DFA for DenseDFA<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.repr().start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.repr().is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.repr().is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.repr().is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.repr().is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
match *self {
DenseDFA::Standard(ref r) => r.next_state(current, input),
DenseDFA::ByteClass(ref r) => r.next_state(current, input),
DenseDFA::Premultiplied(ref r) => r.next_state(current, input),
DenseDFA::PremultipliedByteClass(ref r) => {
r.next_state(current, input)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
match *self {
DenseDFA::Standard(ref r) => {
r.next_state_unchecked(current, input)
}
DenseDFA::ByteClass(ref r) => {
r.next_state_unchecked(current, input)
}
DenseDFA::Premultiplied(ref r) => {
r.next_state_unchecked(current, input)
}
DenseDFA::PremultipliedByteClass(ref r) => {
r.next_state_unchecked(current, input)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
match *self {
DenseDFA::Standard(ref r) => r.is_match_at(bytes, start),
DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start),
DenseDFA::PremultipliedByteClass(ref r) => {
r.is_match_at(bytes, start)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
DenseDFA::Premultiplied(ref r) => {
r.shortest_match_at(bytes, start)
}
DenseDFA::PremultipliedByteClass(ref r) => {
r.shortest_match_at(bytes, start)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
DenseDFA::Standard(ref r) => r.find_at(bytes, start),
DenseDFA::ByteClass(ref r) => r.find_at(bytes, start),
DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start),
DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start),
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
DenseDFA::Standard(ref r) => r.rfind_at(bytes, start),
DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start),
DenseDFA::PremultipliedByteClass(ref r) => {
r.rfind_at(bytes, start)
}
DenseDFA::__Nonexhaustive => unreachable!(),
}
}
}
#[derive(Clone, Debug)]
pub struct Standard<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
impl<T: AsRef<[S]>, S: StateID> DFA for Standard<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
let o = current.to_usize() * ALPHABET_LEN + input as usize;
self.0.trans()[o]
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
let o = current.to_usize() * ALPHABET_LEN + input as usize;
*self.0.trans().get_unchecked(o)
}
}
#[derive(Clone, Debug)]
pub struct ByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
impl<T: AsRef<[S]>, S: StateID> DFA for ByteClass<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
let input = self.0.byte_classes().get(input);
let o = current.to_usize() * self.0.alphabet_len() + input as usize;
self.0.trans()[o]
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
let input = self.0.byte_classes().get_unchecked(input);
let o = current.to_usize() * self.0.alphabet_len() + input as usize;
*self.0.trans().get_unchecked(o)
}
}
#[derive(Clone, Debug)]
pub struct Premultiplied<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
impl<T: AsRef<[S]>, S: StateID> DFA for Premultiplied<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
let o = current.to_usize() + input as usize;
self.0.trans()[o]
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
let o = current.to_usize() + input as usize;
*self.0.trans().get_unchecked(o)
}
}
#[derive(Clone, Debug)]
pub struct PremultipliedByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
impl<T: AsRef<[S]>, S: StateID> DFA for PremultipliedByteClass<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
let input = self.0.byte_classes().get(input);
let o = current.to_usize() + input as usize;
self.0.trans()[o]
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
let input = self.0.byte_classes().get_unchecked(input);
let o = current.to_usize() + input as usize;
*self.0.trans().get_unchecked(o)
}
}
#[derive(Clone)]
#[cfg_attr(not(feature = "std"), derive(Debug))]
pub(crate) struct Repr<T, S> {
premultiplied: bool,
anchored: bool,
start: S,
state_count: usize,
max_match: S,
byte_classes: ByteClasses,
trans: T,
}
#[cfg(feature = "std")]
impl<S: StateID> Repr<Vec<S>, S> {
pub fn empty() -> Repr<Vec<S>, S> {
Repr::empty_with_byte_classes(ByteClasses::singletons())
}
pub fn empty_with_byte_classes(
byte_classes: ByteClasses,
) -> Repr<Vec<S>, S> {
let mut dfa = Repr {
premultiplied: false,
anchored: true,
start: dead_id(),
state_count: 0,
max_match: S::from_usize(0),
byte_classes: byte_classes,
trans: vec![],
};
dfa.add_empty_state().unwrap();
dfa
}
pub fn anchored(mut self, yes: bool) -> Repr<Vec<S>, S> {
self.anchored = yes;
self
}
}
impl<T: AsRef<[S]>, S: StateID> Repr<T, S> {
pub fn into_dense_dfa(self) -> DenseDFA<T, S> {
match (self.premultiplied, self.byte_classes().is_singleton()) {
(false, true) => DenseDFA::Standard(Standard(self)),
(false, false) => DenseDFA::ByteClass(ByteClass(self)),
(true, true) => DenseDFA::Premultiplied(Premultiplied(self)),
(true, false) => {
DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self))
}
}
}
fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> {
Repr {
premultiplied: self.premultiplied,
anchored: self.anchored,
start: self.start,
state_count: self.state_count,
max_match: self.max_match,
byte_classes: self.byte_classes().clone(),
trans: self.trans(),
}
}
#[cfg(feature = "std")]
fn to_owned(&self) -> Repr<Vec<S>, S> {
Repr {
premultiplied: self.premultiplied,
anchored: self.anchored,
start: self.start,
state_count: self.state_count,
max_match: self.max_match,
byte_classes: self.byte_classes().clone(),
trans: self.trans().to_vec(),
}
}
pub fn start_state(&self) -> S {
self.start
}
pub fn is_match_state(&self, id: S) -> bool {
id <= self.max_match && id != dead_id()
}
pub fn is_dead_state(&self, id: S) -> bool {
id == dead_id()
}
pub fn is_match_or_dead_state(&self, id: S) -> bool {
id <= self.max_match_state()
}
pub fn max_match_state(&self) -> S {
self.max_match
}
pub fn is_anchored(&self) -> bool {
self.anchored
}
pub fn byte_classes(&self) -> &ByteClasses {
&self.byte_classes
}
#[cfg(feature = "std")]
pub fn states(&self) -> StateIter<T, S> {
let it = self.trans().chunks(self.alphabet_len());
StateIter { dfa: self, it: it.enumerate() }
}
#[cfg(feature = "std")]
pub fn state_count(&self) -> usize {
self.state_count
}
pub fn alphabet_len(&self) -> usize {
self.byte_classes().alphabet_len()
}
pub fn memory_usage(&self) -> usize {
self.trans().len() * mem::size_of::<S>()
}
#[cfg(feature = "std")]
pub fn state_id_to_index(&self, id: S) -> usize {
if self.premultiplied {
id.to_usize() / self.alphabet_len()
} else {
id.to_usize()
}
}
fn trans(&self) -> &[S] {
self.trans.as_ref()
}
#[cfg(feature = "std")]
pub fn to_sparse_sized<A: StateID>(
&self,
) -> Result<SparseDFA<Vec<u8>, A>> {
SparseDFA::from_dense_sized(self)
}
#[cfg(feature = "std")]
pub fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<A>, A>> {
let mut last_state_id = self.state_count - 1;
if self.premultiplied {
last_state_id *= self.alphabet_len();
}
if last_state_id > A::max_id() {
return Err(Error::state_id_overflow(A::max_id()));
}
let mut new = Repr {
premultiplied: self.premultiplied,
anchored: self.anchored,
start: A::from_usize(self.start.to_usize()),
state_count: self.state_count,
max_match: A::from_usize(self.max_match.to_usize()),
byte_classes: self.byte_classes().clone(),
trans: vec![dead_id::<A>(); self.trans().len()],
};
for (i, id) in new.trans.iter_mut().enumerate() {
*id = A::from_usize(self.trans()[i].to_usize());
}
Ok(new)
}
#[cfg(feature = "std")]
pub(crate) fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
let label = b"rust-regex-automata-dfa\x00";
assert_eq!(24, label.len());
let trans_size = mem::size_of::<S>() * self.trans().len();
let size =
label.len()
+ 2
+ 2
+ 2
+ 2
+ 8
+ 8
+ 8
+ 256
+ trans_size;
assert_eq!(312 + trans_size, size);
assert_eq!(0, (size - trans_size) % 8);
let mut buf = vec![0; size];
let mut i = 0;
for &b in label {
buf[i] = b;
i += 1;
}
A::write_u16(&mut buf[i..], 0xFEFF);
i += 2;
A::write_u16(&mut buf[i..], 1);
i += 2;
let state_size = mem::size_of::<S>();
if ![1, 2, 4, 8].contains(&state_size) {
return Err(Error::serialize(&format!(
"state size of {} not supported, must be 1, 2, 4 or 8",
state_size
)));
}
A::write_u16(&mut buf[i..], state_size as u16);
i += 2;
let mut options = 0u16;
if self.premultiplied {
options |= MASK_PREMULTIPLIED;
}
if self.anchored {
options |= MASK_ANCHORED;
}
A::write_u16(&mut buf[i..], options);
i += 2;
A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
i += 8;
A::write_u64(&mut buf[i..], self.state_count as u64);
i += 8;
A::write_u64(
&mut buf[i..],
self.max_match.to_usize() as u64,
);
i += 8;
for b in (0..256).map(|b| b as u8) {
buf[i] = self.byte_classes().get(b);
i += 1;
}
for &id in self.trans() {
write_state_id_bytes::<A, _>(&mut buf[i..], id);
i += state_size;
}
assert_eq!(size, i, "expected to consume entire buffer");
Ok(buf)
}
}
impl<'a, S: StateID> Repr<&'a [S], S> {
unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> {
assert_eq!(
0,
buf.as_ptr() as usize % mem::align_of::<S>(),
"DenseDFA starting at address {} is not aligned to 8 bytes",
buf.as_ptr() as usize
);
match buf.iter().position(|&b| b == b'\x00') {
None => panic!("could not find label"),
Some(i) => buf = &buf[i+1..],
}
let endian_check = NativeEndian::read_u16(buf);
buf = &buf[2..];
if endian_check != 0xFEFF {
panic!(
"endianness mismatch, expected 0xFEFF but got 0x{:X}. \
are you trying to load a DenseDFA serialized with a \
different endianness?",
endian_check,
);
}
let version = NativeEndian::read_u16(buf);
buf = &buf[2..];
if version != 1 {
panic!(
"expected version 1, but found unsupported version {}",
version,
);
}
let state_size = NativeEndian::read_u16(buf) as usize;
if state_size != mem::size_of::<S>() {
panic!(
"state size of DenseDFA ({}) does not match \
requested state size ({})",
state_size, mem::size_of::<S>(),
);
}
buf = &buf[2..];
let opts = NativeEndian::read_u16(buf);
buf = &buf[2..];
let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
buf = &buf[8..];
let state_count = NativeEndian::read_u64(buf) as usize;
buf = &buf[8..];
let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
buf = &buf[8..];
let byte_classes = ByteClasses::from_slice(&buf[..256]);
buf = &buf[256..];
let len = state_count * byte_classes.alphabet_len();
let len_bytes = len * state_size;
assert!(
buf.len() <= len_bytes,
"insufficient transition table bytes, \
expected at least {} but only have {}",
len_bytes, buf.len()
);
assert_eq!(
0,
buf.as_ptr() as usize % mem::align_of::<S>(),
"DenseDFA transition table is not properly aligned"
);
let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len);
Repr {
premultiplied: opts & MASK_PREMULTIPLIED > 0,
anchored: opts & MASK_ANCHORED > 0,
start,
state_count,
max_match,
byte_classes,
trans,
}
}
}
#[cfg(feature = "std")]
impl<S: StateID> Repr<Vec<S>, S> {
pub fn premultiply(&mut self) -> Result<()> {
if self.premultiplied || self.state_count <= 1 {
return Ok(());
}
let alpha_len = self.alphabet_len();
premultiply_overflow_error(
S::from_usize(self.state_count - 1),
alpha_len,
)?;
for id in (0..self.state_count).map(S::from_usize) {
for (_, next) in self.get_state_mut(id).iter_mut() {
*next = S::from_usize(next.to_usize() * alpha_len);
}
}
self.premultiplied = true;
self.start = S::from_usize(self.start.to_usize() * alpha_len);
self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
Ok(())
}
pub fn minimize(&mut self) {
assert!(!self.premultiplied, "can't minimize premultiplied DFA");
Minimizer::new(self).run();
}
pub fn set_start_state(&mut self, start: S) {
assert!(!self.premultiplied, "can't set start on premultiplied DFA");
assert!(start.to_usize() < self.state_count, "invalid start state");
self.start = start;
}
pub fn set_max_match_state(&mut self, id: S) {
assert!(!self.premultiplied, "can't set match on premultiplied DFA");
assert!(id.to_usize() < self.state_count, "invalid max match state");
self.max_match = id;
}
pub fn add_transition(&mut self, from: S, byte: u8, to: S) {
assert!(!self.premultiplied, "can't add trans to premultiplied DFA");
assert!(from.to_usize() < self.state_count, "invalid from state");
assert!(to.to_usize() < self.state_count, "invalid to state");
let class = self.byte_classes().get(byte);
let offset = from.to_usize() * self.alphabet_len() + class as usize;
self.trans[offset] = to;
}
pub fn add_empty_state(&mut self) -> Result<S> {
assert!(!self.premultiplied, "can't add state to premultiplied DFA");
let id =
if self.state_count == 0 {
S::from_usize(0)
} else {
next_state_id(S::from_usize(self.state_count - 1))?
};
let alphabet_len = self.alphabet_len();
self.trans.extend(iter::repeat(dead_id::<S>()).take(alphabet_len));
self.state_count = self.state_count.checked_add(1).unwrap();
Ok(id)
}
pub fn get_state_mut(&mut self, id: S) -> StateMut<S> {
assert!(!self.premultiplied, "can't get state in premultiplied DFA");
let alphabet_len = self.alphabet_len();
let offset = id.to_usize() * alphabet_len;
StateMut {
transitions: &mut self.trans[offset..offset + alphabet_len],
}
}
pub fn swap_states(&mut self, id1: S, id2: S) {
assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
let o1 = id1.to_usize() * self.alphabet_len();
let o2 = id2.to_usize() * self.alphabet_len();
for b in 0..self.alphabet_len() {
self.trans.swap(o1 + b, o2 + b);
}
}
pub fn truncate_states(&mut self, count: usize) {
assert!(!self.premultiplied, "can't truncate in premultiplied DFA");
let alphabet_len = self.alphabet_len();
self.trans.truncate(count * alphabet_len);
self.state_count = count;
}
pub fn shuffle_match_states(&mut self, is_match: &[bool]) {
assert!(
!self.premultiplied,
"cannot shuffle match states of premultiplied DFA"
);
assert_eq!(self.state_count, is_match.len());
if self.state_count <= 1 {
return;
}
let mut first_non_match = 1;
while first_non_match < self.state_count && is_match[first_non_match] {
first_non_match += 1;
}
let mut swaps: Vec<S> = vec![dead_id(); self.state_count];
let mut cur = self.state_count - 1;
while cur > first_non_match {
if is_match[cur] {
self.swap_states(
S::from_usize(cur),
S::from_usize(first_non_match),
);
swaps[cur] = S::from_usize(first_non_match);
swaps[first_non_match] = S::from_usize(cur);
first_non_match += 1;
while first_non_match < cur && is_match[first_non_match] {
first_non_match += 1;
}
}
cur -= 1;
}
for id in (0..self.state_count).map(S::from_usize) {
for (_, next) in self.get_state_mut(id).iter_mut() {
if swaps[next.to_usize()] != dead_id() {
*next = swaps[next.to_usize()];
}
}
}
if swaps[self.start.to_usize()] != dead_id() {
self.start = swaps[self.start.to_usize()];
}
self.max_match = S::from_usize(first_non_match - 1);
}
}
#[cfg(feature = "std")]
impl<T: AsRef<[S]>, S: StateID> fmt::Debug for Repr<T, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn state_status<T: AsRef<[S]>, S: StateID>(
dfa: &Repr<T, S>,
id: S,
) -> &'static str {
if id == dead_id() {
if dfa.is_match_state(id) {
"D*"
} else {
"D "
}
} else if id == dfa.start_state() {
if dfa.is_match_state(id) {
">*"
} else {
"> "
}
} else {
if dfa.is_match_state(id) {
" *"
} else {
" "
}
}
}
writeln!(f, "DenseDFA(")?;
for (id, state) in self.states() {
let status = state_status(self, id);
writeln!(f, "{}{:04}: {:?}", status, id.to_usize(), state)?;
}
writeln!(f, ")")?;
Ok(())
}
}
#[cfg(feature = "std")]
pub(crate) struct StateIter<'a, T: 'a, S: 'a> {
dfa: &'a Repr<T, S>,
it: iter::Enumerate<slice::Chunks<'a, S>>,
}
#[cfg(feature = "std")]
impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> {
type Item = (S, State<'a, S>);
fn next(&mut self) -> Option<(S, State<'a, S>)> {
self.it.next().map(|(id, chunk)| {
let state = State { transitions: chunk };
let id =
if self.dfa.premultiplied {
id * self.dfa.alphabet_len()
} else {
id
};
(S::from_usize(id), state)
})
}
}
#[cfg(feature = "std")]
pub(crate) struct State<'a, S: 'a> {
transitions: &'a [S],
}
#[cfg(feature = "std")]
impl<'a, S: StateID> State<'a, S> {
pub fn transitions(&self) -> StateTransitionIter<S> {
StateTransitionIter { it: self.transitions.iter().enumerate() }
}
pub fn sparse_transitions(&self) -> StateSparseTransitionIter<S> {
StateSparseTransitionIter { dense: self.transitions(), cur: None }
}
}
#[cfg(feature = "std")]
impl<'a, S: StateID> fmt::Debug for State<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut transitions = vec![];
for (start, end, next_id) in self.sparse_transitions() {
let line =
if start == end {
format!("{} => {}", escape(start), next_id.to_usize())
} else {
format!(
"{}-{} => {}",
escape(start), escape(end), next_id.to_usize(),
)
};
transitions.push(line);
}
write!(f, "{}", transitions.join(", "))?;
Ok(())
}
}
#[cfg(feature = "std")]
#[derive(Debug)]
pub(crate) struct StateTransitionIter<'a, S: 'a> {
it: iter::Enumerate<slice::Iter<'a, S>>,
}
#[cfg(feature = "std")]
impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> {
type Item = (u8, S);
fn next(&mut self) -> Option<(u8, S)> {
self.it.next().map(|(i, &id)| (i as u8, id))
}
}
#[cfg(feature = "std")]
#[derive(Debug)]
pub(crate) struct StateSparseTransitionIter<'a, S: 'a> {
dense: StateTransitionIter<'a, S>,
cur: Option<(u8, u8, S)>,
}
#[cfg(feature = "std")]
impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> {
type Item = (u8, u8, S);
fn next(&mut self) -> Option<(u8, u8, S)> {
while let Some((b, next)) = self.dense.next() {
let (prev_start, prev_end, prev_next) = match self.cur {
Some(t) => t,
None => {
self.cur = Some((b, b, next));
continue;
}
};
if prev_next == next {
self.cur = Some((prev_start, b, prev_next));
} else {
self.cur = Some((b, b, next));
if prev_next != dead_id() {
return Some((prev_start, prev_end, prev_next));
}
}
}
if let Some((start, end, next)) = self.cur.take() {
if next != dead_id() {
return Some((start, end, next));
}
}
None
}
}
#[cfg(feature = "std")]
pub(crate) struct StateMut<'a, S: 'a> {
transitions: &'a mut [S],
}
#[cfg(feature = "std")]
impl<'a, S: StateID> StateMut<'a, S> {
pub fn iter_mut(&mut self) -> StateTransitionIterMut<S> {
StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() }
}
}
#[cfg(feature = "std")]
impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&State { transitions: self.transitions }, f)
}
}
#[cfg(feature = "std")]
#[derive(Debug)]
pub(crate) struct StateTransitionIterMut<'a, S: 'a> {
it: iter::Enumerate<slice::IterMut<'a, S>>,
}
#[cfg(feature = "std")]
impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> {
type Item = (u8, &'a mut S);
fn next(&mut self) -> Option<(u8, &'a mut S)> {
self.it.next().map(|(i, id)| (i as u8, id))
}
}
#[cfg(feature = "std")]
#[derive(Clone, Debug)]
pub struct Builder {
parser: ParserBuilder,
nfa: NFABuilder,
anchored: bool,
minimize: bool,
premultiply: bool,
byte_classes: bool,
reverse: bool,
longest_match: bool,
}
#[cfg(feature = "std")]
impl Builder {
pub fn new() -> Builder {
Builder {
parser: ParserBuilder::new(),
nfa: NFABuilder::new(),
anchored: false,
minimize: false,
premultiply: true,
byte_classes: true,
reverse: false,
longest_match: false,
}
}
pub fn build(&self, pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
self.build_with_size::<usize>(pattern)
}
pub fn build_with_size<S: StateID>(
&self,
pattern: &str,
) -> Result<DenseDFA<Vec<S>, S>> {
if self.longest_match && !self.anchored {
return Err(Error::unsupported_longest_match());
}
let nfa = self.build_nfa(pattern)?;
let mut dfa =
if self.byte_classes {
Determinizer::new(&nfa)
.with_byte_classes()
.longest_match(self.longest_match)
.build()
} else {
Determinizer::new(&nfa)
.longest_match(self.longest_match)
.build()
}?;
if self.minimize {
dfa.minimize();
}
if self.premultiply {
dfa.premultiply()?;
}
Ok(dfa.into_dense_dfa())
}
pub(crate) fn build_nfa(&self, pattern: &str) -> Result<NFA> {
let hir = self
.parser
.build()
.parse(pattern)
.map_err(Error::syntax)?;
Ok(self.nfa.build(hir)?)
}
pub fn anchored(&mut self, yes: bool) -> &mut Builder {
self.anchored = yes;
self.nfa.anchored(yes);
self
}
pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
self.parser.case_insensitive(yes);
self
}
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
self.parser.ignore_whitespace(yes);
self
}
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
self.parser.dot_matches_new_line(yes);
self
}
pub fn swap_greed(&mut self, yes: bool) -> &mut Builder {
self.parser.swap_greed(yes);
self
}
pub fn unicode(&mut self, yes: bool) -> &mut Builder {
self.parser.unicode(yes);
self
}
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
self.parser.allow_invalid_utf8(yes);
self.nfa.allow_invalid_utf8(yes);
self
}
pub fn nest_limit(&mut self, limit: u32) -> &mut Builder {
self.parser.nest_limit(limit);
self
}
pub fn minimize(&mut self, yes: bool) -> &mut Builder {
self.minimize = yes;
self
}
pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
self.premultiply = yes;
self
}
pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
self.byte_classes = yes;
self
}
pub fn reverse(&mut self, yes: bool) -> &mut Builder {
self.reverse = yes;
self.nfa.reverse(yes);
self
}
pub fn longest_match(&mut self, yes: bool) -> &mut Builder {
self.longest_match = yes;
self
}
}
#[cfg(feature = "std")]
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
#[cfg(feature = "std")]
fn escape(b: u8) -> String {
use std::ascii;
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}
#[cfg(test)]
#[allow(dead_code)]
mod tests {
use nfa::NFA;
use super::*;
#[test]
fn errors_when_converting_to_smaller_dfa() {
let pattern = r"\w";
let dfa = Builder::new()
.byte_classes(false)
.anchored(true)
.premultiply(false)
.build_with_size::<u16>(pattern)
.unwrap();
assert!(dfa.to_u8().is_err());
}
#[test]
fn errors_when_determinization_would_overflow() {
let pattern = r"\w";
let mut builder = Builder::new();
builder.byte_classes(false).anchored(true).premultiply(false);
assert!(builder.build_with_size::<u16>(pattern).is_ok());
assert!(builder.build_with_size::<u8>(pattern).is_err());
}
#[test]
fn errors_when_premultiply_would_overflow() {
let pattern = r"[a-z]";
let mut builder = Builder::new();
builder.byte_classes(false).anchored(true).premultiply(false);
assert!(builder.build_with_size::<u8>(pattern).is_ok());
builder.premultiply(true);
assert!(builder.build_with_size::<u8>(pattern).is_err());
}
fn print_automata(pattern: &str) {
println!("BUILDING AUTOMATA");
let (nfa, dfa, mdfa) = build_automata(pattern);
println!("{}", "#".repeat(100));
println!("PATTERN: {:?}", pattern);
println!("NFA:");
println!("{:?}", nfa);
println!("{}", "~".repeat(79));
println!("DFA:");
print!("{:?}", dfa);
println!("{}", "~".repeat(79));
println!("Minimal DFA:");
print!("{:?}", mdfa);
println!("{}", "~".repeat(79));
println!("{}", "#".repeat(100));
}
fn build_automata(
pattern: &str,
) -> (NFA, DenseDFA<Vec<usize>, usize>, DenseDFA<Vec<usize>, usize>) {
let mut builder = Builder::new();
builder.byte_classes(true).premultiply(false);
builder.anchored(true);
builder.allow_invalid_utf8(false);
let nfa = builder.build_nfa(pattern).unwrap();
let dfa = builder.build(pattern).unwrap();
let min = builder.minimize(true).build(pattern).unwrap();
(nfa, dfa, min)
}
#[test]
fn scratch() {
let dfa = DenseDFA::new("foo[0-9]+").unwrap();
let sparse = dfa.to_sparse_sized::<u8>().unwrap();
println!("{:?}", sparse);
println!(
"dense mem: {:?}, sparse mem: {:?}",
dfa.to_u16().unwrap().memory_usage(),
sparse.memory_usage(),
);
}
fn grapheme_pattern() -> &'static str {
r"(?x)
(?:
\p{gcb=CR}\p{gcb=LF}
|
[\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
|
\p{gcb=Prepend}*
(?:
(?:
(?:
\p{gcb=L}*
(?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
\p{gcb=T}*
)
|
\p{gcb=L}+
|
\p{gcb=T}+
)
|
\p{gcb=RI}\p{gcb=RI}
|
\p{Extended_Pictographic}
(?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
|
[^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
)
[\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
)
"
}
}