#[cfg(feature = "std")]
use std::collections::HashMap;
#[cfg(feature = "std")]
use core::fmt;
#[cfg(feature = "std")]
use core::iter;
use core::marker::PhantomData;
use core::mem::size_of;
use byteorder::{ByteOrder, NativeEndian};
#[cfg(feature = "std")]
use byteorder::{BigEndian, LittleEndian};
use classes::ByteClasses;
use dense;
use dfa::DFA;
#[cfg(feature = "std")]
use error::{Error, Result};
#[cfg(feature = "std")]
use state_id::{StateID, dead_id, usize_to_state_id, write_state_id_bytes};
#[cfg(not(feature = "std"))]
use state_id::{StateID, dead_id};
#[derive(Clone, Debug)]
pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
Standard(Standard<T, S>),
ByteClass(ByteClass<T, S>),
#[doc(hidden)]
__Nonexhaustive,
}
#[cfg(feature = "std")]
impl SparseDFA<Vec<u8>, usize> {
pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
dense::Builder::new()
.build(pattern)
.and_then(|dense| dense.to_sparse())
}
}
#[cfg(feature = "std")]
impl<S: StateID> SparseDFA<Vec<u8>, S> {
pub fn empty() -> SparseDFA<Vec<u8>, S> {
dense::DenseDFA::empty().to_sparse().unwrap()
}
pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
dfa: &dense::Repr<T, S>,
) -> Result<SparseDFA<Vec<u8>, A>> {
Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
}
}
impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
match *self {
SparseDFA::Standard(Standard(ref r)) => {
SparseDFA::Standard(Standard(r.as_ref()))
}
SparseDFA::ByteClass(ByteClass(ref r)) => {
SparseDFA::ByteClass(ByteClass(r.as_ref()))
}
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
#[cfg(feature = "std")]
pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
match *self {
SparseDFA::Standard(Standard(ref r)) => {
SparseDFA::Standard(Standard(r.to_owned()))
}
SparseDFA::ByteClass(ByteClass(ref r)) => {
SparseDFA::ByteClass(ByteClass(r.to_owned()))
}
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
pub fn memory_usage(&self) -> usize {
self.repr().memory_usage()
}
fn repr(&self) -> &Repr<T, S> {
match *self {
SparseDFA::Standard(ref r) => &r.0,
SparseDFA::ByteClass(ref r) => &r.0,
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
}
#[cfg(feature = "std")]
impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
self.to_sized()
}
pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
self.to_sized()
}
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
self.to_sized()
}
#[cfg(target_pointer_width = "64")]
pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
self.to_sized()
}
pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
self.repr().to_sized().map(|r| r.into_sparse_dfa())
}
pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<LittleEndian>()
}
pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<BigEndian>()
}
pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
self.repr().to_bytes::<NativeEndian>()
}
}
impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
Repr::from_bytes(buf).into_sparse_dfa()
}
}
impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.repr().start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.repr().is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.repr().is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.repr().is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.repr().is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
match *self {
SparseDFA::Standard(ref r) => r.next_state(current, input),
SparseDFA::ByteClass(ref r) => r.next_state(current, input),
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
self.next_state(current, input)
}
#[inline]
fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
match *self {
SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
SparseDFA::Standard(ref r) => r.find_at(bytes, start),
SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
#[inline]
fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
match *self {
SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
SparseDFA::__Nonexhaustive => unreachable!(),
}
}
}
#[derive(Clone, Debug)]
pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(
Repr<T, S>,
);
impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
self.0.state(current).next(input)
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
self.next_state(current, input)
}
}
#[derive(Clone, Debug)]
pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(
Repr<T, S>,
);
impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
type ID = S;
#[inline]
fn start_state(&self) -> S {
self.0.start_state()
}
#[inline]
fn is_match_state(&self, id: S) -> bool {
self.0.is_match_state(id)
}
#[inline]
fn is_dead_state(&self, id: S) -> bool {
self.0.is_dead_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: S) -> bool {
self.0.is_match_or_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
self.0.is_anchored()
}
#[inline]
fn next_state(&self, current: S, input: u8) -> S {
let input = self.0.byte_classes.get(input);
self.0.state(current).next(input)
}
#[inline]
unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
self.next_state(current, input)
}
}
#[derive(Clone)]
#[cfg_attr(not(feature = "std"), derive(Debug))]
struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
anchored: bool,
start: S,
state_count: usize,
max_match: S,
byte_classes: ByteClasses,
trans: T,
}
impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
fn into_sparse_dfa(self) -> SparseDFA<T, S> {
if self.byte_classes.is_singleton() {
SparseDFA::Standard(Standard(self))
} else {
SparseDFA::ByteClass(ByteClass(self))
}
}
fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
Repr {
anchored: self.anchored,
start: self.start,
state_count: self.state_count,
max_match: self.max_match,
byte_classes: self.byte_classes.clone(),
trans: self.trans(),
}
}
#[cfg(feature = "std")]
fn to_owned(&self) -> Repr<Vec<u8>, S> {
Repr {
anchored: self.anchored,
start: self.start,
state_count: self.state_count,
max_match: self.max_match,
byte_classes: self.byte_classes.clone(),
trans: self.trans().to_vec(),
}
}
#[inline]
fn state<'a>(&'a self, id: S) -> State<'a, S> {
let mut pos = id.to_usize();
let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
pos += 2;
let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
pos += 2 * ntrans;
let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
}
#[cfg(feature = "std")]
fn states<'a>(&'a self) -> StateIter<'a, T, S> {
StateIter { dfa: self, id: dead_id() }
}
fn memory_usage(&self) -> usize {
self.trans().len()
}
fn start_state(&self) -> S {
self.start
}
fn is_match_state(&self, id: S) -> bool {
self.is_match_or_dead_state(id) && !self.is_dead_state(id)
}
fn is_dead_state(&self, id: S) -> bool {
id == dead_id()
}
fn is_match_or_dead_state(&self, id: S) -> bool {
id <= self.max_match
}
fn is_anchored(&self) -> bool {
self.anchored
}
fn trans(&self) -> &[u8] {
self.trans.as_ref()
}
#[cfg(feature = "std")]
fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
for (old_id, state) in self.states() {
let pos = trans.len();
map.insert(old_id, usize_to_state_id(pos)?);
let n = state.ntrans;
let zeros = 2 + (n * 2) + (n * size_of::<A>());
trans.extend(iter::repeat(0).take(zeros));
NativeEndian::write_u16(&mut trans[pos..], n as u16);
let (s, e) = (pos + 2, pos + 2 + (n * 2));
trans[s..e].copy_from_slice(state.input_ranges);
}
let mut new = Repr {
anchored: self.anchored,
start: map[&self.start],
state_count: self.state_count,
max_match: map[&self.max_match],
byte_classes: self.byte_classes.clone(),
trans: trans,
};
for (&old_id, &new_id) in map.iter() {
let old_state = self.state(old_id);
let mut new_state = new.state_mut(new_id);
for i in 0..new_state.ntrans {
let next = map[&old_state.next_at(i)];
new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
}
}
new.start = map[&self.start];
new.max_match = map[&self.max_match];
Ok(new)
}
#[cfg(feature = "std")]
fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
let label = b"rust-regex-automata-sparse-dfa\x00";
let size =
label.len()
+ 2
+ 2
+ 2
+ 2
+ 8
+ 8
+ 8
+ 256
+ self.trans().len();
let mut i = 0;
let mut buf = vec![0; size];
for &b in label {
buf[i] = b;
i += 1;
}
A::write_u16(&mut buf[i..], 0xFEFF);
i += 2;
A::write_u16(&mut buf[i..], 1);
i += 2;
let state_size = size_of::<S>();
if ![1, 2, 4, 8].contains(&state_size) {
return Err(Error::serialize(&format!(
"state size of {} not supported, must be 1, 2, 4 or 8",
state_size
)));
}
A::write_u16(&mut buf[i..], state_size as u16);
i += 2;
let mut options = 0u16;
if self.anchored {
options |= dense::MASK_ANCHORED;
}
A::write_u16(&mut buf[i..], options);
i += 2;
A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
i += 8;
A::write_u64(&mut buf[i..], self.state_count as u64);
i += 8;
A::write_u64(
&mut buf[i..],
self.max_match.to_usize() as u64,
);
i += 8;
for b in (0..256).map(|b| b as u8) {
buf[i] = self.byte_classes.get(b);
i += 1;
}
for (_, state) in self.states() {
A::write_u16(&mut buf[i..], state.ntrans as u16);
i += 2;
buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
i += state.ntrans * 2;
for j in 0..state.ntrans {
write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
i += size_of::<S>();
}
}
assert_eq!(size, i, "expected to consume entire buffer");
Ok(buf)
}
}
impl<'a, S: StateID> Repr<&'a [u8], S> {
unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
match buf.iter().position(|&b| b == b'\x00') {
None => panic!("could not find label"),
Some(i) => buf = &buf[i+1..],
}
let endian_check = NativeEndian::read_u16(buf);
buf = &buf[2..];
if endian_check != 0xFEFF {
panic!(
"endianness mismatch, expected 0xFEFF but got 0x{:X}. \
are you trying to load a SparseDFA serialized with a \
different endianness?",
endian_check,
);
}
let version = NativeEndian::read_u16(buf);
buf = &buf[2..];
if version != 1 {
panic!(
"expected version 1, but found unsupported version {}",
version,
);
}
let state_size = NativeEndian::read_u16(buf) as usize;
if state_size != size_of::<S>() {
panic!(
"state size of SparseDFA ({}) does not match \
requested state size ({})",
state_size, size_of::<S>(),
);
}
buf = &buf[2..];
let opts = NativeEndian::read_u16(buf);
buf = &buf[2..];
let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
buf = &buf[8..];
let state_count = NativeEndian::read_u64(buf) as usize;
buf = &buf[8..];
let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
buf = &buf[8..];
let byte_classes = ByteClasses::from_slice(&buf[..256]);
buf = &buf[256..];
Repr {
anchored: opts & dense::MASK_ANCHORED > 0,
start,
state_count,
max_match,
byte_classes,
trans: buf,
}
}
}
#[cfg(feature = "std")]
impl<S: StateID> Repr<Vec<u8>, S> {
fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
dfa: &dense::Repr<T, S>,
) -> Result<Repr<Vec<u8>, A>> {
let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
for (old_id, state) in dfa.states() {
let pos = trans.len();
remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
trans.push(0);
trans.push(0);
let mut trans_count = 0;
for (b1, b2, _) in state.sparse_transitions() {
trans_count += 1;
trans.push(b1);
trans.push(b2);
}
NativeEndian::write_u16(&mut trans[pos..], trans_count);
let zeros = trans_count as usize * size_of::<A>();
trans.extend(iter::repeat(0).take(zeros));
}
let mut new = Repr {
anchored: dfa.is_anchored(),
start: remap[dfa.state_id_to_index(dfa.start_state())],
state_count: dfa.state_count(),
max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
byte_classes: dfa.byte_classes().clone(),
trans: trans,
};
for (old_id, old_state) in dfa.states() {
let new_id = remap[dfa.state_id_to_index(old_id)];
let mut new_state = new.state_mut(new_id);
let sparse = old_state.sparse_transitions();
for (i, (_, _, next)) in sparse.enumerate() {
let next = remap[dfa.state_id_to_index(next)];
new_state.set_next_at(i, next);
}
}
Ok(new)
}
fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
let mut pos = id.to_usize();
let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
pos += 2;
let size = (ntrans * 2) + (ntrans * size_of::<S>());
let ranges_and_next = &mut self.trans[pos..pos + size];
let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
}
}
#[cfg(feature = "std")]
impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn state_status<T: AsRef<[u8]>, S: StateID>(
dfa: &Repr<T, S>,
id: S,
) -> &'static str {
if id == dead_id() {
if dfa.is_match_state(id) {
"D*"
} else {
"D "
}
} else if id == dfa.start_state() {
if dfa.is_match_state(id) {
">*"
} else {
"> "
}
} else {
if dfa.is_match_state(id) {
" *"
} else {
" "
}
}
}
writeln!(f, "SparseDFA(")?;
for (id, state) in self.states() {
let status = state_status(self, id);
writeln!(f, "{}{:04}: {:?}", status, id.to_usize(), state)?;
}
writeln!(f, ")")?;
Ok(())
}
}
#[cfg(feature = "std")]
#[derive(Debug)]
struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
dfa: &'a Repr<T, S>,
id: S,
}
#[cfg(feature = "std")]
impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
type Item = (S, State<'a, S>);
fn next(&mut self) -> Option<(S, State<'a, S>)> {
if self.id.to_usize() >= self.dfa.trans().len() {
return None;
}
let id = self.id;
let state = self.dfa.state(id);
self.id = S::from_usize(self.id.to_usize() + state.bytes());
Some((id, state))
}
}
#[derive(Clone)]
struct State<'a, S: StateID = usize> {
_state_id_repr: PhantomData<S>,
ntrans: usize,
input_ranges: &'a [u8],
next: &'a [u8],
}
impl<'a, S: StateID> State<'a, S> {
fn next(&self, input: u8) -> S {
for i in 0..self.ntrans {
let (start, end) = self.range(i);
if start <= input && input <= end {
return self.next_at(i)
}
}
dead_id()
}
fn range(&self, i: usize) -> (u8, u8) {
(self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
}
fn next_at(&self, i: usize) -> S {
S::read_bytes(&self.next[i * size_of::<S>()..])
}
#[cfg(feature = "std")]
fn bytes(&self) -> usize {
2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
}
}
#[cfg(feature = "std")]
impl<'a, S: StateID> fmt::Debug for State<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut transitions = vec![];
for i in 0..self.ntrans {
let next = self.next_at(i);
if next == dead_id() {
continue;
}
let (start, end) = self.range(i);
if start == end {
transitions.push(
format!("{} => {}", escape(start), next.to_usize()),
);
} else {
transitions.push(
format!(
"{}-{} => {}",
escape(start),
escape(end),
next.to_usize(),
),
);
}
}
write!(f, "{}", transitions.join(", "))
}
}
#[cfg(feature = "std")]
struct StateMut<'a, S: StateID = usize> {
_state_id_repr: PhantomData<S>,
ntrans: usize,
input_ranges: &'a mut [u8],
next: &'a mut [u8],
}
#[cfg(feature = "std")]
impl<'a, S: StateID> StateMut<'a, S> {
fn set_next_at(&mut self, i: usize, next: S) {
next.write_bytes(&mut self.next[i * size_of::<S>()..]);
}
}
#[cfg(feature = "std")]
impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let state = State {
_state_id_repr: self._state_id_repr,
ntrans: self.ntrans,
input_ranges: self.input_ranges,
next: self.next,
};
fmt::Debug::fmt(&state, f)
}
}
#[cfg(feature = "std")]
fn escape(b: u8) -> String {
use std::ascii;
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}
#[inline(always)]
#[allow(dead_code)]
fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
debug_assert!(ranges.len() <= 512, "ranges should be short");
let (mut left, mut right) = (0, ranges.len() / 2);
while left < right {
let mid = (left + right) / 2;
let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
if needle < b1 {
right = mid;
} else if needle > b2 {
left = mid + 1;
} else {
return Some(mid);
}
}
None
}