regex-automata 0.1.3

Automata construction and matching using regular expressions.
Documentation
use core::fmt::Debug;
use core::hash::Hash;
use core::mem::size_of;

use byteorder::{ByteOrder, NativeEndian};

#[cfg(feature = "std")]
pub use self::std::*;

#[cfg(feature = "std")]
mod std {
    use core::mem::size_of;
    use byteorder::ByteOrder;
    use error::{Error, Result};

    use super::StateID;

    /// Check that the premultiplication of the given state identifier can
    /// fit into the representation indicated by `S`. If it cannot, or if it
    /// overflows `usize` itself, then an error is returned.
    pub fn premultiply_overflow_error<S: StateID>(
        last_state: S,
        alphabet_len: usize,
    ) -> Result<()> {
        let requested = match last_state.to_usize().checked_mul(alphabet_len) {
            Some(requested) => requested,
            None => return Err(Error::premultiply_overflow(0, 0)),
        };
        if requested > S::max_id() {
            return Err(Error::premultiply_overflow(S::max_id(), requested));
        }
        Ok(())
    }

    /// Allocate the next sequential identifier for a fresh state given
    /// the previously constructed state identified by `current`. If the
    /// next sequential identifier would overflow `usize` or the chosen
    /// representation indicated by `S`, then an error is returned.
    pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
        let next = match current.to_usize().checked_add(1) {
            Some(next) => next,
            None => return Err(Error::state_id_overflow(::std::usize::MAX)),
        };
        if next > S::max_id() {
            return Err(Error::state_id_overflow(S::max_id()));
        }
        Ok(S::from_usize(next))
    }

    /// Convert the given `usize` to the chosen state identifier
    /// representation. If the given value cannot fit in the chosen
    /// representation, then an error is returned.
    pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
        if value > S::max_id() {
            Err(Error::state_id_overflow(S::max_id()))
        } else {
            Ok(S::from_usize(value))
        }
    }

    /// Write the given identifier to the given slice of bytes using the
    /// specified endianness. The given slice must have length at least
    /// `size_of::<S>()`.
    ///
    /// The given state identifier representation must have size 1, 2, 4 or 8.
    pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
        slice: &mut [u8],
        id: S,
    ) {
        assert!(
            1 == size_of::<S>()
            || 2 == size_of::<S>()
            || 4 == size_of::<S>()
            || 8 == size_of::<S>()
        );

        match size_of::<S>() {
            1 => slice[0] = id.to_usize() as u8,
            2 => E::write_u16(slice, id.to_usize() as u16),
            4 => E::write_u32(slice, id.to_usize() as u32),
            8 => E::write_u64(slice, id.to_usize() as u64),
            _ => unreachable!(),
        }
    }
}

/// Return the unique identifier for a DFA's dead state in the chosen
/// representation indicated by `S`.
pub fn dead_id<S: StateID>() -> S {
    S::from_usize(0)
}

/// A trait describing the representation of a DFA's state identifier.
///
/// The purpose of this trait is to safely express both the possible state
/// identifier representations that can be used in a DFA and to convert between
/// state identifier representations and types that can be used to efficiently
/// index memory (such as `usize`).
///
/// In general, one should not need to implement this trait explicitly. In
/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
/// represent all corresponding values in a `usize`.)
///
/// # Safety
///
/// This trait is unsafe because the correctness of its implementations may be
/// relied upon by other unsafe code. For example, one possible way to
/// implement this trait incorrectly would be to return a maximum identifier
/// in `max_id` that is greater than the real maximum identifier. This will
/// likely result in wrap-on-overflow semantics in release mode, which can in
/// turn produce incorrect state identifiers. Those state identifiers may then
/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
/// checks are explicitly elided for performance reasons.
pub unsafe trait StateID:
    Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
{
    /// Convert from a `usize` to this implementation's representation.
    ///
    /// Implementors may assume that `n <= Self::max_id`. That is, implementors
    /// do not need to check whether `n` can fit inside this implementation's
    /// representation.
    fn from_usize(n: usize) -> Self;

    /// Convert this implementation's representation to a `usize`.
    ///
    /// Implementors must not return a `usize` value greater than
    /// `Self::max_id` and must not permit overflow when converting between the
    /// implementor's representation and `usize`. In general, the preferred
    /// way for implementors to achieve this is to simply not provide
    /// implementations of `StateID` that cannot fit into the target platform's
    /// `usize`.
    fn to_usize(self) -> usize;

    /// Return the maximum state identifier supported by this representation.
    ///
    /// Implementors must return a correct bound. Doing otherwise may result
    /// in memory unsafety.
    fn max_id() -> usize;

    /// Read a single state identifier from the given slice of bytes in native
    /// endian format.
    ///
    /// Implementors may assume that the given slice has length at least
    /// `size_of::<Self>()`.
    fn read_bytes(slice: &[u8]) -> Self;

    /// Write this state identifier to the given slice of bytes in native
    /// endian format.
    ///
    /// Implementors may assume that the given slice has length at least
    /// `size_of::<Self>()`.
    fn write_bytes(self, slice: &mut [u8]);
}

unsafe impl StateID for usize {
    #[inline]
    fn from_usize(n: usize) -> usize { n }

    #[inline]
    fn to_usize(self) -> usize { self }

    #[inline]
    fn max_id() -> usize { ::core::usize::MAX }

    #[inline]
    fn read_bytes(slice: &[u8]) -> Self {
        NativeEndian::read_uint(slice, size_of::<usize>()) as usize
    }

    #[inline]
    fn write_bytes(self, slice: &mut [u8]) {
        NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
    }
}

unsafe impl StateID for u8 {
    #[inline]
    fn from_usize(n: usize) -> u8 { n as u8 }

    #[inline]
    fn to_usize(self) -> usize { self as usize }

    #[inline]
    fn max_id() -> usize { ::core::u8::MAX as usize }

    #[inline]
    fn read_bytes(slice: &[u8]) -> Self {
        slice[0]
    }

    #[inline]
    fn write_bytes(self, slice: &mut [u8]) {
        slice[0] = self;
    }
}

unsafe impl StateID for u16 {
    #[inline]
    fn from_usize(n: usize) -> u16 { n as u16 }

    #[inline]
    fn to_usize(self) -> usize { self as usize }

    #[inline]
    fn max_id() -> usize { ::core::u16::MAX as usize }

    #[inline]
    fn read_bytes(slice: &[u8]) -> Self {
        NativeEndian::read_u16(slice)
    }

    #[inline]
    fn write_bytes(self, slice: &mut [u8]) {
        NativeEndian::write_u16(slice, self)
    }
}

#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
unsafe impl StateID for u32 {
    #[inline]
    fn from_usize(n: usize) -> u32 { n as u32 }

    #[inline]
    fn to_usize(self) -> usize { self as usize }

    #[inline]
    fn max_id() -> usize { ::core::u32::MAX as usize }

    #[inline]
    fn read_bytes(slice: &[u8]) -> Self {
        NativeEndian::read_u32(slice)
    }

    #[inline]
    fn write_bytes(self, slice: &mut [u8]) {
        NativeEndian::write_u32(slice, self)
    }
}

#[cfg(target_pointer_width = "64")]
unsafe impl StateID for u64 {
    #[inline]
    fn from_usize(n: usize) -> u64 { n as u64 }

    #[inline]
    fn to_usize(self) -> usize { self as usize }

    #[inline]
    fn max_id() -> usize { ::core::u64::MAX as usize }

    #[inline]
    fn read_bytes(slice: &[u8]) -> Self {
        NativeEndian::read_u64(slice)
    }

    #[inline]
    fn write_bytes(self, slice: &mut [u8]) {
        NativeEndian::write_u64(slice, self)
    }
}