vyre-std 0.1.0

Vyre standard library: GPU DFA assembly pipeline, Aho-Corasick construction, and compositional arithmetic helpers
Documentation
//! Shared types for the vyre-std DFA assembly pipeline.
//!
//! The pipeline stages pass these types between each other:
//!
//! ```text
//! regex_to_nfa → Nfa → nfa_to_dfa → Dfa → dfa_minimize → Dfa → dfa_pack → PackedDfa
//! ```
//!
//! Each stage is a pure function; the final `PackedDfa` is a POD byte
//! layout suitable for direct upload to a GPU buffer.

/// Sentinel state id used to mark "no transition on this byte."
pub const INVALID_STATE: u32 = u32::MAX;

/// Construction error shared across the pipeline.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum PatternError {
    /// Regex source contained a character outside the supported subset or a
    /// malformed sequence.
    ///
    /// Carries the byte offset and a human-readable description. Every
    /// message starts with `Fix:` so consumers can surface the remediation
    /// directly to end users.
    ParseError {
        /// Byte offset at which the parser gave up.
        offset: usize,
        /// Actionable `Fix:` prose describing what went wrong.
        message: String,
    },
    /// Regex source used an escape sequence reserved for regex shorthand or
    /// anchor semantics that this byte-oriented frontend does not implement.
    UnsupportedEscape {
        /// Byte offset of the leading backslash.
        offset: usize,
        /// Escaped ASCII byte after the backslash.
        escape: char,
    },
    /// Regex source exceeded the parser's bounded recursive group nesting.
    RecursionLimit {
        /// Maximum group nesting depth permitted.
        limit: usize,
    },
    /// The NFA or DFA exceeded a configured state budget.
    StateOverflow {
        /// Maximum state count permitted.
        limit: u32,
    },
    /// The pattern set was empty or otherwise structurally invalid for
    /// composite assembly (e.g., mixing empty regex with literal patterns).
    EmptyPatternSet,
}

impl core::fmt::Display for PatternError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Self::ParseError { offset, message } => {
                write!(f, "pattern parse error at byte {offset}: {message}")
            }
            Self::UnsupportedEscape { offset, escape } => {
                write!(
                    f,
                    "pattern parse error at byte {offset}: Fix: unsupported escape `\\{escape}`; spell the byte set explicitly with a character class."
                )
            }
            Self::RecursionLimit { limit } => {
                write!(
                    f,
                    "Fix: regex group nesting exceeded the recursion limit ({limit}); flatten or split the expression."
                )
            }
            Self::StateOverflow { limit } => {
                write!(
                    f,
                    "Fix: state count exceeded the pipeline budget ({limit}); split the pattern set into smaller batches."
                )
            }
            Self::EmptyPatternSet => write!(
                f,
                "Fix: pattern set must contain at least one non-empty pattern."
            ),
        }
    }
}

impl std::error::Error for PatternError {}

/// Thompson NFA state identifier.
pub type NfaStateId = u32;

/// NFA transition: `(from, label, to)` where `None` label is epsilon.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NfaEdge {
    /// Source state id.
    pub from: NfaStateId,
    /// Byte label, or `None` for an epsilon transition.
    pub byte: Option<u8>,
    /// Destination state id.
    pub to: NfaStateId,
}

/// Non-deterministic finite automaton in Thompson form.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Nfa {
    /// Total state count; states are `0..state_count`.
    pub state_count: u32,
    /// Edges including both byte-labelled and epsilon transitions.
    pub edges: Vec<NfaEdge>,
    /// Index of the start state. Always present for a well-formed NFA.
    pub start: NfaStateId,
    /// Accept-state bitmap.
    pub accept: Vec<bool>,
}

/// Deterministic finite automaton with a dense transition table.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Dfa {
    /// Total state count.
    pub state_count: u32,
    /// Transitions laid out row-major: `transitions[state * 256 + byte]`.
    /// Values equal to [`INVALID_STATE`] mean "no transition."
    pub transitions: Vec<u32>,
    /// Start state id.
    pub start: u32,
    /// Accept-state bitmap.
    pub accept: Vec<bool>,
}

impl Dfa {
    /// Read the transition for `(state, byte)`.
    ///
    /// Returns [`INVALID_STATE`] when no transition is defined.
    #[must_use]
    #[inline]
    pub fn go(&self, state: u32, byte: u8) -> u32 {
        self.transitions[(state as usize) * 256 + byte as usize]
    }

    /// Write a transition for `(state, byte)`.
    #[inline]
    pub fn set(&mut self, state: u32, byte: u8, next: u32) {
        self.transitions[(state as usize) * 256 + byte as usize] = next;
    }
}

/// Packing format selector for [`super::dfa_pack::dfa_pack`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum DfaPackFormat {
    /// Dense row-major: fastest scan, largest memory. 256 × 4 bytes per state.
    Dense,
    /// Byte equivalence-class compression: smallest when effective alphabet
    /// is narrow. Emits a 256-entry class table + `state × num_classes`.
    EquivClass,
}

/// Packed DFA ready for upload to a GPU buffer.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PackedDfa {
    /// Which format the bytes encode.
    pub format: DfaPackFormat,
    /// Number of DFA states before packing.
    pub state_count: u32,
    /// Start state id.
    pub start: u32,
    /// Raw serialized bytes (GPU-uploadable).
    pub bytes: Vec<u8>,
}