Skip to main content

vyre_std/pattern/
types.rs

1//! Shared types for the vyre-std DFA assembly pipeline.
2//!
3//! The pipeline stages pass these types between each other:
4//!
5//! ```text
6//! regex_to_nfa → Nfa → nfa_to_dfa → Dfa → dfa_minimize → Dfa → dfa_pack → PackedDfa
7//! ```
8//!
9//! Each stage is a pure function; the final `PackedDfa` is a POD byte
10//! layout suitable for direct upload to a GPU buffer.
11
12/// Sentinel state id used to mark "no transition on this byte."
13pub const INVALID_STATE: u32 = u32::MAX;
14
15/// Construction error shared across the pipeline.
16#[derive(Debug, Clone, PartialEq, Eq)]
17#[non_exhaustive]
18pub enum PatternError {
19    /// Regex source contained a character outside the supported subset or a
20    /// malformed sequence.
21    ///
22    /// Carries the byte offset and a human-readable description. Every
23    /// message starts with `Fix:` so consumers can surface the remediation
24    /// directly to end users.
25    ParseError {
26        /// Byte offset at which the parser gave up.
27        offset: usize,
28        /// Actionable `Fix:` prose describing what went wrong.
29        message: String,
30    },
31    /// Regex source used an escape sequence reserved for regex shorthand or
32    /// anchor semantics that this byte-oriented frontend does not implement.
33    UnsupportedEscape {
34        /// Byte offset of the leading backslash.
35        offset: usize,
36        /// Escaped ASCII byte after the backslash.
37        escape: char,
38    },
39    /// Regex source exceeded the parser's bounded recursive group nesting.
40    RecursionLimit {
41        /// Maximum group nesting depth permitted.
42        limit: usize,
43    },
44    /// The NFA or DFA exceeded a configured state budget.
45    StateOverflow {
46        /// Maximum state count permitted.
47        limit: u32,
48    },
49    /// The pattern set was empty or otherwise structurally invalid for
50    /// composite assembly (e.g., mixing empty regex with literal patterns).
51    EmptyPatternSet,
52}
53
54impl core::fmt::Display for PatternError {
55    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
56        match self {
57            Self::ParseError { offset, message } => {
58                write!(f, "pattern parse error at byte {offset}: {message}")
59            }
60            Self::UnsupportedEscape { offset, escape } => {
61                write!(
62                    f,
63                    "pattern parse error at byte {offset}: Fix: unsupported escape `\\{escape}`; spell the byte set explicitly with a character class."
64                )
65            }
66            Self::RecursionLimit { limit } => {
67                write!(
68                    f,
69                    "Fix: regex group nesting exceeded the recursion limit ({limit}); flatten or split the expression."
70                )
71            }
72            Self::StateOverflow { limit } => {
73                write!(
74                    f,
75                    "Fix: state count exceeded the pipeline budget ({limit}); split the pattern set into smaller batches."
76                )
77            }
78            Self::EmptyPatternSet => write!(
79                f,
80                "Fix: pattern set must contain at least one non-empty pattern."
81            ),
82        }
83    }
84}
85
86impl std::error::Error for PatternError {}
87
88/// Thompson NFA state identifier.
89pub type NfaStateId = u32;
90
91/// NFA transition: `(from, label, to)` where `None` label is epsilon.
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub struct NfaEdge {
94    /// Source state id.
95    pub from: NfaStateId,
96    /// Byte label, or `None` for an epsilon transition.
97    pub byte: Option<u8>,
98    /// Destination state id.
99    pub to: NfaStateId,
100}
101
102/// Non-deterministic finite automaton in Thompson form.
103#[derive(Debug, Clone, PartialEq, Eq)]
104pub struct Nfa {
105    /// Total state count; states are `0..state_count`.
106    pub state_count: u32,
107    /// Edges including both byte-labelled and epsilon transitions.
108    pub edges: Vec<NfaEdge>,
109    /// Index of the start state. Always present for a well-formed NFA.
110    pub start: NfaStateId,
111    /// Accept-state bitmap.
112    pub accept: Vec<bool>,
113}
114
115/// Deterministic finite automaton with a dense transition table.
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct Dfa {
118    /// Total state count.
119    pub state_count: u32,
120    /// Transitions laid out row-major: `transitions[state * 256 + byte]`.
121    /// Values equal to [`INVALID_STATE`] mean "no transition."
122    pub transitions: Vec<u32>,
123    /// Start state id.
124    pub start: u32,
125    /// Accept-state bitmap.
126    pub accept: Vec<bool>,
127}
128
129impl Dfa {
130    /// Read the transition for `(state, byte)`.
131    ///
132    /// Returns [`INVALID_STATE`] when no transition is defined.
133    #[must_use]
134    #[inline]
135    pub fn go(&self, state: u32, byte: u8) -> u32 {
136        self.transitions[(state as usize) * 256 + byte as usize]
137    }
138
139    /// Write a transition for `(state, byte)`.
140    #[inline]
141    pub fn set(&mut self, state: u32, byte: u8, next: u32) {
142        self.transitions[(state as usize) * 256 + byte as usize] = next;
143    }
144}
145
146/// Packing format selector for [`super::dfa_pack::dfa_pack`].
147#[derive(Debug, Clone, Copy, PartialEq, Eq)]
148#[non_exhaustive]
149pub enum DfaPackFormat {
150    /// Dense row-major: fastest scan, largest memory. 256 × 4 bytes per state.
151    Dense,
152    /// Byte equivalence-class compression: smallest when effective alphabet
153    /// is narrow. Emits a 256-entry class table + `state × num_classes`.
154    EquivClass,
155}
156
157/// Packed DFA ready for upload to a GPU buffer.
158#[derive(Debug, Clone, PartialEq, Eq)]
159pub struct PackedDfa {
160    /// Which format the bytes encode.
161    pub format: DfaPackFormat,
162    /// Number of DFA states before packing.
163    pub state_count: u32,
164    /// Start state id.
165    pub start: u32,
166    /// Raw serialized bytes (GPU-uploadable).
167    pub bytes: Vec<u8>,
168}