Skip to main content

vyre_std/pattern/
dfa_assemble.rs

1//! Composite entry point for the vyre-std DFA assembly pipeline.
2//!
3//! `dfa_assemble` is the 10-line-consumer API. Give it a slice of patterns
4//! (literal bytes or regex strings), get back a [`PackedDfa`] ready to
5//! upload to a GPU buffer and drive [`vyre::match_ops::dfa_scan`].
6
7use super::types::{DfaPackFormat, PackedDfa, PatternError};
8use super::{dfa_minimize, dfa_pack, nfa_to_dfa, regex_to_nfa};
9
10/// Input pattern variants recognized by [`dfa_assemble`].
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum Pattern<'a> {
13    /// Raw literal bytes to match verbatim.
14    Literal(&'a [u8]),
15    /// Regex source string compiled by [`super::regex_to_nfa`].
16    Regex(&'a str),
17}
18
19/// Options controlling the assembly pipeline.
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21#[non_exhaustive]
22pub struct AssembleOptions {
23    /// Requested pack format. `Dense` if you want fastest scan; `EquivClass`
24    /// if you want the smallest GPU buffer for small alphabets.
25    pub format: DfaPackFormat,
26    /// Whether to run [`super::dfa_minimize::dfa_minimize`] before packing.
27    /// Default `true` — skipping minimization is only useful when the
28    /// pipeline output is re-consumed by another minimizer.
29    pub minimize: bool,
30}
31
32impl Default for AssembleOptions {
33    fn default() -> Self {
34        Self {
35            format: DfaPackFormat::Dense,
36            minimize: true,
37        }
38    }
39}
40
41/// Compile a set of patterns into a GPU-ready packed DFA.
42///
43/// # Examples
44///
45/// ```
46/// use vyre_std::pattern::dfa_assemble::{dfa_assemble, AssembleOptions, Pattern};
47///
48/// let patterns = [
49///     Pattern::Literal(b"GET /api"),
50///     Pattern::Regex("POST /api/v[0-9]+"),
51/// ];
52/// let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
53/// assert!(!packed.bytes.is_empty());
54/// ```
55///
56/// # Errors
57///
58/// - [`PatternError::EmptyPatternSet`] if `patterns` is empty or contains an
59///   empty literal byte string.
60/// - [`PatternError::ParseError`] if any regex pattern is malformed.
61/// - [`PatternError::StateOverflow`] if the combined NFA→DFA subset
62///   construction blows through the state budget.
63#[inline]
64pub fn dfa_assemble(
65    patterns: &[Pattern<'_>],
66    options: AssembleOptions,
67) -> Result<PackedDfa, PatternError> {
68    if patterns.is_empty()
69        || patterns
70            .iter()
71            .any(|pattern| matches!(pattern, Pattern::Literal(bytes) if bytes.is_empty()))
72    {
73        return Err(PatternError::EmptyPatternSet);
74    }
75
76    let combined_regex = combined_pattern_regex(patterns);
77    let nfa = regex_to_nfa::regex_to_nfa(&combined_regex)?;
78    let dfa = nfa_to_dfa::nfa_to_dfa(&nfa)?;
79    let final_dfa = if options.minimize {
80        dfa_minimize::dfa_minimize(&dfa)
81    } else {
82        dfa
83    };
84    Ok(dfa_pack::dfa_pack(&final_dfa, options.format))
85}
86
87fn combined_pattern_regex(patterns: &[Pattern<'_>]) -> String {
88    let mut out = String::with_capacity(patterns.iter().map(pattern_regex_len_hint).sum());
89    for (index, pattern) in patterns.iter().enumerate() {
90        if index != 0 {
91            out.push('|');
92        }
93        push_pattern_regex(&mut out, pattern);
94    }
95    out
96}
97
98fn pattern_regex_len_hint(pattern: &Pattern<'_>) -> usize {
99    match pattern {
100        Pattern::Literal(bytes) => bytes.len().saturating_mul(4).max(1),
101        Pattern::Regex(source) => source.len(),
102    }
103}
104
105fn push_pattern_regex(out: &mut String, pattern: &Pattern<'_>) {
106    match pattern {
107        Pattern::Literal(bytes) => {
108            for &b in *bytes {
109                escape_literal_byte(out, b);
110            }
111        }
112        Pattern::Regex(source) => out.push_str(source),
113    }
114}
115
116fn escape_literal_byte(out: &mut String, byte: u8) {
117    if matches!(
118        byte,
119        b'.' | b'*' | b'+' | b'?' | b'|' | b'(' | b')' | b'[' | b']' | b'\\' | b'^' | b'$'
120    ) {
121        out.push('\\');
122        out.push(byte as char);
123    } else if byte.is_ascii() {
124        out.push(byte as char);
125    } else {
126        // Non-ASCII literal: emit a character class with just that byte so
127        // the regex parser treats it as a single-byte atom.
128        out.push_str("[\\x");
129        push_hex_byte(out, byte);
130        out.push(']');
131    }
132}
133
134fn push_hex_byte(out: &mut String, byte: u8) {
135    const HEX: &[u8; 16] = b"0123456789ABCDEF";
136    out.push(HEX[(byte >> 4) as usize] as char);
137    out.push(HEX[(byte & 0x0F) as usize] as char);
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143    use crate::pattern::{dfa_pack::dfa_unpack, types::INVALID_STATE};
144
145    fn accepts(packed: &PackedDfa, input: &[u8]) -> bool {
146        let dfa = dfa_unpack(packed).expect("unpack");
147        let mut state = dfa.start;
148        for &b in input {
149            let next = dfa.go(state, b);
150            if next == INVALID_STATE {
151                return false;
152            }
153            state = next;
154        }
155        dfa.accept[state as usize]
156    }
157
158    #[test]
159    fn single_literal_pattern() {
160        let patterns = [Pattern::Literal(b"hello")];
161        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
162        assert!(accepts(&packed, b"hello"));
163        assert!(!accepts(&packed, b"hell"));
164        assert!(!accepts(&packed, b"world"));
165    }
166
167    #[test]
168    fn multiple_literal_patterns() {
169        let patterns = [Pattern::Literal(b"foo"), Pattern::Literal(b"bar")];
170        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
171        assert!(accepts(&packed, b"foo"));
172        assert!(accepts(&packed, b"bar"));
173        assert!(!accepts(&packed, b"baz"));
174    }
175
176    #[test]
177    fn regex_pattern() {
178        let patterns = [Pattern::Regex("a(b|c)*d")];
179        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
180        assert!(accepts(&packed, b"ad"));
181        assert!(accepts(&packed, b"abd"));
182        assert!(accepts(&packed, b"abcbcd"));
183        assert!(!accepts(&packed, b"a"));
184    }
185
186    #[test]
187    fn literal_escapes_metachars() {
188        let patterns = [Pattern::Literal(b"a.b*c")];
189        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
190        assert!(accepts(&packed, b"a.b*c"));
191        assert!(!accepts(&packed, b"axbxc"));
192    }
193
194    #[test]
195    fn literal_non_ascii_byte_stays_single_byte() {
196        let patterns = [Pattern::Literal(&[0xE9])];
197        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
198        assert!(accepts(&packed, &[0xE9]));
199        assert!(!accepts(&packed, "\u{00E9}".as_bytes()));
200    }
201
202    #[test]
203    fn mixed_literal_and_regex() {
204        let patterns = [Pattern::Literal(b"exact"), Pattern::Regex("[0-9]+")];
205        let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
206        assert!(accepts(&packed, b"exact"));
207        assert!(accepts(&packed, b"12345"));
208        assert!(!accepts(&packed, b"hello"));
209    }
210
211    #[test]
212    fn empty_pattern_set_errors() {
213        let err = dfa_assemble(&[], AssembleOptions::default()).unwrap_err();
214        assert!(matches!(err, PatternError::EmptyPatternSet));
215    }
216
217    #[test]
218    fn empty_literal_pattern_errors() {
219        let err = dfa_assemble(&[Pattern::Literal(b"")], AssembleOptions::default()).unwrap_err();
220        assert!(matches!(err, PatternError::EmptyPatternSet));
221    }
222
223    #[test]
224    fn malformed_regex_errors() {
225        let patterns = [Pattern::Regex("(unclosed")];
226        assert!(matches!(
227            dfa_assemble(&patterns, AssembleOptions::default()),
228            Err(PatternError::ParseError { .. })
229        ));
230    }
231
232    #[test]
233    fn equiv_class_format() {
234        let patterns = [Pattern::Literal(b"abc")];
235        let options = AssembleOptions {
236            format: DfaPackFormat::EquivClass,
237            minimize: true,
238        };
239        let packed = dfa_assemble(&patterns, options).unwrap();
240        assert_eq!(packed.format, DfaPackFormat::EquivClass);
241        assert!(accepts(&packed, b"abc"));
242    }
243
244    #[test]
245    fn minimize_off_preserves_language() {
246        let patterns = [Pattern::Literal(b"xy")];
247        let options = AssembleOptions {
248            format: DfaPackFormat::Dense,
249            minimize: false,
250        };
251        let packed = dfa_assemble(&patterns, options).unwrap();
252        assert!(accepts(&packed, b"xy"));
253        assert!(!accepts(&packed, b"xz"));
254    }
255}