vyre_std/pattern/
dfa_assemble.rs1use super::types::{DfaPackFormat, PackedDfa, PatternError};
8use super::{dfa_minimize, dfa_pack, nfa_to_dfa, regex_to_nfa};
9
10#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum Pattern<'a> {
13 Literal(&'a [u8]),
15 Regex(&'a str),
17}
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21#[non_exhaustive]
22pub struct AssembleOptions {
23 pub format: DfaPackFormat,
26 pub minimize: bool,
30}
31
32impl Default for AssembleOptions {
33 fn default() -> Self {
34 Self {
35 format: DfaPackFormat::Dense,
36 minimize: true,
37 }
38 }
39}
40
41#[inline]
64pub fn dfa_assemble(
65 patterns: &[Pattern<'_>],
66 options: AssembleOptions,
67) -> Result<PackedDfa, PatternError> {
68 if patterns.is_empty()
69 || patterns
70 .iter()
71 .any(|pattern| matches!(pattern, Pattern::Literal(bytes) if bytes.is_empty()))
72 {
73 return Err(PatternError::EmptyPatternSet);
74 }
75
76 let combined_regex = combined_pattern_regex(patterns);
77 let nfa = regex_to_nfa::regex_to_nfa(&combined_regex)?;
78 let dfa = nfa_to_dfa::nfa_to_dfa(&nfa)?;
79 let final_dfa = if options.minimize {
80 dfa_minimize::dfa_minimize(&dfa)
81 } else {
82 dfa
83 };
84 Ok(dfa_pack::dfa_pack(&final_dfa, options.format))
85}
86
87fn combined_pattern_regex(patterns: &[Pattern<'_>]) -> String {
88 let mut out = String::with_capacity(patterns.iter().map(pattern_regex_len_hint).sum());
89 for (index, pattern) in patterns.iter().enumerate() {
90 if index != 0 {
91 out.push('|');
92 }
93 push_pattern_regex(&mut out, pattern);
94 }
95 out
96}
97
98fn pattern_regex_len_hint(pattern: &Pattern<'_>) -> usize {
99 match pattern {
100 Pattern::Literal(bytes) => bytes.len().saturating_mul(4).max(1),
101 Pattern::Regex(source) => source.len(),
102 }
103}
104
105fn push_pattern_regex(out: &mut String, pattern: &Pattern<'_>) {
106 match pattern {
107 Pattern::Literal(bytes) => {
108 for &b in *bytes {
109 escape_literal_byte(out, b);
110 }
111 }
112 Pattern::Regex(source) => out.push_str(source),
113 }
114}
115
116fn escape_literal_byte(out: &mut String, byte: u8) {
117 if matches!(
118 byte,
119 b'.' | b'*' | b'+' | b'?' | b'|' | b'(' | b')' | b'[' | b']' | b'\\' | b'^' | b'$'
120 ) {
121 out.push('\\');
122 out.push(byte as char);
123 } else if byte.is_ascii() {
124 out.push(byte as char);
125 } else {
126 out.push_str("[\\x");
129 push_hex_byte(out, byte);
130 out.push(']');
131 }
132}
133
134fn push_hex_byte(out: &mut String, byte: u8) {
135 const HEX: &[u8; 16] = b"0123456789ABCDEF";
136 out.push(HEX[(byte >> 4) as usize] as char);
137 out.push(HEX[(byte & 0x0F) as usize] as char);
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143 use crate::pattern::{dfa_pack::dfa_unpack, types::INVALID_STATE};
144
145 fn accepts(packed: &PackedDfa, input: &[u8]) -> bool {
146 let dfa = dfa_unpack(packed).expect("unpack");
147 let mut state = dfa.start;
148 for &b in input {
149 let next = dfa.go(state, b);
150 if next == INVALID_STATE {
151 return false;
152 }
153 state = next;
154 }
155 dfa.accept[state as usize]
156 }
157
158 #[test]
159 fn single_literal_pattern() {
160 let patterns = [Pattern::Literal(b"hello")];
161 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
162 assert!(accepts(&packed, b"hello"));
163 assert!(!accepts(&packed, b"hell"));
164 assert!(!accepts(&packed, b"world"));
165 }
166
167 #[test]
168 fn multiple_literal_patterns() {
169 let patterns = [Pattern::Literal(b"foo"), Pattern::Literal(b"bar")];
170 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
171 assert!(accepts(&packed, b"foo"));
172 assert!(accepts(&packed, b"bar"));
173 assert!(!accepts(&packed, b"baz"));
174 }
175
176 #[test]
177 fn regex_pattern() {
178 let patterns = [Pattern::Regex("a(b|c)*d")];
179 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
180 assert!(accepts(&packed, b"ad"));
181 assert!(accepts(&packed, b"abd"));
182 assert!(accepts(&packed, b"abcbcd"));
183 assert!(!accepts(&packed, b"a"));
184 }
185
186 #[test]
187 fn literal_escapes_metachars() {
188 let patterns = [Pattern::Literal(b"a.b*c")];
189 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
190 assert!(accepts(&packed, b"a.b*c"));
191 assert!(!accepts(&packed, b"axbxc"));
192 }
193
194 #[test]
195 fn literal_non_ascii_byte_stays_single_byte() {
196 let patterns = [Pattern::Literal(&[0xE9])];
197 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
198 assert!(accepts(&packed, &[0xE9]));
199 assert!(!accepts(&packed, "\u{00E9}".as_bytes()));
200 }
201
202 #[test]
203 fn mixed_literal_and_regex() {
204 let patterns = [Pattern::Literal(b"exact"), Pattern::Regex("[0-9]+")];
205 let packed = dfa_assemble(&patterns, AssembleOptions::default()).unwrap();
206 assert!(accepts(&packed, b"exact"));
207 assert!(accepts(&packed, b"12345"));
208 assert!(!accepts(&packed, b"hello"));
209 }
210
211 #[test]
212 fn empty_pattern_set_errors() {
213 let err = dfa_assemble(&[], AssembleOptions::default()).unwrap_err();
214 assert!(matches!(err, PatternError::EmptyPatternSet));
215 }
216
217 #[test]
218 fn empty_literal_pattern_errors() {
219 let err = dfa_assemble(&[Pattern::Literal(b"")], AssembleOptions::default()).unwrap_err();
220 assert!(matches!(err, PatternError::EmptyPatternSet));
221 }
222
223 #[test]
224 fn malformed_regex_errors() {
225 let patterns = [Pattern::Regex("(unclosed")];
226 assert!(matches!(
227 dfa_assemble(&patterns, AssembleOptions::default()),
228 Err(PatternError::ParseError { .. })
229 ));
230 }
231
232 #[test]
233 fn equiv_class_format() {
234 let patterns = [Pattern::Literal(b"abc")];
235 let options = AssembleOptions {
236 format: DfaPackFormat::EquivClass,
237 minimize: true,
238 };
239 let packed = dfa_assemble(&patterns, options).unwrap();
240 assert_eq!(packed.format, DfaPackFormat::EquivClass);
241 assert!(accepts(&packed, b"abc"));
242 }
243
244 #[test]
245 fn minimize_off_preserves_language() {
246 let patterns = [Pattern::Literal(b"xy")];
247 let options = AssembleOptions {
248 format: DfaPackFormat::Dense,
249 minimize: false,
250 };
251 let packed = dfa_assemble(&patterns, options).unwrap();
252 assert!(accepts(&packed, b"xy"));
253 assert!(!accepts(&packed, b"xz"));
254 }
255}