dyntext/regex_ast.rs
1//! Internal regex AST used by the Phase 2 prefix extractor.
2//!
3//! The full `regex` crate parses and compiles a pattern down to
4//! a finite-state machine; for *prefix extraction* we only need
5//! the structural shape of the pattern, not its matcher. We
6//! reuse [`regex_syntax`]'s high-level intermediate
7//! representation for the heavy lifting (tokenisation,
8//! `\w`/`\d`/`\s` expansion, character class normalisation,
9//! repetition canonicalisation) and project it into a tiny AST
10//! that the prefix extractor in [`crate::prefix_extract`] walks.
11//!
12//! # Supported subset
13//!
14//! The AST and its [`parse`] entry point accept the subset of
15//! regular expressions that [`regex_syntax`] can lower to its
16//! HIR -- this excludes lookarounds, backreferences, and other
17//! features the [`regex`] crate does not implement. Inside that
18//! subset, named capture groups (`(?P<foo>...)`) are flagged
19//! as [`RegexError::PrefixUnsupported`]. All other shapes
20//! (literals, anchors, `.`, character classes, alternation,
21//! grouping, and `*`/`+`/`?`/`{m,n}` repetition) project into
22//! the AST below.
23//!
24//! Callers that hit [`RegexError::PrefixUnsupported`] can still
25//! evaluate the regex; they just have to fall back to a full
26//! scan + recheck instead of the trigram-driven candidate
27//! pruning.
28//!
29//! # Examples
30//!
31//! ```
32//! use dyntext::regex_ast::{parse, Ast};
33//!
34//! let ast = parse("foo|bar").expect("parses");
35//! assert!(matches!(ast, Ast::Alt(_)));
36//! ```
37
38use regex_syntax::hir::{Hir, HirKind};
39use thiserror::Error;
40
41/// Errors surfaced by the regex parser and prefix extractor.
42#[derive(Debug, Error)]
43pub enum RegexError {
44 /// The pattern is syntactically invalid or uses a feature
45 /// outside [`regex_syntax`]'s supported subset (lookahead,
46 /// backreferences, etc).
47 #[error("regex parse error: {0}")]
48 Parse(String),
49
50 /// The pattern parsed but uses a feature the prefix
51 /// extractor cannot lower into trigram constraints.
52 /// Callers should fall back to a full scan.
53 #[error("regex prefix extraction unsupported: {0}")]
54 PrefixUnsupported(&'static str),
55}
56
57/// Internal regex AST used by the prefix extractor.
58///
59/// Variants are intentionally coarse: the prefix extractor only
60/// distinguishes nodes that contribute literal bytes, nodes that
61/// break a literal run (any opaque-character node), and the
62/// structural combinators (concat, alt, repeat).
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub enum Ast {
65 /// Empty regex (matches the empty string). Produced by
66 /// `()`, `a{0}`, etc.
67 Empty,
68
69 /// A run of literal bytes that must appear contiguously in
70 /// any matching string. The byte vector is non-empty.
71 Literal(Vec<u8>),
72
73 /// A zero-width assertion: `^`, `$`, `\b`, `\B`, and the
74 /// multi-line variants. Carries no literal-byte
75 /// information and is treated by the extractor as a hard
76 /// break in any surrounding literal run, since adjacent
77 /// literals across an anchor are not guaranteed to be
78 /// adjacent in the matching text.
79 Anchor,
80
81 /// Any single character: `.`, a character class `[abc]`, or
82 /// `\w`/`\d`/`\s` expansions. The extractor treats this as
83 /// opaque and breaks any surrounding literal run.
84 AnyChar,
85
86 /// Concatenation of sub-patterns. Always has at least two
87 /// children when produced by [`parse`]; nullary or unary
88 /// concatenations collapse to [`Ast::Empty`] or the single
89 /// child respectively.
90 Concat(Vec<Ast>),
91
92 /// Alternation of sub-patterns. Always has at least two
93 /// children when produced by [`parse`].
94 Alt(Vec<Ast>),
95
96 /// Repetition operator with a minimum and optional maximum
97 /// count. Covers `*`, `+`, `?`, `{m}`, `{m,}`, `{m,n}`.
98 Repeat {
99 /// Sub-expression being repeated.
100 sub: Box<Ast>,
101 /// Lower bound on iteration count.
102 min: u32,
103 /// Upper bound, or [`None`] for unbounded repetition.
104 max: Option<u32>,
105 },
106}
107
108impl Ast {
109 /// Parse a pattern into the internal AST.
110 ///
111 /// # Errors
112 ///
113 /// * [`RegexError::Parse`] if the pattern is syntactically
114 /// invalid or uses a feature [`regex_syntax`] does not
115 /// accept (lookarounds, backreferences, ...).
116 /// * [`RegexError::PrefixUnsupported`] if the pattern
117 /// parses but contains a feature the extractor cannot
118 /// lower (named capture groups).
119 pub fn parse(pattern: &str) -> Result<Self, RegexError> {
120 let hir = regex_syntax::parse(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
121 Self::from_hir(&hir)
122 }
123
124 fn from_hir(hir: &Hir) -> Result<Self, RegexError> {
125 match hir.kind() {
126 HirKind::Empty => Ok(Ast::Empty),
127 HirKind::Literal(lit) => {
128 if lit.0.is_empty() {
129 // Defensive: HIR's smart constructors are
130 // supposed to elide empty literals into
131 // `Empty`, but we collapse here just in
132 // case a future regex-syntax version slips
133 // one through.
134 Ok(Ast::Empty)
135 } else {
136 Ok(Ast::Literal(lit.0.to_vec()))
137 }
138 }
139 HirKind::Class(_) => Ok(Ast::AnyChar),
140 HirKind::Look(_) => Ok(Ast::Anchor),
141 HirKind::Repetition(rep) => {
142 let sub = Self::from_hir(&rep.sub)?;
143 Ok(Ast::Repeat {
144 sub: Box::new(sub),
145 min: rep.min,
146 max: rep.max,
147 })
148 }
149 HirKind::Capture(cap) => {
150 if cap.name.is_some() {
151 return Err(RegexError::PrefixUnsupported("named capture group"));
152 }
153 Self::from_hir(&cap.sub)
154 }
155 HirKind::Concat(parts) => {
156 let mut v = Vec::with_capacity(parts.len());
157 for p in parts {
158 v.push(Self::from_hir(p)?);
159 }
160 Ok(Self::collapse_concat(v))
161 }
162 HirKind::Alternation(parts) => {
163 let mut v = Vec::with_capacity(parts.len());
164 for p in parts {
165 v.push(Self::from_hir(p)?);
166 }
167 if v.len() == 1 {
168 Ok(v.into_iter().next().expect("len == 1"))
169 } else {
170 Ok(Ast::Alt(v))
171 }
172 }
173 }
174 }
175
176 fn collapse_concat(parts: Vec<Ast>) -> Ast {
177 match parts.len() {
178 0 => Ast::Empty,
179 1 => parts.into_iter().next().expect("len == 1"),
180 _ => Ast::Concat(parts),
181 }
182 }
183}
184
185/// Convenience free-function form of [`Ast::parse`].
186///
187/// # Examples
188///
189/// ```
190/// use dyntext::regex_ast::{parse, Ast};
191///
192/// let ast = parse("ab+").expect("parses");
193/// // "ab+" is concat(literal "a", repeat(literal "b", min=1)).
194/// assert!(matches!(ast, Ast::Concat(_)));
195/// ```
196///
197/// # Errors
198///
199/// Returns [`RegexError`] under the same conditions as
200/// [`Ast::parse`].
201pub fn parse(pattern: &str) -> Result<Ast, RegexError> {
202 Ast::parse(pattern)
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 #[test]
210 fn parse_simple_literal_yields_literal_bytes() {
211 let ast = parse("hello").expect("parses");
212 assert_eq!(ast, Ast::Literal(b"hello".to_vec()));
213 }
214
215 #[test]
216 fn parse_concat_of_literal_and_dot_yields_concat() {
217 let ast = parse("ab.").expect("parses");
218 match ast {
219 Ast::Concat(parts) => {
220 assert_eq!(parts.len(), 2);
221 assert_eq!(parts[0], Ast::Literal(b"ab".to_vec()));
222 assert_eq!(parts[1], Ast::AnyChar);
223 }
224 other => panic!("expected concat, got {other:?}"),
225 }
226 }
227
228 #[test]
229 fn parse_alternation_two_branches_yields_alt() {
230 let ast = parse("foo|bar").expect("parses");
231 match ast {
232 Ast::Alt(branches) => {
233 assert_eq!(branches.len(), 2);
234 }
235 other => panic!("expected alt, got {other:?}"),
236 }
237 }
238
239 #[test]
240 fn parse_unsupported_lookahead_returns_error() {
241 // Lookarounds are not representable in regex-syntax's
242 // HIR; the parser rejects them.
243 let err = parse("(?=foo)").expect_err("lookahead must error");
244 assert!(matches!(err, RegexError::Parse(_)));
245 }
246
247 #[test]
248 fn parse_named_capture_returns_prefix_unsupported() {
249 let err = parse("(?P<name>abc)").expect_err("named capture errors");
250 assert!(matches!(err, RegexError::PrefixUnsupported(_)));
251 }
252
253 #[test]
254 fn parse_anchors_show_up_as_anchor_nodes() {
255 let ast = parse("^a$").expect("parses");
256 match ast {
257 Ast::Concat(parts) => {
258 assert_eq!(parts.len(), 3);
259 assert_eq!(parts[0], Ast::Anchor);
260 assert_eq!(parts[1], Ast::Literal(b"a".to_vec()));
261 assert_eq!(parts[2], Ast::Anchor);
262 }
263 other => panic!("expected concat, got {other:?}"),
264 }
265 }
266
267 #[test]
268 fn parse_grouping_with_quantifier_yields_repeat() {
269 let ast = parse("(ab)+").expect("parses");
270 match ast {
271 Ast::Repeat { sub, min, max } => {
272 assert_eq!(*sub, Ast::Literal(b"ab".to_vec()));
273 assert_eq!(min, 1);
274 assert!(max.is_none());
275 }
276 other => panic!("expected repeat, got {other:?}"),
277 }
278 }
279}