Skip to main content

dyntext/
regex_ast.rs

1//! Internal regex AST used by the Phase 2 prefix extractor.
2//!
3//! The full `regex` crate parses and compiles a pattern down to
4//! a finite-state machine; for *prefix extraction* we only need
5//! the structural shape of the pattern, not its matcher. We
6//! reuse [`regex_syntax`]'s high-level intermediate
7//! representation for the heavy lifting (tokenisation,
8//! `\w`/`\d`/`\s` expansion, character class normalisation,
9//! repetition canonicalisation) and project it into a tiny AST
10//! that the prefix extractor in [`crate::prefix_extract`] walks.
11//!
12//! # Supported subset
13//!
14//! The AST and its [`parse`] entry point accept the subset of
15//! regular expressions that [`regex_syntax`] can lower to its
16//! HIR -- this excludes lookarounds, backreferences, and other
17//! features the [`regex`] crate does not implement. Inside that
18//! subset, named capture groups (`(?P<foo>...)`) are flagged
19//! as [`RegexError::PrefixUnsupported`]. All other shapes
20//! (literals, anchors, `.`, character classes, alternation,
21//! grouping, and `*`/`+`/`?`/`{m,n}` repetition) project into
22//! the AST below.
23//!
24//! Callers that hit [`RegexError::PrefixUnsupported`] can still
25//! evaluate the regex; they just have to fall back to a full
26//! scan + recheck instead of the trigram-driven candidate
27//! pruning.
28//!
29//! # Examples
30//!
31//! ```
32//! use dyntext::regex_ast::{parse, Ast};
33//!
34//! let ast = parse("foo|bar").expect("parses");
35//! assert!(matches!(ast, Ast::Alt(_)));
36//! ```
37
38use regex_syntax::hir::{Hir, HirKind};
39use thiserror::Error;
40
41/// Errors surfaced by the regex parser and prefix extractor.
42#[derive(Debug, Error)]
43pub enum RegexError {
44    /// The pattern is syntactically invalid or uses a feature
45    /// outside [`regex_syntax`]'s supported subset (lookahead,
46    /// backreferences, etc).
47    #[error("regex parse error: {0}")]
48    Parse(String),
49
50    /// The pattern parsed but uses a feature the prefix
51    /// extractor cannot lower into trigram constraints.
52    /// Callers should fall back to a full scan.
53    #[error("regex prefix extraction unsupported: {0}")]
54    PrefixUnsupported(&'static str),
55}
56
57/// Internal regex AST used by the prefix extractor.
58///
59/// Variants are intentionally coarse: the prefix extractor only
60/// distinguishes nodes that contribute literal bytes, nodes that
61/// break a literal run (any opaque-character node), and the
62/// structural combinators (concat, alt, repeat).
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub enum Ast {
65    /// Empty regex (matches the empty string). Produced by
66    /// `()`, `a{0}`, etc.
67    Empty,
68
69    /// A run of literal bytes that must appear contiguously in
70    /// any matching string. The byte vector is non-empty.
71    Literal(Vec<u8>),
72
73    /// A zero-width assertion: `^`, `$`, `\b`, `\B`, and the
74    /// multi-line variants. Carries no literal-byte
75    /// information and is treated by the extractor as a hard
76    /// break in any surrounding literal run, since adjacent
77    /// literals across an anchor are not guaranteed to be
78    /// adjacent in the matching text.
79    Anchor,
80
81    /// Any single character: `.`, a character class `[abc]`, or
82    /// `\w`/`\d`/`\s` expansions. The extractor treats this as
83    /// opaque and breaks any surrounding literal run.
84    AnyChar,
85
86    /// Concatenation of sub-patterns. Always has at least two
87    /// children when produced by [`parse`]; nullary or unary
88    /// concatenations collapse to [`Ast::Empty`] or the single
89    /// child respectively.
90    Concat(Vec<Ast>),
91
92    /// Alternation of sub-patterns. Always has at least two
93    /// children when produced by [`parse`].
94    Alt(Vec<Ast>),
95
96    /// Repetition operator with a minimum and optional maximum
97    /// count. Covers `*`, `+`, `?`, `{m}`, `{m,}`, `{m,n}`.
98    Repeat {
99        /// Sub-expression being repeated.
100        sub: Box<Ast>,
101        /// Lower bound on iteration count.
102        min: u32,
103        /// Upper bound, or [`None`] for unbounded repetition.
104        max: Option<u32>,
105    },
106}
107
108impl Ast {
109    /// Parse a pattern into the internal AST.
110    ///
111    /// # Errors
112    ///
113    /// * [`RegexError::Parse`] if the pattern is syntactically
114    ///   invalid or uses a feature [`regex_syntax`] does not
115    ///   accept (lookarounds, backreferences, ...).
116    /// * [`RegexError::PrefixUnsupported`] if the pattern
117    ///   parses but contains a feature the extractor cannot
118    ///   lower (named capture groups).
119    pub fn parse(pattern: &str) -> Result<Self, RegexError> {
120        let hir = regex_syntax::parse(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
121        Self::from_hir(&hir)
122    }
123
124    fn from_hir(hir: &Hir) -> Result<Self, RegexError> {
125        match hir.kind() {
126            HirKind::Empty => Ok(Ast::Empty),
127            HirKind::Literal(lit) => {
128                if lit.0.is_empty() {
129                    // Defensive: HIR's smart constructors are
130                    // supposed to elide empty literals into
131                    // `Empty`, but we collapse here just in
132                    // case a future regex-syntax version slips
133                    // one through.
134                    Ok(Ast::Empty)
135                } else {
136                    Ok(Ast::Literal(lit.0.to_vec()))
137                }
138            }
139            HirKind::Class(_) => Ok(Ast::AnyChar),
140            HirKind::Look(_) => Ok(Ast::Anchor),
141            HirKind::Repetition(rep) => {
142                let sub = Self::from_hir(&rep.sub)?;
143                Ok(Ast::Repeat {
144                    sub: Box::new(sub),
145                    min: rep.min,
146                    max: rep.max,
147                })
148            }
149            HirKind::Capture(cap) => {
150                if cap.name.is_some() {
151                    return Err(RegexError::PrefixUnsupported("named capture group"));
152                }
153                Self::from_hir(&cap.sub)
154            }
155            HirKind::Concat(parts) => {
156                let mut v = Vec::with_capacity(parts.len());
157                for p in parts {
158                    v.push(Self::from_hir(p)?);
159                }
160                Ok(Self::collapse_concat(v))
161            }
162            HirKind::Alternation(parts) => {
163                let mut v = Vec::with_capacity(parts.len());
164                for p in parts {
165                    v.push(Self::from_hir(p)?);
166                }
167                if v.len() == 1 {
168                    Ok(v.into_iter().next().expect("len == 1"))
169                } else {
170                    Ok(Ast::Alt(v))
171                }
172            }
173        }
174    }
175
176    fn collapse_concat(parts: Vec<Ast>) -> Ast {
177        match parts.len() {
178            0 => Ast::Empty,
179            1 => parts.into_iter().next().expect("len == 1"),
180            _ => Ast::Concat(parts),
181        }
182    }
183}
184
185/// Convenience free-function form of [`Ast::parse`].
186///
187/// # Examples
188///
189/// ```
190/// use dyntext::regex_ast::{parse, Ast};
191///
192/// let ast = parse("ab+").expect("parses");
193/// // "ab+" is concat(literal "a", repeat(literal "b", min=1)).
194/// assert!(matches!(ast, Ast::Concat(_)));
195/// ```
196///
197/// # Errors
198///
199/// Returns [`RegexError`] under the same conditions as
200/// [`Ast::parse`].
201pub fn parse(pattern: &str) -> Result<Ast, RegexError> {
202    Ast::parse(pattern)
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn parse_simple_literal_yields_literal_bytes() {
211        let ast = parse("hello").expect("parses");
212        assert_eq!(ast, Ast::Literal(b"hello".to_vec()));
213    }
214
215    #[test]
216    fn parse_concat_of_literal_and_dot_yields_concat() {
217        let ast = parse("ab.").expect("parses");
218        match ast {
219            Ast::Concat(parts) => {
220                assert_eq!(parts.len(), 2);
221                assert_eq!(parts[0], Ast::Literal(b"ab".to_vec()));
222                assert_eq!(parts[1], Ast::AnyChar);
223            }
224            other => panic!("expected concat, got {other:?}"),
225        }
226    }
227
228    #[test]
229    fn parse_alternation_two_branches_yields_alt() {
230        let ast = parse("foo|bar").expect("parses");
231        match ast {
232            Ast::Alt(branches) => {
233                assert_eq!(branches.len(), 2);
234            }
235            other => panic!("expected alt, got {other:?}"),
236        }
237    }
238
239    #[test]
240    fn parse_unsupported_lookahead_returns_error() {
241        // Lookarounds are not representable in regex-syntax's
242        // HIR; the parser rejects them.
243        let err = parse("(?=foo)").expect_err("lookahead must error");
244        assert!(matches!(err, RegexError::Parse(_)));
245    }
246
247    #[test]
248    fn parse_named_capture_returns_prefix_unsupported() {
249        let err = parse("(?P<name>abc)").expect_err("named capture errors");
250        assert!(matches!(err, RegexError::PrefixUnsupported(_)));
251    }
252
253    #[test]
254    fn parse_anchors_show_up_as_anchor_nodes() {
255        let ast = parse("^a$").expect("parses");
256        match ast {
257            Ast::Concat(parts) => {
258                assert_eq!(parts.len(), 3);
259                assert_eq!(parts[0], Ast::Anchor);
260                assert_eq!(parts[1], Ast::Literal(b"a".to_vec()));
261                assert_eq!(parts[2], Ast::Anchor);
262            }
263            other => panic!("expected concat, got {other:?}"),
264        }
265    }
266
267    #[test]
268    fn parse_grouping_with_quantifier_yields_repeat() {
269        let ast = parse("(ab)+").expect("parses");
270        match ast {
271            Ast::Repeat { sub, min, max } => {
272                assert_eq!(*sub, Ast::Literal(b"ab".to_vec()));
273                assert_eq!(min, 1);
274                assert!(max.is_none());
275            }
276            other => panic!("expected repeat, got {other:?}"),
277        }
278    }
279}