oxc_regular_expression/
ast.rs

1use bitflags::bitflags;
2
3use oxc_allocator::{Box, CloneIn, GetAddress, Vec};
4use oxc_ast_macros::ast;
5use oxc_span::{Atom, ContentEq, Span};
6
7/// The root of the `PatternParser` result.
8#[ast]
9#[derive(Debug)]
10#[generate_derive(CloneIn, ContentEq)]
11pub struct Pattern<'a> {
12    pub span: Span,
13    pub body: Disjunction<'a>,
14}
15
16/// Pile of [`Alternative`]s separated by `|`.
17#[ast]
18#[derive(Debug)]
19#[generate_derive(CloneIn, ContentEq)]
20pub struct Disjunction<'a> {
21    pub span: Span,
22    pub body: Vec<'a, Alternative<'a>>,
23}
24
25/// Single unit of `|` separated alternatives.
26#[ast]
27#[derive(Debug)]
28#[generate_derive(CloneIn, ContentEq)]
29pub struct Alternative<'a> {
30    pub span: Span,
31    pub body: Vec<'a, Term<'a>>,
32}
33
34/// Single unit of [`Alternative`], containing various kinds.
35#[ast]
36#[derive(Debug)]
37#[generate_derive(CloneIn, ContentEq)]
38pub enum Term<'a> {
39    // Assertion, QuantifiableAssertion
40    BoundaryAssertion(Box<'a, BoundaryAssertion>) = 0,
41    LookAroundAssertion(Box<'a, LookAroundAssertion<'a>>) = 1,
42    // Quantifier
43    Quantifier(Box<'a, Quantifier<'a>>) = 2,
44    // Atom, ExtendedAtom
45    Character(Box<'a, Character>) = 3,
46    Dot(Dot) = 4,
47    CharacterClassEscape(Box<'a, CharacterClassEscape>) = 5,
48    UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 6,
49    CharacterClass(Box<'a, CharacterClass<'a>>) = 7,
50    CapturingGroup(Box<'a, CapturingGroup<'a>>) = 8,
51    IgnoreGroup(Box<'a, IgnoreGroup<'a>>) = 9,
52    IndexedReference(Box<'a, IndexedReference>) = 10,
53    NamedReference(Box<'a, NamedReference<'a>>) = 11,
54}
55
56/// Simple form of assertion.
57/// e.g. `^`, `$`, `\b`, `\B`
58#[ast]
59#[derive(Debug)]
60#[generate_derive(CloneIn, ContentEq)]
61pub struct BoundaryAssertion {
62    pub span: Span,
63    pub kind: BoundaryAssertionKind,
64}
65
66#[ast]
67#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68#[generate_derive(CloneIn, ContentEq)]
69pub enum BoundaryAssertionKind {
70    Start = 0,
71    End = 1,
72    Boundary = 2,
73    NegativeBoundary = 3,
74}
75
76/// Lookaround assertion.
77/// e.g. `(?=...)`, `(?!...)`, `(?<=...)`, `(?<!...)`
78#[ast]
79#[derive(Debug)]
80#[generate_derive(CloneIn, ContentEq)]
81pub struct LookAroundAssertion<'a> {
82    pub span: Span,
83    pub kind: LookAroundAssertionKind,
84    pub body: Disjunction<'a>,
85}
86
87#[ast]
88#[derive(Debug, Clone, Copy, PartialEq, Eq)]
89#[generate_derive(CloneIn, ContentEq)]
90pub enum LookAroundAssertionKind {
91    Lookahead = 0,
92    NegativeLookahead = 1,
93    Lookbehind = 2,
94    NegativeLookbehind = 3,
95}
96
97/// Quantifier holding a [`Term`] and its repetition count.
98/// e.g. `a*`, `b+`, `c?`, `d{3}`, `e{4,}`, `f{5,6}`
99#[ast]
100#[derive(Debug)]
101#[generate_derive(CloneIn, ContentEq)]
102pub struct Quantifier<'a> {
103    pub span: Span,
104    pub min: u64,
105    /// `None` means no upper bound.
106    pub max: Option<u64>,
107    pub greedy: bool,
108    pub body: Term<'a>,
109}
110
111/// Single character.
112#[ast]
113#[derive(Debug, Clone, Copy)]
114#[generate_derive(CloneIn, ContentEq)]
115pub struct Character {
116    /// This will be invalid position when `UnicodeMode` is disabled and `value` is a surrogate pair.
117    pub span: Span,
118    pub kind: CharacterKind,
119    /// Unicode code point or UTF-16 code unit.
120    pub value: u32,
121}
122
123#[ast]
124#[derive(Debug, Clone, Copy, PartialEq, Eq)]
125#[generate_derive(CloneIn, ContentEq)]
126pub enum CharacterKind {
127    ControlLetter = 0,
128    HexadecimalEscape = 1,
129    Identifier = 2,
130    Null = 3,
131    // To distinguish leading 0 cases like `\00` and `\000`
132    Octal1 = 4,
133    Octal2 = 5,
134    Octal3 = 6,
135    SingleEscape = 7,
136    Symbol = 8,
137    UnicodeEscape = 9,
138}
139
140/// Character class.
141/// e.g. `\d`, `\D`, `\s`, `\S`, `\w`, `\W`
142#[ast]
143#[derive(Debug)]
144#[generate_derive(CloneIn, ContentEq)]
145pub struct CharacterClassEscape {
146    pub span: Span,
147    pub kind: CharacterClassEscapeKind,
148}
149
150#[ast]
151#[derive(Debug, Clone, Copy, PartialEq, Eq)]
152#[generate_derive(CloneIn, ContentEq)]
153pub enum CharacterClassEscapeKind {
154    D = 0,
155    NegativeD = 1,
156    S = 2,
157    NegativeS = 3,
158    W = 4,
159    NegativeW = 5,
160}
161
162/// Unicode property.
163/// e.g. `\p{ASCII}`, `\P{ASCII}`, `\p{sc=Hiragana}`, `\P{sc=Hiragana}`
164#[ast]
165#[derive(Debug)]
166#[generate_derive(CloneIn, ContentEq)]
167pub struct UnicodePropertyEscape<'a> {
168    pub span: Span,
169    pub negative: bool,
170    /// `true` if `UnicodeSetsMode` and `name` matches unicode property of strings.
171    pub strings: bool,
172    pub name: Atom<'a>,
173    pub value: Option<Atom<'a>>,
174}
175
176/// The `.`.
177#[ast]
178#[derive(Debug, Clone, Copy)]
179#[generate_derive(CloneIn, ContentEq)]
180pub struct Dot {
181    pub span: Span,
182}
183
184/// Character class wrapped by `[]`.
185/// e.g. `[a-z]`, `[^A-Z]`, `[abc]`, `[a&&b&&c]`, `[[a-z]--x--y]`
186#[ast]
187#[derive(Debug)]
188#[generate_derive(CloneIn, ContentEq)]
189pub struct CharacterClass<'a> {
190    pub span: Span,
191    pub negative: bool,
192    /// `true` if:
193    /// - `body` contains [`UnicodePropertyEscape`], nested [`CharacterClass`] or [`ClassStringDisjunction`] which `strings` is `true`
194    /// - and matches each logic depends on `kind`
195    pub strings: bool,
196    pub kind: CharacterClassContentsKind,
197    pub body: Vec<'a, CharacterClassContents<'a>>,
198}
199
200#[ast]
201#[derive(Debug, Clone, Copy, PartialEq, Eq)]
202#[generate_derive(CloneIn, ContentEq)]
203pub enum CharacterClassContentsKind {
204    Union = 0,
205    /// `UnicodeSetsMode` only.
206    Intersection = 1,
207    /// `UnicodeSetsMode` only.
208    Subtraction = 2,
209}
210
211#[ast]
212#[derive(Debug)]
213#[generate_derive(CloneIn, ContentEq, GetAddress)]
214pub enum CharacterClassContents<'a> {
215    CharacterClassRange(Box<'a, CharacterClassRange>) = 0,
216    CharacterClassEscape(Box<'a, CharacterClassEscape>) = 1,
217    UnicodePropertyEscape(Box<'a, UnicodePropertyEscape<'a>>) = 2,
218    Character(Box<'a, Character>) = 3,
219    /// `UnicodeSetsMode` only
220    NestedCharacterClass(Box<'a, CharacterClass<'a>>) = 4,
221    /// `UnicodeSetsMode` only
222    ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>) = 5,
223}
224
225/// `-` separated range of characters.
226/// e.g. `a-z`, `A-Z`, `0-9`
227#[ast]
228#[derive(Debug)]
229#[generate_derive(CloneIn, ContentEq)]
230pub struct CharacterClassRange {
231    pub span: Span,
232    pub min: Character,
233    pub max: Character,
234}
235
236/// `|` separated string of characters wrapped by `\q{}`.
237#[ast]
238#[derive(Debug)]
239#[generate_derive(CloneIn, ContentEq)]
240pub struct ClassStringDisjunction<'a> {
241    pub span: Span,
242    /// `true` if body is empty or contains [`ClassString`] which `strings` is `true`.
243    pub strings: bool,
244    pub body: Vec<'a, ClassString<'a>>,
245}
246
247/// Single unit of [`ClassStringDisjunction`].
248#[ast]
249#[derive(Debug)]
250#[generate_derive(CloneIn, ContentEq)]
251pub struct ClassString<'a> {
252    pub span: Span,
253    /// `true` if body is empty or contain 2 more characters.
254    pub strings: bool,
255    pub body: Vec<'a, Character>,
256}
257
258/// Named or unnamed capturing group.
259/// e.g. `(...)`, `(?<name>...)`
260#[ast]
261#[derive(Debug)]
262#[generate_derive(CloneIn, ContentEq)]
263pub struct CapturingGroup<'a> {
264    pub span: Span,
265    /// Group name to be referenced by [`NamedReference`].
266    pub name: Option<Atom<'a>>,
267    pub body: Disjunction<'a>,
268}
269
270/// Pseudo-group for ignoring.
271/// e.g. `(?:...)`
272#[ast]
273#[derive(Debug)]
274#[generate_derive(CloneIn, ContentEq)]
275pub struct IgnoreGroup<'a> {
276    pub span: Span,
277    pub modifiers: Option<Modifiers>,
278    pub body: Disjunction<'a>,
279}
280
281/// Modifiers in [`IgnoreGroup`].
282/// e.g. `i` in `(?i:...)`, `-s` in `(?-s:...)`
283#[ast]
284#[derive(Debug)]
285#[generate_derive(CloneIn, ContentEq)]
286pub struct Modifiers {
287    pub span: Span,
288    pub enabling: Modifier,
289    pub disabling: Modifier,
290}
291
292bitflags! {
293    /// Each part of modifier in [`Modifiers`].
294    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
295    pub struct Modifier: u8 {
296        /// Ignore case flag
297        const I = 1 << 0;
298        /// Multiline flag
299        const M = 1 << 1;
300        /// DotAll flag
301        const S = 1 << 2;
302    }
303}
304/// Dummy type to communicate the content of `Modifier` to `oxc_ast_tools`.
305#[ast(foreign = Modifier)]
306#[expect(dead_code)]
307struct ModifierAlias(u8);
308
309/// Backreference by index.
310/// e.g. `\1`, `\2`, `\3`
311#[ast]
312#[derive(Debug)]
313#[generate_derive(CloneIn, ContentEq)]
314pub struct IndexedReference {
315    pub span: Span,
316    pub index: u32,
317}
318
319/// Backreference by name.
320/// e.g. `\k<name>`
321#[ast]
322#[derive(Debug)]
323#[generate_derive(CloneIn, ContentEq)]
324pub struct NamedReference<'a> {
325    pub span: Span,
326    pub name: Atom<'a>,
327}
328
329// See `oxc_ast/src/lib.rs` for the details
330#[cfg(target_pointer_width = "64")]
331#[test]
332fn size_asserts() {
333    use std::mem::size_of;
334
335    assert!(size_of::<Term>() == 16);
336    assert!(size_of::<CharacterClassContents>() == 16);
337}