Skip to main content

regress/
ir.rs

1//! Intermediate representation for a regex
2
3use crate::api;
4use crate::types::{BracketContents, CaptureGroupID, CaptureGroupName};
5#[cfg(not(feature = "std"))]
6use alloc::{boxed::Box, string::ToString, vec::Vec};
7use core::fmt;
8
9#[derive(Debug, Copy, Clone)]
10pub enum AnchorType {
11    StartOfLine, // ^
12    EndOfLine,   // $
13}
14
15/// A Quantifier.
16#[derive(Debug, Copy, Clone)]
17pub struct Quantifier {
18    /// Minimum number of iterations of the loop, inclusive.
19    pub min: usize,
20
21    /// Maximum number of iterations of the loop, inclusive;
22    /// or None if unbounded.
23    pub max: Option<usize>,
24
25    /// Whether the loop is greedy.
26    pub greedy: bool,
27}
28
29/// The node types of our IR.
30#[derive(Debug)]
31pub enum Node {
32    /// Matches the empty string.
33    Empty,
34
35    /// Reaching this node terminates the match successfully.
36    Goal,
37
38    /// Match a literal character.
39    /// If icase is true, then `c` MUST be already folded.
40    Char { c: u32, icase: bool },
41
42    /// Match a literal sequence of bytes.
43    ByteSequence(Vec<u8>),
44
45    /// Match any of a set of *bytes*.
46    /// This may not exceed length MAX_BYTE_SET_LENGTH.
47    ByteSet(Vec<u8>),
48
49    /// Match any of a set of *chars*, case-insensitive.
50    /// This may not exceed length MAX_CHAR_SET_LENGTH.
51    CharSet(Vec<u32>),
52
53    /// Match the catenation of multiple nodes.
54    Cat(Vec<Node>),
55
56    /// Match an alternation like a|b.
57    Alt(Box<Node>, Box<Node>),
58
59    /// Match anything including newlines.
60    MatchAny,
61
62    /// Match anything except a newline.
63    MatchAnyExceptLineTerminator,
64
65    /// Match an anchor like ^ or $. The `multiline` flag controls whether the
66    /// anchor should treat line terminators as boundaries.
67    Anchor {
68        anchor_type: AnchorType,
69        multiline: bool,
70    },
71
72    /// Word boundary (\b or \B).
73    WordBoundary { invert: bool },
74
75    /// A capturing group.
76    /// If the name is set, it is guaranteed nonempty.
77    /// Note that multiple capture groups may share the same name in different alternations.
78    CaptureGroup {
79        id: CaptureGroupID,
80        contents: Box<Node>,
81        name: Option<CaptureGroupName>,
82    },
83
84    /// A backreference.
85    /// Indices are 1-based. That is, it matches JS syntax: \1 is the first capture group,
86    /// not everything.
87    /// A backreference that logically may match multiple capture groups (through shared names)
88    /// are emitted through alternations of backreferences.
89    BackRef(u32),
90
91    /// A bracket.
92    Bracket(BracketContents),
93
94    /// A lookaround assertions like (?:) or (?!).
95    LookaroundAssertion {
96        negate: bool,
97        backwards: bool,
98        start_group: CaptureGroupID,
99        end_group: CaptureGroupID,
100        contents: Box<Node>,
101    },
102
103    /// A loop like /.*/ or /x{3, 5}?/
104    Loop {
105        loopee: Box<Node>,
106        quant: Quantifier,
107        enclosed_groups: core::ops::Range<u16>,
108    },
109
110    /// A loop whose body matches exactly one character.
111    /// Enclosed capture groups are forbidden here.
112    Loop1CharBody {
113        loopee: Box<Node>,
114        quant: Quantifier,
115    },
116}
117
118pub type NodeList = Vec<Node>;
119
120impl Node {
121    /// Helper to return an "always fails" node.
122    pub fn make_always_fails() -> Node {
123        Node::CharSet(Vec::new())
124    }
125
126    /// Reverse the children of \p self if in a lookbehind.
127    /// Used as a parameter to walk_mut.
128    pub fn reverse_cats(&mut self, w: &mut Walk) {
129        match self {
130            Node::Cat(nodes) if w.in_lookbehind => nodes.reverse(),
131            Node::ByteSequence(..) => panic!("Should not be reversing literal bytes"),
132            _ => {}
133        }
134    }
135
136    /// \return whether this is an Empty node.
137    pub fn is_empty(&self) -> bool {
138        matches!(self, Node::Empty)
139    }
140
141    /// \return whether this is a Cat node.
142    pub fn is_cat(&self) -> bool {
143        matches!(self, Node::Cat(..))
144    }
145
146    /// \return whether this node is known to match exactly one char.
147    /// This is best-effort: a false return is always safe.
148    pub fn matches_exactly_one_char(&self) -> bool {
149        match self {
150            Node::Char { .. } => true,
151            Node::CharSet(contents) => !contents.is_empty(),
152            Node::Bracket(contents) => !contents.is_empty(),
153            Node::MatchAny => true,
154            Node::MatchAnyExceptLineTerminator => true,
155            _ => false,
156        }
157    }
158
159    /// \return true if this node will always fail to match.
160    /// Note this is different than matching the empty string.
161    /// For example, an empty bracket /[]/ tries to match one char
162    /// from an empty set.
163    pub fn match_always_fails(&self) -> bool {
164        match self {
165            Node::ByteSet(bytes) => bytes.is_empty(),
166            Node::CharSet(contents) => contents.is_empty(),
167            Node::Bracket(contents) => contents.is_empty(),
168            _ => false,
169        }
170    }
171
172    /// Duplicate a node, perhaps assigning new loop IDs. Note we must never
173    /// copy a capture group.
174    ///
175    /// Returns None if the depth is too high.
176    pub fn try_duplicate(&self, mut depth: usize) -> Option<Node> {
177        if depth > 100 {
178            return None;
179        }
180        depth += 1;
181        Some(match self {
182            Node::Empty => Node::Empty,
183            Node::Goal => Node::Goal,
184            &Node::Char { c, icase } => Node::Char { c, icase },
185            Node::ByteSequence(bytes) => Node::ByteSequence(bytes.clone()),
186            Node::ByteSet(bytes) => Node::ByteSet(bytes.clone()),
187            Node::CharSet(chars) => Node::CharSet(chars.clone()),
188            Node::Cat(nodes) => {
189                let mut new_nodes = Vec::with_capacity(nodes.len());
190                for n in nodes {
191                    new_nodes.push(n.try_duplicate(depth)?);
192                }
193                Node::Cat(new_nodes)
194            }
195            Node::Alt(left, right) => Node::Alt(
196                Box::new(left.try_duplicate(depth)?),
197                Box::new(right.try_duplicate(depth)?),
198            ),
199            Node::MatchAny => Node::MatchAny,
200            Node::MatchAnyExceptLineTerminator => Node::MatchAnyExceptLineTerminator,
201            &Node::Anchor {
202                anchor_type,
203                multiline,
204            } => Node::Anchor {
205                anchor_type,
206                multiline,
207            },
208
209            Node::Loop {
210                loopee,
211                quant,
212                enclosed_groups,
213            } => {
214                assert!(
215                    enclosed_groups.start >= enclosed_groups.end,
216                    "Cannot duplicate a loop with enclosed groups"
217                );
218                Node::Loop {
219                    loopee: Box::new(loopee.as_ref().try_duplicate(depth)?),
220                    quant: *quant,
221                    enclosed_groups: enclosed_groups.clone(),
222                }
223            }
224
225            Node::Loop1CharBody { loopee, quant } => Node::Loop1CharBody {
226                loopee: Box::new(loopee.as_ref().try_duplicate(depth)?),
227                quant: *quant,
228            },
229
230            Node::CaptureGroup { .. } => {
231                panic!("Refusing to duplicate a capture group");
232            }
233            &Node::WordBoundary { invert } => Node::WordBoundary { invert },
234            &Node::BackRef(idx) => Node::BackRef(idx),
235            Node::Bracket(bc) => Node::Bracket(bc.clone()),
236            // Do not reverse into lookarounds, they already have the right sense.
237            Node::LookaroundAssertion {
238                negate,
239                backwards,
240                start_group,
241                end_group,
242                contents,
243            } => {
244                assert!(
245                    start_group >= end_group,
246                    "Cannot duplicate an assertion with enclosed groups"
247                );
248                Node::LookaroundAssertion {
249                    negate: *negate,
250                    backwards: *backwards,
251                    start_group: *start_group,
252                    end_group: *end_group,
253                    contents: Box::new((*contents).try_duplicate(depth)?),
254                }
255            }
256        })
257    }
258}
259
260/// A helper type for walking.
261#[derive(Debug, Clone)]
262pub struct Walk {
263    // It set to true, skip the children of this node.
264    pub skip_children: bool,
265
266    // The current depth of the walk.
267    pub depth: usize,
268
269    // If true, we are in a lookbehind (and so the cursor will move backwards).
270    pub in_lookbehind: bool,
271
272    // If the regex is in unicode mode.
273    pub unicode: bool,
274}
275
276impl Walk {
277    fn new(unicode: bool) -> Self {
278        Self {
279            skip_children: false,
280            depth: 0,
281            in_lookbehind: false,
282            unicode,
283        }
284    }
285}
286
287#[derive(Debug)]
288struct Walker<'a, F>
289where
290    F: FnMut(&Node, &mut Walk),
291{
292    func: &'a mut F,
293    postorder: bool,
294    walk: Walk,
295}
296
297impl<F> Walker<'_, F>
298where
299    F: FnMut(&Node, &mut Walk),
300{
301    fn process_children(&mut self, n: &Node) {
302        match n {
303            Node::Empty
304            | Node::Goal
305            | Node::Char { .. }
306            | Node::ByteSequence(..)
307            | Node::ByteSet(..)
308            | Node::CharSet(..)
309            | Node::WordBoundary { .. }
310            | Node::BackRef { .. }
311            | Node::Bracket { .. }
312            | Node::MatchAny
313            | Node::MatchAnyExceptLineTerminator
314            | Node::Anchor { .. } => {}
315            Node::Cat(nodes) => {
316                for node in nodes {
317                    self.process(node);
318                }
319            }
320            Node::Alt(left, right) => {
321                self.process(left.as_ref());
322                self.process(right.as_ref());
323            }
324
325            Node::Loop { loopee, .. } | Node::Loop1CharBody { loopee, .. } => self.process(loopee),
326            Node::CaptureGroup { contents, .. } => self.process(contents.as_ref()),
327
328            Node::LookaroundAssertion {
329                backwards,
330                contents,
331                ..
332            } => {
333                let saved = self.walk.in_lookbehind;
334                self.walk.in_lookbehind = *backwards;
335                self.process(contents.as_ref());
336                self.walk.in_lookbehind = saved;
337            }
338        }
339    }
340
341    fn process(&mut self, n: &Node) {
342        self.walk.skip_children = false;
343        if !self.postorder {
344            (self.func)(n, &mut self.walk);
345        }
346        if !self.walk.skip_children {
347            self.walk.depth += 1;
348            self.process_children(n);
349            self.walk.depth -= 1;
350        }
351        if self.postorder {
352            (self.func)(n, &mut self.walk)
353        }
354    }
355}
356
357#[derive(Debug)]
358struct MutWalker<'a, F>
359where
360    F: FnMut(&mut Node, &mut Walk),
361{
362    func: &'a mut F,
363    postorder: bool,
364    walk: Walk,
365}
366
367impl<F> MutWalker<'_, F>
368where
369    F: FnMut(&mut Node, &mut Walk),
370{
371    fn process_children(&mut self, n: &mut Node) {
372        match n {
373            Node::Empty
374            | Node::Goal
375            | Node::Char { .. }
376            | Node::ByteSequence(..)
377            | Node::ByteSet(..)
378            | Node::CharSet(..)
379            | Node::MatchAny
380            | Node::MatchAnyExceptLineTerminator
381            | Node::Anchor { .. }
382            | Node::WordBoundary { .. }
383            | Node::BackRef { .. }
384            | Node::Bracket { .. } => {}
385            Node::Cat(nodes) => {
386                nodes.iter_mut().for_each(|node| self.process(node));
387            }
388            Node::Alt(left, right) => {
389                self.process(left.as_mut());
390                self.process(right.as_mut());
391            }
392
393            Node::Loop { loopee, .. } | Node::Loop1CharBody { loopee, .. } => {
394                self.process(loopee);
395            }
396            Node::CaptureGroup { contents, .. } => self.process(contents.as_mut()),
397
398            Node::LookaroundAssertion {
399                backwards,
400                contents,
401                ..
402            } => {
403                let saved = self.walk.in_lookbehind;
404                self.walk.in_lookbehind = *backwards;
405                self.process(contents.as_mut());
406                self.walk.in_lookbehind = saved;
407            }
408        }
409    }
410
411    fn process(&mut self, n: &mut Node) {
412        self.walk.skip_children = false;
413        if !self.postorder {
414            (self.func)(n, &mut self.walk);
415        }
416        if !self.walk.skip_children {
417            self.walk.depth += 1;
418            self.process_children(n);
419            self.walk.depth -= 1;
420        }
421        if self.postorder {
422            (self.func)(n, &mut self.walk);
423        }
424    }
425}
426
427/// Call a function on every Node.
428/// If \p postorder is true, then process children before the node;
429/// otherwise process children after the node.
430pub fn walk<F>(postorder: bool, unicode: bool, n: &Node, func: &mut F)
431where
432    F: FnMut(&Node, &mut Walk),
433{
434    let mut walker = Walker {
435        func,
436        postorder,
437        walk: Walk::new(unicode),
438    };
439    walker.process(n);
440}
441
442/// Call a function on every Node, which may mutate the node.
443/// If \p postorder is true, then process children before the node;
444/// otherwise process children after the node.
445/// If postorder is false, the function should return true to process children,
446/// false to avoid descending into children. If postorder is true, the return
447/// value is ignored.
448pub fn walk_mut<F>(postorder: bool, unicode: bool, n: &mut Node, func: &mut F)
449where
450    F: FnMut(&mut Node, &mut Walk),
451{
452    let mut walker = MutWalker {
453        func,
454        postorder,
455        walk: Walk::new(unicode),
456    };
457    walker.process(n);
458}
459
460/// A regex in IR form.
461pub struct Regex {
462    pub node: Node,
463    pub flags: api::Flags,
464}
465
466impl Regex {}
467
468fn display_node(node: &Node, depth: usize, f: &mut fmt::Formatter) -> fmt::Result {
469    for _ in 0..depth {
470        write!(f, "..")?;
471    }
472    match node {
473        Node::Empty => {
474            writeln!(f, "Empty")?;
475        }
476        Node::Goal => {
477            writeln!(f, "Goal")?;
478        }
479        Node::Char { c, icase: _ } => {
480            writeln!(f, "'{}'", &c.to_string())?;
481        }
482        Node::ByteSequence(bytes) => {
483            write!(f, "ByteSeq{} 0x", bytes.len())?;
484            for &b in bytes {
485                write!(f, "{:x}", b)?;
486            }
487            writeln!(f)?;
488        }
489        Node::ByteSet(bytes) => {
490            let len = bytes.len();
491            write!(f, "ByteSet{}", len)?;
492            for &b in bytes {
493                write!(f, " 0x{:x}", b)?;
494            }
495            writeln!(f)?;
496        }
497        Node::CharSet(chars) => {
498            write!(f, "CharSet ")?;
499            let mut first = true;
500            for &c in chars {
501                if !first {
502                    write!(f, ", ")?;
503                }
504                first = false;
505                write!(f, "0x{:x}", { c })?;
506            }
507            writeln!(f)?;
508        }
509        Node::Cat(..) => {
510            writeln!(f, "Cat")?;
511        }
512        Node::Alt(..) => {
513            writeln!(f, "Alt")?;
514        }
515        Node::MatchAny => {
516            writeln!(f, "MatchAny")?;
517        }
518        Node::MatchAnyExceptLineTerminator => {
519            writeln!(f, "MatchAnyExceptLineTerminator")?;
520        }
521        Node::Anchor {
522            anchor_type,
523            multiline,
524        } => {
525            writeln!(f, "Anchor {:?} multiline={}", anchor_type, multiline)?;
526        }
527        Node::Loop {
528            quant,
529            enclosed_groups,
530            ..
531        } => {
532            writeln!(f, "Loop (groups {:?}) {:?}", enclosed_groups, quant)?;
533        }
534        Node::Loop1CharBody { quant, .. } => {
535            writeln!(f, "Loop1Char {:?}", quant)?;
536        }
537        Node::CaptureGroup { id, name, .. } => {
538            if let Some(name) = name {
539                writeln!(f, "CaptureGroup {:?} {:?}", id, name)?;
540            } else {
541                writeln!(f, "CaptureGroup {:?}", id)?;
542            }
543        }
544        &Node::WordBoundary { invert } => {
545            let kind = if invert { "\\B" } else { "\\b" };
546            writeln!(f, "WordBoundary {:?} ", kind)?;
547        }
548        &Node::BackRef(group) => {
549            writeln!(f, "BackRef {:?} ", group)?;
550        }
551        Node::Bracket(contents) => {
552            writeln!(f, "Bracket {:?}", contents)?;
553        }
554
555        &Node::LookaroundAssertion {
556            negate,
557            backwards,
558            start_group,
559            end_group,
560            ..
561        } => {
562            let sense = if negate { "negative" } else { "positive" };
563            let direction = if backwards { "backwards" } else { "forwards" };
564            writeln!(
565                f,
566                "LookaroundAssertion {} {} {:?} {:?}",
567                sense, direction, start_group, end_group
568            )?;
569        }
570    }
571    Ok(())
572}
573
574impl fmt::Display for Regex {
575    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
576        //display_node(&self.node, 0, f)
577        let mut result = Ok(());
578        walk(
579            false,
580            self.flags.unicode,
581            &self.node,
582            &mut |node: &Node, walk: &mut Walk| {
583                if result.is_ok() {
584                    result = display_node(node, walk.depth, f)
585                }
586            },
587        );
588        result
589    }
590}