1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
//! The intermediate representation of a RegEx //! in a tree based structure. use crate::Span; use bitflags::bitflags; bitflags! { pub struct Flags: u8 { /// With this flag the search looks for all matches, without this flag /// only the first match is returned const G = 0b00000001; /// Multiline mode const M = 0b00000010; /// Case-insensitive search const I = 0b00000100; /// "dotall" mode, that allows `.` to match newlines (`\n`) const S = 0b00001000; /// Enables full unicode support const U = 0b00010000; /// "Sticky" mode const Y = 0b00100000; } } /// The structure that represents a regular expression. /// /// It contains the actual RegEx node, and the flags for this expression. #[derive(Debug, Clone)] pub struct Regex { pub node: Node, pub flags: Flags, } /// The tree structure that is used to represent parsed /// RegEx patterns. #[derive(Debug, Clone, PartialEq, Eq)] pub enum Node { /// An empty regex node. Empty, /// An "either or". (e.g. `a|b`) Disjunction(Span, Vec<Node>), /// A single assertion. Assertion(Span, AssertionKind), /// A concatenation of regex nodes. (e.g. `ab`) Alternative(Span, Vec<Node>), /// A single character literal. Literal(Span, char), /// Matches a character class (e.g. `\d` or `\w`). /// /// The bool argument indicates if this perl class is negated. PerlClass(Span, ClassPerlKind, bool), /// A back reference to a previous group (`\1`, `\2`, ...). BackReference(Span, u32), /// A `.` that matches everything. Dot(Span), /// A class of multiple characters such as `[A-Z0-9]` CharacterClass(Span, CharacterClass), /// A grouped pattern Group(Span, Group), /// A quantifier which optionally matches or matches multiple times. /// `bool` indicates whether a lazy quantifier (`?`) is present after it. Quantifier(Span, Box<Node>, QuantifierKind, bool), /// A reference to a group using a name NamedBackReference(Span, String), } impl Node { pub fn span(&self) -> Option<Span> { Some( match self { Node::Empty => return None, Node::Disjunction(s, _) => s, Node::Assertion(s, _) => s, Node::Alternative(s, _) => s, Node::Literal(s, _) => s, Node::PerlClass(s, _, _) => s, Node::BackReference(s, _) => s, Node::Dot(s) => s, Node::CharacterClass(s, _) => s, Node::Group(s, _) => s, Node::Quantifier(s, _, _, _) => s, Node::NamedBackReference(s, _) => s, } .to_owned(), ) } } /// A grouped pattern which can later be referred to #[derive(Debug, Clone, PartialEq, Eq)] pub struct Group { /// Whether this group cannot be later referred to with `$0` for example pub noncapturing: bool, pub inner: Box<Node>, pub name: Option<String>, } #[derive(Debug, Clone, PartialEq, Eq)] pub enum QuantifierKind { /// `?` Optional, /// `*` Multiple, /// `+` AtLeastOne, /// `{number}` Number(u32), /// `{number,number}`. if the second option is None it is "between X and unlimited times" Between(u32, Option<u32>), } /// A class matching multiple characters or ranges of characters #[derive(Debug, Clone, PartialEq, Eq)] pub struct CharacterClass { pub negated: bool, pub members: Vec<CharacterClassMember>, } #[derive(Debug, Clone, PartialEq, Eq)] pub enum CharacterClassMember { Range(Node, Node), Single(Node), } #[derive(Debug, Clone, PartialEq, Eq)] pub enum AssertionKind { /// `^` StartOfLine, /// `$` EndOfLine, /// `\b` WordBoundary, /// `\B` NonWordBoundary, /// `x(?=y)` Lookahead(Box<Node>), /// `x(?!y)` NegativeLookahead(Box<Node>), /// `(?<=y)x` Lookbehind(Box<Node>), /// `(?<!y)x` NegativeLookbehind(Box<Node>), } #[derive(Debug, Clone, PartialEq, Eq)] pub enum ClassPerlKind { Digit, Word, Space, Unicode(Option<String>, String), }