1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
//! The intermediate representation of a RegEx
//! in a tree based structure.

use crate::Span;
use bitflags::bitflags;

bitflags! {
    pub struct Flags: u8 {
        /// With this flag the search looks for all matches, without this flag
        /// only the first match is returned
        const G = 0b00000001;
        /// Multiline mode
        const M = 0b00000010;
        /// Case-insensitive search
        const I = 0b00000100;
        /// "dotall" mode, that allows `.` to match newlines (`\n`)
        const S = 0b00001000;
        /// Enables full unicode support
        const U = 0b00010000;
        /// "Sticky" mode
        const Y = 0b00100000;
    }
}

/// The structure that represents a regular expression.
///
/// It contains the actual RegEx node, and the flags for this expression.
#[derive(Debug, Clone)]
pub struct Regex {
    pub node: Node,
    pub flags: Flags,
}

/// The tree structure that is used to represent parsed
/// RegEx patterns.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Node {
    /// An empty regex node.
    Empty,
    /// An "either or". (e.g. `a|b`)
    Disjunction(Span, Vec<Node>),
    /// A single assertion.
    Assertion(Span, AssertionKind),
    /// A concatenation of regex nodes. (e.g. `ab`)
    Alternative(Span, Vec<Node>),
    /// A single character literal.
    Literal(Span, char),
    /// Matches a character class (e.g. `\d` or `\w`).
    ///
    /// The bool argument indicates if this perl class is negated.
    PerlClass(Span, ClassPerlKind, bool),
    /// A back reference to a previous group (`\1`, `\2`, ...).
    BackReference(Span, u32),
    /// A `.` that matches everything.
    Dot(Span),
    /// A class of multiple characters such as `[A-Z0-9]`
    CharacterClass(Span, CharacterClass),
    /// A grouped pattern
    Group(Span, Group),
    /// A quantifier which optionally matches or matches multiple times.
    /// `bool` indicates whether a lazy quantifier (`?`) is present after it.
    Quantifier(Span, Box<Node>, QuantifierKind, bool),
    /// A reference to a group using a name
    NamedBackReference(Span, String),
}

impl Node {
    pub fn span(&self) -> Option<Span> {
        Some(
            match self {
                Node::Empty => return None,
                Node::Disjunction(s, _) => s,
                Node::Assertion(s, _) => s,
                Node::Alternative(s, _) => s,
                Node::Literal(s, _) => s,
                Node::PerlClass(s, _, _) => s,
                Node::BackReference(s, _) => s,
                Node::Dot(s) => s,
                Node::CharacterClass(s, _) => s,
                Node::Group(s, _) => s,
                Node::Quantifier(s, _, _, _) => s,
                Node::NamedBackReference(s, _) => s,
            }
            .to_owned(),
        )
    }
}

/// A grouped pattern which can later be referred to
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Group {
    /// Whether this group cannot be later referred to with `$0` for example
    pub noncapturing: bool,
    pub inner: Box<Node>,
    pub name: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum QuantifierKind {
    /// `?`
    Optional,
    /// `*`
    Multiple,
    /// `+`
    AtLeastOne,
    /// `{number}`
    Number(u32),
    /// `{number,number}`. if the second option is None it is "between X and unlimited times"
    Between(u32, Option<u32>),
}

/// A class matching multiple characters or ranges of characters
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CharacterClass {
    pub negated: bool,
    pub members: Vec<CharacterClassMember>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CharacterClassMember {
    Range(Node, Node),
    Single(Node),
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum AssertionKind {
    /// `^`
    StartOfLine,
    /// `$`
    EndOfLine,
    /// `\b`
    WordBoundary,
    /// `\B`
    NonWordBoundary,
    /// `x(?=y)`
    Lookahead(Box<Node>),
    /// `x(?!y)`
    NegativeLookahead(Box<Node>),
    /// `(?<=y)x`
    Lookbehind(Box<Node>),
    /// `(?<!y)x`
    NegativeLookbehind(Box<Node>),
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ClassPerlKind {
    Digit,
    Word,
    Space,
    Unicode(Option<String>, String),
}