posix_regex/
compile.rs

1//! The regex "compiler", which parses the regex itself.
2//! Produces a matcher ready to match input.
3
4#[cfg(feature = "no_std")]
5use std::prelude::*;
6
7use std::borrow::Cow;
8use std::collections::HashMap;
9use std::fmt;
10use {ctype, PosixRegex};
11use tree::*;
12
13/// Repetition bounds, for example + is (1, None), and ? is (0, Some(1))
14#[derive(Clone, Copy, PartialEq, Eq)]
15pub struct Range(pub u32, pub Option<u32>);
16impl fmt::Debug for Range {
17    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
18        match self {
19            Range(start, None) => write!(f, "{}..", start),
20            Range(start, Some(end)) => write!(f, "{}..{}", start, end),
21        }
22    }
23}
24
25/// An item inside square brackets, like `[abc]` or `[[:digit:]]`
26#[derive(Clone, PartialEq, Eq)]
27pub enum Collation {
28    Char(u8),
29    Class(fn(u8) -> bool)
30}
31impl Collation {
32    /// Compare this collation to a character
33    pub fn matches(&self, other: u8, insensitive: bool) -> bool {
34        match *self {
35            Collation::Char(me) if insensitive => if ctype::is_alpha(me) && ctype::is_alpha(other) {
36                me | 32 == other | 32
37            } else {
38                me == other
39            },
40            Collation::Char(me) => me == other,
41            Collation::Class(f) => f(other)
42        }
43    }
44}
45impl fmt::Debug for Collation {
46    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
47        match *self {
48            Collation::Char(c) => write!(f, "{:?}", c as char),
49            Collation::Class(c) => write!(f, "{:p}", c),
50        }
51    }
52}
53
54/// A single "compiled" token, such as a `.` or a character literal
55#[derive(Clone, PartialEq, Eq)]
56pub enum Token {
57    /// Internal token used to find matches that might be anywhere in the text
58    InternalStart,
59
60    Alternative,
61    Any,
62    BackRef(u32),
63    Char(u8),
64    End,
65    Group(usize),
66    OneOf {
67        invert: bool,
68        list: Vec<Collation>
69    },
70    Root,
71    Start,
72    WordEnd,
73    WordStart
74}
75impl fmt::Debug for Token {
76    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
77        match *self {
78            Token::InternalStart => write!(f, "<START>"),
79
80            Token::Alternative => write!(f, "Alternative"),
81            Token::Any => write!(f, "."),
82            Token::BackRef(id) => write!(f, "\\{}", id),
83            Token::Char(c) => write!(f, "{:?}", c as char),
84            Token::End => write!(f, "$"),
85            Token::Group(id) => write!(f, "Group({})", id),
86            Token::OneOf { invert, ref list } => write!(f, "{{invert: {}, {:?}}}", invert, list),
87            Token::Root => write!(f, "Root"),
88            Token::Start => write!(f, "^"),
89            Token::WordEnd => write!(f, ">"),
90            Token::WordStart => write!(f, "<")
91        }
92    }
93}
94/// An error that occurred while compiling the regex
95#[derive(Clone, Debug, PartialEq, Eq)]
96pub enum Error {
97    EOF,
98    EmptyRepetition,
99    Expected(u8, Option<u8>),
100    IllegalRange,
101    IntegerOverflow,
102    InvalidBackRef(u32),
103    LeadingRepetition,
104    UnclosedRepetition,
105    UnexpectedToken(u8),
106    UnknownClass(Vec<u8>),
107    UnknownCollation
108}
109
110/// A regex builder struct
111pub struct PosixRegexBuilder<'a> {
112    input: &'a [u8],
113    classes: HashMap<&'a [u8], fn(u8) -> bool>,
114    group_id: usize,
115    builder: TreeBuilder
116}
117impl<'a> PosixRegexBuilder<'a> {
118    /// Create a new instance that is ready to parse the regex `input`
119    pub fn new(input: &'a [u8]) -> Self {
120        Self {
121            input,
122            classes: HashMap::new(),
123            group_id: 1,
124            builder: TreeBuilder::default()
125        }
126    }
127    /// Add a custom collation class, for use within square brackets (such as `[[:digit:]]`)
128    pub fn with_class(mut self, name: &'a [u8], callback: fn(u8) -> bool) -> Self {
129        self.classes.insert(name, callback);
130        self
131    }
132    /// Add all the default collation classes, like `[[:digit:]]` and `[[:alnum:]]`
133    pub fn with_default_classes(mut self) -> Self {
134        #[cfg(not(feature = "no_std"))]
135        self.classes.reserve(12);
136
137        self.classes.insert(b"alnum", ctype::is_alnum);
138        self.classes.insert(b"alpha", ctype::is_alpha);
139        self.classes.insert(b"blank", ctype::is_blank);
140        self.classes.insert(b"cntrl", ctype::is_cntrl);
141        self.classes.insert(b"digit", ctype::is_digit);
142        self.classes.insert(b"graph", ctype::is_graph);
143        self.classes.insert(b"lower", ctype::is_lower);
144        self.classes.insert(b"print", ctype::is_print);
145        self.classes.insert(b"punct", ctype::is_punct);
146        self.classes.insert(b"space", ctype::is_space);
147        self.classes.insert(b"upper", ctype::is_upper);
148        self.classes.insert(b"xdigit", ctype::is_xdigit);
149
150        self
151    }
152    /// "Compile" this regex to a struct ready to match input
153    pub fn compile(self) -> Result<PosixRegex<'static>, Error> {
154        let tree = self.compile_tokens()?;
155        Ok(PosixRegex::new(Cow::Owned(tree)))
156    }
157    pub fn compile_tokens(mut self) -> Result<Tree, Error> {
158        self.builder.start_internal(Token::Root, Range(1, Some(1)));
159        self.parse()?;
160        self.builder.finish_internal();
161        Ok(self.builder.finish())
162    }
163
164    fn consume(&mut self, amount: usize) {
165        self.input = &self.input[amount..];
166    }
167    fn take_int(&mut self) -> Result<Option<u32>, Error> {
168        let mut out: Option<u32> = None;
169        while let Some(&c @ b'0'..=b'9') = self.input.first() {
170            self.consume(1);
171            out = Some(out.unwrap_or(0)
172                .checked_mul(10)
173                .and_then(|out| out.checked_add((c - b'0') as u32))
174                .ok_or(Error::IntegerOverflow)?);
175        }
176        Ok(out)
177    }
178    fn next(&mut self) -> Result<u8, Error> {
179        self.input.first()
180            .map(|&c| { self.consume(1); c })
181            .ok_or(Error::EOF)
182    }
183    fn expect(&mut self, c: u8) -> Result<(), Error> {
184        if self.input.first() != Some(&c) {
185            return Err(Error::Expected(c, self.input.first().cloned()));
186        }
187        self.consume(1);
188        Ok(())
189    }
190    fn parse_range(&mut self) -> Result<Range, Error> {
191        let mut range = Range(1, Some(1));
192        if let Some(&c) = self.input.first() {
193            let new = match c {
194                b'*' => Some((1, Range(0, None))),
195                b'\\' => match self.input.get(1) {
196                    Some(b'?') => Some((2, Range(0, Some(1)))),
197                    Some(b'+') => Some((2, Range(1, None))),
198                    Some(b'{') => {
199                        self.consume(2);
200                        let first = self.take_int()?.ok_or(Error::EmptyRepetition)?;
201                        let mut second = Some(first);
202                        if let Some(b',') = self.input.first() {
203                            self.consume(1);
204                            second = self.take_int()?;
205                        }
206                        if self.input.first() == Some(&b'}') {
207                            self.consume(1);
208                        } else if self.input.starts_with(br"\}") {
209                            self.consume(2);
210                        } else {
211                            return Err(Error::UnclosedRepetition);
212                        }
213                        if second.map(|second| first > second).unwrap_or(false) {
214                            return Err(Error::IllegalRange);
215                        }
216                        range = Range(first, second);
217                        None
218                    },
219                    _ => None
220                },
221                _ => None
222            };
223            if let Some((consume, new)) = new {
224                range = new;
225                self.consume(consume);
226            }
227        }
228        Ok(range)
229    }
230    fn parse(&mut self) -> Result<(), Error> {
231        self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
232        while let Ok(c) = self.next() {
233            let token = match c {
234                b'^' => Token::Start,
235                b'$' => Token::End,
236                b'.' => Token::Any,
237                b'[' => {
238                    let mut list = Vec::new();
239                    let invert = self.input.first() == Some(&b'^');
240
241                    if invert {
242                        self.consume(1);
243                    }
244
245                    loop {
246                        let mut c = self.next()?;
247
248                        let mut push = true;
249
250                        if c == b'[' {
251                            // TODO: Handle collation characters properly,
252                            // because currently idk what they are and only
253                            // have the behavior of `grep` to go on.
254                            match self.next()? {
255                                b'.' => {
256                                    c = self.next()?;
257                                    self.expect(b'.')?;
258                                    self.expect(b']')?;
259                                },
260                                b'=' => {
261                                    c = self.next()?;
262                                    self.expect(b'=')?;
263                                    self.expect(b']')?;
264                                },
265                                b':' => {
266                                    let end = self.input.iter().position(|&c| c == b':').ok_or(Error::EOF)?;
267                                    let key = &self.input[..end];
268                                    let class = *self.classes.get(key).ok_or_else(|| Error::UnknownClass(key.to_vec()))?;
269                                    self.consume(end + 1);
270                                    self.expect(b']')?;
271
272                                    list.push(Collation::Class(class));
273                                    push = false;
274                                },
275                                _ => return Err(Error::UnknownCollation)
276                            }
277                        }
278
279                        if push {
280                            list.push(Collation::Char(c));
281
282                            if self.input.first() == Some(&b'-') && self.input.get(1) != Some(&b']') {
283                                self.consume(1);
284                                let dest = self.next()?;
285                                for c in (c+1)..=dest {
286                                    list.push(Collation::Char(c));
287                                }
288                            }
289                        }
290
291                        if self.input.first() == Some(&b']') {
292                            self.consume(1);
293                            break;
294                        }
295                    }
296
297                    Token::OneOf {
298                        invert,
299                        list
300                    }
301                },
302                b'\\' if self.input.first().map(|&c| (c as char).is_digit(10)).unwrap_or(false) => {
303                    let id = self.take_int()?.unwrap();
304                    if (id as usize) >= self.group_id {
305                        return Err(Error::InvalidBackRef(id));
306                    }
307                    Token::BackRef(id)
308                }
309                b'\\' => match self.next()? {
310                    b'(' => {
311                        let id = self.group_id;
312                        self.group_id += 1;
313                        let checkpoint = self.builder.checkpoint();
314                        self.parse()?;
315                        let range = self.parse_range()?;
316                        self.builder.start_internal_at(checkpoint, Token::Group(id), range);
317                        self.builder.finish_internal();
318                        continue;
319                    },
320                    b')' => break,
321                    b'|' => {
322                        self.builder.finish_internal();
323                        self.builder.start_internal(Token::Alternative, Range(1, Some(1)));
324                        continue;
325                    },
326                    b'<' => Token::WordStart,
327                    b'>' => Token::WordEnd,
328                    b'a' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_alnum)] },
329                    b'd' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_digit)] },
330                    b's' => Token::OneOf { invert: false, list: vec![Collation::Class(ctype::is_space)] },
331                    b'S' => Token::OneOf { invert: true,  list: vec![Collation::Class(ctype::is_space)] },
332                    b'n' => Token::Char(b'\n'),
333                    b'r' => Token::Char(b'\r'),
334                    b't' => Token::Char(b'\t'),
335                    c => Token::Char(c)
336                },
337                c => Token::Char(c)
338            };
339            let range = self.parse_range()?;
340            self.builder.leaf(token, range);
341        }
342        self.builder.finish_internal();
343        Ok(())
344    }
345}
346
347#[cfg(test)]
348mod tests {
349    use super::*;
350
351    fn compile(input: &[u8]) -> String {
352        format!(
353            "{:?}",
354            PosixRegexBuilder::new(input)
355                .with_default_classes()
356                .compile_tokens()
357                .expect("error compiling regex")
358        )
359    }
360
361    #[test]
362    fn basic() {
363        assert_eq!(
364            compile(b"abc"),
365            "\
366Root 1..1
367  Alternative 1..1
368    'a' 1..1
369    'b' 1..1
370    'c' 1..1
371"
372        );
373    }
374    #[test]
375    fn groups() {
376        assert_eq!(
377            compile(br"\(abc\|bcd\|cde\)"),
378            "\
379Root 1..1
380  Alternative 1..1
381    Group(1) 1..1
382      Alternative 1..1
383        'a' 1..1
384        'b' 1..1
385        'c' 1..1
386      Alternative 1..1
387        'b' 1..1
388        'c' 1..1
389        'd' 1..1
390      Alternative 1..1
391        'c' 1..1
392        'd' 1..1
393        'e' 1..1
394"
395        );
396        assert_eq!(
397            compile(br"\(abc\|\(bcd\|cde\)\)"),
398            "\
399Root 1..1
400  Alternative 1..1
401    Group(1) 1..1
402      Alternative 1..1
403        'a' 1..1
404        'b' 1..1
405        'c' 1..1
406      Alternative 1..1
407        Group(2) 1..1
408          Alternative 1..1
409            'b' 1..1
410            'c' 1..1
411            'd' 1..1
412          Alternative 1..1
413            'c' 1..1
414            'd' 1..1
415            'e' 1..1
416"
417        );
418    }
419    #[test]
420    fn words() {
421        assert_eq!(
422            compile(br"\<word\>"),
423            "\
424Root 1..1
425  Alternative 1..1
426    < 1..1
427    'w' 1..1
428    'o' 1..1
429    'r' 1..1
430    'd' 1..1
431    > 1..1
432"
433        );
434    }
435    #[test]
436    fn repetitions() {
437        assert_eq!(
438            compile(br"yeee*"),
439            "\
440Root 1..1
441  Alternative 1..1
442    'y' 1..1
443    'e' 1..1
444    'e' 1..1
445    'e' 0..
446"
447        );
448        assert_eq!(
449            compile(br"yee\?"),
450            "\
451Root 1..1
452  Alternative 1..1
453    'y' 1..1
454    'e' 1..1
455    'e' 0..1
456"
457        );
458        assert_eq!(
459            compile(br"yee\+"),
460            "\
461Root 1..1
462  Alternative 1..1
463    'y' 1..1
464    'e' 1..1
465    'e' 1..
466"
467        );
468        assert_eq!(
469            compile(br"ye\{2}"),
470            "\
471Root 1..1
472  Alternative 1..1
473    'y' 1..1
474    'e' 2..2
475"
476        );
477        assert_eq!(
478            compile(br"ye\{2,}"),
479            "\
480Root 1..1
481  Alternative 1..1
482    'y' 1..1
483    'e' 2..
484"
485        );
486        assert_eq!(
487            compile(br"ye\{2,3}"),
488            "\
489Root 1..1
490  Alternative 1..1
491    'y' 1..1
492    'e' 2..3
493"
494        );
495    }
496    #[test]
497    fn bracket() {
498        assert_eq!(
499            compile(b"[abc]"),
500            "\
501Root 1..1
502  Alternative 1..1
503    {invert: false, ['a', 'b', 'c']} 1..1
504"
505        );
506        assert_eq!(
507            compile(b"[^abc]"),
508            "\
509Root 1..1
510  Alternative 1..1
511    {invert: true, ['a', 'b', 'c']} 1..1
512"
513        );
514        assert_eq!(
515            compile(b"[]] [^]]"),
516            "\
517Root 1..1
518  Alternative 1..1
519    {invert: false, [']']} 1..1
520    ' ' 1..1
521    {invert: true, [']']} 1..1
522"
523        );
524        assert_eq!(
525            compile(b"[0-3] [a-c] [-1] [1-]"),
526            "\
527Root 1..1
528  Alternative 1..1
529    {invert: false, ['0', '1', '2', '3']} 1..1
530    ' ' 1..1
531    {invert: false, ['a', 'b', 'c']} 1..1
532    ' ' 1..1
533    {invert: false, ['-', '1']} 1..1
534    ' ' 1..1
535    {invert: false, ['1', '-']} 1..1
536"
537        );
538        assert_eq!(
539            compile(b"[[.-.]-/]"),
540            "\
541Root 1..1
542  Alternative 1..1
543    {invert: false, ['-', '.', '/']} 1..1
544"
545        );
546        assert_eq!(
547            compile(b"[[:digit:][:upper:]]"),
548            format!("\
549Root 1..1
550  Alternative 1..1
551    {{invert: false, [{:p}, {:p}]}} 1..1
552", ctype::is_digit as fn(u8) -> bool, ctype::is_upper as fn(u8) -> bool)
553        );
554    }
555    #[test]
556    fn newline() {
557        assert_eq!(
558            compile(br"\r\n"),
559            "\
560Root 1..1
561  Alternative 1..1
562    '\\r' 1..1
563    '\\n' 1..1
564"
565        );
566    }
567    #[test]
568    fn backref() {
569        assert_eq!(
570            compile(br"\([abc]\)\1"),
571            "\
572Root 1..1
573  Alternative 1..1
574    Group(1) 1..1
575      Alternative 1..1
576        {invert: false, ['a', 'b', 'c']} 1..1
577    \\1 1..1
578"
579        )
580    }
581}