beans/regex/
parsing.rs

1use std::{iter::Enumerate, str::Chars};
2
3use super::matching::{Instruction, Program};
4use crate::regex::matching::InstructionPointer;
5use unbounded_interval_tree::interval_tree::IntervalTree;
6
7#[cfg(test)]
8pub mod tests {
9    use super::*;
10    use crate::lexer::TerminalId;
11
12    /// Compile a regex into a program executable on the VM.
13    pub fn compile(regex: &str, id: TerminalId) -> Result<(Program, usize), RegexError> {
14        let mut program = Program::new();
15        let (regex, nb_groups) = read(regex, 0)?;
16        build(regex, &mut program);
17        program.push(Instruction::Match(id));
18        Ok((program, nb_groups))
19    }
20
21    #[test]
22    fn read_char() {
23        use Regex::*;
24        assert_eq!(read("a", 0).unwrap(), (Char('a'), 0));
25    }
26
27    #[test]
28    fn read_multiline_comment() {
29        use std::ops::Bound::Included;
30        use Regex::*;
31        let a = read(r"/\*([^*]|\*[^/])*\*/", 0).unwrap();
32        let mut first_class = IntervalTree::default();
33        first_class.insert((Included('*'), Included('*')));
34        let mut second_class = IntervalTree::default();
35        second_class.insert((Included('/'), Included('/')));
36        // let b = Concat(
37        //     Box::new(Char('/')),
38        //     Box::new(Concat(
39        //         Box::new(Char('*')),
40        //         Box::new(Concat(
41        //             Box::new(KleeneStar(Box::new(Group(
42        //                 Box::new(Option(
43        //                     Box::new(CharacterClass(first_class, true)),
44        //                     Box::new(Concat(
45        //                         Box::new(Char('*')),
46        //                         Box::new(CharacterClass(second_class, true)),
47        //                     )),
48        //                 )),
49        //                 0,
50        //             )))),
51        //             Box::new(Concat(Box::new(Char('*')), Box::new(Char('/')))),
52        //         )),
53        //     )),
54        // );
55        let b = Concat(
56            Box::new(Concat(
57                Box::new(Concat(
58                    Box::new(Concat(Box::new(Char('/')), Box::new(Char('*')))),
59                    Box::new(KleeneStar(Box::new(Group(
60                        Box::new(Option(
61                            Box::new(CharacterClass(first_class, true)),
62                            Box::new(Concat(
63                                Box::new(Char('*')),
64                                Box::new(CharacterClass(second_class, true)),
65                            )),
66                        )),
67                        0,
68                    )))),
69                )),
70                Box::new(Char('*')),
71            )),
72            Box::new(Char('/')),
73        );
74        assert_eq!(a, (b, 1));
75    }
76
77    #[test]
78    fn read_word_boundary() {
79        use Regex::*;
80        assert_eq!(read(r"\b", 0).unwrap(), (WordBoundary, 0));
81    }
82
83    #[test]
84    fn read_eof() {
85        use Regex::*;
86        assert_eq!(read(r"\Z", 0).unwrap(), (EOF, 0));
87        assert_eq!(read(r"\z", 0).unwrap(), (EOF, 0));
88    }
89
90    #[test]
91    fn read_char_class() {
92        use std::ops::Bound::Included;
93        use Regex::*;
94        let mut tree = IntervalTree::default();
95        tree.insert((Included('a'), Included('a')));
96        assert_eq!(read("[a]", 0).unwrap(), (CharacterClass(tree, false), 0));
97
98        let mut tree = IntervalTree::default();
99        tree.insert((Included('a'), Included('z')));
100        tree.insert((Included('A'), Included('Z')));
101        tree.insert((Included('0'), Included('9')));
102        tree.insert((Included('_'), Included('_')));
103        assert_eq!(
104            read("[a-zA-Z0-9_]", 0).unwrap(),
105            (CharacterClass(tree, false), 0)
106        );
107
108        let mut tree = IntervalTree::default();
109        tree.insert((Included('a'), Included('z')));
110        tree.insert((Included('A'), Included('Z')));
111        tree.insert((Included('0'), Included('9')));
112        tree.insert((Included('_'), Included('_')));
113        assert_eq!(
114            read("[^a-zA-Z0-9_]", 0).unwrap(),
115            (CharacterClass(tree, true), 0)
116        );
117
118        assert_eq!(
119            read("[a-zb-z]", 0).unwrap_err(),
120            RegexError {
121                position: 8,
122                message: String::from(
123                    "Found 1 overlaps in character class from position 0 to position 8: (a-z,b-z)"
124                ),
125            }
126        );
127    }
128
129    #[test]
130    fn read_escaped() {
131        use Regex::*;
132        assert_eq!(read(r"\w", 0).unwrap(), (WordChar, 0));
133    }
134
135    #[test]
136    #[should_panic]
137    fn read_wrong_escaped() {
138        read(r"\l", 0).unwrap();
139    }
140
141    #[test]
142    fn read_group() {
143        use Regex::*;
144        assert_eq!(read("(a)", 0).unwrap(), (Group(Box::new(Char('a')), 0), 1));
145    }
146
147    #[test]
148    fn read_repetition() {
149        use Regex::*;
150        assert_eq!(
151            read("a+b+", 0).unwrap(),
152            (
153                Concat(
154                    Box::new(Repetition(Box::new(Char('a')))),
155                    Box::new(Repetition(Box::new(Char('b'))))
156                ),
157                0
158            )
159        );
160
161        assert_eq!(
162            read("(a+)(b+)", 0).unwrap(),
163            (
164                Concat(
165                    Box::new(Group(Box::new(Repetition(Box::new(Char('a')))), 0)),
166                    Box::new(Group(Box::new(Repetition(Box::new(Char('b')))), 1))
167                ),
168                2
169            )
170        );
171    }
172
173    #[test]
174    fn read_kleene_star() {
175        use Regex::*;
176
177        assert_eq!(
178            read("a*b*", 0).unwrap(),
179            (
180                Concat(
181                    Box::new(KleeneStar(Box::new(Char('a')))),
182                    Box::new(KleeneStar(Box::new(Char('b'))))
183                ),
184                0
185            )
186        );
187
188        assert_eq!(
189            read("(a*)(b*)", 0).unwrap(),
190            (
191                Concat(
192                    Box::new(Group(Box::new(KleeneStar(Box::new(Char('a')))), 0)),
193                    Box::new(Group(Box::new(KleeneStar(Box::new(Char('b')))), 1))
194                ),
195                2
196            )
197        );
198    }
199
200    #[test]
201    fn read_option() {
202        use Regex::*;
203
204        assert_eq!(
205            read("abc|bcd", 0).unwrap(),
206            (
207                Option(
208                    Box::new(Concat(
209                        Box::new(Concat(Box::new(Char('a')), Box::new(Char('b')),)),
210                        Box::new(Char('c'))
211                    )),
212                    Box::new(Concat(
213                        Box::new(Concat(Box::new(Char('b')), Box::new(Char('c')),)),
214                        Box::new(Char('d'))
215                    )),
216                ),
217                0
218            )
219        );
220
221        assert_eq!(
222            read("ab|cd|ef", 0).unwrap(),
223            (
224                Option(
225                    Box::new(Option(
226                        Box::new(Concat(Box::new(Char('a')), Box::new(Char('b')))),
227                        Box::new(Concat(Box::new(Char('c')), Box::new(Char('d'))))
228                    )),
229                    Box::new(Concat(Box::new(Char('e')), Box::new(Char('f'))))
230                ),
231                0
232            )
233        );
234    }
235
236    #[test]
237    fn read_any() {
238        use Regex::*;
239        assert_eq!(read(".", 0).unwrap(), (Any, 0));
240        assert_eq!(read(".*", 0).unwrap(), (KleeneStar(Box::new(Any)), 0));
241    }
242
243    #[test]
244    fn read_malformed() {
245        let wrong_regex = [
246            ("a**", 2),
247            ("a*?", 2),
248            ("a?*", 2),
249            ("a*+", 2),
250            ("a+*", 2),
251            ("a??", 2),
252            ("a?+", 2),
253            ("a+?", 2),
254            ("a++", 2),
255            ("*", 0),
256            ("?", 0),
257            ("+", 0),
258        ];
259        for &(regex, p) in wrong_regex.iter() {
260            if let Err(RegexError { position: pos, .. }) = read(regex, 0) {
261                assert_eq!(pos, p);
262            } else {
263                panic!("(/{}/ shouldn't succeed.", regex);
264            }
265        }
266    }
267
268    #[test]
269    fn build_basic() {
270        use Instruction::*;
271        let program = compile("a+b+", TerminalId(0)).unwrap();
272        assert_eq!(
273            program,
274            (
275                Program::from(vec![
276                    Char('a'),
277                    Split(InstructionPointer(0), InstructionPointer(2)),
278                    Char('b'),
279                    Split(InstructionPointer(2), InstructionPointer(4)),
280                    Match(TerminalId(0))
281                ]),
282                0
283            )
284        );
285    }
286
287    #[test]
288    fn build_char() {
289        use Instruction::*;
290        let program = compile("a", TerminalId(0)).unwrap();
291        assert_eq!(
292            program,
293            (Program::from(vec![Char('a'), Match(TerminalId(0))]), 0)
294        );
295    }
296
297    #[test]
298    fn build_char_class() {
299        use std::ops::Bound::Included;
300        use Instruction::*;
301        let mut tree = IntervalTree::default();
302        tree.insert((Included('a'), Included('z')));
303        tree.insert((Included('A'), Included('Z')));
304        tree.insert((Included('0'), Included('9')));
305        tree.insert((Included('_'), Included('_')));
306        let program = compile("[a-zA-Z0-9_]", TerminalId(0)).unwrap();
307        assert_eq!(
308            program,
309            (
310                Program::from(vec![CharacterClass(tree, false), Match(TerminalId(0))]),
311                0
312            )
313        );
314    }
315
316    #[test]
317    fn build_word_boundary() {
318        use Instruction::*;
319        let program = compile(r"\b", TerminalId(0)).unwrap();
320        assert_eq!(
321            program,
322            (Program::from(vec![WordBoundary, Match(TerminalId(0))]), 0)
323        );
324    }
325
326    #[test]
327    fn build_eof() {
328        use Instruction::*;
329        let program = compile(r"\z", TerminalId(0)).unwrap();
330        assert_eq!(program, (Program::from(vec![EOF, Match(TerminalId(0))]), 0));
331        let program = compile(r"\Z", TerminalId(0)).unwrap();
332        assert_eq!(program, (Program::from(vec![EOF, Match(TerminalId(0))]), 0));
333    }
334
335    #[test]
336    fn build_escaped() {
337        use Instruction::*;
338        let program = compile(r"\w", TerminalId(0)).unwrap();
339        assert_eq!(
340            program,
341            (Program::from(vec![WordChar, Match(TerminalId(0))]), 0)
342        );
343    }
344
345    #[test]
346    fn build_concat() {
347        use Instruction::*;
348        let program = compile("ab", TerminalId(0)).unwrap();
349        assert_eq!(
350            program,
351            (
352                Program::from(vec![Char('a'), Char('b'), Match(TerminalId(0))]),
353                0
354            )
355        );
356    }
357
358    #[test]
359    fn build_option() {
360        use Instruction::*;
361        let program = compile("a|b", TerminalId(0)).unwrap();
362        assert_eq!(
363            program,
364            (
365                Program::from(vec![
366                    Split(InstructionPointer(1), InstructionPointer(3)),
367                    Char('a'),
368                    Jump(InstructionPointer(4)),
369                    Char('b'),
370                    Match(TerminalId(0))
371                ]),
372                0
373            )
374        );
375    }
376
377    #[test]
378    fn build_optional() {
379        use Instruction::*;
380        let program = compile("a?", TerminalId(0)).unwrap();
381        assert_eq!(
382            program,
383            (
384                Program::from(vec![
385                    Split(InstructionPointer(1), InstructionPointer(2)),
386                    Char('a'),
387                    Match(TerminalId(0))
388                ]),
389                0
390            )
391        );
392    }
393
394    #[test]
395    fn build_kleene_star() {
396        use Instruction::*;
397        let program = compile("a*", TerminalId(0)).unwrap();
398        assert_eq!(
399            program,
400            (
401                Program::from(vec![
402                    Split(InstructionPointer(1), InstructionPointer(3)),
403                    Char('a'),
404                    Jump(InstructionPointer(0)),
405                    Match(TerminalId(0))
406                ]),
407                0
408            )
409        );
410    }
411
412    #[test]
413    fn build_repetition() {
414        use Instruction::*;
415        let program = compile("a+", TerminalId(0)).unwrap();
416        assert_eq!(
417            program,
418            (
419                Program::from(vec![
420                    Char('a'),
421                    Split(InstructionPointer(0), InstructionPointer(2)),
422                    Match(TerminalId(0))
423                ]),
424                0
425            )
426        );
427    }
428
429    #[test]
430    fn build_any() {
431        use Instruction::*;
432        let program = compile(".", TerminalId(0)).unwrap();
433        assert_eq!(program, (Program::from(vec![Any, Match(TerminalId(0))]), 0));
434    }
435
436    #[test]
437    fn build_groups() {
438        use Instruction::*;
439        let program = compile("(a+)(b+)", TerminalId(0)).unwrap();
440        assert_eq!(
441            program,
442            (
443                Program::from(vec![
444                    Save(0),
445                    Char('a'),
446                    Split(InstructionPointer(1), InstructionPointer(3)),
447                    Save(1),
448                    Save(2),
449                    Char('b'),
450                    Split(InstructionPointer(5), InstructionPointer(7)),
451                    Save(3),
452                    Match(TerminalId(0))
453                ]),
454                2
455            )
456        )
457    }
458
459    #[test]
460    fn build_string() {
461        use std::collections::Bound::Included;
462        use Instruction::*;
463        let program = compile(r"'(([^'\\]|\\[^\\]|\\\\)*)'", TerminalId(0)).unwrap();
464        let mut first_char_class = IntervalTree::default();
465        first_char_class.insert((Included('\''), Included('\'')));
466        first_char_class.insert((Included('\\'), Included('\\')));
467        let mut second_char_class = IntervalTree::default();
468        second_char_class.insert((Included('\\'), Included('\\')));
469        let correct = Program::from(vec![
470            Char('\''),
471            Save(0),
472            Split(InstructionPointer(3), InstructionPointer(15)),
473            Save(2),
474            Split(InstructionPointer(5), InstructionPointer(11)),
475            Split(InstructionPointer(6), InstructionPointer(8)),
476            CharacterClass(first_char_class, true),
477            Jump(InstructionPointer(10)),
478            Char('\\'),
479            CharacterClass(second_char_class, true),
480            Jump(InstructionPointer(13)),
481            Char('\\'),
482            Char('\\'),
483            Save(3),
484            Jump(InstructionPointer(2)),
485            Save(1),
486            Char('\''),
487            Match(TerminalId(0)),
488        ]);
489        assert_eq!(program, (correct, 2));
490    }
491}
492
493/// # Summary
494///
495/// `Regex` represents any successfully parsed regex.
496#[cfg_attr(test, derive(PartialEq))]
497#[derive(Debug)]
498pub enum Regex {
499    Char(char),
500    Option(Box<Regex>, Box<Regex>),
501    Optional(Box<Regex>),
502    Repetition(Box<Regex>),
503    KleeneStar(Box<Regex>),
504    Concat(Box<Regex>, Box<Regex>),
505    Group(Box<Regex>, usize),
506    CharacterClass(IntervalTree<char>, bool),
507    WordChar,
508    Digit,
509    Whitespace,
510    WordBoundary,
511    EOF,
512    Any,
513    Empty,
514}
515
516/// # Summary
517///
518/// `RegexError` is an alias of a couple (usize, String).
519/// The first element is the position at which the error occured,
520/// and the second one is a description of the said error.
521#[derive(Debug, PartialEq, Eq)]
522pub struct RegexError {
523    pub position: usize,
524    pub message: String,
525}
526
527impl From<(Regex, Option<Regex>)> for Regex {
528    fn from(t: (Regex, Option<Regex>)) -> Self {
529        match t {
530            (r, Some(r2)) => Regex::Option(Box::new(r2), Box::new(r)),
531            (r, None) => r,
532        }
533    }
534}
535
536/// Take a `Regex`, and a reference to a `Program` and
537/// appends the implementation of the `Regex` at the
538/// end of the `Program`.
539///
540/// **Warning**: this function is recursive, and a malicious input
541/// may lead to a stack overflow. This is because this library
542/// is really meant to be used in a context in which the end user
543/// (the one providing the input) is also the one using the library.
544pub fn build(regex: Regex, program: &mut Program) {
545    match regex {
546        Regex::Char(c) => {
547            program.push(Instruction::Char(c));
548        }
549        Regex::Option(r1, r2) => {
550            let split_pos = InstructionPointer(program.len());
551            program.push(Instruction::Split(
552                InstructionPointer(0),
553                InstructionPointer(0),
554            ));
555            build(*r1, program);
556            let jmp_pos = program.len_ip();
557            program.push(Instruction::Jump(InstructionPointer(0)));
558            build(*r2, program);
559            program[split_pos] = Instruction::Split(split_pos.incr(), jmp_pos.incr());
560            program[jmp_pos] = Instruction::Jump(program.len_ip());
561        }
562        Regex::Concat(r1, r2) => {
563            build(*r1, program);
564            build(*r2, program);
565        }
566        Regex::Optional(r) => {
567            let split_pos = program.len_ip();
568            program.push(Instruction::Split(
569                InstructionPointer(0),
570                InstructionPointer(0),
571            ));
572            build(*r, program);
573            program[split_pos] = Instruction::Split(split_pos.incr(), program.len_ip());
574        }
575        Regex::KleeneStar(r) => {
576            let split_pos = program.len_ip();
577            program.push(Instruction::Split(
578                InstructionPointer(0),
579                InstructionPointer(0),
580            ));
581            build(*r, program);
582            program.push(Instruction::Jump(split_pos));
583            program[split_pos] = Instruction::Split(split_pos.incr(), program.len_ip());
584        }
585        Regex::Repetition(r) => {
586            let init_pos = program.len_ip();
587            build(*r, program);
588            program.push(Instruction::Split(init_pos, program.len_ip().incr()));
589        }
590        Regex::Group(r, i) => {
591            program.push(Instruction::Save(2 * i));
592            build(*r, program);
593            program.push(Instruction::Save(2 * i + 1));
594        }
595        Regex::Any => program.push(Instruction::Any),
596        Regex::WordChar => program.push(Instruction::WordChar),
597        Regex::Digit => program.push(Instruction::Digit),
598        Regex::WordBoundary => program.push(Instruction::WordBoundary),
599        Regex::CharacterClass(class, negated) => {
600            program.push(Instruction::CharacterClass(class, negated))
601        }
602        Regex::EOF => program.push(Instruction::EOF),
603        Regex::Whitespace => program.push(Instruction::Whitespace),
604        Regex::Empty => {}
605    };
606}
607
608/// Parse a regex. The parsing technique is quite efficient,
609/// essentially linear time.
610/// **This is a private function, please use the API instead.**
611pub fn read(regex: &str, mut groups: usize) -> Result<(Regex, usize), RegexError> {
612    /// Parse a character class.
613    fn read_char_class(
614        input: &mut std::iter::Enumerate<std::str::Chars<'_>>,
615        size: usize,
616        actual: usize,
617    ) -> Result<Regex, RegexError> {
618        use std::ops::{Bound, Bound::Included};
619        fn insert(
620            interval: (Bound<char>, Bound<char>),
621            tree: &mut IntervalTree<char>,
622            overlaps: &mut Vec<Vec<(char, char)>>,
623        ) {
624            let over = tree.get_interval_overlaps(&interval);
625            if !over.is_empty() {
626                overlaps.push(
627                    over.into_iter()
628                        .chain(std::iter::once(&interval))
629                        .map(|(start, end)| {
630                            let s = match start {
631                                Included(s) => *s,
632                                _ => panic!(),
633                            };
634                            let e = match end {
635                                Included(e) => *e,
636                                _ => panic!(),
637                            };
638                            (s, e)
639                        })
640                        .collect(),
641                );
642            }
643            tree.insert(interval);
644        }
645        let mut tree = IntervalTree::default();
646        let mut last = None;
647        let mut overlaps = Vec::new();
648        let mut negated = false;
649
650        fn read_escaped_char(
651            input: &mut Enumerate<Chars<'_>>,
652            pos: usize,
653        ) -> Result<char, RegexError> {
654            if let Some((pos, c)) = input.next() {
655                match c {
656                    ']' => Ok(c),
657                    't' => Ok('\t'),
658                    'n' => Ok('\n'),
659                    '^' => Ok('^'),
660                    '-' => Ok('-'),
661                    '\\' => Ok('\\'),
662                    _ => {
663                        Err(RegexError {
664                            position: pos,
665                            message: format!(
666                                "Cannot escape character inside character class {}, try replacing /\\{}/ with /{}/",
667                                c, c, c
668                            ),
669                        })
670                    }
671                }
672            } else {
673                Err(RegexError {
674                    position: pos,
675                    message: String::from(
676                        "Expected something after '\', but found EOF instead",
677                    ),
678                })
679            }
680        }
681
682        while let Some((pos, chr)) = input.next() {
683            match chr {
684                '\\' => {
685                    if let Some(cr) = last {
686                        insert((Included(cr), Included(cr)), &mut tree, &mut overlaps);
687                    }
688                    last = Some(read_escaped_char(input, pos)?);
689                }
690                '^' if pos == actual + 1 => {
691                    negated = true;
692                }
693                '-' => {
694                    if let Some(c1) = last {
695                        if let Some((_, c2)) = input.next() {
696                            let chr = if c2 == '\\' {
697                                read_escaped_char(input, pos)?
698                            } else {
699                                c2
700                            };
701                            insert((Included(c1), Included(chr)), &mut tree, &mut overlaps);
702                        } else {
703                            return Err(RegexError {
704                                position: pos,
705                                message: String::from(
706                                    "Expected something after '-', but found EOF instead",
707                                ),
708                            });
709                        }
710                        last = None;
711                    } else {
712                        insert((Included('-'), Included('-')), &mut tree, &mut overlaps);
713                    }
714                }
715                ']' => {
716                    if let Some(c) = last {
717                        insert((Included(c), Included(c)), &mut tree, &mut overlaps);
718                    }
719                    if !overlaps.is_empty() {
720                        let mut err_str = format!("Found {} overlaps in character class from position {} to position {}: ", overlaps.len(), actual, pos+1);
721                        for overlap in overlaps {
722                            err_str.push('(');
723                            for (s, e) in overlap {
724                                err_str.push(s);
725                                err_str.push('-');
726                                err_str.push(e);
727                                err_str.push(',');
728                            }
729                            err_str.pop();
730                            err_str.push_str("), ");
731                        }
732                        err_str.pop();
733                        err_str.pop();
734                        return Err(RegexError {
735                            position: pos + 1,
736                            message: err_str,
737                        });
738                    } else {
739                        return Ok(Regex::CharacterClass(tree, negated));
740                    }
741                }
742                c => {
743                    if let Some(cr) = last {
744                        insert((Included(cr), Included(cr)), &mut tree, &mut overlaps);
745                    }
746                    last = Some(c);
747                }
748            }
749        }
750        Err(RegexError {position: size, message: format!("Expected end of character class, but found EOF. Try adding ']' if you really meant to have a character class, or adding '\\' at position {} if you didn't want a character class", actual)})
751    }
752
753    fn add(regex: Regex, stack: &mut Vec<(Regex, Option<Regex>, usize)>) {
754        let (last, remainder, group) = stack.pop().unwrap();
755        stack.push((concat(last, regex), remainder, group));
756    }
757
758    fn concat(left: Regex, right: Regex) -> Regex {
759        if let Regex::Empty = left {
760            right
761        } else {
762            Regex::Concat(Box::new(left), Box::new(right))
763        }
764    }
765
766    fn kleene_star(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
767        match exp {
768            Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(kleene_star(*r2, pos)?))),
769            Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(kleene_star(*r2, pos)?))),
770            Regex::Empty => Err(RegexError {
771                position: pos,
772                message: String::from("Cannot apply kleene star to empty regex."),
773            }),
774            Regex::KleeneStar(..) => Err(RegexError {
775                position: pos,
776                message: String::from("Cannot apply kleene star twice."),
777            }),
778            Regex::Optional(..) => Err(RegexError {
779                position: pos,
780                message: String::from("Cannot apply kleene star on an optional group."),
781            }),
782            Regex::Repetition(..) => Err(RegexError {
783                position: pos,
784                message: String::from("Cannot apply kleene star and repetition."),
785            }),
786            r => Ok(Regex::KleeneStar(Box::new(r))),
787        }
788    }
789
790    fn repetition(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
791        match exp {
792            Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(repetition(*r2, pos)?))),
793            Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(repetition(*r2, pos)?))),
794            Regex::Empty => Err(RegexError {
795                position: pos,
796                message: String::from("Cannot apply repetition to empty regex."),
797            }),
798            Regex::KleeneStar(..) => Err(RegexError {
799                position: pos,
800                message: String::from("Cannot apply repetition twice."),
801            }),
802            Regex::Optional(..) => Err(RegexError {
803                position: pos,
804                message: String::from("Cannot apply repetition on an optional group."),
805            }),
806            Regex::Repetition(..) => Err(RegexError {
807                position: pos,
808                message: String::from("Cannot apply repetition and kleene star."),
809            }),
810            r => Ok(Regex::Repetition(Box::new(r))),
811        }
812    }
813
814    fn optional(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
815        match exp {
816            Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(optional(*r2, pos)?))),
817            Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(optional(*r2, pos)?))),
818            Regex::Empty => Err(RegexError {
819                position: pos,
820                message: String::from("Cannot apply optional to empty regex."),
821            }),
822            Regex::KleeneStar(..) => Err(RegexError {
823                position: pos,
824                message: String::from("Non-greedy kleene star is not supported."),
825            }),
826            Regex::Optional(..) => Err(RegexError {
827                position: pos,
828                message: String::from("Non-greedy optional is not supported."),
829            }),
830            Regex::Repetition(..) => Err(RegexError {
831                position: pos,
832                message: String::from("Non-greedy repetition is not supported."),
833            }),
834            r => Ok(Regex::Optional(Box::new(r))),
835        }
836    }
837
838    let mut stack = vec![(Regex::Empty, None, 0)];
839    let mut chrs = regex.chars().enumerate();
840    let size = regex.chars().count();
841    while let Some((pos, chr)) = chrs.next() {
842        match chr {
843            '(' => {
844                stack.push((Regex::Empty, None, groups));
845		groups += 1;
846            }
847            ')' => {
848                if stack.len() <= 1 {
849                    return Err(RegexError {
850                        position: pos,
851                        message: String::from("Closing parenthesis doesn't match any previously opened."),
852                    });
853                }
854                {
855		    let (last, remainder, nb_group) = stack.pop().unwrap();
856                    let group = Regex::Group(Box::new((last, remainder).into()), nb_group);
857                    let (last, remainder, nb_group) = stack.pop().unwrap();
858                    stack.push((concat(last, group), remainder, nb_group));
859                }
860            }
861            '?' => {
862                let (last, remainder, nb_group) = stack.pop().unwrap();
863                stack.push((optional(last, pos)?, remainder, nb_group));
864            }
865            '*' => {
866                let (last, remainder, nb_group) = stack.pop().unwrap();
867                stack.push((kleene_star(last, pos)?, remainder, nb_group));
868            }
869            '+' => {
870                let (last, remainder, nb_group) = stack.pop().unwrap();
871                stack.push((repetition(last, pos)?, remainder, nb_group));
872            }
873            '|' => {
874		let (l, remainder, group) = stack.pop().unwrap();
875                let last = (l, remainder).into();
876                stack.push((Regex::Empty, Some(last), group));
877            }
878	    '$' => return Err(RegexError {
879		position: pos,
880		message: String::from("End-of-line /$/ is not supported. Matches are anchored anyways. Try /\\n/ instead.")
881	    }),
882	    '^' => return Err(RegexError {
883		position: pos,
884		message: String::from("Start-of-line /^/ is not supported. Matches are anchored anyways.")
885	    }),
886	    ']' => return Err(RegexError {
887		position: pos,
888		message: String::from("Closing bracket doesn't match any previsouly opened."),
889	    }),
890            '.' => add(Regex::Any, &mut stack),
891            '\\' => {
892                if let Some((pos, chr)) = chrs.next() {
893                    match chr {
894			'\\' | '.' | '(' | ')' | '?' | '+' | '*' | '|' | '$' | '^' | '[' | ']' => add(Regex::Char(chr), &mut stack),
895			'A' => return Err(RegexError {
896			    position: pos,
897			    message: String::from("Start of the string anchor /\\A/ is not supported.")
898			}),
899			'N' => return Err(RegexError {
900			    position: pos,
901			    message: String::from("Not a line break shorthand /\\N/ is not supported. Try /[^\\n]/ instead.")
902			}),
903			'Z' => add(Regex::EOF, &mut stack),
904			'b' => add(Regex::WordBoundary, &mut stack),
905			'd' => add(Regex::Digit, &mut stack),
906			'h' => return Err(RegexError {
907			    position: pos,
908			    message: String::from("Horizontal whitespace / hexadecimal digit shorthand /\\h/ is not supported. Try [0-9a-f] instead if you wanted hexadecimal digit.")
909			}),
910			'l' => return Err(RegexError {
911			    position: pos,
912			    message: String::from("Lowercase shorthand /\\l/ is not supported. Try /[a-z]/ instead.")
913			}),
914			'n' => add(Regex::Char('\n'), &mut stack),
915			'r' => return Err(RegexError {
916			    position: pos,
917			    message: String::from("Nonsense EOL /\\r/ is not supported. Try /\\n/ instead.")
918			}),
919			's' => add(Regex::Whitespace, &mut stack),
920			't' => add(Regex::Char('\t'), &mut stack),
921			'u' => return Err(RegexError {
922			    position: pos,
923			    message: String::from("Uppercase shorthand /\\u/ is not supported. Try /[A-Z]/ instead.")
924			}),
925			'v' => return Err(RegexError {
926			    position: pos,
927			    message: String::from("Vertical whitespace shorthand /\\v/ is not supported.")
928			}),
929                        'w' => add(Regex::WordChar, &mut stack),
930			'x' => return Err(RegexError {
931			    position: pos,
932			    message: String::from("Hexadecimal escape /\\xFF/ is not supported.")
933			}),
934			'z' => add(Regex::EOF, &mut stack),
935                        _ => {
936                            return Err(RegexError {
937                                position: pos,
938                                message: format!(
939                                    "Cannot escape character {}, try replacing /\\{}/ with /{}/",
940                                    chr, chr, chr
941                                ),
942                            })
943                        }
944                    }
945                } else {
946                    return Err(RegexError {
947                        position: pos,
948                        message: String::from(
949                            "Expected something after character '\', but found EOF instead",
950                        ),
951                    });
952                }
953            }
954            '[' => {
955                add(read_char_class(&mut chrs, size, pos)?, &mut stack);
956            }
957            c => add(Regex::Char(c), &mut stack),
958        }
959    }
960    if stack.len() != 1 {
961        Err(RegexError {
962            position: size,
963            message: format!(
964                "Expected {} closing parenthesis, found EOF",
965                stack.len() - 1
966            ),
967        })
968    } else {
969        Ok((
970            {
971                let (last, remainder, _) = stack.pop().unwrap();
972                (last, remainder).into()
973            },
974            groups,
975        ))
976    }
977}