1use std::{iter::Enumerate, str::Chars};
2
3use super::matching::{Instruction, Program};
4use crate::regex::matching::InstructionPointer;
5use unbounded_interval_tree::interval_tree::IntervalTree;
6
7#[cfg(test)]
8pub mod tests {
9 use super::*;
10 use crate::lexer::TerminalId;
11
12 pub fn compile(regex: &str, id: TerminalId) -> Result<(Program, usize), RegexError> {
14 let mut program = Program::new();
15 let (regex, nb_groups) = read(regex, 0)?;
16 build(regex, &mut program);
17 program.push(Instruction::Match(id));
18 Ok((program, nb_groups))
19 }
20
21 #[test]
22 fn read_char() {
23 use Regex::*;
24 assert_eq!(read("a", 0).unwrap(), (Char('a'), 0));
25 }
26
27 #[test]
28 fn read_multiline_comment() {
29 use std::ops::Bound::Included;
30 use Regex::*;
31 let a = read(r"/\*([^*]|\*[^/])*\*/", 0).unwrap();
32 let mut first_class = IntervalTree::default();
33 first_class.insert((Included('*'), Included('*')));
34 let mut second_class = IntervalTree::default();
35 second_class.insert((Included('/'), Included('/')));
36 let b = Concat(
56 Box::new(Concat(
57 Box::new(Concat(
58 Box::new(Concat(Box::new(Char('/')), Box::new(Char('*')))),
59 Box::new(KleeneStar(Box::new(Group(
60 Box::new(Option(
61 Box::new(CharacterClass(first_class, true)),
62 Box::new(Concat(
63 Box::new(Char('*')),
64 Box::new(CharacterClass(second_class, true)),
65 )),
66 )),
67 0,
68 )))),
69 )),
70 Box::new(Char('*')),
71 )),
72 Box::new(Char('/')),
73 );
74 assert_eq!(a, (b, 1));
75 }
76
77 #[test]
78 fn read_word_boundary() {
79 use Regex::*;
80 assert_eq!(read(r"\b", 0).unwrap(), (WordBoundary, 0));
81 }
82
83 #[test]
84 fn read_eof() {
85 use Regex::*;
86 assert_eq!(read(r"\Z", 0).unwrap(), (EOF, 0));
87 assert_eq!(read(r"\z", 0).unwrap(), (EOF, 0));
88 }
89
90 #[test]
91 fn read_char_class() {
92 use std::ops::Bound::Included;
93 use Regex::*;
94 let mut tree = IntervalTree::default();
95 tree.insert((Included('a'), Included('a')));
96 assert_eq!(read("[a]", 0).unwrap(), (CharacterClass(tree, false), 0));
97
98 let mut tree = IntervalTree::default();
99 tree.insert((Included('a'), Included('z')));
100 tree.insert((Included('A'), Included('Z')));
101 tree.insert((Included('0'), Included('9')));
102 tree.insert((Included('_'), Included('_')));
103 assert_eq!(
104 read("[a-zA-Z0-9_]", 0).unwrap(),
105 (CharacterClass(tree, false), 0)
106 );
107
108 let mut tree = IntervalTree::default();
109 tree.insert((Included('a'), Included('z')));
110 tree.insert((Included('A'), Included('Z')));
111 tree.insert((Included('0'), Included('9')));
112 tree.insert((Included('_'), Included('_')));
113 assert_eq!(
114 read("[^a-zA-Z0-9_]", 0).unwrap(),
115 (CharacterClass(tree, true), 0)
116 );
117
118 assert_eq!(
119 read("[a-zb-z]", 0).unwrap_err(),
120 RegexError {
121 position: 8,
122 message: String::from(
123 "Found 1 overlaps in character class from position 0 to position 8: (a-z,b-z)"
124 ),
125 }
126 );
127 }
128
129 #[test]
130 fn read_escaped() {
131 use Regex::*;
132 assert_eq!(read(r"\w", 0).unwrap(), (WordChar, 0));
133 }
134
135 #[test]
136 #[should_panic]
137 fn read_wrong_escaped() {
138 read(r"\l", 0).unwrap();
139 }
140
141 #[test]
142 fn read_group() {
143 use Regex::*;
144 assert_eq!(read("(a)", 0).unwrap(), (Group(Box::new(Char('a')), 0), 1));
145 }
146
147 #[test]
148 fn read_repetition() {
149 use Regex::*;
150 assert_eq!(
151 read("a+b+", 0).unwrap(),
152 (
153 Concat(
154 Box::new(Repetition(Box::new(Char('a')))),
155 Box::new(Repetition(Box::new(Char('b'))))
156 ),
157 0
158 )
159 );
160
161 assert_eq!(
162 read("(a+)(b+)", 0).unwrap(),
163 (
164 Concat(
165 Box::new(Group(Box::new(Repetition(Box::new(Char('a')))), 0)),
166 Box::new(Group(Box::new(Repetition(Box::new(Char('b')))), 1))
167 ),
168 2
169 )
170 );
171 }
172
173 #[test]
174 fn read_kleene_star() {
175 use Regex::*;
176
177 assert_eq!(
178 read("a*b*", 0).unwrap(),
179 (
180 Concat(
181 Box::new(KleeneStar(Box::new(Char('a')))),
182 Box::new(KleeneStar(Box::new(Char('b'))))
183 ),
184 0
185 )
186 );
187
188 assert_eq!(
189 read("(a*)(b*)", 0).unwrap(),
190 (
191 Concat(
192 Box::new(Group(Box::new(KleeneStar(Box::new(Char('a')))), 0)),
193 Box::new(Group(Box::new(KleeneStar(Box::new(Char('b')))), 1))
194 ),
195 2
196 )
197 );
198 }
199
200 #[test]
201 fn read_option() {
202 use Regex::*;
203
204 assert_eq!(
205 read("abc|bcd", 0).unwrap(),
206 (
207 Option(
208 Box::new(Concat(
209 Box::new(Concat(Box::new(Char('a')), Box::new(Char('b')),)),
210 Box::new(Char('c'))
211 )),
212 Box::new(Concat(
213 Box::new(Concat(Box::new(Char('b')), Box::new(Char('c')),)),
214 Box::new(Char('d'))
215 )),
216 ),
217 0
218 )
219 );
220
221 assert_eq!(
222 read("ab|cd|ef", 0).unwrap(),
223 (
224 Option(
225 Box::new(Option(
226 Box::new(Concat(Box::new(Char('a')), Box::new(Char('b')))),
227 Box::new(Concat(Box::new(Char('c')), Box::new(Char('d'))))
228 )),
229 Box::new(Concat(Box::new(Char('e')), Box::new(Char('f'))))
230 ),
231 0
232 )
233 );
234 }
235
236 #[test]
237 fn read_any() {
238 use Regex::*;
239 assert_eq!(read(".", 0).unwrap(), (Any, 0));
240 assert_eq!(read(".*", 0).unwrap(), (KleeneStar(Box::new(Any)), 0));
241 }
242
243 #[test]
244 fn read_malformed() {
245 let wrong_regex = [
246 ("a**", 2),
247 ("a*?", 2),
248 ("a?*", 2),
249 ("a*+", 2),
250 ("a+*", 2),
251 ("a??", 2),
252 ("a?+", 2),
253 ("a+?", 2),
254 ("a++", 2),
255 ("*", 0),
256 ("?", 0),
257 ("+", 0),
258 ];
259 for &(regex, p) in wrong_regex.iter() {
260 if let Err(RegexError { position: pos, .. }) = read(regex, 0) {
261 assert_eq!(pos, p);
262 } else {
263 panic!("(/{}/ shouldn't succeed.", regex);
264 }
265 }
266 }
267
268 #[test]
269 fn build_basic() {
270 use Instruction::*;
271 let program = compile("a+b+", TerminalId(0)).unwrap();
272 assert_eq!(
273 program,
274 (
275 Program::from(vec![
276 Char('a'),
277 Split(InstructionPointer(0), InstructionPointer(2)),
278 Char('b'),
279 Split(InstructionPointer(2), InstructionPointer(4)),
280 Match(TerminalId(0))
281 ]),
282 0
283 )
284 );
285 }
286
287 #[test]
288 fn build_char() {
289 use Instruction::*;
290 let program = compile("a", TerminalId(0)).unwrap();
291 assert_eq!(
292 program,
293 (Program::from(vec![Char('a'), Match(TerminalId(0))]), 0)
294 );
295 }
296
297 #[test]
298 fn build_char_class() {
299 use std::ops::Bound::Included;
300 use Instruction::*;
301 let mut tree = IntervalTree::default();
302 tree.insert((Included('a'), Included('z')));
303 tree.insert((Included('A'), Included('Z')));
304 tree.insert((Included('0'), Included('9')));
305 tree.insert((Included('_'), Included('_')));
306 let program = compile("[a-zA-Z0-9_]", TerminalId(0)).unwrap();
307 assert_eq!(
308 program,
309 (
310 Program::from(vec![CharacterClass(tree, false), Match(TerminalId(0))]),
311 0
312 )
313 );
314 }
315
316 #[test]
317 fn build_word_boundary() {
318 use Instruction::*;
319 let program = compile(r"\b", TerminalId(0)).unwrap();
320 assert_eq!(
321 program,
322 (Program::from(vec![WordBoundary, Match(TerminalId(0))]), 0)
323 );
324 }
325
326 #[test]
327 fn build_eof() {
328 use Instruction::*;
329 let program = compile(r"\z", TerminalId(0)).unwrap();
330 assert_eq!(program, (Program::from(vec![EOF, Match(TerminalId(0))]), 0));
331 let program = compile(r"\Z", TerminalId(0)).unwrap();
332 assert_eq!(program, (Program::from(vec![EOF, Match(TerminalId(0))]), 0));
333 }
334
335 #[test]
336 fn build_escaped() {
337 use Instruction::*;
338 let program = compile(r"\w", TerminalId(0)).unwrap();
339 assert_eq!(
340 program,
341 (Program::from(vec![WordChar, Match(TerminalId(0))]), 0)
342 );
343 }
344
345 #[test]
346 fn build_concat() {
347 use Instruction::*;
348 let program = compile("ab", TerminalId(0)).unwrap();
349 assert_eq!(
350 program,
351 (
352 Program::from(vec![Char('a'), Char('b'), Match(TerminalId(0))]),
353 0
354 )
355 );
356 }
357
358 #[test]
359 fn build_option() {
360 use Instruction::*;
361 let program = compile("a|b", TerminalId(0)).unwrap();
362 assert_eq!(
363 program,
364 (
365 Program::from(vec![
366 Split(InstructionPointer(1), InstructionPointer(3)),
367 Char('a'),
368 Jump(InstructionPointer(4)),
369 Char('b'),
370 Match(TerminalId(0))
371 ]),
372 0
373 )
374 );
375 }
376
377 #[test]
378 fn build_optional() {
379 use Instruction::*;
380 let program = compile("a?", TerminalId(0)).unwrap();
381 assert_eq!(
382 program,
383 (
384 Program::from(vec![
385 Split(InstructionPointer(1), InstructionPointer(2)),
386 Char('a'),
387 Match(TerminalId(0))
388 ]),
389 0
390 )
391 );
392 }
393
394 #[test]
395 fn build_kleene_star() {
396 use Instruction::*;
397 let program = compile("a*", TerminalId(0)).unwrap();
398 assert_eq!(
399 program,
400 (
401 Program::from(vec![
402 Split(InstructionPointer(1), InstructionPointer(3)),
403 Char('a'),
404 Jump(InstructionPointer(0)),
405 Match(TerminalId(0))
406 ]),
407 0
408 )
409 );
410 }
411
412 #[test]
413 fn build_repetition() {
414 use Instruction::*;
415 let program = compile("a+", TerminalId(0)).unwrap();
416 assert_eq!(
417 program,
418 (
419 Program::from(vec![
420 Char('a'),
421 Split(InstructionPointer(0), InstructionPointer(2)),
422 Match(TerminalId(0))
423 ]),
424 0
425 )
426 );
427 }
428
429 #[test]
430 fn build_any() {
431 use Instruction::*;
432 let program = compile(".", TerminalId(0)).unwrap();
433 assert_eq!(program, (Program::from(vec![Any, Match(TerminalId(0))]), 0));
434 }
435
436 #[test]
437 fn build_groups() {
438 use Instruction::*;
439 let program = compile("(a+)(b+)", TerminalId(0)).unwrap();
440 assert_eq!(
441 program,
442 (
443 Program::from(vec![
444 Save(0),
445 Char('a'),
446 Split(InstructionPointer(1), InstructionPointer(3)),
447 Save(1),
448 Save(2),
449 Char('b'),
450 Split(InstructionPointer(5), InstructionPointer(7)),
451 Save(3),
452 Match(TerminalId(0))
453 ]),
454 2
455 )
456 )
457 }
458
459 #[test]
460 fn build_string() {
461 use std::collections::Bound::Included;
462 use Instruction::*;
463 let program = compile(r"'(([^'\\]|\\[^\\]|\\\\)*)'", TerminalId(0)).unwrap();
464 let mut first_char_class = IntervalTree::default();
465 first_char_class.insert((Included('\''), Included('\'')));
466 first_char_class.insert((Included('\\'), Included('\\')));
467 let mut second_char_class = IntervalTree::default();
468 second_char_class.insert((Included('\\'), Included('\\')));
469 let correct = Program::from(vec![
470 Char('\''),
471 Save(0),
472 Split(InstructionPointer(3), InstructionPointer(15)),
473 Save(2),
474 Split(InstructionPointer(5), InstructionPointer(11)),
475 Split(InstructionPointer(6), InstructionPointer(8)),
476 CharacterClass(first_char_class, true),
477 Jump(InstructionPointer(10)),
478 Char('\\'),
479 CharacterClass(second_char_class, true),
480 Jump(InstructionPointer(13)),
481 Char('\\'),
482 Char('\\'),
483 Save(3),
484 Jump(InstructionPointer(2)),
485 Save(1),
486 Char('\''),
487 Match(TerminalId(0)),
488 ]);
489 assert_eq!(program, (correct, 2));
490 }
491}
492
493#[cfg_attr(test, derive(PartialEq))]
497#[derive(Debug)]
498pub enum Regex {
499 Char(char),
500 Option(Box<Regex>, Box<Regex>),
501 Optional(Box<Regex>),
502 Repetition(Box<Regex>),
503 KleeneStar(Box<Regex>),
504 Concat(Box<Regex>, Box<Regex>),
505 Group(Box<Regex>, usize),
506 CharacterClass(IntervalTree<char>, bool),
507 WordChar,
508 Digit,
509 Whitespace,
510 WordBoundary,
511 EOF,
512 Any,
513 Empty,
514}
515
516#[derive(Debug, PartialEq, Eq)]
522pub struct RegexError {
523 pub position: usize,
524 pub message: String,
525}
526
527impl From<(Regex, Option<Regex>)> for Regex {
528 fn from(t: (Regex, Option<Regex>)) -> Self {
529 match t {
530 (r, Some(r2)) => Regex::Option(Box::new(r2), Box::new(r)),
531 (r, None) => r,
532 }
533 }
534}
535
536pub fn build(regex: Regex, program: &mut Program) {
545 match regex {
546 Regex::Char(c) => {
547 program.push(Instruction::Char(c));
548 }
549 Regex::Option(r1, r2) => {
550 let split_pos = InstructionPointer(program.len());
551 program.push(Instruction::Split(
552 InstructionPointer(0),
553 InstructionPointer(0),
554 ));
555 build(*r1, program);
556 let jmp_pos = program.len_ip();
557 program.push(Instruction::Jump(InstructionPointer(0)));
558 build(*r2, program);
559 program[split_pos] = Instruction::Split(split_pos.incr(), jmp_pos.incr());
560 program[jmp_pos] = Instruction::Jump(program.len_ip());
561 }
562 Regex::Concat(r1, r2) => {
563 build(*r1, program);
564 build(*r2, program);
565 }
566 Regex::Optional(r) => {
567 let split_pos = program.len_ip();
568 program.push(Instruction::Split(
569 InstructionPointer(0),
570 InstructionPointer(0),
571 ));
572 build(*r, program);
573 program[split_pos] = Instruction::Split(split_pos.incr(), program.len_ip());
574 }
575 Regex::KleeneStar(r) => {
576 let split_pos = program.len_ip();
577 program.push(Instruction::Split(
578 InstructionPointer(0),
579 InstructionPointer(0),
580 ));
581 build(*r, program);
582 program.push(Instruction::Jump(split_pos));
583 program[split_pos] = Instruction::Split(split_pos.incr(), program.len_ip());
584 }
585 Regex::Repetition(r) => {
586 let init_pos = program.len_ip();
587 build(*r, program);
588 program.push(Instruction::Split(init_pos, program.len_ip().incr()));
589 }
590 Regex::Group(r, i) => {
591 program.push(Instruction::Save(2 * i));
592 build(*r, program);
593 program.push(Instruction::Save(2 * i + 1));
594 }
595 Regex::Any => program.push(Instruction::Any),
596 Regex::WordChar => program.push(Instruction::WordChar),
597 Regex::Digit => program.push(Instruction::Digit),
598 Regex::WordBoundary => program.push(Instruction::WordBoundary),
599 Regex::CharacterClass(class, negated) => {
600 program.push(Instruction::CharacterClass(class, negated))
601 }
602 Regex::EOF => program.push(Instruction::EOF),
603 Regex::Whitespace => program.push(Instruction::Whitespace),
604 Regex::Empty => {}
605 };
606}
607
608pub fn read(regex: &str, mut groups: usize) -> Result<(Regex, usize), RegexError> {
612 fn read_char_class(
614 input: &mut std::iter::Enumerate<std::str::Chars<'_>>,
615 size: usize,
616 actual: usize,
617 ) -> Result<Regex, RegexError> {
618 use std::ops::{Bound, Bound::Included};
619 fn insert(
620 interval: (Bound<char>, Bound<char>),
621 tree: &mut IntervalTree<char>,
622 overlaps: &mut Vec<Vec<(char, char)>>,
623 ) {
624 let over = tree.get_interval_overlaps(&interval);
625 if !over.is_empty() {
626 overlaps.push(
627 over.into_iter()
628 .chain(std::iter::once(&interval))
629 .map(|(start, end)| {
630 let s = match start {
631 Included(s) => *s,
632 _ => panic!(),
633 };
634 let e = match end {
635 Included(e) => *e,
636 _ => panic!(),
637 };
638 (s, e)
639 })
640 .collect(),
641 );
642 }
643 tree.insert(interval);
644 }
645 let mut tree = IntervalTree::default();
646 let mut last = None;
647 let mut overlaps = Vec::new();
648 let mut negated = false;
649
650 fn read_escaped_char(
651 input: &mut Enumerate<Chars<'_>>,
652 pos: usize,
653 ) -> Result<char, RegexError> {
654 if let Some((pos, c)) = input.next() {
655 match c {
656 ']' => Ok(c),
657 't' => Ok('\t'),
658 'n' => Ok('\n'),
659 '^' => Ok('^'),
660 '-' => Ok('-'),
661 '\\' => Ok('\\'),
662 _ => {
663 Err(RegexError {
664 position: pos,
665 message: format!(
666 "Cannot escape character inside character class {}, try replacing /\\{}/ with /{}/",
667 c, c, c
668 ),
669 })
670 }
671 }
672 } else {
673 Err(RegexError {
674 position: pos,
675 message: String::from(
676 "Expected something after '\', but found EOF instead",
677 ),
678 })
679 }
680 }
681
682 while let Some((pos, chr)) = input.next() {
683 match chr {
684 '\\' => {
685 if let Some(cr) = last {
686 insert((Included(cr), Included(cr)), &mut tree, &mut overlaps);
687 }
688 last = Some(read_escaped_char(input, pos)?);
689 }
690 '^' if pos == actual + 1 => {
691 negated = true;
692 }
693 '-' => {
694 if let Some(c1) = last {
695 if let Some((_, c2)) = input.next() {
696 let chr = if c2 == '\\' {
697 read_escaped_char(input, pos)?
698 } else {
699 c2
700 };
701 insert((Included(c1), Included(chr)), &mut tree, &mut overlaps);
702 } else {
703 return Err(RegexError {
704 position: pos,
705 message: String::from(
706 "Expected something after '-', but found EOF instead",
707 ),
708 });
709 }
710 last = None;
711 } else {
712 insert((Included('-'), Included('-')), &mut tree, &mut overlaps);
713 }
714 }
715 ']' => {
716 if let Some(c) = last {
717 insert((Included(c), Included(c)), &mut tree, &mut overlaps);
718 }
719 if !overlaps.is_empty() {
720 let mut err_str = format!("Found {} overlaps in character class from position {} to position {}: ", overlaps.len(), actual, pos+1);
721 for overlap in overlaps {
722 err_str.push('(');
723 for (s, e) in overlap {
724 err_str.push(s);
725 err_str.push('-');
726 err_str.push(e);
727 err_str.push(',');
728 }
729 err_str.pop();
730 err_str.push_str("), ");
731 }
732 err_str.pop();
733 err_str.pop();
734 return Err(RegexError {
735 position: pos + 1,
736 message: err_str,
737 });
738 } else {
739 return Ok(Regex::CharacterClass(tree, negated));
740 }
741 }
742 c => {
743 if let Some(cr) = last {
744 insert((Included(cr), Included(cr)), &mut tree, &mut overlaps);
745 }
746 last = Some(c);
747 }
748 }
749 }
750 Err(RegexError {position: size, message: format!("Expected end of character class, but found EOF. Try adding ']' if you really meant to have a character class, or adding '\\' at position {} if you didn't want a character class", actual)})
751 }
752
753 fn add(regex: Regex, stack: &mut Vec<(Regex, Option<Regex>, usize)>) {
754 let (last, remainder, group) = stack.pop().unwrap();
755 stack.push((concat(last, regex), remainder, group));
756 }
757
758 fn concat(left: Regex, right: Regex) -> Regex {
759 if let Regex::Empty = left {
760 right
761 } else {
762 Regex::Concat(Box::new(left), Box::new(right))
763 }
764 }
765
766 fn kleene_star(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
767 match exp {
768 Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(kleene_star(*r2, pos)?))),
769 Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(kleene_star(*r2, pos)?))),
770 Regex::Empty => Err(RegexError {
771 position: pos,
772 message: String::from("Cannot apply kleene star to empty regex."),
773 }),
774 Regex::KleeneStar(..) => Err(RegexError {
775 position: pos,
776 message: String::from("Cannot apply kleene star twice."),
777 }),
778 Regex::Optional(..) => Err(RegexError {
779 position: pos,
780 message: String::from("Cannot apply kleene star on an optional group."),
781 }),
782 Regex::Repetition(..) => Err(RegexError {
783 position: pos,
784 message: String::from("Cannot apply kleene star and repetition."),
785 }),
786 r => Ok(Regex::KleeneStar(Box::new(r))),
787 }
788 }
789
790 fn repetition(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
791 match exp {
792 Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(repetition(*r2, pos)?))),
793 Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(repetition(*r2, pos)?))),
794 Regex::Empty => Err(RegexError {
795 position: pos,
796 message: String::from("Cannot apply repetition to empty regex."),
797 }),
798 Regex::KleeneStar(..) => Err(RegexError {
799 position: pos,
800 message: String::from("Cannot apply repetition twice."),
801 }),
802 Regex::Optional(..) => Err(RegexError {
803 position: pos,
804 message: String::from("Cannot apply repetition on an optional group."),
805 }),
806 Regex::Repetition(..) => Err(RegexError {
807 position: pos,
808 message: String::from("Cannot apply repetition and kleene star."),
809 }),
810 r => Ok(Regex::Repetition(Box::new(r))),
811 }
812 }
813
814 fn optional(exp: Regex, pos: usize) -> Result<Regex, RegexError> {
815 match exp {
816 Regex::Concat(r1, r2) => Ok(Regex::Concat(r1, Box::new(optional(*r2, pos)?))),
817 Regex::Option(r1, r2) => Ok(Regex::Option(r1, Box::new(optional(*r2, pos)?))),
818 Regex::Empty => Err(RegexError {
819 position: pos,
820 message: String::from("Cannot apply optional to empty regex."),
821 }),
822 Regex::KleeneStar(..) => Err(RegexError {
823 position: pos,
824 message: String::from("Non-greedy kleene star is not supported."),
825 }),
826 Regex::Optional(..) => Err(RegexError {
827 position: pos,
828 message: String::from("Non-greedy optional is not supported."),
829 }),
830 Regex::Repetition(..) => Err(RegexError {
831 position: pos,
832 message: String::from("Non-greedy repetition is not supported."),
833 }),
834 r => Ok(Regex::Optional(Box::new(r))),
835 }
836 }
837
838 let mut stack = vec![(Regex::Empty, None, 0)];
839 let mut chrs = regex.chars().enumerate();
840 let size = regex.chars().count();
841 while let Some((pos, chr)) = chrs.next() {
842 match chr {
843 '(' => {
844 stack.push((Regex::Empty, None, groups));
845 groups += 1;
846 }
847 ')' => {
848 if stack.len() <= 1 {
849 return Err(RegexError {
850 position: pos,
851 message: String::from("Closing parenthesis doesn't match any previously opened."),
852 });
853 }
854 {
855 let (last, remainder, nb_group) = stack.pop().unwrap();
856 let group = Regex::Group(Box::new((last, remainder).into()), nb_group);
857 let (last, remainder, nb_group) = stack.pop().unwrap();
858 stack.push((concat(last, group), remainder, nb_group));
859 }
860 }
861 '?' => {
862 let (last, remainder, nb_group) = stack.pop().unwrap();
863 stack.push((optional(last, pos)?, remainder, nb_group));
864 }
865 '*' => {
866 let (last, remainder, nb_group) = stack.pop().unwrap();
867 stack.push((kleene_star(last, pos)?, remainder, nb_group));
868 }
869 '+' => {
870 let (last, remainder, nb_group) = stack.pop().unwrap();
871 stack.push((repetition(last, pos)?, remainder, nb_group));
872 }
873 '|' => {
874 let (l, remainder, group) = stack.pop().unwrap();
875 let last = (l, remainder).into();
876 stack.push((Regex::Empty, Some(last), group));
877 }
878 '$' => return Err(RegexError {
879 position: pos,
880 message: String::from("End-of-line /$/ is not supported. Matches are anchored anyways. Try /\\n/ instead.")
881 }),
882 '^' => return Err(RegexError {
883 position: pos,
884 message: String::from("Start-of-line /^/ is not supported. Matches are anchored anyways.")
885 }),
886 ']' => return Err(RegexError {
887 position: pos,
888 message: String::from("Closing bracket doesn't match any previsouly opened."),
889 }),
890 '.' => add(Regex::Any, &mut stack),
891 '\\' => {
892 if let Some((pos, chr)) = chrs.next() {
893 match chr {
894 '\\' | '.' | '(' | ')' | '?' | '+' | '*' | '|' | '$' | '^' | '[' | ']' => add(Regex::Char(chr), &mut stack),
895 'A' => return Err(RegexError {
896 position: pos,
897 message: String::from("Start of the string anchor /\\A/ is not supported.")
898 }),
899 'N' => return Err(RegexError {
900 position: pos,
901 message: String::from("Not a line break shorthand /\\N/ is not supported. Try /[^\\n]/ instead.")
902 }),
903 'Z' => add(Regex::EOF, &mut stack),
904 'b' => add(Regex::WordBoundary, &mut stack),
905 'd' => add(Regex::Digit, &mut stack),
906 'h' => return Err(RegexError {
907 position: pos,
908 message: String::from("Horizontal whitespace / hexadecimal digit shorthand /\\h/ is not supported. Try [0-9a-f] instead if you wanted hexadecimal digit.")
909 }),
910 'l' => return Err(RegexError {
911 position: pos,
912 message: String::from("Lowercase shorthand /\\l/ is not supported. Try /[a-z]/ instead.")
913 }),
914 'n' => add(Regex::Char('\n'), &mut stack),
915 'r' => return Err(RegexError {
916 position: pos,
917 message: String::from("Nonsense EOL /\\r/ is not supported. Try /\\n/ instead.")
918 }),
919 's' => add(Regex::Whitespace, &mut stack),
920 't' => add(Regex::Char('\t'), &mut stack),
921 'u' => return Err(RegexError {
922 position: pos,
923 message: String::from("Uppercase shorthand /\\u/ is not supported. Try /[A-Z]/ instead.")
924 }),
925 'v' => return Err(RegexError {
926 position: pos,
927 message: String::from("Vertical whitespace shorthand /\\v/ is not supported.")
928 }),
929 'w' => add(Regex::WordChar, &mut stack),
930 'x' => return Err(RegexError {
931 position: pos,
932 message: String::from("Hexadecimal escape /\\xFF/ is not supported.")
933 }),
934 'z' => add(Regex::EOF, &mut stack),
935 _ => {
936 return Err(RegexError {
937 position: pos,
938 message: format!(
939 "Cannot escape character {}, try replacing /\\{}/ with /{}/",
940 chr, chr, chr
941 ),
942 })
943 }
944 }
945 } else {
946 return Err(RegexError {
947 position: pos,
948 message: String::from(
949 "Expected something after character '\', but found EOF instead",
950 ),
951 });
952 }
953 }
954 '[' => {
955 add(read_char_class(&mut chrs, size, pos)?, &mut stack);
956 }
957 c => add(Regex::Char(c), &mut stack),
958 }
959 }
960 if stack.len() != 1 {
961 Err(RegexError {
962 position: size,
963 message: format!(
964 "Expected {} closing parenthesis, found EOF",
965 stack.len() - 1
966 ),
967 })
968 } else {
969 Ok((
970 {
971 let (last, remainder, _) = stack.pop().unwrap();
972 (last, remainder).into()
973 },
974 groups,
975 ))
976 }
977}