1use log::trace;
2use std::{iter::Peekable, str::Chars};
3
4mod unicode;
5mod unicode_tables;
6
7#[derive(Debug)]
8pub struct Error {
9 pub msg: String,
10 pub idx: usize,
11}
12
13impl std::fmt::Display for Error {
14 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
15 write!(f, "{} at {}", self.msg, self.idx)
16 }
17}
18
19impl std::error::Error for Error {}
20
21impl Error {
22 fn new(idx: usize, msg: &str) -> Self {
23 Self {
24 idx,
25 msg: msg.to_string(),
26 }
27 }
28}
29
30pub struct RegexParser<'a> {
31 pattern: &'a str,
32 chars: Peekable<Chars<'a>>,
33 state: State<'a>,
34}
35
36impl<'a> RegexParser<'a> {
37 pub fn new(js: &'a str) -> Result<Self, Error> {
38 if !js.starts_with('/') {
39 return Err(Error::new(
40 0,
41 "regular expression literals must start with a /",
42 ));
43 }
44 let pat_end_idx = if let Some(end_idx) = js.rfind('/') {
45 if end_idx == 0 {
46 return Err(Error::new(0, "regular expression literals must have 2 `/`"));
47 } else {
48 end_idx
49 }
50 } else {
51 return Err(Error::new(0, "regular expression literals must have 2 `/`"));
52 };
53 let pattern = if let Some(pattern) = js.get(1..pat_end_idx) {
54 pattern
55 } else {
56 return Err(Error::new(0, "Invalid regular expression"));
57 };
58 let flags = if let Some(flag_str) = js.get(pat_end_idx + 1..) {
59 let mut flags = RegExFlags::default();
60 for (i, c) in flag_str.chars().enumerate() {
61 flags.add_flag(c, pat_end_idx + i + 1)?;
62 }
63 flags
64 } else {
65 return Err(Error::new(pat_end_idx, "invalid flags"));
66 };
67 Ok(Self {
68 pattern,
69 chars: pattern.chars().peekable(),
70 state: State::new(pattern.len(), flags.unicode),
71 })
72 }
73
74 pub fn validate(&mut self) -> Result<(), Error> {
75 trace!("parse {:?}", self.current());
76 self.pattern()?;
77 if !self.state.n && !self.state.group_names.is_empty() {
78 self.pattern()?;
79 }
80 Ok(())
81 }
82 fn pattern(&mut self) -> Result<(), Error> {
88 trace!("pattern {:?}", self.current(),);
89 if self.state.pos > 0 {
90 self.chars = self.pattern.chars().peekable();
91 self.state.reset();
92 }
93 self.disjunction()?;
94 if self.state.pos != self.state.len {
95 if self.eat(')') {
96 return Err(Error::new(self.state.pos, "Unmatched `)`"));
97 }
98 if self.eat(']') || self.eat('}') {
99 return Err(Error::new(self.state.pos, "Lone quantifier brackets"));
100 }
101 }
102 if self.state.max_back_refs > self.state.num_capturing_parens {
103 return Err(Error::new(self.state.pos, "Invalid escape"));
104 }
105 for name in &self.state.back_ref_names {
106 if !self.state.group_names.contains(name) {
107 return Err(Error::new(
108 self.state.pos,
109 "Invalid named capture referenced",
110 ));
111 }
112 }
113 Ok(())
114 }
115 fn disjunction(&mut self) -> Result<(), Error> {
120 trace!("disjunction {:?}", self.current(),);
121 self.alternative()?;
122 while self.eat('|') {
123 self.alternative()?;
124 }
125 if self.eat_quantifier(true)? {
126 return Err(Error::new(self.state.pos, "Nothing to repeat"));
127 }
128 if self.eat('{') {
129 return Err(Error::new(self.state.pos, "lone quantifier brackets"));
130 }
131 Ok(())
132 }
133 fn alternative(&mut self) -> Result<(), Error> {
138 trace!("alternative {:?}", self.current(),);
139 while self.state.pos < self.state.len && self.eat_term()? {}
140 Ok(())
141 }
142 fn eat_quantifier(&mut self, no_error: bool) -> Result<bool, Error> {
150 trace!("eat_quantifier {:?}", self.current(),);
151 Ok(if self.eat_quantifier_prefix(no_error)? {
152 self.eat('?');
153 true
154 } else {
155 false
156 })
157 }
158 fn eat_quantifier_prefix(&mut self, no_error: bool) -> Result<bool, Error> {
161 trace!("eat_quantifier_prefix {:?}", self.current(),);
162 let ret = self.eat('*')
163 || self.eat('+')
164 || self.eat('?')
165 || self.eat_braced_quantifier(no_error)?;
166 Ok(ret)
167 }
168 fn eat_braced_quantifier(&mut self, no_error: bool) -> Result<bool, Error> {
181 trace!("eat_braced_quantifier {:?}", self.current(),);
182 let start = self.state.pos;
183 if self.eat('{') {
184 if self.eat_digits(10) {
185 let min = self.state.last_int_value;
186 let max = if self.eat(',') && self.eat_digits(10) {
187 self.state.last_int_value
188 } else {
189 None
190 };
191 if self.eat('}') {
192 if let (Some(max), Some(min)) = (max, min) {
193 if max < min && !no_error {
194 return Err(Error::new(
195 self.state.pos,
196 &format!("numbers out of order in {{{},{}}}", min, max),
197 ));
198 }
199 }
200 return Ok(true);
201 }
202 }
203 if self.state.u && !no_error {
204 return Err(Error::new(self.state.pos, "Incomplete quantifier"));
205 }
206 self.reset_to(start);
207 }
208 Ok(false)
209 }
210 fn eat_term(&mut self) -> Result<bool, Error> {
218 trace!("eat_term {:?}", self.current(),);
219 if self.eat_assertion()? {
220 if self.state.last_assert_is_quant && self.eat_quantifier(false)? && self.state.n {
221 return Err(Error::new(self.state.pos, "Invalid quantifier"));
222 }
223 return Ok(true);
224 }
225 if self.state.u {
226 if self.eat_atom()? {
227 self.eat_quantifier(false)?;
228 return Ok(true);
229 }
230 } else if self.eat_extended_atom()? {
231 self.eat_quantifier(false)?;
232 return Ok(true);
233 }
234 Ok(false)
235 }
236 fn eat_atom(&mut self) -> Result<bool, Error> {
243 trace!("eat_atom {:?}", self.current(),);
244 let ret = self.eat_pattern_characters()
245 || self.eat('.')
246 || self.eat_reverse_solidus_atom_escape()?
247 || self.eat_character_class()?
248 || self.eat_uncapturing_group()?
249 || self.eat_capturing_group()?;
250 Ok(ret)
251 }
252 fn eat_extended_atom(&mut self) -> Result<bool, Error> {
255 trace!("eat_extended_atom {:?}", self.current(),);
256 let ret = self.eat('.')
257 || self.eat_reverse_solidus_atom_escape()?
258 || self.eat_character_class()?
259 || self.eat_uncapturing_group()?
260 || self.eat_capturing_group()?
261 || self.eat_invalid_braced_quantifier()?
262 || self.eat_extended_pattern_character();
263 Ok(ret)
264 }
265 fn eat_invalid_braced_quantifier(&mut self) -> Result<bool, Error> {
268 trace!("eat_invalid_braced_quantifier {:?}", self.current(),);
269 if self.eat_braced_quantifier(true)? {
270 return Err(Error::new(self.state.pos, "Nothing to repeat"));
271 }
272 Ok(false)
273 }
274 fn eat_extended_pattern_character(&mut self) -> bool {
277 trace!("eat_extended_pattern_character {:?}", self.current(),);
278 if let Some(ch) = self.chars.peek() {
279 if *ch != '$'
280 && !(*ch >= '(' && *ch <= '+')
281 && *ch != '.'
282 && *ch != '?'
283 && *ch != '['
284 && *ch != '^'
285 && *ch != '|'
286 {
287 self.advance();
288 return true;
289 }
290 }
291 false
292 }
293 fn eat_pattern_characters(&mut self) -> bool {
296 trace!("eat_pattern_characters {:?}", self.current(),);
297 let start = self.state.pos;
298 while let Some(next) = self.chars.peek() {
299 if !Self::is_syntax_ch(*next) {
300 self.advance();
301 } else {
302 break;
303 }
304 }
305 self.state.pos != start
306 }
307 fn is_syntax_ch(ch: char) -> bool {
311 ch == '$'
312 || ch >= '(' && ch <= '+'
313 || ch == '.'
314 || ch == '?'
315 || ch >= '[' && ch <= '^'
316 || ch >= '{' && ch <= '}'
317 }
318
319 fn eat_reverse_solidus_atom_escape(&mut self) -> Result<bool, Error> {
321 trace!("eat_reverse_solidus_atom_escape {:?}", self.current(),);
322 let start = self.state.pos;
323 if self.eat('\\') {
324 if self.eat_atom_escape()? {
325 return Ok(true);
326 }
327 self.reset_to(start);
328 }
329 Ok(false)
330 }
331 fn eat_atom_escape(&mut self) -> Result<bool, Error> {
333 trace!("eat_atom_escape {}", self.state.u,);
334 if self.eat_back_ref()
335 || self.eat_character_class_escape()?
336 || self.eat_character_escape()?
337 || self.state.n && self.eat_k_group_name()?
338 {
339 return Ok(true);
340 }
341 trace!("previous check failed, {}", self.state.u);
342 if self.state.u {
343 trace!("previous all failed, with unicode flag");
344 if let Some(next) = self.current() {
345 if *next == 'c' {
346 return Err(Error::new(self.state.pos, "Invalid unicode escape"));
347 }
348 }
349 trace!("returning error");
350 return Err(Error::new(self.state.pos, "Invalid escape"));
351 }
352 Ok(false)
353 }
354 fn eat_back_ref(&mut self) -> bool {
362 trace!("eat_back_ref {:?}", self.current(),);
363 let start = self.state.pos;
364 if self.eat_decimal_escape() {
365 let n = if let Some(n) = self.state.last_int_value {
366 n
367 } else {
368 return true;
369 };
370 if self.state.u {
371 if n > self.state.max_back_refs {
372 self.state.max_back_refs = n;
373 }
374 return true;
375 }
376 if n <= self.state.num_capturing_parens {
377 return true;
378 }
379 self.reset_to(start);
380 }
381 false
382 }
383 fn eat_decimal_escape(&mut self) -> bool {
385 trace!("eat_decimal_escape {:?}", self.current(),);
386 let start = self.state.pos;
387 let mut last_int_value = 0;
388 while let Some(next) = self.chars.peek() {
389 if let Some(n) = next.to_digit(10) {
390 last_int_value = 10 * last_int_value + n;
391 self.advance()
392 } else {
393 break;
394 }
395 }
396 self.state.last_int_value = Some(last_int_value);
397 self.state.pos != start
398 }
399 fn eat_character_class_escape(&mut self) -> Result<bool, Error> {
404 trace!("eat_character_class_escape {:?}", self.current(),);
405 if let Some(next) = self.chars.peek() {
406 if Self::is_character_class_escape(*next) {
407 self.state.last_int_value = None;
408 self.advance();
409 return Ok(true);
410 }
411 if self.state.u && (*next == 'P' || *next == 'p') {
412 self.state.last_int_value = None;
413 self.advance();
414 if self.eat('{') && self.eat_unicode_property_value_expression()? && self.eat('}') {
415 return Ok(true);
416 }
417 return Err(Error::new(self.state.pos, "Invalid property name"));
418 }
419 }
420 Ok(false)
421 }
422 fn eat_unicode_property_value_expression(&mut self) -> Result<bool, Error> {
425 trace!("eat_unicode_property_value_expression {:?}", self.current(),);
426 let start = self.state.pos;
427 if self.eat_unicode_property_name() && self.eat('=') {
428 let name = self.state.last_string_value;
429 if self.eat_unicode_property_value() {
430 self.validate_unicode_property_name_and_value(
431 &name,
432 &self.state.last_string_value,
433 )?;
434 return Ok(true);
435 }
436 }
437 self.reset_to(start);
438 if self.eat_lone_unicode_property_name_or_value() {
439 self.validate_unicode_property_name_or_value(&self.state.last_string_value)?;
440 return Ok(true);
441 }
442 Ok(false)
443 }
444 fn eat_unicode_property_name(&mut self) -> bool {
452 trace!("eat_unicode_property_name {:?}", self.current(),);
453 let start = self.state.pos;
454 self.state.last_string_value = None;
455 while let Some(ch) = self.chars.peek() {
456 if Self::is_unicode_property_name_character(*ch) {
457 self.advance();
458 } else {
459 break;
460 }
461 }
462 if self.state.pos != start {
463 self.state.last_string_value = self.pattern.get(start..self.state.pos)
464 }
465 self.state.last_string_value.is_some()
466 }
467 fn eat_unicode_property_value(&mut self) -> bool {
470 trace!("eat_unicode_property_value {:?}", self.current(),);
471 let start = self.state.pos;
472 while let Some(next) = self.chars.peek() {
473 if Self::is_unicode_property_value_character(*next) {
474 self.advance();
475 } else {
476 break;
477 }
478 }
479 if start != self.state.pos {
480 self.state.last_string_value = self.pattern.get(start..self.state.pos);
481 }
482 self.state.last_string_value.is_some()
483 }
484 fn eat_lone_unicode_property_name_or_value(&mut self) -> bool {
487 trace!(
488 "eat_lone_unicode_property_name_or_value {:?}",
489 self.current(),
490 );
491 self.eat_unicode_property_value()
492 }
493 fn validate_unicode_property_name_and_value(
496 &self,
497 name: &Option<&'a str>,
498 value: &Option<&'a str>,
499 ) -> Result<(), Error> {
500 if let (Some(name), Some(value)) = (name, value) {
501 if !unicode::validate_name_and_value(name, value) {
502 Err(Error {
503 idx: self.state.pos,
504 msg: format!(
505 "Unable to validate unicode property name and value ({:?} and {:?})",
506 name, value
507 ),
508 })
509 } else {
510 Ok(())
511 }
512 } else {
513 Err(Error {
514 idx: self.state.pos,
515 msg: "Invalid unicode property name & value provided".to_string(),
516 })
517 }
518 }
519 fn validate_unicode_property_name_or_value(
522 &self,
523 name_or_value: &Option<&'a str>,
524 ) -> Result<(), Error> {
525 if let Some(name) = name_or_value {
526 if !unicode::validate_name_or_value(name) {
527 Err(Error {
528 idx: self.state.pos,
529 msg: format!(
530 "Unable to validate unicode property name or value ({:?})",
531 name_or_value
532 ),
533 })
534 } else {
535 Ok(())
536 }
537 } else {
538 Err(Error {
539 idx: self.state.pos,
540 msg: "Invalid unicoe property name or value".to_string(),
541 })
542 }
543 }
544 fn is_unicode_property_name_character(ch: char) -> bool {
546 Self::is_control_letter(ch) || ch == '_'
547 }
548 fn is_unicode_property_value_character(ch: char) -> bool {
550 Self::is_unicode_property_name_character(ch) || ch.is_digit(10)
551 }
552 fn is_control_letter(ch: char) -> bool {
554 (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')
555 }
556 fn is_character_class_escape(ch: char) -> bool {
558 ch == 'd' || ch == 'D' || ch == 's' || ch == 'S' || ch == 'w' || ch == 'W'
559 }
560 fn eat_character_escape(&mut self) -> Result<bool, Error> {
562 trace!("eat_character_escape {:?}", self.current(),);
563 let ret = self.eat_control_escape()
564 || self.eat_c_control_letter()
565 || self.eat_zero()
566 || self.eat_hex_escape_sequence()?
567 || self.eat_unicode_escape_sequence()?
568 || (!self.state.u && self.eat_legacy_octal_escape_sequence())
569 || self.eat_identity_escape();
570 Ok(ret)
571 }
572 fn current(&mut self) -> Option<&char> {
574 self.chars.peek()
575 }
576 fn eat_control_escape(&mut self) -> bool {
582 trace!("eat_control_escape {:?}", self.current(),);
583 if let Some(ch) = self.chars.peek() {
584 match ch {
585 't' => self.state.last_int_value = Some(9),
586 'n' => self.state.last_int_value = Some(10),
587 'v' => self.state.last_int_value = Some(11),
588 'f' => self.state.last_int_value = Some(12),
589 'r' => self.state.last_int_value = Some(13),
590 _ => return false,
591 }
592 self.advance();
593 return true;
594 }
595 false
596 }
597 fn eat_c_control_letter(&mut self) -> bool {
608 trace!("eat_c_control_letter {:?}", self.current(),);
609 let start = self.state.pos;
610 if self.eat('c') {
611 if self.eat_control_letter() {
612 return true;
613 }
614 self.reset_to(start);
615 }
616 false
617 }
618 fn eat_control_letter(&mut self) -> bool {
620 trace!("eat_control_letter {:?}", self.current(),);
621 if let Some(next) = self.chars.peek() {
622 if Self::is_control_letter(*next) {
623 let n: u32 = (*next).into();
624 self.state.last_int_value = Some(n % 0x20);
625 self.advance();
626 return true;
627 }
628 }
629 false
630 }
631 fn eat_zero(&mut self) -> bool {
633 trace!("eat_zero {:?}", self.current(),);
634 if let Some(zero) = self.chars.peek() {
635 if *zero == '0' {
636 self.state.last_int_value = Some(0);
637 self.advance();
638 return true;
639 }
640 }
641 false
642 }
643 fn eat_hex_escape_sequence(&mut self) -> Result<bool, Error> {
645 trace!("eat_hex_escape_sequence {:?}", self.current(),);
646 let start = self.state.pos;
647 if self.eat('x') {
648 if self.eat_fixed_hex_digits(2) {
649 return Ok(true);
650 }
651 if self.state.u {
652 return Err(Error::new(start, "Invalid escape"));
653 }
654 self.reset_to(start)
655 }
656 Ok(false)
657 }
658 fn eat_fixed_hex_digits(&mut self, len: usize) -> bool {
661 trace!("eat_fixed_hex_digits {:?}", self.current(),);
662 let start = self.state.pos;
663 self.state.last_int_value = Some(0);
664 for _ in 0..len {
665 if let Some(n) = self.eat_digit(16) {
666 let last_int_value = self.state.last_int_value.unwrap_or(0);
667 self.state.last_int_value = Some(16 * last_int_value + n);
668 } else {
669 self.reset_to(start);
670 return false;
671 }
672 }
673 true
674 }
675 fn eat_legacy_octal_escape_sequence(&mut self) -> bool {
677 trace!("eat_legacy_octal_escape_sequence {:?}", self.current(),);
678 let last_int_value;
679 if let Some(n1) = self.eat_digit(8) {
680 if let Some(n2) = self.eat_digit(8) {
681 if n1 <= 3 {
682 if let Some(n3) = self.eat_digit(8) {
683 last_int_value = n1 * 64 + n2 * 8 + n3;
684 } else {
685 last_int_value = n1 * 8 + n2;
686 }
687 } else {
688 last_int_value = n1 * 8 + n2;
689 }
690 } else {
691 last_int_value = n1;
692 }
693 self.state.last_int_value = Some(last_int_value);
694 return true;
695 }
696 false
697 }
698 fn eat_digit(&mut self, radix: u32) -> Option<u32> {
701 trace!("eat_digit {:?}", self.current(),);
702 if let Some(next) = self.chars.peek() {
703 if next.is_digit(radix) {
704 let n = next.to_digit(radix);
705 self.advance();
706 return n;
707 }
708 }
709 None
710 }
711
712 fn eat_identity_escape(&mut self) -> bool {
713 trace!("eat_identity_escape {:?}", self.current(),);
714 if self.state.u {
715 if self.eat_syntax_character() {
716 return true;
717 }
718 if self.eat('/') {
719 self.state.last_int_value = Some(0x2f);
720 return true;
721 }
722 return false;
723 }
724 if let Some(ch) = self.chars.peek() {
725 if *ch != 'c' && (!self.state.n || *ch != 'k') {
726 let n = (*ch).into();
727 self.state.last_int_value = Some(n);
728 self.advance();
729 true
730 } else {
731 false
732 }
733 } else {
734 true
735 }
736 }
737 fn eat_syntax_character(&mut self) -> bool {
739 trace!("eat_syntax_character {:?}", self.current(),);
740 if let Some(ch) = self.chars.peek() {
741 if Self::is_syntax_ch(*ch) {
742 self.state.last_int_value = Some((*ch).into());
743 self.advance();
744 return true;
745 }
746 }
747 false
748 }
749 fn eat_unicode_escape_sequence(&mut self) -> Result<bool, Error> {
754 trace!("eat_regex_unicode_escape_sequence {:?}", self.current(),);
755 let start = self.state.pos;
756 if self.eat('u') {
757 if self.eat_fixed_hex_digits(4) {
758 let lead = self.state.last_int_value.unwrap_or(0);
759 if self.state.u && lead >= 0xD800 && lead <= 0xDBFF {
760 let lead_end = self.state.pos;
761 if self.eat('\\') && self.eat('u') && self.eat_fixed_hex_digits(4) {
762 let tail = self.state.last_int_value.unwrap_or(0);
763 if tail >= 0xDC00 && tail <= 0xDFFF {
764 self.state.last_int_value =
765 Some((lead - 0xD800) * 0x400 + (tail - 0xDC00) + 0x10000);
766 return Ok(true);
767 }
768 }
769 self.reset_to(lead_end);
770 self.state.last_int_value = Some(lead);
771 }
772 return Ok(true);
773 }
774 if self.state.u
775 && self.eat('{')
776 && self.eat_digits(16)
777 && self.eat('}')
778 && self
779 .state
780 .last_int_value
781 .map(|v| v <= 0x10_FFFF)
782 .unwrap_or(true)
783 {
784 return Ok(true);
785 }
786
787 if self.state.u {
788 return Err(Error::new(self.state.pos, "Invalid unicode escape"));
789 }
790
791 self.reset_to(start)
792 }
793 Ok(false)
794 }
795 fn eat_character_class(&mut self) -> Result<bool, Error> {
800 trace!("eat_character_class {:?}", self.current(),);
801 if self.eat('[') {
802 self.eat('^');
803 self.class_ranges()?;
804 if self.eat(']') {
805 Ok(true)
806 } else {
807 Err(Error::new(self.state.pos, "Unterminated character class"))
808 }
809 } else {
810 Ok(false)
811 }
812 }
813 fn class_ranges(&mut self) -> Result<(), Error> {
818 trace!("class_ranges {:?}", self.current(),);
819 while self.eat_class_atom()? {
820 let left = self.state.last_int_value;
821 if self.eat('-') && self.eat_class_atom()? {
822 let right = self.state.last_int_value;
823 if self.state.u && (left.is_none() || right.is_none()) {
824 return Err(Error::new(self.state.pos, "Invalid character class"));
825 }
826 if let (Some(left), Some(right)) = (left, right) {
827 if left > right {
828 return Err(Error::new(
829 self.state.pos,
830 &format!(
831 "Range out of order in character class ({} > {})",
832 left, right
833 ),
834 ));
835 }
836 }
837 }
838 }
839 Ok(())
840 }
841 fn eat_class_atom(&mut self) -> Result<bool, Error> {
843 trace!("eat_class_atom {:?}", self.current(),);
844 let start = self.state.pos;
845 if self.eat('\\') {
846 if self.eat_class_escape()? {
847 return Ok(true);
848 }
849 if self.state.u {
850 if let Some(ch) = self.chars.peek() {
851 if *ch == 'c' || ch.is_digit(8) {
852 return Err(Error::new(self.state.pos, "Invalid class escape"));
853 }
854 return Err(Error::new(self.state.pos, "Invalid escape"));
855 }
856 }
857 self.reset_to(start);
858 }
859 if let Some(ch) = self.chars.peek() {
860 if *ch != ']' {
861 self.state.last_int_value = Some((*ch).into());
862 self.advance();
863 return Ok(true);
864 }
865 }
866 Ok(false)
867 }
868 fn eat_class_escape(&mut self) -> Result<bool, Error> {
870 trace!("eat_class_escape {:?}", self.current(),);
871 let start = self.state.pos;
872 if self.eat('b') {
873 self.state.last_int_value = Some(0x08);
874 return Ok(true);
875 }
876 if self.state.u && self.eat('-') {
877 self.state.last_int_value = Some(0x2D);
878 return Ok(true);
879 }
880 if self.state.u && self.eat('c') {
881 if self.eat_class_control_letter() {
882 return Ok(true);
883 }
884 self.reset_to(start);
885 }
886 let ret = self.eat_character_class_escape()? || self.eat_character_escape()?;
887 Ok(ret)
888 }
889 fn eat_class_control_letter(&mut self) -> bool {
891 trace!("eat_class_control_letter {:?}", self.current(),);
892 if let Some(ch) = self.chars.peek() {
893 if ch.is_digit(10) || *ch == '_' {
894 let n: u32 = (*ch).into();
895 self.state.last_int_value = Some(n % 0x20);
896 self.advance();
897 return true;
898 }
899 }
900 false
901 }
902 fn eat_k_group_name(&mut self) -> Result<bool, Error> {
904 trace!("eat_k_group_name {:?}", self.current(),);
905 if self.eat('k') {
906 if self.eat_group_name()? {
907 if let Some(name) = self.state.last_string_value {
908 self.state.back_ref_names.push(name);
909 return Ok(true);
910 }
911 }
912 return Err(Error::new(self.state.pos, "Invalid named reference"));
913 }
914 Ok(false)
915 }
916 fn eat_group_name(&mut self) -> Result<bool, Error> {
918 trace!("eat_group_name {:?}", self.current(),);
919 self.state.last_string_value = None;
920 if self.eat('<') {
921 if self.eat_regex_identifier_name()? && self.eat('>') {
922 return Ok(true);
923 }
924 return Err(Error::new(self.state.pos, "Invalid capture group name"));
925 }
926 Ok(false)
927 }
928 fn eat_regex_identifier_name(&mut self) -> Result<bool, Error> {
930 trace!("eat_regex_identifier_name {:?}", self.current(),);
931 let start = self.state.pos;
932 self.state.last_string_value = None;
933 if self.eat_ident_start()? {
934 while self.eat_ident_part()? {}
935 self.state.last_string_value = Some(&self.pattern[start..self.state.pos]);
936 return Ok(true);
937 }
938 Ok(false)
939 }
940 fn eat_ident_start(&mut self) -> Result<bool, Error> {
942 trace!("eat_ident_start {:?}", self.current(),);
943 let start = self.state.pos;
944 self.state.last_string_value = None;
945 let mut ch = if let Some(ch) = self.chars.peek() {
946 *ch
947 } else {
948 return Ok(false);
949 };
950 self.advance();
951 if ch == '\\' && self.eat_unicode_escape_sequence()? {
952 if let Some(n) = self.state.last_int_value {
953 if let Some(n) = std::char::from_u32(n) {
954 ch = n;
955 }
956 }
957 }
958 if Self::is_id_start(ch) {
959 self.state.last_int_value = Some(ch.into());
960 return Ok(true);
961 }
962 self.reset_to(start);
963 Ok(false)
964 }
965
966 fn eat_ident_part(&mut self) -> Result<bool, Error> {
967 trace!("eat_ident_part {:?}", self.current(),);
968 let start = self.state.pos;
969 let mut ch = if let Some(ch) = self.chars.peek() {
970 *ch
971 } else {
972 return Ok(false);
973 };
974 self.advance();
975 if ch == '\\' && self.eat_unicode_escape_sequence()? {
976 if let Some(n) = self.state.last_int_value {
977 if let Some(n) = std::char::from_u32(n) {
978 ch = n;
979 }
980 }
981 }
982 if Self::is_id_continue(ch) {
983 self.state.last_int_value = Some(ch.into());
984 return Ok(true);
985 }
986 self.reset_to(start);
987 Ok(false)
988 }
989
990 fn is_id_start(ch: char) -> bool {
991 (ch >= 'A' && ch <= 'Z')
992 || (ch >= 'a' && ch <= 'z')
993 || ch == '$'
994 || ch == '_'
995 || unic_ucd_ident::is_id_start(ch)
996 }
997
998 fn is_id_continue(ch: char) -> bool {
999 (ch >= 'A' && ch <= 'Z')
1000 || (ch >= 'a' && ch <= 'z')
1001 || (ch >= '0' && ch <= '9')
1002 || ch == '$'
1003 || ch == '_'
1004 || unic_ucd_ident::is_id_continue(ch)
1005 }
1006
1007 fn eat_uncapturing_group(&mut self) -> Result<bool, Error> {
1008 trace!("eat_uncapturing_group {:?}", self.current(),);
1009 let start = self.state.pos;
1010 if self.eat('(') {
1011 if self.eat('?') && self.eat(':') {
1012 self.disjunction()?;
1013 if self.eat(')') {
1014 return Ok(true);
1015 }
1016 return Err(Error::new(start, "Unterminated group"));
1017 }
1018 self.reset_to(start)
1019 }
1020 Ok(false)
1021 }
1022
1023 fn eat_capturing_group(&mut self) -> Result<bool, Error> {
1024 trace!("eat_capturing_group {:?}", self.current(),);
1025 if self.eat('(') {
1026 self.group_specifier()?;
1027 self.disjunction()?;
1028 if self.eat(')') {
1029 self.state.num_capturing_parens += 1;
1030 Ok(true)
1031 } else {
1032 Err(Error::new(self.state.pos, "Unterminated group"))
1033 }
1034 } else {
1035 Ok(false)
1036 }
1037 }
1038
1039 fn group_specifier(&mut self) -> Result<(), Error> {
1040 trace!("group_specifier {:?}", self.current(),);
1041 if self.eat('?') {
1042 if self.eat_group_name()? {
1043 if let Some(name) = self.state.last_string_value {
1044 if self.state.group_names.contains(&name) {
1045 return Err(Error::new(self.state.pos, "Duplicate capture group name"));
1046 } else {
1047 self.state.group_names.push(name);
1048 return Ok(());
1049 }
1050 }
1051 }
1052 return Err(Error::new(self.state.pos, "Invalid group"));
1053 }
1054 Ok(())
1055 }
1056
1057 fn eat_assertion(&mut self) -> Result<bool, Error> {
1058 trace!("eat_assertion {:?}", self.current(),);
1059 let start = self.state.pos;
1060 self.state.last_assert_is_quant = false;
1061 if self.eat('^') || self.eat('$') {
1062 return Ok(true);
1063 }
1064 if self.eat('\\') {
1065 if self.eat('B') || self.eat('b') {
1066 return Ok(true);
1067 }
1068 self.reset_to(start);
1069 }
1070 if self.eat('(') && self.eat('?') {
1071 let look_behind = self.eat('<');
1072 if self.eat('=') || self.eat('!') {
1073 self.disjunction()?;
1074 if !self.eat(')') {
1075 return Err(Error::new(self.state.pos, "Unterminated group"));
1076 }
1077 self.state.last_assert_is_quant = !look_behind;
1078 return Ok(true);
1079 }
1080 }
1081 self.reset_to(start);
1082 Ok(false)
1083 }
1084
1085 fn eat_digits(&mut self, radix: u32) -> bool {
1086 trace!("eat_digits {:?}", self.current(),);
1087 let start = self.state.pos;
1088 self.state.last_int_value = Some(0);
1089 while let Some(next) = self.chars.peek() {
1090 log::debug!("next digit: {}", next);
1091 if let Some(n) = next.to_digit(radix) {
1092 log::debug!("digit as u32: {}", n);
1093 let last_int_value = self.state.last_int_value.unwrap_or(0);
1094 log::debug!("last_int_value: {}", last_int_value);
1095 self.state.last_int_value = Some(radix * last_int_value + n);
1096 self.advance();
1097 } else {
1098 log::debug!("next not digit");
1099 break;
1100 }
1101 }
1102 self.state.pos != start
1103 }
1104
1105 fn eat(&mut self, ch: char) -> bool {
1106 if let Some(next) = self.chars.peek() {
1107 if *next == ch {
1108 self.advance();
1109 return true;
1110 }
1111 }
1112 false
1113 }
1114
1115 fn advance(&mut self) {
1116 if let Some(ch) = self.chars.next() {
1117 self.state.pos += ch.len_utf8();
1118 log::debug!("adv: {} ({})", ch, self.state.pos);
1119 } else {
1120 log::debug!("adv at end");
1121 }
1122 }
1123
1124 fn reset_to(&mut self, idx: usize) {
1125 let remaining = &self.pattern[idx..];
1126 self.chars = remaining.chars().peekable();
1127 log::debug!("res: {} ({})", self.chars.peek().unwrap_or(&' '), idx);
1128 self.state.pos = idx;
1129 }
1130}
1131
1132struct State<'a> {
1133 pos: usize,
1134 len: usize,
1135 last_int_value: Option<u32>,
1136 last_string_value: Option<&'a str>,
1137 last_assert_is_quant: bool,
1138 num_capturing_parens: u32,
1139 max_back_refs: u32,
1140 group_names: Vec<&'a str>,
1141 back_ref_names: Vec<&'a str>,
1142 n: bool,
1143 u: bool,
1144}
1145
1146impl<'a> State<'a> {
1147 pub fn new(len: usize, u: bool) -> Self {
1148 Self {
1149 pos: 0,
1150 len,
1151 last_int_value: None,
1152 last_string_value: None,
1153 last_assert_is_quant: false,
1154 num_capturing_parens: 0,
1155 max_back_refs: 0,
1156 group_names: Vec::new(),
1157 back_ref_names: Vec::new(),
1158 n: u,
1159 u,
1160 }
1161 }
1162 pub fn reset(&mut self) {
1163 self.pos = 0;
1164 self.last_int_value = None;
1165 self.last_string_value = None;
1166 self.num_capturing_parens = 0;
1167 self.max_back_refs = 0;
1168 self.group_names.clear();
1169 self.back_ref_names.clear();
1170 }
1171}
1172
1173#[derive(Debug)]
1174struct RegExFlags {
1175 case_insensitive: bool,
1176 multi_line: bool,
1177 dot_matches_new_line: bool,
1178 unicode: bool,
1179 global: bool,
1180 sticky: bool,
1181 has_indicies: bool,
1182}
1183
1184impl Default for RegExFlags {
1185 fn default() -> Self {
1186 RegExFlags {
1187 case_insensitive: false,
1188 multi_line: false,
1189 dot_matches_new_line: false,
1190 unicode: false,
1191 global: false,
1192 sticky: false,
1193 has_indicies: false,
1194 }
1195 }
1196}
1197
1198impl RegExFlags {
1199 fn add_flag(&mut self, c: char, pos: usize) -> Result<(), Error> {
1200 match c {
1201 'g' => {
1202 if self.global {
1203 Err(Error::new(pos, "duplicate g flag"))
1204 } else {
1205 self.global = true;
1206 Ok(())
1207 }
1208 }
1209 'i' => {
1210 if self.case_insensitive {
1211 Err(Error::new(pos, "duplicate i flag"))
1212 } else {
1213 self.case_insensitive = true;
1214 Ok(())
1215 }
1216 }
1217 'm' => {
1218 if self.multi_line {
1219 Err(Error::new(pos, "duplicate m flag"))
1220 } else {
1221 self.multi_line = true;
1222 Ok(())
1223 }
1224 }
1225 's' => {
1226 if self.dot_matches_new_line {
1227 Err(Error::new(pos, "duplicate s flag"))
1228 } else {
1229 self.dot_matches_new_line = true;
1230 Ok(())
1231 }
1232 }
1233 'u' => {
1234 if self.unicode {
1235 Err(Error::new(pos, "duplicate u flag"))
1236 } else {
1237 self.unicode = true;
1238 Ok(())
1239 }
1240 }
1241 'y' => {
1242 if self.sticky {
1243 Err(Error::new(pos, "duplicate y flag"))
1244 } else {
1245 self.sticky = true;
1246 Ok(())
1247 }
1248 }
1249 'd' => {
1250 if self.has_indicies {
1251 Err(Error::new(pos, "duplicate d flag"))
1252 } else {
1253 self.has_indicies = true;
1254 Ok(())
1255 }
1256 }
1257 _ => Err(Error::new(pos, &format!("invalid flag {:?}", c))),
1258 }
1259 }
1260}
1261
1262#[cfg(test)]
1263mod tests {
1264 use super::*;
1265 #[test]
1266 fn lots_of_regexes() {
1267 run_test("/asdf|fdsa/g").unwrap();
1268 }
1269 #[test]
1270 #[should_panic = "Invalid escape"]
1271 fn decimal_escape_with_u() {
1272 run_test(r"/\1/u").unwrap()
1273 }
1274
1275 #[test]
1276 #[should_panic = "invalid flag"]
1277 fn invalid_regex_flag() {
1278 run_test("/./G").unwrap();
1279 }
1280
1281 #[test]
1282 #[should_panic = "Nothing to repeat"]
1283 fn bad_look_behind() {
1284 run_test(r"/.(?<=.)?/").unwrap();
1285 }
1286
1287 #[test]
1288 #[should_panic]
1289 fn bad_quant() {
1290 run_test(r"/{2}/").unwrap();
1291 }
1292
1293 #[test]
1294 #[should_panic]
1295 fn id_continue_u() {
1296 run_test(r"/\M/u").unwrap();
1297 }
1298
1299 #[test]
1300 #[should_panic]
1301 fn cant_start_with_star() {
1302 run_test("/*/").unwrap();
1303 }
1304
1305 #[test]
1306 fn unicode_name_and_value() {
1307 for value in unicode_tables::general_category::GC {
1308 run_test(&format!(r"/\p{{General_Category={}}}/u", value))
1309 .expect(&format!("failed at General_category={}", value));
1310 run_test(&format!(r"/\p{{gc={}}}/u", value)).expect(&format!("failed at gc={}", value));
1311 }
1312 for value in unicode_tables::script_values::SCRIPT {
1313 run_test(&format!(r"/\p{{Script={}}}/u", value))
1314 .expect(&format!("failed at Script={}", value));
1315 run_test(&format!(r"/\p{{sc={}}}/u", value)).expect(&format!("failed at sc={}", value));
1316 run_test(&format!(r"/\p{{Script_Extensions={}}}/u", value))
1317 .expect(&format!("failed at Script_Extensions={}", value));
1318 run_test(&format!(r"/\p{{scx={}}}/u", value))
1319 .expect(&format!("failed at scx={}", value));
1320 }
1321 }
1322 #[test]
1323 #[should_panic]
1324 fn unicode_name_and_value_bad_name() {
1325 run_test(r"/\p{junk=Greek}/u").unwrap();
1326 }
1327 #[test]
1328 #[should_panic]
1329 fn unicode_name_and_value_bad_value() {
1330 run_test(r"/\p{General_Category=Geek}/u").unwrap();
1331 }
1332 #[test]
1333 #[should_panic]
1334 fn unicode_name_or_value_bad_value() {
1335 run_test(r"/\p{junk}/u").unwrap();
1336 }
1337 #[test]
1338 fn unicode_name_or_value() {
1339 for value in unicode_tables::GC_AND_BP {
1340 run_test(&format!(r"/\p{{{}}}/u", value)).unwrap();
1341 }
1342 }
1343
1344 #[test]
1345 fn named_group() {
1346 run_test(r"/(?<x>a)|b/").unwrap();
1347 }
1348
1349 #[test]
1350 fn control_range() {
1351 run_test(r"/[\t-\r]/").unwrap();
1352 }
1353
1354 #[test]
1355 fn known_flags() {
1356 let flags = &['d', 'g', 'i', 'm', 's', 'u', 'y'];
1357 for flag in flags {
1358 run_test(&format!("/.+/{}", flag)).unwrap();
1359 }
1360 let all: String = flags.iter().collect();
1361 run_test(&format!("/.+/{}", all)).unwrap();
1362 }
1363
1364 #[test]
1365 fn duplicate_known_flags() {
1366 let flags = &['d', 'g', 'i', 'm', 's', 'u', 'y'];
1367 for flag in flags {
1368 run_test(&format!("/.+/{0}{}0", flag)).unwrap_err();
1369 }
1370 }
1371 #[test]
1372 fn long_or_sequences() {
1373 run_test(r#"/((?:[^BEGHLMOSWYZabcdhmswyz']+)|(?:'(?:[^']|'')*')|(?:G{1,5}|y{1,4}|Y{1,4}|M{1,5}|L{1,5}|w{1,2}|W{1}|d{1,2}|E{1,6}|c{1,6}|a{1,5}|b{1,5}|B{1,5}|h{1,2}|H{1,2}|m{1,2}|s{1,2}|S{1,3}|z{1,4}|Z{1,5}|O{1,4}))([\s\S]*)/"#).unwrap();
1374 }
1375
1376 fn run_test(regex: &str) -> Result<(), Error> {
1377 let _ = pretty_env_logger::try_init();
1378 let mut parser = RegexParser::new(regex)?;
1379 parser.validate()?;
1380 Ok(())
1381 }
1382}