res_regex/
lib.rs

1use log::trace;
2use std::{iter::Peekable, str::Chars};
3
4mod unicode;
5mod unicode_tables;
6
7#[derive(Debug)]
8pub struct Error {
9    pub msg: String,
10    pub idx: usize,
11}
12
13impl std::fmt::Display for Error {
14    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
15        write!(f, "{} at {}", self.msg, self.idx)
16    }
17}
18
19impl std::error::Error for Error {}
20
21impl Error {
22    fn new(idx: usize, msg: &str) -> Self {
23        Self {
24            idx,
25            msg: msg.to_string(),
26        }
27    }
28}
29
30pub struct RegexParser<'a> {
31    pattern: &'a str,
32    chars: Peekable<Chars<'a>>,
33    state: State<'a>,
34}
35
36impl<'a> RegexParser<'a> {
37    pub fn new(js: &'a str) -> Result<Self, Error> {
38        if !js.starts_with('/') {
39            return Err(Error::new(
40                0,
41                "regular expression literals must start with a /",
42            ));
43        }
44        let pat_end_idx = if let Some(end_idx) = js.rfind('/') {
45            if end_idx == 0 {
46                return Err(Error::new(0, "regular expression literals must have 2 `/`"));
47            } else {
48                end_idx
49            }
50        } else {
51            return Err(Error::new(0, "regular expression literals must have 2 `/`"));
52        };
53        let pattern = if let Some(pattern) = js.get(1..pat_end_idx) {
54            pattern
55        } else {
56            return Err(Error::new(0, "Invalid regular expression"));
57        };
58        let flags = if let Some(flag_str) = js.get(pat_end_idx + 1..) {
59            let mut flags = RegExFlags::default();
60            for (i, c) in flag_str.chars().enumerate() {
61                flags.add_flag(c, pat_end_idx + i + 1)?;
62            }
63            flags
64        } else {
65            return Err(Error::new(pat_end_idx, "invalid flags"));
66        };
67        Ok(Self {
68            pattern,
69            chars: pattern.chars().peekable(),
70            state: State::new(pattern.len(), flags.unicode),
71        })
72    }
73
74    pub fn validate(&mut self) -> Result<(), Error> {
75        trace!("parse {:?}", self.current());
76        self.pattern()?;
77        if !self.state.n && !self.state.group_names.is_empty() {
78            self.pattern()?;
79        }
80        Ok(())
81    }
82    /// The primary entry point, `Pattern` is technically
83    /// the target for all the characters inbetween the `/`s
84    /// ```js
85    /// let re = /pattern/
86    /// ```
87    fn pattern(&mut self) -> Result<(), Error> {
88        trace!("pattern {:?}", self.current(),);
89        if self.state.pos > 0 {
90            self.chars = self.pattern.chars().peekable();
91            self.state.reset();
92        }
93        self.disjunction()?;
94        if self.state.pos != self.state.len {
95            if self.eat(')') {
96                return Err(Error::new(self.state.pos, "Unmatched `)`"));
97            }
98            if self.eat(']') || self.eat('}') {
99                return Err(Error::new(self.state.pos, "Lone quantifier brackets"));
100            }
101        }
102        if self.state.max_back_refs > self.state.num_capturing_parens {
103            return Err(Error::new(self.state.pos, "Invalid escape"));
104        }
105        for name in &self.state.back_ref_names {
106            if !self.state.group_names.contains(name) {
107                return Err(Error::new(
108                    self.state.pos,
109                    "Invalid named capture referenced",
110                ));
111            }
112        }
113        Ok(())
114    }
115    /// A disjunction will be items separated by a `|`
116    /// ```js
117    /// let re = /dis|junction/
118    /// ```
119    fn disjunction(&mut self) -> Result<(), Error> {
120        trace!("disjunction {:?}", self.current(),);
121        self.alternative()?;
122        while self.eat('|') {
123            self.alternative()?;
124        }
125        if self.eat_quantifier(true)? {
126            return Err(Error::new(self.state.pos, "Nothing to repeat"));
127        }
128        if self.eat('{') {
129            return Err(Error::new(self.state.pos, "lone quantifier brackets"));
130        }
131        Ok(())
132    }
133    /// An alternative is either side of a `disjunction`
134    /// ```js
135    /// let re = /alt1|alt2/;
136    /// ```
137    fn alternative(&mut self) -> Result<(), Error> {
138        trace!("alternative {:?}", self.current(),);
139        while self.state.pos < self.state.len && self.eat_term()? {}
140        Ok(())
141    }
142    /// a quantifier is appended to an item to say how
143    /// many of that item should exist, this includes `*` (0 or more)
144    /// `+` (1 or more), `?` (0 or 1) or `{1}`, `{1,2}`
145    ///
146    /// ```js
147    /// let re = /s*p+q?a{1}b{1,2}/;
148    /// ```
149    fn eat_quantifier(&mut self, no_error: bool) -> Result<bool, Error> {
150        trace!("eat_quantifier {:?}", self.current(),);
151        Ok(if self.eat_quantifier_prefix(no_error)? {
152            self.eat('?');
153            true
154        } else {
155            false
156        })
157    }
158    /// A prefix is either then characer `*`, `+`, `?` or
159    /// the full braced quantifier `{1} or `{1,2}`
160    fn eat_quantifier_prefix(&mut self, no_error: bool) -> Result<bool, Error> {
161        trace!("eat_quantifier_prefix {:?}", self.current(),);
162        let ret = self.eat('*')
163            || self.eat('+')
164            || self.eat('?')
165            || self.eat_braced_quantifier(no_error)?;
166        Ok(ret)
167    }
168    /// A braced quantifier either 1 or two numbers wrapped in
169    /// curly braces separated by a comma. The first number
170    /// refers to the minimum number of repeated items and the
171    /// second number refers to the maximum. The second number
172    /// is optional
173    ///
174    /// ```js
175    /// let re = /a{1,100}/;
176    /// if (re.text('a'.repeat(101))) {
177    ///     throw new Error('re will only match up to 100 repeated `a`s');
178    /// }
179    /// ```
180    fn eat_braced_quantifier(&mut self, no_error: bool) -> Result<bool, Error> {
181        trace!("eat_braced_quantifier {:?}", self.current(),);
182        let start = self.state.pos;
183        if self.eat('{') {
184            if self.eat_digits(10) {
185                let min = self.state.last_int_value;
186                let max = if self.eat(',') && self.eat_digits(10) {
187                    self.state.last_int_value
188                } else {
189                    None
190                };
191                if self.eat('}') {
192                    if let (Some(max), Some(min)) = (max, min) {
193                        if max < min && !no_error {
194                            return Err(Error::new(
195                                self.state.pos,
196                                &format!("numbers out of order in {{{},{}}}", min, max),
197                            ));
198                        }
199                    }
200                    return Ok(true);
201                }
202            }
203            if self.state.u && !no_error {
204                return Err(Error::new(self.state.pos, "Incomplete quantifier"));
205            }
206            self.reset_to(start);
207        }
208        Ok(false)
209    }
210    /// A term is the body of an `alternate`
211    /// it may include an `assertion` or an `atom`
212    /// or an `atom` followed by a `quantifier`
213    ///
214    /// ```js
215    /// let re = /term/
216    /// ```
217    fn eat_term(&mut self) -> Result<bool, Error> {
218        trace!("eat_term {:?}", self.current(),);
219        if self.eat_assertion()? {
220            if self.state.last_assert_is_quant && self.eat_quantifier(false)? && self.state.n {
221                return Err(Error::new(self.state.pos, "Invalid quantifier"));
222            }
223            return Ok(true);
224        }
225        if self.state.u {
226            if self.eat_atom()? {
227                self.eat_quantifier(false)?;
228                return Ok(true);
229            }
230        } else if self.eat_extended_atom()? {
231            self.eat_quantifier(false)?;
232            return Ok(true);
233        }
234        Ok(false)
235    }
236    /// An atom is a single character or representative
237    /// set of characters. This includes things like
238    /// groups and classes
239    /// ```js
240    /// let re = /a(b)[a-b]/;
241    /// ```
242    fn eat_atom(&mut self) -> Result<bool, Error> {
243        trace!("eat_atom {:?}", self.current(),);
244        let ret = self.eat_pattern_characters()
245            || self.eat('.')
246            || self.eat_reverse_solidus_atom_escape()?
247            || self.eat_character_class()?
248            || self.eat_uncapturing_group()?
249            || self.eat_capturing_group()?;
250        Ok(ret)
251    }
252    /// An extended version of the normal `atom`, this includes
253    /// exotic classes and groups
254    fn eat_extended_atom(&mut self) -> Result<bool, Error> {
255        trace!("eat_extended_atom {:?}", self.current(),);
256        let ret = self.eat('.')
257            || self.eat_reverse_solidus_atom_escape()?
258            || self.eat_character_class()?
259            || self.eat_uncapturing_group()?
260            || self.eat_capturing_group()?
261            || self.eat_invalid_braced_quantifier()?
262            || self.eat_extended_pattern_character();
263        Ok(ret)
264    }
265    /// attempts to consume a braced quantifier
266    /// in an invalid position.
267    fn eat_invalid_braced_quantifier(&mut self) -> Result<bool, Error> {
268        trace!("eat_invalid_braced_quantifier {:?}", self.current(),);
269        if self.eat_braced_quantifier(true)? {
270            return Err(Error::new(self.state.pos, "Nothing to repeat"));
271        }
272        Ok(false)
273    }
274    /// extended pattern characters include symbols
275    /// like `(` or `|`
276    fn eat_extended_pattern_character(&mut self) -> bool {
277        trace!("eat_extended_pattern_character {:?}", self.current(),);
278        if let Some(ch) = self.chars.peek() {
279            if *ch != '$'
280                && !(*ch >= '(' && *ch <= '+')
281                && *ch != '.'
282                && *ch != '?'
283                && *ch != '['
284                && *ch != '^'
285                && *ch != '|'
286            {
287                self.advance();
288                return true;
289            }
290        }
291        false
292    }
293    /// A pattern character is any non-syntax
294    /// character
295    fn eat_pattern_characters(&mut self) -> bool {
296        trace!("eat_pattern_characters {:?}", self.current(),);
297        let start = self.state.pos;
298        while let Some(next) = self.chars.peek() {
299            if !Self::is_syntax_ch(*next) {
300                self.advance();
301            } else {
302                break;
303            }
304        }
305        self.state.pos != start
306    }
307    /// Syntax characters are operators
308    /// that have special meanin in a regular expression
309    /// like `?` or `.`
310    fn is_syntax_ch(ch: char) -> bool {
311        ch == '$'
312            || ch >= '(' && ch <= '+'
313            || ch == '.'
314            || ch == '?'
315            || ch >= '[' && ch <= '^'
316            || ch >= '{' && ch <= '}'
317    }
318
319    /// a reverse solidus is a really fancy name for `\`
320    fn eat_reverse_solidus_atom_escape(&mut self) -> Result<bool, Error> {
321        trace!("eat_reverse_solidus_atom_escape {:?}", self.current(),);
322        let start = self.state.pos;
323        if self.eat('\\') {
324            if self.eat_atom_escape()? {
325                return Ok(true);
326            }
327            self.reset_to(start);
328        }
329        Ok(false)
330    }
331    /// Picking up after a `\`
332    fn eat_atom_escape(&mut self) -> Result<bool, Error> {
333        trace!("eat_atom_escape {}", self.state.u,);
334        if self.eat_back_ref()
335            || self.eat_character_class_escape()?
336            || self.eat_character_escape()?
337            || self.state.n && self.eat_k_group_name()?
338        {
339            return Ok(true);
340        }
341        trace!("previous check failed, {}", self.state.u);
342        if self.state.u {
343            trace!("previous all failed, with unicode flag");
344            if let Some(next) = self.current() {
345                if *next == 'c' {
346                    return Err(Error::new(self.state.pos, "Invalid unicode escape"));
347                }
348            }
349            trace!("returning error");
350            return Err(Error::new(self.state.pos, "Invalid escape"));
351        }
352        Ok(false)
353    }
354    /// A back reference is a reference to a
355    /// previous capture group
356    /// ```js
357    /// let re = /(abc)\1/;
358    /// ```
359    ///
360    /// in the above, we would match "abcabc" only
361    fn eat_back_ref(&mut self) -> bool {
362        trace!("eat_back_ref {:?}", self.current(),);
363        let start = self.state.pos;
364        if self.eat_decimal_escape() {
365            let n = if let Some(n) = self.state.last_int_value {
366                n
367            } else {
368                return true;
369            };
370            if self.state.u {
371                if n > self.state.max_back_refs {
372                    self.state.max_back_refs = n;
373                }
374                return true;
375            }
376            if n <= self.state.num_capturing_parens {
377                return true;
378            }
379            self.reset_to(start);
380        }
381        false
382    }
383    /// an escaped decimal number
384    fn eat_decimal_escape(&mut self) -> bool {
385        trace!("eat_decimal_escape {:?}", self.current(),);
386        let start = self.state.pos;
387        let mut last_int_value = 0;
388        while let Some(next) = self.chars.peek() {
389            if let Some(n) = next.to_digit(10) {
390                last_int_value = 10 * last_int_value + n;
391                self.advance()
392            } else {
393                break;
394            }
395        }
396        self.state.last_int_value = Some(last_int_value);
397        self.state.pos != start
398    }
399    /// An escaped character class
400    /// this include `\d`, `\s`, and `\w`
401    /// if the regex has the `u` flag, it would also
402    /// include `\p{General_Category=Greek}`
403    fn eat_character_class_escape(&mut self) -> Result<bool, Error> {
404        trace!("eat_character_class_escape {:?}", self.current(),);
405        if let Some(next) = self.chars.peek() {
406            if Self::is_character_class_escape(*next) {
407                self.state.last_int_value = None;
408                self.advance();
409                return Ok(true);
410            }
411            if self.state.u && (*next == 'P' || *next == 'p') {
412                self.state.last_int_value = None;
413                self.advance();
414                if self.eat('{') && self.eat_unicode_property_value_expression()? && self.eat('}') {
415                    return Ok(true);
416                }
417                return Err(Error::new(self.state.pos, "Invalid property name"));
418            }
419        }
420        Ok(false)
421    }
422    /// After an escaped p (`\p{`), with unicode enabled would
423    /// allow for unicode category classes
424    fn eat_unicode_property_value_expression(&mut self) -> Result<bool, Error> {
425        trace!("eat_unicode_property_value_expression {:?}", self.current(),);
426        let start = self.state.pos;
427        if self.eat_unicode_property_name() && self.eat('=') {
428            let name = self.state.last_string_value;
429            if self.eat_unicode_property_value() {
430                self.validate_unicode_property_name_and_value(
431                    &name,
432                    &self.state.last_string_value,
433                )?;
434                return Ok(true);
435            }
436        }
437        self.reset_to(start);
438        if self.eat_lone_unicode_property_name_or_value() {
439            self.validate_unicode_property_name_or_value(&self.state.last_string_value)?;
440            return Ok(true);
441        }
442        Ok(false)
443    }
444    /// This will be one of the following
445    ///  * `General_Category`
446    ///  * `gc`
447    ///  * `Script`
448    ///  * `sc`
449    ///  * `Script_Extensions`
450    ///  * `scx`
451    fn eat_unicode_property_name(&mut self) -> bool {
452        trace!("eat_unicode_property_name {:?}", self.current(),);
453        let start = self.state.pos;
454        self.state.last_string_value = None;
455        while let Some(ch) = self.chars.peek() {
456            if Self::is_unicode_property_name_character(*ch) {
457                self.advance();
458            } else {
459                break;
460            }
461        }
462        if self.state.pos != start {
463            self.state.last_string_value = self.pattern.get(start..self.state.pos)
464        }
465        self.state.last_string_value.is_some()
466    }
467    /// This should match a value in the corresponding
468    /// category lists
469    fn eat_unicode_property_value(&mut self) -> bool {
470        trace!("eat_unicode_property_value {:?}", self.current(),);
471        let start = self.state.pos;
472        while let Some(next) = self.chars.peek() {
473            if Self::is_unicode_property_value_character(*next) {
474                self.advance();
475            } else {
476                break;
477            }
478        }
479        if start != self.state.pos {
480            self.state.last_string_value = self.pattern.get(start..self.state.pos);
481        }
482        self.state.last_string_value.is_some()
483    }
484    /// This could be any General_Category or Binary Property
485    /// entry
486    fn eat_lone_unicode_property_name_or_value(&mut self) -> bool {
487        trace!(
488            "eat_lone_unicode_property_name_or_value {:?}",
489            self.current(),
490        );
491        self.eat_unicode_property_value()
492    }
493    /// Validates that the name and value
494    /// are valid
495    fn validate_unicode_property_name_and_value(
496        &self,
497        name: &Option<&'a str>,
498        value: &Option<&'a str>,
499    ) -> Result<(), Error> {
500        if let (Some(name), Some(value)) = (name, value) {
501            if !unicode::validate_name_and_value(name, value) {
502                Err(Error {
503                    idx: self.state.pos,
504                    msg: format!(
505                        "Unable to validate unicode property name and value ({:?} and {:?})",
506                        name, value
507                    ),
508                })
509            } else {
510                Ok(())
511            }
512        } else {
513            Err(Error {
514                idx: self.state.pos,
515                msg: "Invalid unicode property name & value provided".to_string(),
516            })
517        }
518    }
519    /// Validates that a lone name or value
520    /// is valid
521    fn validate_unicode_property_name_or_value(
522        &self,
523        name_or_value: &Option<&'a str>,
524    ) -> Result<(), Error> {
525        if let Some(name) = name_or_value {
526            if !unicode::validate_name_or_value(name) {
527                Err(Error {
528                    idx: self.state.pos,
529                    msg: format!(
530                        "Unable to validate unicode property name or value ({:?})",
531                        name_or_value
532                    ),
533                })
534            } else {
535                Ok(())
536            }
537        } else {
538            Err(Error {
539                idx: self.state.pos,
540                msg: "Invalid unicoe property name or value".to_string(),
541            })
542        }
543    }
544    /// This will be any control letter plus `_`
545    fn is_unicode_property_name_character(ch: char) -> bool {
546        Self::is_control_letter(ch) || ch == '_'
547    }
548    /// This will be any name character plus and decimal digit
549    fn is_unicode_property_value_character(ch: char) -> bool {
550        Self::is_unicode_property_name_character(ch) || ch.is_digit(10)
551    }
552    /// Any capital or lowercase english character
553    fn is_control_letter(ch: char) -> bool {
554        (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')
555    }
556    /// `d`, `D`, `s`, `S`, `w`, `W`
557    fn is_character_class_escape(ch: char) -> bool {
558        ch == 'd' || ch == 'D' || ch == 's' || ch == 'S' || ch == 'w' || ch == 'W'
559    }
560    /// This would consume any valid character after a `\`
561    fn eat_character_escape(&mut self) -> Result<bool, Error> {
562        trace!("eat_character_escape {:?}", self.current(),);
563        let ret = self.eat_control_escape()
564            || self.eat_c_control_letter()
565            || self.eat_zero()
566            || self.eat_hex_escape_sequence()?
567            || self.eat_unicode_escape_sequence()?
568            || (!self.state.u && self.eat_legacy_octal_escape_sequence())
569            || self.eat_identity_escape();
570        Ok(ret)
571    }
572    /// Peek at the current look ahead token
573    fn current(&mut self) -> Option<&char> {
574        self.chars.peek()
575    }
576    /// control escapes include `\t`, `\n`, `\v`, `\f` and `\r`
577    ///
578    /// ```js
579    /// let re = /\n\t/;
580    /// ```
581    fn eat_control_escape(&mut self) -> bool {
582        trace!("eat_control_escape {:?}", self.current(),);
583        if let Some(ch) = self.chars.peek() {
584            match ch {
585                't' => self.state.last_int_value = Some(9),
586                'n' => self.state.last_int_value = Some(10),
587                'v' => self.state.last_int_value = Some(11),
588                'f' => self.state.last_int_value = Some(12),
589                'r' => self.state.last_int_value = Some(13),
590                _ => return false,
591            }
592            self.advance();
593            return true;
594        }
595        false
596    }
597    /// An escaped control character is any `\c` followed
598    /// by a single english letter (upper or lower)
599    ///
600    /// ```js
601    /// let re = /\cI/;
602    /// ```
603    /// These characters represent an old
604    /// form of control escapes like \t (in the example above)
605    ///
606    /// (wikipedia)[https://en.wikipedia.org/wiki/Control_character]
607    fn eat_c_control_letter(&mut self) -> bool {
608        trace!("eat_c_control_letter {:?}", self.current(),);
609        let start = self.state.pos;
610        if self.eat('c') {
611            if self.eat_control_letter() {
612                return true;
613            }
614            self.reset_to(start);
615        }
616        false
617    }
618    /// Eat a letter after a `\c`
619    fn eat_control_letter(&mut self) -> bool {
620        trace!("eat_control_letter {:?}", self.current(),);
621        if let Some(next) = self.chars.peek() {
622            if Self::is_control_letter(*next) {
623                let n: u32 = (*next).into();
624                self.state.last_int_value = Some(n % 0x20);
625                self.advance();
626                return true;
627            }
628        }
629        false
630    }
631    /// Eat a zero character
632    fn eat_zero(&mut self) -> bool {
633        trace!("eat_zero {:?}", self.current(),);
634        if let Some(zero) = self.chars.peek() {
635            if *zero == '0' {
636                self.state.last_int_value = Some(0);
637                self.advance();
638                return true;
639            }
640        }
641        false
642    }
643    /// eat a hexidecimal number escape sequence
644    fn eat_hex_escape_sequence(&mut self) -> Result<bool, Error> {
645        trace!("eat_hex_escape_sequence {:?}", self.current(),);
646        let start = self.state.pos;
647        if self.eat('x') {
648            if self.eat_fixed_hex_digits(2) {
649                return Ok(true);
650            }
651            if self.state.u {
652                return Err(Error::new(start, "Invalid escape"));
653            }
654            self.reset_to(start)
655        }
656        Ok(false)
657    }
658    /// Attempt to consume a fixed number of hexidecimal
659    /// characters in a row
660    fn eat_fixed_hex_digits(&mut self, len: usize) -> bool {
661        trace!("eat_fixed_hex_digits {:?}", self.current(),);
662        let start = self.state.pos;
663        self.state.last_int_value = Some(0);
664        for _ in 0..len {
665            if let Some(n) = self.eat_digit(16) {
666                let last_int_value = self.state.last_int_value.unwrap_or(0);
667                self.state.last_int_value = Some(16 * last_int_value + n);
668            } else {
669                self.reset_to(start);
670                return false;
671            }
672        }
673        true
674    }
675    /// Eat a sequence of numbers starting with 0, all below 8
676    fn eat_legacy_octal_escape_sequence(&mut self) -> bool {
677        trace!("eat_legacy_octal_escape_sequence {:?}", self.current(),);
678        let last_int_value;
679        if let Some(n1) = self.eat_digit(8) {
680            if let Some(n2) = self.eat_digit(8) {
681                if n1 <= 3 {
682                    if let Some(n3) = self.eat_digit(8) {
683                        last_int_value = n1 * 64 + n2 * 8 + n3;
684                    } else {
685                        last_int_value = n1 * 8 + n2;
686                    }
687                } else {
688                    last_int_value = n1 * 8 + n2;
689                }
690            } else {
691                last_int_value = n1;
692            }
693            self.state.last_int_value = Some(last_int_value);
694            return true;
695        }
696        false
697    }
698    /// Attempt to consume a digit of the provided
699    /// radix
700    fn eat_digit(&mut self, radix: u32) -> Option<u32> {
701        trace!("eat_digit {:?}", self.current(),);
702        if let Some(next) = self.chars.peek() {
703            if next.is_digit(radix) {
704                let n = next.to_digit(radix);
705                self.advance();
706                return n;
707            }
708        }
709        None
710    }
711
712    fn eat_identity_escape(&mut self) -> bool {
713        trace!("eat_identity_escape {:?}", self.current(),);
714        if self.state.u {
715            if self.eat_syntax_character() {
716                return true;
717            }
718            if self.eat('/') {
719                self.state.last_int_value = Some(0x2f);
720                return true;
721            }
722            return false;
723        }
724        if let Some(ch) = self.chars.peek() {
725            if *ch != 'c' && (!self.state.n || *ch != 'k') {
726                let n = (*ch).into();
727                self.state.last_int_value = Some(n);
728                self.advance();
729                true
730            } else {
731                false
732            }
733        } else {
734            true
735        }
736    }
737    /// Attempt to consume a syntax character like `{`
738    fn eat_syntax_character(&mut self) -> bool {
739        trace!("eat_syntax_character {:?}", self.current(),);
740        if let Some(ch) = self.chars.peek() {
741            if Self::is_syntax_ch(*ch) {
742                self.state.last_int_value = Some((*ch).into());
743                self.advance();
744                return true;
745            }
746        }
747        false
748    }
749    /// A fixed 4 digit or curly brace unicode escape character
750    /// ```js
751    /// let re = /\u{61}\u0062/;
752    /// ```
753    fn eat_unicode_escape_sequence(&mut self) -> Result<bool, Error> {
754        trace!("eat_regex_unicode_escape_sequence {:?}", self.current(),);
755        let start = self.state.pos;
756        if self.eat('u') {
757            if self.eat_fixed_hex_digits(4) {
758                let lead = self.state.last_int_value.unwrap_or(0);
759                if self.state.u && lead >= 0xD800 && lead <= 0xDBFF {
760                    let lead_end = self.state.pos;
761                    if self.eat('\\') && self.eat('u') && self.eat_fixed_hex_digits(4) {
762                        let tail = self.state.last_int_value.unwrap_or(0);
763                        if tail >= 0xDC00 && tail <= 0xDFFF {
764                            self.state.last_int_value =
765                                Some((lead - 0xD800) * 0x400 + (tail - 0xDC00) + 0x10000);
766                            return Ok(true);
767                        }
768                    }
769                    self.reset_to(lead_end);
770                    self.state.last_int_value = Some(lead);
771                }
772                return Ok(true);
773            }
774            if self.state.u
775                && self.eat('{')
776                && self.eat_digits(16)
777                && self.eat('}')
778                && self
779                    .state
780                    .last_int_value
781                    .map(|v| v <= 0x10_FFFF)
782                    .unwrap_or(true)
783            {
784                return Ok(true);
785            }
786
787            if self.state.u {
788                return Err(Error::new(self.state.pos, "Invalid unicode escape"));
789            }
790
791            self.reset_to(start)
792        }
793        Ok(false)
794    }
795    /// Attempt to consume a character class
796    /// ```js
797    /// let re = /[clas]/;
798    /// ```
799    fn eat_character_class(&mut self) -> Result<bool, Error> {
800        trace!("eat_character_class {:?}", self.current(),);
801        if self.eat('[') {
802            self.eat('^');
803            self.class_ranges()?;
804            if self.eat(']') {
805                Ok(true)
806            } else {
807                Err(Error::new(self.state.pos, "Unterminated character class"))
808            }
809        } else {
810            Ok(false)
811        }
812    }
813    /// Attempt to consume a class range
814    /// ```js
815    /// let re = /[c-r]/;
816    /// ```
817    fn class_ranges(&mut self) -> Result<(), Error> {
818        trace!("class_ranges {:?}", self.current(),);
819        while self.eat_class_atom()? {
820            let left = self.state.last_int_value;
821            if self.eat('-') && self.eat_class_atom()? {
822                let right = self.state.last_int_value;
823                if self.state.u && (left.is_none() || right.is_none()) {
824                    return Err(Error::new(self.state.pos, "Invalid character class"));
825                }
826                if let (Some(left), Some(right)) = (left, right) {
827                    if left > right {
828                        return Err(Error::new(
829                            self.state.pos,
830                            &format!(
831                                "Range out of order in character class ({} > {})",
832                                left, right
833                            ),
834                        ));
835                    }
836                }
837            }
838        }
839        Ok(())
840    }
841    /// Attempt to consume a single part of a class
842    fn eat_class_atom(&mut self) -> Result<bool, Error> {
843        trace!("eat_class_atom {:?}", self.current(),);
844        let start = self.state.pos;
845        if self.eat('\\') {
846            if self.eat_class_escape()? {
847                return Ok(true);
848            }
849            if self.state.u {
850                if let Some(ch) = self.chars.peek() {
851                    if *ch == 'c' || ch.is_digit(8) {
852                        return Err(Error::new(self.state.pos, "Invalid class escape"));
853                    }
854                    return Err(Error::new(self.state.pos, "Invalid escape"));
855                }
856            }
857            self.reset_to(start);
858        }
859        if let Some(ch) = self.chars.peek() {
860            if *ch != ']' {
861                self.state.last_int_value = Some((*ch).into());
862                self.advance();
863                return Ok(true);
864            }
865        }
866        Ok(false)
867    }
868    /// attempt to consume an escaped part of a class
869    fn eat_class_escape(&mut self) -> Result<bool, Error> {
870        trace!("eat_class_escape {:?}", self.current(),);
871        let start = self.state.pos;
872        if self.eat('b') {
873            self.state.last_int_value = Some(0x08);
874            return Ok(true);
875        }
876        if self.state.u && self.eat('-') {
877            self.state.last_int_value = Some(0x2D);
878            return Ok(true);
879        }
880        if self.state.u && self.eat('c') {
881            if self.eat_class_control_letter() {
882                return Ok(true);
883            }
884            self.reset_to(start);
885        }
886        let ret = self.eat_character_class_escape()? || self.eat_character_escape()?;
887        Ok(ret)
888    }
889    /// attempt to consume a control letter
890    fn eat_class_control_letter(&mut self) -> bool {
891        trace!("eat_class_control_letter {:?}", self.current(),);
892        if let Some(ch) = self.chars.peek() {
893            if ch.is_digit(10) || *ch == '_' {
894                let n: u32 = (*ch).into();
895                self.state.last_int_value = Some(n % 0x20);
896                self.advance();
897                return true;
898            }
899        }
900        false
901    }
902    /// attempt to consume a `\k` group
903    fn eat_k_group_name(&mut self) -> Result<bool, Error> {
904        trace!("eat_k_group_name {:?}", self.current(),);
905        if self.eat('k') {
906            if self.eat_group_name()? {
907                if let Some(name) = self.state.last_string_value {
908                    self.state.back_ref_names.push(name);
909                    return Ok(true);
910                }
911            }
912            return Err(Error::new(self.state.pos, "Invalid named reference"));
913        }
914        Ok(false)
915    }
916    /// attempt to consume a named group
917    fn eat_group_name(&mut self) -> Result<bool, Error> {
918        trace!("eat_group_name {:?}", self.current(),);
919        self.state.last_string_value = None;
920        if self.eat('<') {
921            if self.eat_regex_identifier_name()? && self.eat('>') {
922                return Ok(true);
923            }
924            return Err(Error::new(self.state.pos, "Invalid capture group name"));
925        }
926        Ok(false)
927    }
928    /// Attempt to consume an identifier name
929    fn eat_regex_identifier_name(&mut self) -> Result<bool, Error> {
930        trace!("eat_regex_identifier_name {:?}", self.current(),);
931        let start = self.state.pos;
932        self.state.last_string_value = None;
933        if self.eat_ident_start()? {
934            while self.eat_ident_part()? {}
935            self.state.last_string_value = Some(&self.pattern[start..self.state.pos]);
936            return Ok(true);
937        }
938        Ok(false)
939    }
940    /// attempt to consume an identifer start
941    fn eat_ident_start(&mut self) -> Result<bool, Error> {
942        trace!("eat_ident_start {:?}", self.current(),);
943        let start = self.state.pos;
944        self.state.last_string_value = None;
945        let mut ch = if let Some(ch) = self.chars.peek() {
946            *ch
947        } else {
948            return Ok(false);
949        };
950        self.advance();
951        if ch == '\\' && self.eat_unicode_escape_sequence()? {
952            if let Some(n) = self.state.last_int_value {
953                if let Some(n) = std::char::from_u32(n) {
954                    ch = n;
955                }
956            }
957        }
958        if Self::is_id_start(ch) {
959            self.state.last_int_value = Some(ch.into());
960            return Ok(true);
961        }
962        self.reset_to(start);
963        Ok(false)
964    }
965
966    fn eat_ident_part(&mut self) -> Result<bool, Error> {
967        trace!("eat_ident_part {:?}", self.current(),);
968        let start = self.state.pos;
969        let mut ch = if let Some(ch) = self.chars.peek() {
970            *ch
971        } else {
972            return Ok(false);
973        };
974        self.advance();
975        if ch == '\\' && self.eat_unicode_escape_sequence()? {
976            if let Some(n) = self.state.last_int_value {
977                if let Some(n) = std::char::from_u32(n) {
978                    ch = n;
979                }
980            }
981        }
982        if Self::is_id_continue(ch) {
983            self.state.last_int_value = Some(ch.into());
984            return Ok(true);
985        }
986        self.reset_to(start);
987        Ok(false)
988    }
989
990    fn is_id_start(ch: char) -> bool {
991        (ch >= 'A' && ch <= 'Z')
992            || (ch >= 'a' && ch <= 'z')
993            || ch == '$'
994            || ch == '_'
995            || unic_ucd_ident::is_id_start(ch)
996    }
997
998    fn is_id_continue(ch: char) -> bool {
999        (ch >= 'A' && ch <= 'Z')
1000            || (ch >= 'a' && ch <= 'z')
1001            || (ch >= '0' && ch <= '9')
1002            || ch == '$'
1003            || ch == '_'
1004            || unic_ucd_ident::is_id_continue(ch)
1005    }
1006
1007    fn eat_uncapturing_group(&mut self) -> Result<bool, Error> {
1008        trace!("eat_uncapturing_group {:?}", self.current(),);
1009        let start = self.state.pos;
1010        if self.eat('(') {
1011            if self.eat('?') && self.eat(':') {
1012                self.disjunction()?;
1013                if self.eat(')') {
1014                    return Ok(true);
1015                }
1016                return Err(Error::new(start, "Unterminated group"));
1017            }
1018            self.reset_to(start)
1019        }
1020        Ok(false)
1021    }
1022
1023    fn eat_capturing_group(&mut self) -> Result<bool, Error> {
1024        trace!("eat_capturing_group {:?}", self.current(),);
1025        if self.eat('(') {
1026            self.group_specifier()?;
1027            self.disjunction()?;
1028            if self.eat(')') {
1029                self.state.num_capturing_parens += 1;
1030                Ok(true)
1031            } else {
1032                Err(Error::new(self.state.pos, "Unterminated group"))
1033            }
1034        } else {
1035            Ok(false)
1036        }
1037    }
1038
1039    fn group_specifier(&mut self) -> Result<(), Error> {
1040        trace!("group_specifier {:?}", self.current(),);
1041        if self.eat('?') {
1042            if self.eat_group_name()? {
1043                if let Some(name) = self.state.last_string_value {
1044                    if self.state.group_names.contains(&name) {
1045                        return Err(Error::new(self.state.pos, "Duplicate capture group name"));
1046                    } else {
1047                        self.state.group_names.push(name);
1048                        return Ok(());
1049                    }
1050                }
1051            }
1052            return Err(Error::new(self.state.pos, "Invalid group"));
1053        }
1054        Ok(())
1055    }
1056
1057    fn eat_assertion(&mut self) -> Result<bool, Error> {
1058        trace!("eat_assertion {:?}", self.current(),);
1059        let start = self.state.pos;
1060        self.state.last_assert_is_quant = false;
1061        if self.eat('^') || self.eat('$') {
1062            return Ok(true);
1063        }
1064        if self.eat('\\') {
1065            if self.eat('B') || self.eat('b') {
1066                return Ok(true);
1067            }
1068            self.reset_to(start);
1069        }
1070        if self.eat('(') && self.eat('?') {
1071            let look_behind = self.eat('<');
1072            if self.eat('=') || self.eat('!') {
1073                self.disjunction()?;
1074                if !self.eat(')') {
1075                    return Err(Error::new(self.state.pos, "Unterminated group"));
1076                }
1077                self.state.last_assert_is_quant = !look_behind;
1078                return Ok(true);
1079            }
1080        }
1081        self.reset_to(start);
1082        Ok(false)
1083    }
1084
1085    fn eat_digits(&mut self, radix: u32) -> bool {
1086        trace!("eat_digits {:?}", self.current(),);
1087        let start = self.state.pos;
1088        self.state.last_int_value = Some(0);
1089        while let Some(next) = self.chars.peek() {
1090            log::debug!("next digit: {}", next);
1091            if let Some(n) = next.to_digit(radix) {
1092                log::debug!("digit as u32: {}", n);
1093                let last_int_value = self.state.last_int_value.unwrap_or(0);
1094                log::debug!("last_int_value: {}", last_int_value);
1095                self.state.last_int_value = Some(radix * last_int_value + n);
1096                self.advance();
1097            } else {
1098                log::debug!("next not digit");
1099                break;
1100            }
1101        }
1102        self.state.pos != start
1103    }
1104
1105    fn eat(&mut self, ch: char) -> bool {
1106        if let Some(next) = self.chars.peek() {
1107            if *next == ch {
1108                self.advance();
1109                return true;
1110            }
1111        }
1112        false
1113    }
1114
1115    fn advance(&mut self) {
1116        if let Some(ch) = self.chars.next() {
1117            self.state.pos += ch.len_utf8();
1118            log::debug!("adv: {} ({})", ch, self.state.pos);
1119        } else {
1120            log::debug!("adv at end");
1121        }
1122    }
1123
1124    fn reset_to(&mut self, idx: usize) {
1125        let remaining = &self.pattern[idx..];
1126        self.chars = remaining.chars().peekable();
1127        log::debug!("res: {} ({})", self.chars.peek().unwrap_or(&' '), idx);
1128        self.state.pos = idx;
1129    }
1130}
1131
1132struct State<'a> {
1133    pos: usize,
1134    len: usize,
1135    last_int_value: Option<u32>,
1136    last_string_value: Option<&'a str>,
1137    last_assert_is_quant: bool,
1138    num_capturing_parens: u32,
1139    max_back_refs: u32,
1140    group_names: Vec<&'a str>,
1141    back_ref_names: Vec<&'a str>,
1142    n: bool,
1143    u: bool,
1144}
1145
1146impl<'a> State<'a> {
1147    pub fn new(len: usize, u: bool) -> Self {
1148        Self {
1149            pos: 0,
1150            len,
1151            last_int_value: None,
1152            last_string_value: None,
1153            last_assert_is_quant: false,
1154            num_capturing_parens: 0,
1155            max_back_refs: 0,
1156            group_names: Vec::new(),
1157            back_ref_names: Vec::new(),
1158            n: u,
1159            u,
1160        }
1161    }
1162    pub fn reset(&mut self) {
1163        self.pos = 0;
1164        self.last_int_value = None;
1165        self.last_string_value = None;
1166        self.num_capturing_parens = 0;
1167        self.max_back_refs = 0;
1168        self.group_names.clear();
1169        self.back_ref_names.clear();
1170    }
1171}
1172
1173#[derive(Debug)]
1174struct RegExFlags {
1175    case_insensitive: bool,
1176    multi_line: bool,
1177    dot_matches_new_line: bool,
1178    unicode: bool,
1179    global: bool,
1180    sticky: bool,
1181    has_indicies: bool,
1182}
1183
1184impl Default for RegExFlags {
1185    fn default() -> Self {
1186        RegExFlags {
1187            case_insensitive: false,
1188            multi_line: false,
1189            dot_matches_new_line: false,
1190            unicode: false,
1191            global: false,
1192            sticky: false,
1193            has_indicies: false,
1194        }
1195    }
1196}
1197
1198impl RegExFlags {
1199    fn add_flag(&mut self, c: char, pos: usize) -> Result<(), Error> {
1200        match c {
1201            'g' => {
1202                if self.global {
1203                    Err(Error::new(pos, "duplicate g flag"))
1204                } else {
1205                    self.global = true;
1206                    Ok(())
1207                }
1208            }
1209            'i' => {
1210                if self.case_insensitive {
1211                    Err(Error::new(pos, "duplicate i flag"))
1212                } else {
1213                    self.case_insensitive = true;
1214                    Ok(())
1215                }
1216            }
1217            'm' => {
1218                if self.multi_line {
1219                    Err(Error::new(pos, "duplicate m flag"))
1220                } else {
1221                    self.multi_line = true;
1222                    Ok(())
1223                }
1224            }
1225            's' => {
1226                if self.dot_matches_new_line {
1227                    Err(Error::new(pos, "duplicate s flag"))
1228                } else {
1229                    self.dot_matches_new_line = true;
1230                    Ok(())
1231                }
1232            }
1233            'u' => {
1234                if self.unicode {
1235                    Err(Error::new(pos, "duplicate u flag"))
1236                } else {
1237                    self.unicode = true;
1238                    Ok(())
1239                }
1240            }
1241            'y' => {
1242                if self.sticky {
1243                    Err(Error::new(pos, "duplicate y flag"))
1244                } else {
1245                    self.sticky = true;
1246                    Ok(())
1247                }
1248            }
1249            'd' => {
1250                if self.has_indicies {
1251                    Err(Error::new(pos, "duplicate d flag"))
1252                } else {
1253                    self.has_indicies = true;
1254                    Ok(())
1255                }
1256            }
1257            _ => Err(Error::new(pos, &format!("invalid flag {:?}", c))),
1258        }
1259    }
1260}
1261
1262#[cfg(test)]
1263mod tests {
1264    use super::*;
1265    #[test]
1266    fn lots_of_regexes() {
1267        run_test("/asdf|fdsa/g").unwrap();
1268    }
1269    #[test]
1270    #[should_panic = "Invalid escape"]
1271    fn decimal_escape_with_u() {
1272        run_test(r"/\1/u").unwrap()
1273    }
1274
1275    #[test]
1276    #[should_panic = "invalid flag"]
1277    fn invalid_regex_flag() {
1278        run_test("/./G").unwrap();
1279    }
1280
1281    #[test]
1282    #[should_panic = "Nothing to repeat"]
1283    fn bad_look_behind() {
1284        run_test(r"/.(?<=.)?/").unwrap();
1285    }
1286
1287    #[test]
1288    #[should_panic]
1289    fn bad_quant() {
1290        run_test(r"/{2}/").unwrap();
1291    }
1292
1293    #[test]
1294    #[should_panic]
1295    fn id_continue_u() {
1296        run_test(r"/\M/u").unwrap();
1297    }
1298
1299    #[test]
1300    #[should_panic]
1301    fn cant_start_with_star() {
1302        run_test("/*/").unwrap();
1303    }
1304
1305    #[test]
1306    fn unicode_name_and_value() {
1307        for value in unicode_tables::general_category::GC {
1308            run_test(&format!(r"/\p{{General_Category={}}}/u", value))
1309                .expect(&format!("failed at General_category={}", value));
1310            run_test(&format!(r"/\p{{gc={}}}/u", value)).expect(&format!("failed at gc={}", value));
1311        }
1312        for value in unicode_tables::script_values::SCRIPT {
1313            run_test(&format!(r"/\p{{Script={}}}/u", value))
1314                .expect(&format!("failed at Script={}", value));
1315            run_test(&format!(r"/\p{{sc={}}}/u", value)).expect(&format!("failed at sc={}", value));
1316            run_test(&format!(r"/\p{{Script_Extensions={}}}/u", value))
1317                .expect(&format!("failed at Script_Extensions={}", value));
1318            run_test(&format!(r"/\p{{scx={}}}/u", value))
1319                .expect(&format!("failed at scx={}", value));
1320        }
1321    }
1322    #[test]
1323    #[should_panic]
1324    fn unicode_name_and_value_bad_name() {
1325        run_test(r"/\p{junk=Greek}/u").unwrap();
1326    }
1327    #[test]
1328    #[should_panic]
1329    fn unicode_name_and_value_bad_value() {
1330        run_test(r"/\p{General_Category=Geek}/u").unwrap();
1331    }
1332    #[test]
1333    #[should_panic]
1334    fn unicode_name_or_value_bad_value() {
1335        run_test(r"/\p{junk}/u").unwrap();
1336    }
1337    #[test]
1338    fn unicode_name_or_value() {
1339        for value in unicode_tables::GC_AND_BP {
1340            run_test(&format!(r"/\p{{{}}}/u", value)).unwrap();
1341        }
1342    }
1343
1344    #[test]
1345    fn named_group() {
1346        run_test(r"/(?<x>a)|b/").unwrap();
1347    }
1348
1349    #[test]
1350    fn control_range() {
1351        run_test(r"/[\t-\r]/").unwrap();
1352    }
1353
1354    #[test]
1355    fn known_flags() {
1356        let flags = &['d', 'g', 'i', 'm', 's', 'u', 'y'];
1357        for flag in flags {
1358            run_test(&format!("/.+/{}", flag)).unwrap();
1359        }
1360        let all: String = flags.iter().collect();
1361        run_test(&format!("/.+/{}", all)).unwrap();
1362    }
1363
1364    #[test]
1365    fn duplicate_known_flags() {
1366        let flags = &['d', 'g', 'i', 'm', 's', 'u', 'y'];
1367        for flag in flags {
1368            run_test(&format!("/.+/{0}{}0", flag)).unwrap_err();
1369        }
1370    }
1371    #[test]
1372    fn long_or_sequences() {
1373        run_test(r#"/((?:[^BEGHLMOSWYZabcdhmswyz']+)|(?:'(?:[^']|'')*')|(?:G{1,5}|y{1,4}|Y{1,4}|M{1,5}|L{1,5}|w{1,2}|W{1}|d{1,2}|E{1,6}|c{1,6}|a{1,5}|b{1,5}|B{1,5}|h{1,2}|H{1,2}|m{1,2}|s{1,2}|S{1,3}|z{1,4}|Z{1,5}|O{1,4}))([\s\S]*)/"#).unwrap();
1374    }
1375
1376    fn run_test(regex: &str) -> Result<(), Error> {
1377        let _ = pretty_env_logger::try_init();
1378        let mut parser = RegexParser::new(regex)?;
1379        parser.validate()?;
1380        Ok(())
1381    }
1382}