js_regex/
validator.rs

1// Copyright (C) 2020 Quentin M. Kniep <hello@quentinkniep.com>
2// Distributed under terms of the MIT license.
3
4use std::collections::HashSet;
5use std::ops::{Deref, DerefMut};
6
7use crate::reader::Reader;
8use crate::unicode::*;
9
10fn is_syntax_character(cp: char) -> bool {
11    return cp == '^'
12        || cp == '$'
13        || cp == '\\'
14        || cp == '.'
15        || cp == '*'
16        || cp == '+'
17        || cp == '?'
18        || cp == '('
19        || cp == ')'
20        || cp == '['
21        || cp == ']'
22        || cp == '{'
23        || cp == '}'
24        || cp == '|';
25}
26
27fn is_unicode_property_name_character(cp: char) -> bool {
28    cp.is_ascii_alphabetic() || cp == '_'
29}
30
31fn is_unicode_property_value_character(cp: char) -> bool {
32    is_unicode_property_name_character(cp) || cp.is_digit(10)
33}
34
35fn is_regexp_identifier_start(cp: char) -> bool {
36    is_id_start(cp) || cp == '$' || cp == '_'
37}
38
39fn is_regexp_identifier_part(cp: char) -> bool {
40    is_id_continue(cp) ||
41    cp == '$' ||
42    cp == '_' ||
43    cp == '\u{200c}' ||  // unicode zero-width non-joiner
44    cp == '\u{200d}' // unicode zero-width joiner
45}
46
47fn is_id_start(cp: char) -> bool {
48    if (cp as u32) < 0x41 {
49        false
50    } else if (cp as u32) < 0x5b {
51        true
52    } else if (cp as u32) < 0x61 {
53        false
54    } else if (cp as u32) < 0x7b {
55        true
56    } else {
57        is_large_id_start(cp)
58    }
59}
60
61fn is_id_continue(cp: char) -> bool {
62    if (cp as u32) < 0x30 {
63        false
64    } else if (cp as u32) < 0x3a {
65        true
66    } else if (cp as u32) < 0x41 {
67        false
68    } else if (cp as u32) < 0x5b {
69        true
70    } else if (cp as u32) == 0x5f {
71        true
72    } else if (cp as u32) < 0x61 {
73        false
74    } else if (cp as u32) < 0x7b {
75        true
76    } else {
77        is_large_id_start(cp) || is_large_id_continue(cp)
78    }
79}
80
81fn is_valid_unicode(cp: i64) -> bool {
82    cp <= 0x10ffff
83}
84
85fn is_lead_surrogate(cp: i64) -> bool {
86    cp >= 0xd800 && cp <= 0xdbff
87}
88
89fn is_trail_surrogate(cp: i64) -> bool {
90    cp >= 0xdc00 && cp <= 0xdfff
91}
92
93fn combine_surrogate_pair(lead: i64, trail: i64) -> i64 {
94    (lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000
95}
96
97#[derive(Clone, Copy, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)]
98pub enum EcmaVersion {
99    ES5,
100    ES2015,
101    ES2016,
102    ES2017,
103    ES2018,
104    ES2019,
105    ES2020,
106    ES2021,
107}
108
109#[derive(Debug)]
110pub struct EcmaRegexValidator {
111    reader: Reader,
112    strict: bool,
113    ecma_version: EcmaVersion,
114    u_flag: bool,
115    n_flag: bool,
116    last_int_value: i64,
117    last_min_value: i64,
118    last_max_value: i64,
119    last_str_value: String,
120    last_key_value: String,
121    last_val_value: String,
122    last_assertion_is_quantifiable: bool,
123    num_capturing_parens: u32,
124    group_names: HashSet<String>,
125    backreference_names: HashSet<String>,
126}
127
128impl Deref for EcmaRegexValidator {
129    type Target = Reader;
130
131    fn deref(&self) -> &Self::Target {
132        &self.reader
133    }
134}
135
136impl DerefMut for EcmaRegexValidator {
137    fn deref_mut(&mut self) -> &mut Self::Target {
138        &mut self.reader
139    }
140}
141
142impl EcmaRegexValidator {
143    pub fn new(ecma_version: EcmaVersion) -> Self {
144        EcmaRegexValidator {
145            reader: Reader::new(),
146            strict: false,
147            ecma_version,
148            u_flag: false,
149            n_flag: false,
150            last_int_value: 0,
151            last_min_value: 0,
152            last_max_value: 0,
153            last_str_value: "".to_string(),
154            last_key_value: "".to_string(),
155            last_val_value: "".to_string(),
156            last_assertion_is_quantifiable: false,
157            num_capturing_parens: 0,
158            group_names: HashSet::new(),
159            backreference_names: HashSet::new(),
160        }
161    }
162
163    /// Validates flags of a EcmaScript regular expression.
164    pub fn validate_flags(&self, flags: &str) -> Result<(), String> {
165        let mut existing_flags = HashSet::<char>::new();
166
167        for flag in flags.chars() {
168            if existing_flags.contains(&flag) {
169                return Err(format!("Duplicated flag {}", flag));
170            }
171            existing_flags.insert(flag);
172
173            if flag == 'g'
174                || flag == 'i'
175                || flag == 'm'
176                || (flag == 'u' && self.ecma_version >= EcmaVersion::ES2015)
177                || (flag == 'y' && self.ecma_version >= EcmaVersion::ES2015)
178                || (flag == 's' && self.ecma_version >= EcmaVersion::ES2018)
179            {
180                // do nothing
181            } else {
182                return Err(format!("Invalid flag {}", flag));
183            }
184        }
185        Ok(())
186    }
187
188    /// Validates the pattern of a EcmaScript regular expression.
189    pub fn validate_pattern(&mut self, source: &str, u_flag: bool) -> Result<(), String> {
190        self.strict = u_flag; // TODO: allow toggling strict independently of u flag
191        self.u_flag = u_flag && self.ecma_version >= EcmaVersion::ES2015;
192        self.n_flag = u_flag && self.ecma_version >= EcmaVersion::ES2018;
193        //self.reset(source, 0, source.len(), u_flag);
194        self.reset(source, 0, source.chars().count(), u_flag);
195        self.consume_pattern()?;
196
197        if !self.n_flag && self.ecma_version >= EcmaVersion::ES2018 && self.group_names.len() > 0 {
198            self.n_flag = true;
199            self.rewind(0);
200            self.consume_pattern()?;
201        }
202
203        return Ok(());
204    }
205
206    /// Validate the next characters as a RegExp `Pattern` production.
207    /// ```grammar
208    /// Pattern[U, N]::
209    ///     Disjunction[?U, ?N]
210    /// ```
211    fn consume_pattern(&mut self) -> Result<(), String> {
212        self.num_capturing_parens = self.count_capturing_parens();
213        self.group_names.clear();
214        self.backreference_names.clear();
215
216        self.consume_disjunction()?;
217
218        if let Some(cp) = self.code_point_with_offset(0) {
219            if cp == ')' {
220                return Err("Unmatched ')'".to_string());
221            } else if cp == '\\' {
222                return Err("\\ at end of pattern".to_string());
223            } else if cp == ']' || cp == '}' {
224                return Err("Lone quantifier brackets".to_string());
225            }
226            return Err(format!("Unexpected character {}", cp));
227        }
228
229        for name in self.backreference_names.difference(&self.group_names) {
230            return Err(format!("Invalid named capture referenced: {}", name));
231        }
232        return Ok(());
233    }
234
235    /// Validate the next characters as a RegExp `Disjunction` production.
236    /// ```grammar
237    /// Disjunction[U, N]::
238    ///      Alternative[?U, ?N]
239    ///      Alternative[?U, ?N] `|` Disjunction[?U, ?N]
240    /// ```
241    fn consume_disjunction(&mut self) -> Result<(), String> {
242        self.consume_alternative()?;
243        while self.eat('|') {
244            self.consume_alternative()?;
245        }
246
247        if self.consume_quantifier(true)? {
248            return Err("Nothing to repeat".to_string());
249        } else if self.eat('{') {
250            return Err("Lone quantifier brackets".to_string());
251        }
252        return Ok(());
253    }
254
255    /// Validate the next characters as a RegExp `Alternative` production.
256    /// ```grammar
257    /// Alternative[U, N]::
258    ///      ε
259    ///      Alternative[?U, ?N] Term[?U, ?N]
260    /// ```
261    fn consume_alternative(&mut self) -> Result<(), String> {
262        while self.code_point_with_offset(0).is_some() && self.consume_term()? {
263            // do nothing
264        }
265        Ok(())
266    }
267
268    /// Validate the next characters as a RegExp `Term` production if possible.
269    /// ```grammar
270    /// Term[U, N]::
271    ///      [strict] Assertion[+U, ?N]
272    ///      [strict] Atom[+U, ?N]
273    ///      [strict] Atom[+U, ?N] Quantifier
274    ///      [annexB][+U] Assertion[+U, ?N]
275    ///      [annexB][+U] Atom[+U, ?N]
276    ///      [annexB][+U] Atom[+U, ?N] Quantifier
277    ///      [annexB][~U] QuantifiableAssertion[?N] Quantifier
278    ///      [annexB][~U] Assertion[~U, ?N]
279    ///      [annexB][~U] ExtendedAtom[?N] Quantifier
280    ///      [annexB][~U] ExtendedAtom[?N]
281    /// ```
282    /// Returns `true` if it consumed the next characters successfully.
283    fn consume_term(&mut self) -> Result<bool, String> {
284        if self.u_flag || self.strict {
285            return Ok(self.consume_assertion()?
286                || (self.consume_atom()? && self.consume_optional_quantifier()?));
287        }
288        return Ok((self.consume_assertion()?
289            && (!self.last_assertion_is_quantifiable || self.consume_optional_quantifier()?))
290            || (self.consume_extended_atom()? && self.consume_optional_quantifier()?));
291    }
292
293    fn consume_optional_quantifier(&mut self) -> Result<bool, String> {
294        self.consume_quantifier(false)?;
295        Ok(true)
296    }
297
298    /// Validate the next characters as a RegExp `Term` production if possible.
299    /// Set `self.last_assertion_is_quantifiable` if the consumed assertion was a
300    /// `QuantifiableAssertion` production.
301    /// ```grammar
302    /// Assertion[U, N]::
303    ///      `^`
304    ///      `$`
305    ///      `\b`
306    ///      `\B`
307    ///      [strict] `(?=` Disjunction[+U, ?N] `)`
308    ///      [strict] `(?!` Disjunction[+U, ?N] `)`
309    ///      [annexB][+U] `(?=` Disjunction[+U, ?N] `)`
310    ///      [annexB][+U] `(?!` Disjunction[+U, ?N] `)`
311    ///      [annexB][~U] QuantifiableAssertion[?N]
312    ///      `(?<=` Disjunction[?U, ?N] `)`
313    ///      `(?<!` Disjunction[?U, ?N] `)`
314    /// QuantifiableAssertion[N]::
315    ///      `(?=` Disjunction[~U, ?N] `)`
316    ///      `(?!` Disjunction[~U, ?N] `)`
317    /// ```
318    /// Returns `true` if it consumed the next characters successfully.
319    fn consume_assertion(&mut self) -> Result<bool, String> {
320        let start = self.index();
321        self.last_assertion_is_quantifiable = false;
322
323        if self.eat('^') || self.eat('$') || self.eat2('\\', 'B') || self.eat2('\\', 'b') {
324            return Ok(true);
325        }
326
327        // Lookahead / Lookbehind
328        if self.eat2('(', '?') {
329            let lookbehind = self.ecma_version >= EcmaVersion::ES2018 && self.eat('<');
330            let mut flag = self.eat('=');
331            if !flag {
332                flag = self.eat('!');
333            }
334            if flag {
335                self.consume_disjunction()?;
336                if !self.eat(')') {
337                    return Err("Unterminated group".to_string());
338                }
339                self.last_assertion_is_quantifiable = !lookbehind && !self.strict;
340                return Ok(true);
341            }
342            self.rewind(start);
343        }
344        Ok(false)
345    }
346
347    /// Validate the next characters as a RegExp `Quantifier` production if possible.
348    /// ```grammar
349    /// Quantifier::
350    ///      QuantifierPrefix
351    ///      QuantifierPrefix `?`
352    /// QuantifierPrefix::
353    ///      `*`
354    ///      `+`
355    ///      `?`
356    ///      `{` DecimalDigits `}`
357    ///      `{` DecimalDigits `,}`
358    ///      `{` DecimalDigits `,` DecimalDigits `}`
359    /// ```
360    /// Returns `true` if it consumed the next characters successfully.
361    fn consume_quantifier(&mut self, no_consume: bool) -> Result<bool, String> {
362        // QuantifierPrefix
363        if !self.eat('*')
364            && !self.eat('+')
365            && !self.eat('?')
366            && !self.eat_braced_quantifier(no_consume)?
367        {
368            return Ok(false);
369        }
370
371        self.eat('?');
372        return Ok(true);
373    }
374
375    /// Eats the next characters as the following alternatives if possible.
376    /// Sets `self.last_min_value` and `self.last_max_value` if it consumed the next characters
377    /// successfully.
378    /// ```grammar
379    ///      `{` DecimalDigits `}`
380    ///      `{` DecimalDigits `,}`
381    ///      `{` DecimalDigits `,` DecimalDigits `}`
382    /// ```
383    /// Returns `true` if it consumed the next characters successfully.
384    fn eat_braced_quantifier(&mut self, no_error: bool) -> Result<bool, &str> {
385        let start = self.index();
386        if self.eat('{') {
387            self.last_min_value = 0;
388            self.last_max_value = i64::MAX;
389            if self.eat_decimal_digits() {
390                self.last_min_value = self.last_int_value;
391                self.last_max_value = self.last_int_value;
392                if self.eat(',') {
393                    self.last_max_value = if self.eat_decimal_digits() {
394                        self.last_int_value
395                    } else {
396                        i64::MAX
397                    }
398                }
399                if self.eat('}') {
400                    if !no_error && self.last_max_value < self.last_min_value {
401                        return Err("numbers out of order in {} quantifier");
402                    }
403                    return Ok(true);
404                }
405            }
406            if !no_error && (self.u_flag || self.strict) {
407                return Err("Incomplete quantifier");
408            }
409            self.rewind(start);
410        }
411        return Ok(false);
412    }
413
414    /// Validate the next characters as a RegExp `Atom` production if possible.
415    /// ```grammar
416    /// Atom[U, N]::
417    ///      PatternCharacter
418    ///      `.`
419    ///      `\\` AtomEscape[?U, ?N]
420    ///      CharacterClass[?U]
421    ///      `(?:` Disjunction[?U, ?N] )
422    ///      `(` GroupSpecifier[?U] Disjunction[?U, ?N] `)`
423    /// ```
424    /// Returns `true` if it consumed the next characters successfully.
425    fn consume_atom(&mut self) -> Result<bool, String> {
426        Ok(self.consume_pattern_character()
427            || self.consume_dot()
428            || self.consume_reverse_solidus_atom_escape()?
429            || self.consume_character_class()?
430            || self.consume_uncapturing_group()?
431            || self.consume_capturing_group()?)
432    }
433
434    /// Validate the next characters as the following alternatives if possible.
435    /// ```grammar
436    ///      `.`
437    /// ```
438    /// Returns `true` if it consumed the next characters successfully.
439    fn consume_dot(&mut self) -> bool {
440        if self.eat('.') {
441            return true;
442        }
443        return false;
444    }
445
446    /// Validate the next characters as the following alternatives if possible.
447    /// ```grammar
448    ///      `\\` AtomEscape[?U, ?N]
449    /// ```
450    /// Returns `true` if it consumed the next characters successfully.
451    fn consume_reverse_solidus_atom_escape(&mut self) -> Result<bool, String> {
452        let start = self.index();
453        if self.eat('\\') {
454            if self.consume_atom_escape()? {
455                return Ok(true);
456            }
457            self.rewind(start);
458        }
459        return Ok(false);
460    }
461
462    /// Validate the next characters as the following alternatives if possible.
463    /// ```grammar
464    ///      `(?:` Disjunction[?U, ?N] )
465    /// ```
466    /// Returns `true` if it consumed the next characters successfully.
467    fn consume_uncapturing_group(&mut self) -> Result<bool, String> {
468        if self.eat3('(', '?', ':') {
469            self.consume_disjunction()?;
470            if !self.eat(')') {
471                return Err("Unterminated group".to_string());
472            }
473            return Ok(true);
474        }
475        return Ok(false);
476    }
477
478    /// Validate the next characters as the following alternatives if possible.
479    /// ```grammar
480    ///      `(` GroupSpecifier[?U] Disjunction[?U, ?N] `)`
481    /// ```
482    /// Returns `true` if it consumed the next characters successfully.
483    fn consume_capturing_group(&mut self) -> Result<bool, String> {
484        if !self.eat('(') {
485            return Ok(false);
486        }
487
488        if self.ecma_version >= EcmaVersion::ES2018 {
489            self.consume_group_specifier()?;
490        } else if self.code_point_with_offset(0) == Some('?') {
491            return Err("Invalid group".to_string());
492        }
493
494        self.consume_disjunction()?;
495        if !self.eat(')') {
496            return Err("Unterminated group".to_string());
497        }
498        Ok(true)
499    }
500
501    /// Validate the next characters as a RegExp `ExtendedAtom` production if possible.
502    /// ```grammar
503    /// ExtendedAtom[N]::
504    ///      `.`
505    ///      `\` AtomEscape[~U, ?N]
506    ///      `\` [lookahead = c]
507    ///      CharacterClass[~U]
508    ///      `(?:` Disjunction[~U, ?N] `)`
509    ///      `(` Disjunction[~U, ?N] `)`
510    ///      InvalidBracedQuantifier
511    ///      ExtendedPatternCharacter
512    /// ```
513    /// Returns `true` if it consumed the next characters successfully.
514    fn consume_extended_atom(&mut self) -> Result<bool, String> {
515        Ok(self.eat('.')
516            || self.consume_reverse_solidus_atom_escape()?
517            || self.consume_reverse_solidus_followed_by_c()
518            || self.consume_character_class()?
519            || self.consume_uncapturing_group()?
520            || self.consume_capturing_group()?
521            || self.consume_invalid_braced_quantifier()?
522            || self.consume_extended_pattern_character())
523    }
524
525    /// Validate the next characters as the following alternatives if possible.
526    /// ```grammar
527    ///      `\` [lookahead = c]
528    /// ```
529    /// Returns `true` if it consumed the next characters successfully.
530    fn consume_reverse_solidus_followed_by_c(&mut self) -> bool {
531        if self.code_point_with_offset(0) == Some('\\')
532            && self.code_point_with_offset(1) == Some('c')
533        {
534            self.last_int_value = '\\' as i64;
535            self.advance();
536            return true;
537        }
538        return false;
539    }
540
541    /// Validate the next characters as a RegExp `InvalidBracedQuantifier`
542    /// production if possible.
543    /// ```grammar
544    /// InvalidBracedQuantifier::
545    ///      `{` DecimalDigits `}`
546    ///      `{` DecimalDigits `,}`
547    ///      `{` DecimalDigits `,` DecimalDigits `}`
548    /// ```
549    /// Returns `true` if it consumed the next characters successfully.
550    fn consume_invalid_braced_quantifier(&mut self) -> Result<bool, &str> {
551        if self.eat_braced_quantifier(true)? {
552            return Err("Nothing to repeat");
553        }
554        Ok(false)
555    }
556
557    /// Validate the next characters as a RegExp `PatternCharacter` production if
558    /// possible.
559    /// ```grammar
560    /// PatternCharacter::
561    ///      SourceCharacter but not SyntaxCharacter
562    /// ```
563    /// Returns `true` if it consumed the next characters successfully.
564    fn consume_pattern_character(&mut self) -> bool {
565        if let Some(cp) = self.code_point_with_offset(0) {
566            if !is_syntax_character(cp) {
567                self.advance();
568                return true;
569            }
570        }
571        return false;
572    }
573
574    /// Validate the next characters as a RegExp `ExtendedPatternCharacter`
575    /// production if possible.
576    /// ```grammar
577    /// ExtendedPatternCharacter::
578    ///      SourceCharacter but not one of ^ $ \ . * + ? ( ) [ |
579    /// ```
580    /// Returns `true` if it consumed the next characters successfully.
581    fn consume_extended_pattern_character(&mut self) -> bool {
582        if let Some(cp) = self.code_point_with_offset(0) {
583            if cp != '^'
584                && cp != '$'
585                && cp != '\\'
586                && cp != '.'
587                && cp != '*'
588                && cp != '+'
589                && cp != '?'
590                && cp != '('
591                && cp != ')'
592                && cp != '['
593                && cp != '|'
594            {
595                self.advance();
596                return true;
597            }
598        }
599        return false;
600    }
601
602    /// Validate the next characters as a RegExp `GroupSpecifier` production.
603    /// Set `self.last_str_value` if the group name existed.
604    /// ```grammar
605    /// GroupSpecifier[U]::
606    ///      ε
607    ///      `?` GroupName[?U]
608    /// ```
609    /// Returns `true` if the group name existed.
610    fn consume_group_specifier(&mut self) -> Result<bool, String> {
611        if self.eat('?') {
612            if self.eat_group_name()? {
613                if !self.group_names.contains(&self.last_str_value) {
614                    self.group_names.insert(self.last_str_value.clone());
615                    return Ok(true);
616                }
617                return Err("Duplicate capture group name".to_string());
618            }
619            return Err("Invalid group".to_string());
620        }
621        return Ok(false);
622    }
623
624    /// Validate the next characters as a RegExp `AtomEscape` production if possible.
625    /// ```grammar
626    /// AtomEscape[U, N]::
627    ///      [strict] DecimalEscape
628    ///      [annexB][+U] DecimalEscape
629    ///      [annexB][~U] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape is <= NcapturingParens
630    ///      CharacterClassEscape[?U]
631    ///      [strict] CharacterEscape[?U]
632    ///      [annexB] CharacterEscape[?U, ?N]
633    ///      [+N] `k` GroupName[?U]
634    /// ```
635    /// Returns `Ok(true)` if it consumed the next characters successfully.
636    fn consume_atom_escape(&mut self) -> Result<bool, String> {
637        if self.consume_backreference()?
638            || self.consume_character_class_escape()?
639            || self.consume_character_escape()?
640            || (self.n_flag && self.consume_k_group_name()?)
641        {
642            return Ok(true);
643        }
644        if self.strict || self.u_flag {
645            return Err("Invalid escape".to_string());
646        }
647        return Ok(false);
648    }
649
650    /// Validate the next characters as the follwoing alternatives if possible.
651    /// ```grammar
652    ///      [strict] DecimalEscape
653    ///      [annexB][+U] DecimalEscape
654    ///      [annexB][~U] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape is <= NcapturingParens
655    /// ```
656    /// Returns `Ok(true)` if it consumed the next characters successfully.
657    fn consume_backreference(&mut self) -> Result<bool, &str> {
658        let start = self.index();
659        if self.eat_decimal_escape() {
660            if self.last_int_value <= self.num_capturing_parens as i64 {
661                return Ok(true);
662            } else if self.strict || self.u_flag {
663                return Err("Invalid escape");
664            }
665            self.rewind(start);
666        }
667        Ok(false)
668    }
669
670    /// Validate the next characters as a RegExp `DecimalEscape` production if possible.
671    /// Set `-1` to `self.last_int_value` as meaning of a character set if it ate the next
672    /// characters successfully.
673    /// ```grammar
674    /// CharacterClassEscape[U]::
675    ///      `d`
676    ///      `D`
677    ///      `s`
678    ///      `S`
679    ///      `w`
680    ///      `W`
681    ///      [+U] `p{` UnicodePropertyValueExpression `}`
682    ///      [+U] `P{` UnicodePropertyValueExpression `}`
683    /// ```
684    /// Returns `true` if it consumed the next characters successfully.
685    fn consume_character_class_escape(&mut self) -> Result<bool, String> {
686        if self.eat('d')
687            || self.eat('D')
688            || self.eat('s')
689            || self.eat('S')
690            || self.eat('w')
691            || self.eat('W')
692        {
693            self.last_int_value = -1;
694            return Ok(true);
695        }
696
697        if self.u_flag
698            && self.ecma_version >= EcmaVersion::ES2018
699            && (self.eat('p') || self.eat('P'))
700        {
701            self.last_int_value = -1;
702            if self.eat('{') && self.eat_unicode_property_value_expression()? && self.eat('}') {
703                return Ok(true);
704            }
705            return Err("Invalid property name".to_string());
706        }
707        Ok(false)
708    }
709
710    /// Validate the next characters as a RegExp `CharacterEscape` production if possible.
711    /// ```grammar
712    /// CharacterEscape[U, N]::
713    ///      ControlEscape
714    ///      `c` ControlLetter
715    ///      `0` [lookahead ∉ DecimalDigit]
716    ///      HexEscapeSequence
717    ///      RegExpUnicodeEscapeSequence[?U]
718    ///      [annexB][~U] LegacyOctalEscapeSequence
719    ///      IdentityEscape[?U, ?N]
720    /// ```
721    /// Returns `true` if it consumed the next characters successfully.
722    fn consume_character_escape(&mut self) -> Result<bool, String> {
723        Ok(self.eat_control_escape()
724            || self.eat_c_control_letter()
725            || self.eat_zero()
726            || self.eat_hex_escape_sequence()?
727            || self.eat_regexp_unicode_escape_sequence(false)?
728            || (!self.strict && !self.u_flag && self.eat_legacy_octal_escape_sequence())
729            || self.eat_identity_escape())
730    }
731
732    /// Validate the next characters as the follwoing alternatives if possible.
733    /// ```grammar
734    ///      `k` GroupName[?U]
735    /// ```
736    /// Returns `Ok(true)` if it consumed the next characters successfully.
737    fn consume_k_group_name(&mut self) -> Result<bool, String> {
738        if self.eat('k') {
739            if self.eat_group_name()? {
740                let group_name = self.last_str_value.clone();
741                self.backreference_names.insert(group_name);
742                return Ok(true);
743            }
744            return Err("Invalid named reference".to_string());
745        }
746        Ok(false)
747    }
748
749    /// Validate the next characters as a RegExp `CharacterClass` production if possible.
750    /// ```grammar
751    /// CharacterClass[U]::
752    ///      `[` [lookahead ≠ ^] ClassRanges[?U] `]`
753    ///      `[^` ClassRanges[?U] `]`
754    /// ```
755    /// Returns `true` if it consumed the next characters successfully.
756    fn consume_character_class(&mut self) -> Result<bool, String> {
757        if !self.eat('[') {
758            return Ok(false);
759        }
760        self.consume_class_ranges()?;
761        if !self.eat(']') {
762            return Err("Unterminated character class".to_string());
763        }
764        Ok(true)
765    }
766
767    /// Validate the next characters as a RegExp `ClassRanges` production.
768    /// ```grammar
769    /// ClassRanges[U]::
770    ///      ε
771    ///      NonemptyClassRanges[?U]
772    /// NonemptyClassRanges[U]::
773    ///      ClassAtom[?U]
774    ///      ClassAtom[?U] NonemptyClassRangesNoDash[?U]
775    ///      ClassAtom[?U] `-` ClassAtom[?U] ClassRanges[?U]
776    /// NonemptyClassRangesNoDash[U]::
777    ///      ClassAtom[?U]
778    ///      ClassAtomNoDash[?U] NonemptyClassRangesNoDash[?U]
779    ///      ClassAtomNoDash[?U] `-` ClassAtom[?U] ClassRanges[?U]
780    /// ```
781    fn consume_class_ranges(&mut self) -> Result<(), String> {
782        loop {
783            // Consume the first ClassAtom
784            if !self.consume_class_atom()? {
785                break;
786            }
787            let min = self.last_int_value;
788
789            // Consume `-`
790            if !self.eat('-') {
791                continue;
792            }
793
794            // Consume the second ClassAtom
795            if !self.consume_class_atom()? {
796                break;
797            }
798            let max = self.last_int_value;
799
800            // Validate
801            if min == -1 || max == -1 {
802                if self.strict {
803                    return Err("Invalid character class".to_string());
804                }
805                continue;
806            }
807
808            println!("min: {},  max: {}", min, max);
809            if min > max {
810                return Err("Range out of order in character class".to_string());
811            }
812        }
813        Ok(())
814    }
815
816    /// Validate the next characters as a RegExp `ClassAtom` production if possible.
817    /// Set `self.last_int_value` if it consumed the next characters successfully.
818    /// ```grammar
819    /// ClassAtom[U, N]::
820    ///      `-`
821    ///      ClassAtomNoDash[?U, ?N]
822    /// ClassAtomNoDash[U, N]::
823    ///      SourceCharacter but not one of \ ] -
824    ///      `\` ClassEscape[?U, ?N]
825    ///      [annexB] `\` [lookahead = c]
826    /// ```
827    /// Returns `Ok(true)` if it consumed the next characters successfully.
828    fn consume_class_atom(&mut self) -> Result<bool, String> {
829        let start = self.index();
830
831        if let Some(cp) = self.code_point_with_offset(0) {
832            if cp != '\\' && cp != ']' {
833                self.advance();
834                self.last_int_value = cp as i64;
835                return Ok(true);
836            }
837        }
838
839        if self.eat('\\') {
840            if self.consume_class_escape()? {
841                return Ok(true);
842            }
843            if !self.strict && self.code_point_with_offset(0) == Some('c') {
844                self.last_int_value = '\\' as i64;
845                return Ok(true);
846            }
847            if self.strict || self.u_flag {
848                return Err("Invalid escape".to_string());
849            }
850            self.rewind(start);
851        }
852        Ok(false)
853    }
854
855    /// Validate the next characters as a RegExp `ClassEscape` production if possible.
856    /// Set `self.last_int_value` if it consumed the next characters successfully.
857    /// ```grammar
858    /// ClassEscape[U, N]::
859    ///      `b`
860    ///      [+U] `-`
861    ///      [annexB][~U] `c` ClassControlLetter
862    ///      CharacterClassEscape[?U]
863    ///      CharacterEscape[?U, ?N]
864    /// ClassControlLetter::
865    ///      DecimalDigit
866    ///      `_`
867    /// ```
868    /// Returns `Ok(true)` if it consumed the next characters successfully.
869    fn consume_class_escape(&mut self) -> Result<bool, String> {
870        if self.eat('b') {
871            self.last_int_value = 0x08; // backspace
872            return Ok(true);
873        }
874
875        // [+U] `-`
876        if self.u_flag && self.eat('-') {
877            self.last_int_value = '-' as i64;
878            return Ok(true);
879        }
880
881        // [annexB][~U] `c` ClassControlLetter
882        if !self.strict && !self.u_flag && self.code_point_with_offset(0) == Some('c') {
883            if let Some(cp) = self.code_point_with_offset(1) {
884                if cp.is_digit(10) || cp == '_' {
885                    self.advance();
886                    self.advance();
887                    self.last_int_value = cp as i64 % 0x20;
888                    return Ok(true);
889                }
890            }
891        }
892
893        Ok(self.consume_character_class_escape()? || self.consume_character_escape()?)
894    }
895
896    /// Eat the next characters as a RegExp `GroupName` production if possible.
897    /// Set `self.last_str_value` if the group name existed.
898    /// ```grammar
899    /// GroupName[U]::
900    ///      `<` RegExpIdentifierName[?U] `>`
901    /// ```
902    /// Returns `true` if it ate the next characters successfully.
903    fn eat_group_name(&mut self) -> Result<bool, String> {
904        if self.eat('<') {
905            if self.eat_regexp_identifier_name()? && self.eat('>') {
906                return Ok(true);
907            }
908            return Err("Invalid capture group name".to_string());
909        }
910        return Ok(false);
911    }
912
913    /// Eat the next characters as a RegExp `RegExpIdentifierName` production if
914    /// possible.
915    /// Set `self.last_str_value` if the identifier name existed.
916    /// ```grammar
917    /// RegExpIdentifierName[U]::
918    ///      RegExpIdentifierStart[?U]
919    ///      RegExpIdentifierName[?U] RegExpIdentifierPart[?U]
920    /// ```
921    /// Returns `true` if it ate the next characters successfully.
922    fn eat_regexp_identifier_name(&mut self) -> Result<bool, String> {
923        if self.eat_regexp_identifier_start()? {
924            self.last_str_value = std::char::from_u32(self.last_int_value as u32)
925                .unwrap()
926                .to_string();
927            while self.eat_regexp_identifier_part()? {
928                self.last_str_value
929                    .push(std::char::from_u32(self.last_int_value as u32).unwrap());
930            }
931            return Ok(true);
932        }
933        return Ok(false);
934    }
935
936    /// Eat the next characters as a RegExp `RegExpIdentifierStart` production if
937    /// possible.
938    /// Set `self.last_int_value` if the identifier start existed.
939    /// ```grammar
940    /// RegExpIdentifierStart[U] ::
941    ///      UnicodeIDStart
942    ///      `$`
943    ///      `_`
944    ///      `\` RegExpUnicodeEscapeSequence[+U]
945    ///      [~U] UnicodeLeadSurrogate UnicodeTrailSurrogate
946    /// ```
947    /// Returns `true` if it ate the next characters successfully.
948    fn eat_regexp_identifier_start(&mut self) -> Result<bool, String> {
949        let start = self.index();
950        let force_u_flag = !self.u_flag && self.ecma_version >= EcmaVersion::ES2020;
951
952        if let Some(mut cp) = self.code_point_with_offset(0) {
953            self.advance();
954            let cp1 = self.code_point_with_offset(0);
955            if cp == '\\' && self.eat_regexp_unicode_escape_sequence(force_u_flag)? {
956                cp = std::char::from_u32(self.last_int_value as u32).unwrap();
957            } else if force_u_flag
958                && is_lead_surrogate(cp as i64)
959                && cp1.is_some()
960                && is_trail_surrogate(cp1.unwrap() as i64)
961            {
962                cp = std::char::from_u32(
963                    combine_surrogate_pair(cp as i64, cp1.unwrap() as i64) as u32,
964                )
965                .unwrap();
966                self.advance();
967            }
968
969            if is_regexp_identifier_start(cp) {
970                self.last_int_value = cp as i64;
971                return Ok(true);
972            }
973        }
974
975        if self.index() != start {
976            self.rewind(start);
977        }
978        return Ok(false);
979    }
980
981    /// Eat the next characters as a RegExp `RegExpIdentifierPart` production if
982    /// possible.
983    /// Set `self.last_int_value` if the identifier part existed.
984    /// ```grammar
985    /// RegExpIdentifierPart[U] ::
986    ///      UnicodeIDContinue
987    ///      `$`
988    ///      `_`
989    ///      `\` RegExpUnicodeEscapeSequence[+U]
990    ///      [~U] UnicodeLeadSurrogate UnicodeTrailSurrogate
991    ///      <ZWNJ>
992    ///      <ZWJ>
993    /// ```
994    /// Returns `true` if it ate the next characters successfully.
995    fn eat_regexp_identifier_part(&mut self) -> Result<bool, String> {
996        let start = self.index();
997        let force_u_flag = !self.u_flag && self.ecma_version >= EcmaVersion::ES2020;
998        let mut cp = self.code_point_with_offset(0);
999        self.advance();
1000        let cp1 = self.code_point_with_offset(0);
1001
1002        if cp == Some('\\') && self.eat_regexp_unicode_escape_sequence(force_u_flag)? {
1003            // TODO: convert unicode code point to char
1004            cp = std::char::from_u32(self.last_int_value as u32);
1005        } else if force_u_flag
1006            && is_lead_surrogate(cp.unwrap() as i64)
1007            && is_trail_surrogate(cp1.unwrap() as i64)
1008        {
1009            cp = std::char::from_u32(combine_surrogate_pair(
1010                cp.unwrap() as i64,
1011                cp1.unwrap() as i64,
1012            ) as u32);
1013            self.advance();
1014        }
1015
1016        if cp.is_some() && is_regexp_identifier_part(cp.unwrap()) {
1017            self.last_int_value = cp.unwrap() as i64;
1018            return Ok(true);
1019        }
1020
1021        if self.index() != start {
1022            self.rewind(start);
1023        }
1024        Ok(false)
1025    }
1026
1027    /// Eat the next characters as the follwoing alternatives if possible.
1028    /// Set `self.last_int_value` if it ate the next characters successfully.
1029    /// ```grammar
1030    ///      `c` ControlLetter
1031    /// ```
1032    /// Returns `true` if it ate the next characters successfully.
1033    fn eat_c_control_letter(&mut self) -> bool {
1034        let start = self.index();
1035        if self.eat('c') {
1036            if self.eat_control_letter() {
1037                return true;
1038            }
1039            self.rewind(start);
1040        }
1041        false
1042    }
1043
1044    /// Eat the next characters as the follwoing alternatives if possible.
1045    /// Set `self.last_int_value` if it ate the next characters successfully.
1046    /// ```grammar
1047    ///      `0` [lookahead ∉ DecimalDigit]
1048    /// ```
1049    /// Returns `true` if it ate the next characters successfully.
1050    fn eat_zero(&mut self) -> bool {
1051        if self.code_point_with_offset(0) != Some('0') {
1052            return false;
1053        }
1054        if let Some(cp) = self.code_point_with_offset(1) {
1055            if cp.is_digit(10) {
1056                return false;
1057            }
1058        }
1059        self.last_int_value = 0;
1060        self.advance();
1061        return true;
1062    }
1063
1064    /// Eat the next characters as a RegExp `ControlEscape` production if
1065    /// possible.
1066    /// Set `self.last_int_value` if it ate the next characters successfully.
1067    /// ```grammar
1068    /// ControlEscape:: one of
1069    ///      f n r t v
1070    /// ```
1071    /// Returns `true` if it ate the next characters successfully.
1072    fn eat_control_escape(&mut self) -> bool {
1073        if self.eat('f') {
1074            self.last_int_value = 0x0c; // formfeed
1075            return true;
1076        }
1077        if self.eat('n') {
1078            self.last_int_value = 0x0a; // linefeed
1079            return true;
1080        }
1081        if self.eat('r') {
1082            self.last_int_value = 0x0d; // carriage return
1083            return true;
1084        }
1085        if self.eat('t') {
1086            self.last_int_value = 0x09; // character tabulation
1087            return true;
1088        }
1089        if self.eat('v') {
1090            self.last_int_value = 0x0b; // line tabulation
1091            return true;
1092        }
1093        false
1094    }
1095
1096    /// Eat the next characters as a RegExp `ControlLetter` production if possible.
1097    /// Set `self.last_int_value` if it ate the next characters successfully.
1098    /// ```grammar
1099    /// ControlLetter:: one of
1100    ///      a b c d e f g h i j k l m n o p q r s t u v w x y z
1101    ///      A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1102    /// ```
1103    /// Returns `true` if it ate the next characters successfully.
1104    fn eat_control_letter(&mut self) -> bool {
1105        if let Some(cp) = self.code_point_with_offset(0) {
1106            if cp.is_ascii_alphabetic() {
1107                self.advance();
1108                self.last_int_value = cp as i64 % 0x20;
1109                return true;
1110            }
1111        }
1112        false
1113    }
1114
1115    /// Eat the next characters as a RegExp `RegExpUnicodeEscapeSequence`
1116    /// production if possible.
1117    /// Set `self.last_int_value` if it ate the next characters successfully.
1118    /// ```grammar
1119    /// RegExpUnicodeEscapeSequence[U]::
1120    ///      [+U] `u` LeadSurrogate `\u` TrailSurrogate
1121    ///      [+U] `u` LeadSurrogate
1122    ///      [+U] `u` TrailSurrogate
1123    ///      [+U] `u` NonSurrogate
1124    ///      [~U] `u` Hex4Digits
1125    ///      [+U] `u{` CodePoint `}`
1126    /// ```
1127    /// Returns `true` if it ate the next characters successfully.
1128    fn eat_regexp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> Result<bool, &str> {
1129        let start = self.index();
1130        let u_flag = force_u_flag || self.u_flag;
1131
1132        if self.eat('u') {
1133            if (u_flag && self.eat_regexp_unicode_surrogate_pair_escape())
1134                || self.eat_fixed_hex_digits(4)
1135                || (u_flag && self.eat_regexp_unicode_codepoint_escape())
1136            {
1137                return Ok(true);
1138            }
1139            if self.strict || u_flag {
1140                return Err("Invalid unicode escape");
1141            }
1142            self.rewind(start);
1143        }
1144
1145        return Ok(false);
1146    }
1147
1148    /// Eat the next characters as the following alternatives if possible.
1149    /// Set `self.last_int_value` if it ate the next characters successfully.
1150    /// ```grammar
1151    ///      LeadSurrogate `\u` TrailSurrogate
1152    /// ```
1153    /// Returns `true` if it ate the next characters successfully.
1154    fn eat_regexp_unicode_surrogate_pair_escape(&mut self) -> bool {
1155        let start = self.index();
1156
1157        if self.eat_fixed_hex_digits(4) {
1158            let lead = self.last_int_value;
1159            if is_lead_surrogate(lead)
1160                && self.eat('\\')
1161                && self.eat('u')
1162                && self.eat_fixed_hex_digits(4)
1163            {
1164                let trail = self.last_int_value;
1165                if is_trail_surrogate(trail) {
1166                    self.last_int_value = combine_surrogate_pair(lead, trail);
1167                    return true;
1168                }
1169            }
1170
1171            self.rewind(start);
1172        }
1173
1174        return false;
1175    }
1176
1177    /// Eat the next characters as the following alternatives if possible.
1178    /// Set `self.last_int_value` if it ate the next characters successfully.
1179    /// ```grammar
1180    ///      `{` CodePoint `}`
1181    /// ```
1182    /// Returns `true` if it ate the next characters successfully.
1183    fn eat_regexp_unicode_codepoint_escape(&mut self) -> bool {
1184        let start = self.index();
1185
1186        if self.eat('{')
1187            && self.eat_hex_digits()
1188            && self.eat('}')
1189            && is_valid_unicode(self.last_int_value)
1190        {
1191            return true;
1192        }
1193
1194        self.rewind(start);
1195        return false;
1196    }
1197
1198    /// Eat the next characters as a RegExp `IdentityEscape` production if possible.
1199    /// Set `self.last_int_value` if it ate the next characters successfully.
1200    /// ```grammar
1201    /// IdentityEscape[U, N]::
1202    ///      [+U] SyntaxCharacter
1203    ///      [+U] `/`
1204    ///      [strict][~U] SourceCharacter but not UnicodeIDContinue
1205    ///      [annexB][~U] SourceCharacterIdentityEscape[?N]
1206    /// SourceCharacterIdentityEscape[N]::
1207    ///      [~N] SourceCharacter but not c
1208    ///      [+N] SourceCharacter but not one of c k
1209    /// ```
1210    /// Returns `true` if it ate the next characters successfully.
1211    fn eat_identity_escape(&mut self) -> bool {
1212        if let Some(cp) = self.code_point_with_offset(0) {
1213            if self.is_valid_identity_escape(cp) {
1214                self.last_int_value = cp as i64;
1215                self.advance();
1216                return true;
1217            }
1218        }
1219        return false;
1220    }
1221    fn is_valid_identity_escape(&self, cp: char) -> bool {
1222        if self.u_flag {
1223            return is_syntax_character(cp) || cp == '/';
1224        } else if self.strict {
1225            return !is_id_continue(cp);
1226        } else if self.n_flag {
1227            return !(cp == 'c' || cp == 'k');
1228        }
1229        return cp != 'c';
1230    }
1231
1232    /// Eat the next characters as a RegExp `DecimalEscape` production if possible.
1233    /// Set `self.last_int_value` if it ate the next characters successfully.
1234    /// ```grammar
1235    /// DecimalEscape::
1236    ///      NonZeroDigit DecimalDigits(opt) [lookahead ∉ DecimalDigit]
1237    /// ```
1238    /// Returns `true` if it ate the next characters successfully.
1239    fn eat_decimal_escape(&mut self) -> bool {
1240        self.last_int_value = 0;
1241        if let Some(cp) = self.code_point_with_offset(0) {
1242            if cp.is_digit(10) {
1243                self.last_int_value = 10 * self.last_int_value + cp.to_digit(10).unwrap() as i64;
1244                self.advance();
1245                while let Some(cp) = self.code_point_with_offset(0) {
1246                    if !cp.is_digit(10) {
1247                        break;
1248                    }
1249                    self.last_int_value = 10 * self.last_int_value + cp.to_digit(10).unwrap() as i64;
1250                    self.advance();
1251                }
1252                return true;
1253            }
1254        }
1255        return false;
1256    }
1257
1258    /// Eat the next characters as a RegExp `UnicodePropertyValueExpression` production if possible.
1259    /// Set `self.last_key_value` and `self.last_val_value` if it ate the next characters
1260    /// successfully.
1261    /// ```grammar
1262    /// UnicodePropertyValueExpression::
1263    ///      UnicodePropertyName `=` UnicodePropertyValue
1264    ///      LoneUnicodePropertyNameOrValue
1265    /// ```
1266    /// Returns `true` if it ate the next characters successfully.
1267    fn eat_unicode_property_value_expression(&mut self) -> Result<bool, &str> {
1268        let start = self.index();
1269
1270        // UnicodePropertyName `=` UnicodePropertyValue
1271        if self.eat_unicode_property_name() && self.eat('=') {
1272            self.last_key_value = self.last_str_value.clone();
1273            if self.eat_unicode_property_value() {
1274                self.last_val_value = self.last_str_value.clone();
1275                if is_valid_unicode_property(
1276                    self.ecma_version,
1277                    &self.last_key_value,
1278                    &self.last_val_value,
1279                ) {
1280                    return Ok(true);
1281                }
1282                return Err("Invalid property name");
1283            }
1284        }
1285        self.rewind(start);
1286
1287        // LoneUnicodePropertyNameOrValue
1288        if self.eat_lone_unicode_property_name_or_value() {
1289            let name_or_value = self.last_str_value.clone();
1290            if is_valid_unicode_property(self.ecma_version, "General_Category", &name_or_value) {
1291                self.last_key_value = "General_Category".to_string();
1292                self.last_val_value = name_or_value;
1293                return Ok(true);
1294            }
1295            if is_valid_lone_unicode_property(self.ecma_version, &name_or_value) {
1296                self.last_key_value = name_or_value;
1297                self.last_val_value = "".to_string();
1298                return Ok(true);
1299            }
1300            return Err("Invalid property name");
1301        }
1302        Ok(false)
1303    }
1304
1305    /// Eat the next characters as a RegExp `UnicodePropertyName` production if possible.
1306    /// Set `self.last_str_value` if it ate the next characters successfully.
1307    /// ```grammar
1308    /// UnicodePropertyName::
1309    ///      UnicodePropertyNameCharacters
1310    /// ```
1311    /// Returns `true` if it ate the next characters successfully.
1312    fn eat_unicode_property_name(&mut self) -> bool {
1313        self.last_str_value = "".to_string();
1314        while let Some(cp) = self.code_point_with_offset(0) {
1315            if !is_unicode_property_name_character(cp) {
1316                break;
1317            }
1318            self.last_str_value.push(cp);
1319            self.advance();
1320        }
1321        self.last_str_value != ""
1322    }
1323
1324    /// Eat the next characters as a RegExp `UnicodePropertyValue` production if possible.
1325    /// Set `self.last_str_value` if it ate the next characters successfully.
1326    /// ```grammar
1327    /// UnicodePropertyValue::
1328    ///      UnicodePropertyValueCharacters
1329    /// ```
1330    /// Returns `true` if it ate the next characters successfully.
1331    fn eat_unicode_property_value(&mut self) -> bool {
1332        self.last_str_value = "".to_string();
1333        while let Some(cp) = self.code_point_with_offset(0) {
1334            if !is_unicode_property_value_character(cp) {
1335                break;
1336            }
1337            self.last_str_value.push(cp);
1338            self.advance();
1339        }
1340        self.last_str_value != ""
1341    }
1342
1343    /// Eat the next characters as a RegExp `UnicodePropertyValue` production if possible.
1344    /// Set `self.last_str_value` if it ate the next characters successfully.
1345    /// ```grammar
1346    /// LoneUnicodePropertyNameOrValue::
1347    ///      UnicodePropertyValueCharacters
1348    /// ```
1349    /// Returns `true` if it ate the next characters successfully.
1350    fn eat_lone_unicode_property_name_or_value(&mut self) -> bool {
1351        self.eat_unicode_property_value()
1352    }
1353
1354    /// Eat the next characters as a `HexEscapeSequence` production if possible.
1355    /// Set `self.last_int_value` if it ate the next characters successfully.
1356    /// ```grammar
1357    /// HexEscapeSequence::
1358    ///      `x` HexDigit HexDigit
1359    /// HexDigit:: one of
1360    ///      0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
1361    /// ```
1362    /// Returns `true` if it ate the next characters successfully.
1363    fn eat_hex_escape_sequence(&mut self) -> Result<bool, &str> {
1364        let start = self.index();
1365        if self.eat('x') {
1366            if self.eat_fixed_hex_digits(2) {
1367                return Ok(true);
1368            }
1369            if self.u_flag || self.strict {
1370                return Err("Invalid escape");
1371            }
1372            self.rewind(start);
1373        }
1374        Ok(false)
1375    }
1376
1377    /// Eat the next characters as a `DecimalDigits` production if possible.
1378    /// Set `self.last_int_value` if it ate the next characters successfully.
1379    /// ```grammar
1380    /// DecimalDigits::
1381    ///      DecimalDigit
1382    ///      DecimalDigits DecimalDigit
1383    /// DecimalDigit:: one of
1384    ///      0 1 2 3 4 5 6 7 8 9
1385    /// ```
1386    /// Returns `true` if it ate the next characters successfully.
1387    fn eat_decimal_digits(&mut self) -> bool {
1388        let start = self.index();
1389
1390        self.last_int_value = 0;
1391        while let Some(cp) = self.code_point_with_offset(0) {
1392            if !cp.is_digit(10) {
1393                break;
1394            }
1395            self.last_int_value = 10 * self.last_int_value
1396                + self
1397                    .code_point_with_offset(0)
1398                    .unwrap()
1399                    .to_digit(10)
1400                    .unwrap() as i64;
1401            self.advance();
1402        }
1403
1404        return self.index() != start;
1405    }
1406
1407    /// Eat the next characters as a `HexDigits` production if possible.
1408    /// Set `self.last_int_value` if it ate the next characters successfully.
1409    /// ```grammar
1410    /// HexDigits::
1411    ///      HexDigit
1412    ///      HexDigits HexDigit
1413    /// HexDigit:: one of
1414    ///      0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
1415    /// ```
1416    /// Returns `true` if it ate the next characters successfully.
1417    fn eat_hex_digits(&mut self) -> bool {
1418        let start = self.index();
1419        self.last_int_value = 0;
1420        while let Some(cp) = self.code_point_with_offset(0) {
1421            if !cp.is_digit(16) {
1422                break;
1423            }
1424            self.last_int_value = 16 * self.last_int_value + cp.to_digit(16).unwrap() as i64;
1425            self.advance();
1426        }
1427        return self.index() != start;
1428    }
1429
1430    /// Eat the next characters as a `HexDigits` production if possible.
1431    /// Set `self.last_int_value` if it ate the next characters successfully.
1432    /// ```grammar
1433    /// LegacyOctalEscapeSequence::
1434    ///      OctalDigit [lookahead ∉ OctalDigit]
1435    ///      ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
1436    ///      FourToSeven OctalDigit
1437    ///      ZeroToThree OctalDigit OctalDigit
1438    /// OctalDigit:: one of
1439    ///      0 1 2 3 4 5 6 7
1440    /// ZeroToThree:: one of
1441    ///      0 1 2 3
1442    /// FourToSeven:: one of
1443    ///      4 5 6 7
1444    /// ```
1445    /// Returns `true` if it ate the next characters successfully.
1446    fn eat_legacy_octal_escape_sequence(&mut self) -> bool {
1447        if self.eat_octal_digit() {
1448            let n1 = self.last_int_value;
1449            if self.eat_octal_digit() {
1450                let n2 = self.last_int_value;
1451                if n1 <= 3 && self.eat_octal_digit() {
1452                    self.last_int_value = n1 * 64 + n2 * 8 + self.last_int_value
1453                } else {
1454                    self.last_int_value = n1 * 8 + n2;
1455                }
1456            } else {
1457                self.last_int_value = n1;
1458            }
1459            return true;
1460        }
1461        return false;
1462    }
1463
1464    /// Eat the next characters as a `OctalDigit` production if possible.
1465    /// Set `self.last_int_value` if it ate the next characters successfully.
1466    /// ```grammar
1467    /// OctalDigit:: one of
1468    ///      0 1 2 3 4 5 6 7
1469    /// ```
1470    /// Returns `true` if it ate the next characters successfully.
1471    fn eat_octal_digit(&mut self) -> bool {
1472        if let Some(cp) = self.code_point_with_offset(0) {
1473            if cp.is_digit(8) {
1474                self.advance();
1475                self.last_int_value = cp.to_digit(8).unwrap() as i64;
1476                return true;
1477            }
1478        }
1479        self.last_int_value = 0;
1480        return false;
1481    }
1482
1483    /// Eat the next characters as the given number of `HexDigit` productions if possible.
1484    /// Set `self.last_int_value` if it ate the next characters successfully.
1485    /// ```grammar
1486    /// HexDigit:: one of
1487    ///      0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
1488    /// ```
1489    /// Returns `true` if it ate the next characters successfully.
1490    fn eat_fixed_hex_digits(&mut self, length: i64) -> bool {
1491        let start = self.index();
1492        self.last_int_value = 0;
1493        for _ in 0..length {
1494            let cp = self.code_point_with_offset(0);
1495            if cp.is_none() || !cp.unwrap().is_digit(16) {
1496                self.rewind(start);
1497                return false;
1498            }
1499            self.last_int_value =
1500                16 * self.last_int_value + cp.unwrap().to_digit(16).unwrap() as i64;
1501            self.advance();
1502        }
1503        return true;
1504    }
1505
1506    fn count_capturing_parens(&mut self) -> u32 {
1507        let start = self.index();
1508        let mut in_class = false;
1509        let mut escaped = false;
1510        let mut count = 0;
1511
1512        while let Some(cp) = self.code_point_with_offset(0) {
1513            if escaped {
1514                escaped = false;
1515            } else if cp == '\\' {
1516                escaped = true;
1517            } else if cp == '[' {
1518                in_class = true;
1519            } else if cp == ']' {
1520                in_class = false;
1521            } else if cp == '('
1522                && !in_class
1523                && (self.code_point_with_offset(1) != Some('?')
1524                    || (self.code_point_with_offset(2) == Some('<')
1525                        && self.code_point_with_offset(3) != Some('=')
1526                        && self.code_point_with_offset(3) != Some('!')))
1527            {
1528                count += 1
1529            }
1530            self.advance();
1531        }
1532
1533        self.rewind(start);
1534        count
1535    }
1536}
1537
1538#[cfg(test)]
1539mod tests {
1540    use super::*;
1541
1542    #[test]
1543    fn count_capturing_parens_test() {
1544        let mut validator = EcmaRegexValidator::new(EcmaVersion::ES2018);
1545        let source = "foo|(abc)de";
1546        validator.reset(source, 0, source.len(), false);
1547        assert_eq!(validator.count_capturing_parens(), 1);
1548        let source = "foo|(?:abc)de";
1549        validator.reset(source, 0, source.len(), false);
1550        assert_eq!(validator.count_capturing_parens(), 0);
1551        let source = "((foo)|(abc)de)";
1552        validator.reset(source, 0, source.len(), false);
1553        assert_eq!(validator.count_capturing_parens(), 3);
1554    }
1555}
js_regex/validator.rs

js_regex/
validator.rs