anyxml_automata/
xsregexp.rs

1//! Implement the regular expressions described in
2//! [XML Schema Part 2: Datatypes Second Edition Appendix F Regular Expressions](https://www.w3.org/TR/xmlschema-2/#regexs).
3
4use std::{
5    iter::{empty, once},
6    sync::LazyLock,
7};
8
9use crate::{
10    ast::ASTNode,
11    fa::DFA,
12    unicode::{
13        GENERAL_CATEGORY_CC, GENERAL_CATEGORY_CF, GENERAL_CATEGORY_CO, GENERAL_CATEGORY_LL,
14        GENERAL_CATEGORY_LM, GENERAL_CATEGORY_LO, GENERAL_CATEGORY_LT, GENERAL_CATEGORY_LU,
15        GENERAL_CATEGORY_MC, GENERAL_CATEGORY_ME, GENERAL_CATEGORY_MN, GENERAL_CATEGORY_ND,
16        GENERAL_CATEGORY_NL, GENERAL_CATEGORY_NO, GENERAL_CATEGORY_PC, GENERAL_CATEGORY_PD,
17        GENERAL_CATEGORY_PE, GENERAL_CATEGORY_PF, GENERAL_CATEGORY_PI, GENERAL_CATEGORY_PO,
18        GENERAL_CATEGORY_PS, GENERAL_CATEGORY_SC, GENERAL_CATEGORY_SK, GENERAL_CATEGORY_SM,
19        GENERAL_CATEGORY_SO, GENERAL_CATEGORY_ZL, GENERAL_CATEGORY_ZP, GENERAL_CATEGORY_ZS,
20        iterate_general_category_c, iterate_general_category_l, iterate_general_category_m,
21        iterate_general_category_n, iterate_general_category_p, iterate_general_category_s,
22        iterate_general_category_z, seach_block_range,
23    },
24    util::{complement_ranges, difference_ranges, union_ranges},
25};
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum RegexpError {
29    SyntaxError,
30    TooLargeQuantity,
31    InvalidQuantifier,
32    InvalidCharacter,
33    InvalidCharRange,
34    InvalidCharProp,
35    InvalidBlock,
36}
37
38impl std::fmt::Display for RegexpError {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        write!(f, "{self:?}")
41    }
42}
43
44impl std::error::Error for RegexpError {}
45
46#[derive(Debug)]
47pub struct XSRegexp {
48    fa: LazyLock<DFA<char>, Box<dyn Fn() -> DFA<char>>>,
49}
50
51impl XSRegexp {
52    pub fn compile(mut regexp: &str) -> Result<XSRegexp, RegexpError> {
53        let ast = parse_regexp(&mut regexp, false)?;
54        Ok(Self {
55            fa: LazyLock::new(Box::new(move || DFA::assemble(ast.as_ref()).unwrap())),
56        })
57    }
58
59    pub fn is_match(&self, input: &str) -> bool {
60        self.fa.is_match(input.chars())
61    }
62}
63
64fn parse_regexp(regexp: &mut &str, inner: bool) -> Result<Option<ASTNode<char>>, RegexpError> {
65    let mut res = parse_branch(regexp)?;
66    while let Some(rem) = regexp.strip_prefix('|') {
67        *regexp = rem;
68        if let Some(right) = parse_branch(regexp)? {
69            if let Some(left) = res {
70                res = Some(ASTNode::Alternation(Box::new(left), Box::new(right)));
71            } else {
72                res = Some(right);
73            }
74        }
75    }
76
77    if inner && regexp.starts_with(')') {
78        *regexp = &regexp[1..];
79        Ok(res)
80    } else if !regexp.is_empty() {
81        Err(RegexpError::SyntaxError)
82    } else {
83        Ok(res)
84    }
85}
86
87/// [2] branch ::= piece*
88fn parse_branch(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
89    let mut ret = None;
90    while !regexp.starts_with([')', '|']) && !regexp.is_empty() {
91        if let Some(right) = parse_piece(regexp)? {
92            if let Some(left) = ret {
93                ret = Some(ASTNode::Catenation(Box::new(left), Box::new(right)));
94            } else {
95                ret = Some(right)
96            }
97        }
98    }
99    Ok(ret)
100}
101
102/// [3] piece ::= atom quantifier?
103fn parse_piece(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
104    if let Some(atom) = parse_atom(regexp)? {
105        parse_quantifier(regexp, atom).map(Some)
106    } else {
107        Ok(None)
108    }
109}
110
111/// [4] quantifier ::= [?*+] | ( '{' quantity '}' )
112fn parse_quantifier(regexp: &mut &str, atom: ASTNode<char>) -> Result<ASTNode<char>, RegexpError> {
113    match regexp.as_bytes() {
114        [b'?', ..] => {
115            *regexp = &regexp[1..];
116            Ok(ASTNode::ZeroOrOne(Box::new(atom)))
117        }
118        [b'*', ..] => {
119            *regexp = &regexp[1..];
120            Ok(ASTNode::ZeroOrMore(Box::new(atom)))
121        }
122        [b'+', ..] => {
123            *regexp = &regexp[1..];
124            Ok(ASTNode::OneOrMore(Box::new(atom)))
125        }
126        [b'{', ..] => {
127            *regexp = &regexp[1..];
128            match parse_quantity(regexp)? {
129                Quantity::QuantRange(at_least, at_most) => Ok(ASTNode::Repeat {
130                    node: Box::new(atom),
131                    at_least,
132                    at_most,
133                }),
134                Quantity::QuantExact(exact) => Ok(ASTNode::RepeatExact(Box::new(atom), exact)),
135            }
136        }
137        _ => Ok(atom),
138    }
139}
140
141enum Quantity {
142    QuantRange(usize, Option<usize>),
143    QuantExact(usize),
144}
145
146/// [5] quantity ::= quantRange | quantMin | QuantExact
147fn parse_quantity(regexp: &mut &str) -> Result<Quantity, RegexpError> {
148    let pos = regexp
149        .as_bytes()
150        .iter()
151        .position(|&c| !c.is_ascii_digit())
152        .ok_or(RegexpError::SyntaxError)?;
153    let p = regexp[..pos]
154        .parse::<usize>()
155        .or(Err(RegexpError::TooLargeQuantity))?;
156    *regexp = &regexp[pos..];
157
158    if regexp.is_empty() {
159        return Err(RegexpError::SyntaxError);
160    }
161
162    let head = regexp.as_bytes()[0];
163    if head == b'}' {
164        Ok(Quantity::QuantExact(p))
165    } else if head == b',' {
166        *regexp = &regexp[1..];
167
168        if regexp.is_empty() {
169            return Err(RegexpError::SyntaxError);
170        }
171
172        let head = regexp.as_bytes()[0];
173        if head == b'}' {
174            Ok(Quantity::QuantRange(p, None))
175        } else if head.is_ascii_digit() {
176            let pos = regexp
177                .as_bytes()
178                .iter()
179                .position(|&c| !c.is_ascii_digit())
180                .ok_or(RegexpError::SyntaxError)?;
181            let q = regexp[..pos]
182                .parse::<usize>()
183                .or(Err(RegexpError::TooLargeQuantity))?;
184
185            if p > q {
186                Err(RegexpError::InvalidQuantifier)
187            } else {
188                Ok(Quantity::QuantRange(p, Some(q)))
189            }
190        } else {
191            Err(RegexpError::SyntaxError)
192        }
193    } else {
194        Err(RegexpError::SyntaxError)
195    }
196}
197
198/// [9] atom ::= Char | charClass | ( '(' regExp ')' )
199fn parse_atom(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
200    if regexp.is_empty() {
201        return Err(RegexpError::SyntaxError);
202    }
203
204    match regexp.as_bytes() {
205        [b'(', ..] => {
206            // for 'regExp'
207            *regexp = &regexp[1..];
208            parse_regexp(regexp, true)
209        }
210        // [11] charClass         ::= charClassEsc | charClassExpr | WildcardEsc
211        // [23] charClassEsc      ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
212        // [25] catEsc        ::= '\p{' charProp '}'
213        [b'\\', b'p', ..] => parse_cat_esc(regexp).map(ASTNode::alternate_all),
214        // [26] complEsc      ::= '\P{' charProp '}'
215        [b'\\', b'P', ..] => parse_compl_esc(regexp).map(ASTNode::alternate_all),
216        // [37] MultiCharEsc  ::= '\' [sSiIcCdDwW]
217        [
218            b'\\',
219            b's' | b'S' | b'i' | b'I' | b'c' | b'C' | b'd' | b'D' | b'w' | b'W',
220            ..,
221        ] => parse_multi_char_esc(regexp).map(ASTNode::alternate_all),
222        // [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
223        [
224            b'\\',
225            b'n' | b'r' | b't' | b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{'
226            | b'}' | b'\x2D' | b'\x5B' | b'\x5D' | b'\x5E',
227            ..,
228        ] => parse_single_char_esc(regexp).map(ASTNode::alternate_all),
229        // [12] charClassExpr ::= '[' charGroup ']'
230        [b'[', ..] => {
231            *regexp = &regexp[1..];
232            let ret = parse_char_group(regexp).map(ASTNode::alternate_all);
233            if !regexp.starts_with(']') {
234                return Err(RegexpError::SyntaxError);
235            }
236            *regexp = &regexp[1..];
237            ret
238        }
239        // [37a] WildcardEsc  ::= '.'
240        [b'.', ..] => {
241            *regexp = &regexp[1..];
242            Ok(ASTNode::negate_all(
243                [('\n', '\n'), ('\r', '\r')].into_iter(),
244            ))
245        }
246        // [10] Char          ::= [^.\?*+()|#x5B#x5D]
247        [c, ..] => {
248            if matches!(
249                *c,
250                b'.' | b'\\' | b'?' | b'*' | b'+' | b'(' | b')' | b'|' | b'\x5B' | b'\x5D'
251            ) {
252                return Err(RegexpError::InvalidCharacter);
253            }
254            let c = regexp.chars().next().unwrap();
255            *regexp = &regexp[c.len_utf8()..];
256            Ok(Some(ASTNode::Charcters {
257                start: c,
258                end: c,
259                negation: false,
260            }))
261        }
262        [] => Ok(None),
263    }
264}
265
266/// [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
267fn parse_char_group(
268    regexp: &mut &str,
269) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
270    let mut negation = false;
271    if let Some(rem) = regexp.strip_prefix('^') {
272        *regexp = rem;
273        negation = true;
274    }
275
276    let mut pos_char_group = parse_pos_char_group(regexp)?;
277    if negation {
278        pos_char_group = Box::new(complement_ranges(pos_char_group));
279    }
280    let ret = if regexp.starts_with("-[") {
281        *regexp = &regexp[2..];
282        let sub = parse_char_group(regexp)?;
283        if !regexp.starts_with(']') {
284            return Err(RegexpError::SyntaxError);
285        }
286        *regexp = &regexp[1..];
287        Box::new(difference_ranges(pos_char_group, sub))
288    } else {
289        pos_char_group
290    };
291    Ok(ret)
292}
293
294/// [14] posCharGroup ::= ( charRange | charClassEsc )+
295/// [15] negCharGroup ::= '^' posCharGroup
296fn parse_pos_char_group(
297    regexp: &mut &str,
298) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
299    if regexp.is_empty() {
300        return Err(RegexpError::SyntaxError);
301    }
302
303    fn parse_pos_char_group_once(
304        regexp: &mut &str,
305        hypen: &mut bool,
306    ) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
307        if regexp.starts_with("-") {
308            // A hyphen is only allowed at this position in XmlCharIncDash.
309            //
310            // [17] charRange      ::= seRange | XmlCharIncDash
311            // [22] XmlCharIncDash ::= [^\#x5B#x5D]
312            *regexp = &regexp[1..];
313            *hypen = true;
314            return Ok(Box::new(once(('-', '-'))));
315        }
316
317        let start = if regexp.starts_with('\\') {
318            if regexp.len() < 2 {
319                return Err(RegexpError::SyntaxError);
320            }
321            // If the first character is SingleCharEsc, it is necessary
322            // to determine whether it is charRange or charClassEsc.
323            if matches!(
324                regexp.as_bytes()[1],
325                b'n' | b'r'
326                    | b't'
327                    | b'\\'
328                    | b'|'
329                    | b'.'
330                    | b'?'
331                    | b'*'
332                    | b'+'
333                    | b'('
334                    | b')'
335                    | b'{'
336                    | b'}'
337                    | b'\x2D'
338                    | b'\x5B'
339                    | b'\x5D'
340                    | b'\x5E'
341            ) {
342                parse_single_char_esc(regexp).unwrap().next().unwrap().0
343            } else {
344                return parse_char_class_esc(regexp);
345            }
346        } else {
347            let start = regexp.chars().next().unwrap();
348            *regexp = &regexp[start.len_utf8()..];
349            if matches!(start, '\x2D' | '\x5B' | '\x5D') {
350                return Err(RegexpError::InvalidCharacter);
351            }
352            start
353        };
354        if !regexp.starts_with('-') || regexp.starts_with("-[") {
355            // If there are no consecutive a hyphen, this is XmlCharIncDash or charClassEsc.
356            // If “-” continues, character class subtraction begins,
357            // so it is still XmlCharIncDash or charClassEsc.
358            return Ok(Box::new(once((start, start))));
359        }
360        *regexp = &regexp[1..];
361
362        if regexp.is_empty() {
363            return Err(RegexpError::SyntaxError);
364        }
365
366        // If there are consecutive a hyphen, this is seRange.
367        // Furthermore, if there are consecutive backslashes, it is SingleCharEsc.
368        let end = if regexp.starts_with('\\') {
369            parse_single_char_esc(regexp)?.next().unwrap().0
370        } else {
371            let end = regexp.chars().next().unwrap();
372            *regexp = &regexp[end.len_utf8()..];
373            if matches!(end, '\x2D' | '\x5B' | '\x5D') {
374                return Err(RegexpError::InvalidCharacter);
375            }
376            end
377        };
378
379        if start > end {
380            return Err(RegexpError::InvalidCharRange);
381        }
382        Ok(Box::new(once((start, end))))
383    }
384
385    let mut res = Box::new(empty()) as Box<dyn Iterator<Item = (char, char)>>;
386    let mut hypen = false;
387    let mut init = false;
388    let mut last_hyphen = false;
389    while !regexp.starts_with("]") && !regexp.starts_with("-[") {
390        res = Box::new(union_ranges(
391            res,
392            parse_pos_char_group_once(regexp, &mut hypen)?,
393        ));
394        if last_hyphen {
395            // If the hyphen that should be at the end is already present, it is incorrect.
396            return Err(RegexpError::SyntaxError);
397        }
398        if init && hypen {
399            // If there is a hyphen at any point other than the start of a loop,
400            // it must be the last hyphen.
401            last_hyphen = true;
402        }
403        hypen = false;
404        init = true;
405    }
406
407    Ok(res)
408}
409
410fn parse_char_class_esc<'a>(
411    regexp: &mut &str,
412) -> Result<Box<dyn Iterator<Item = (char, char)> + 'a>, RegexpError> {
413    match regexp.as_bytes() {
414        // [25] catEsc        ::= '\p{' charProp '}'
415        [b'\\', b'p', ..] => parse_cat_esc(regexp).map(|iter| Box::new(iter) as _),
416        // [26] complEsc      ::= '\P{' charProp '}'
417        [b'\\', b'P', ..] => parse_compl_esc(regexp).map(|iter| Box::new(iter) as _),
418        // [37] MultiCharEsc  ::= '\' [sSiIcCdDwW]
419        [
420            b'\\',
421            b's' | b'S' | b'i' | b'I' | b'c' | b'C' | b'd' | b'D' | b'w' | b'W',
422            ..,
423        ] => parse_multi_char_esc(regexp),
424        // [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
425        [
426            b'\\',
427            b'n' | b'r' | b't' | b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{'
428            | b'}' | b'\x2D' | b'\x5B' | b'\x5D' | b'\x5E',
429            ..,
430        ] => parse_single_char_esc(regexp).map(|iter| Box::new(iter) as _),
431        _ => Err(RegexpError::SyntaxError),
432    }
433}
434
435/// [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] /* N.B.:  #x2D = '-', #x5B = '[', #x5D = ']', #x5E = '^' */
436fn parse_single_char_esc<'a>(
437    regexp: &mut &str,
438) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
439    if regexp.starts_with('\\') {
440        return Err(RegexpError::SyntaxError);
441    }
442    *regexp = &regexp[1..];
443    if regexp.is_empty() {
444        return Err(RegexpError::SyntaxError);
445    }
446    let c = regexp.as_bytes()[0];
447    let c = match c {
448        b'n' => '\n',
449        b'r' => '\r',
450        b't' => '\t',
451        c @ (b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{' | b'}' | b'-' | b'['
452        | b']' | b'^') => c as char,
453        _ => return Err(RegexpError::InvalidCharacter),
454    };
455    *regexp = &regexp[1..];
456    Ok(once((c, c)))
457}
458
459/// [25] catEsc ::= '\p{' charProp '}'
460fn parse_cat_esc<'a>(
461    regexp: &mut &str,
462) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
463    if !regexp.starts_with("\\p{") {
464        return Err(RegexpError::SyntaxError);
465    }
466    *regexp = &regexp[2..];
467    let ret = parse_char_prop(regexp)?;
468    if !regexp.starts_with("}") {
469        return Err(RegexpError::SyntaxError);
470    }
471    *regexp = &regexp[1..];
472    Ok(ret)
473}
474
475/// [26] complEsc ::= '\P{' charProp '}'
476fn parse_compl_esc<'a>(
477    regexp: &mut &str,
478) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
479    if !regexp.starts_with("\\P{") {
480        return Err(RegexpError::SyntaxError);
481    }
482    *regexp = &regexp[2..];
483    let ret = complement_ranges(parse_char_prop(regexp)?);
484    if !regexp.starts_with("}") {
485        return Err(RegexpError::SyntaxError);
486    }
487    *regexp = &regexp[1..];
488    Ok(ret)
489}
490
491/// [27] charProp ::= IsCategory | IsBlock
492fn parse_char_prop(
493    regexp: &mut &str,
494) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
495    if regexp.is_empty() {
496        return Err(RegexpError::SyntaxError);
497    }
498
499    let mut trim = 2;
500    let ret: Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> = match regexp.as_bytes() {
501        [b'L', b'u', ..] => Ok(Box::new(GENERAL_CATEGORY_LU.iter().copied())),
502        [b'L', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_LL.iter().copied())),
503        [b'L', b't', ..] => Ok(Box::new(GENERAL_CATEGORY_LT.iter().copied())),
504        [b'L', b'm', ..] => Ok(Box::new(GENERAL_CATEGORY_LM.iter().copied())),
505        [b'L', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_LO.iter().copied())),
506        [b'L', ..] => {
507            trim = 1;
508            Ok(Box::new(iterate_general_category_l()))
509        }
510        [b'M', b'n', ..] => Ok(Box::new(GENERAL_CATEGORY_MN.iter().copied())),
511        [b'M', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_MC.iter().copied())),
512        [b'M', b'e', ..] => Ok(Box::new(GENERAL_CATEGORY_ME.iter().copied())),
513        [b'M', ..] => {
514            trim = 1;
515            Ok(Box::new(iterate_general_category_m()))
516        }
517        [b'N', b'd', ..] => Ok(Box::new(GENERAL_CATEGORY_ND.iter().copied())),
518        [b'N', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_NL.iter().copied())),
519        [b'N', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_NO.iter().copied())),
520        [b'N', ..] => {
521            trim = 1;
522            Ok(Box::new(iterate_general_category_n()))
523        }
524        [b'P', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_PC.iter().copied())),
525        [b'P', b'd', ..] => Ok(Box::new(GENERAL_CATEGORY_PD.iter().copied())),
526        [b'P', b's', ..] => Ok(Box::new(GENERAL_CATEGORY_PS.iter().copied())),
527        [b'P', b'e', ..] => Ok(Box::new(GENERAL_CATEGORY_PE.iter().copied())),
528        [b'P', b'i', ..] => Ok(Box::new(GENERAL_CATEGORY_PI.iter().copied())),
529        [b'P', b'f', ..] => Ok(Box::new(GENERAL_CATEGORY_PF.iter().copied())),
530        [b'P', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_PO.iter().copied())),
531        [b'P', ..] => {
532            trim = 1;
533            Ok(Box::new(iterate_general_category_p()))
534        }
535        [b'Z', b's', ..] => Ok(Box::new(GENERAL_CATEGORY_ZS.iter().copied())),
536        [b'Z', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_ZL.iter().copied())),
537        [b'Z', b'p', ..] => Ok(Box::new(GENERAL_CATEGORY_ZP.iter().copied())),
538        [b'Z', ..] => {
539            trim = 1;
540            Ok(Box::new(iterate_general_category_z()))
541        }
542        [b'S', b'm', ..] => Ok(Box::new(GENERAL_CATEGORY_SM.iter().copied())),
543        [b'S', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_SC.iter().copied())),
544        [b'S', b'k', ..] => Ok(Box::new(GENERAL_CATEGORY_SK.iter().copied())),
545        [b'S', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_SO.iter().copied())),
546        [b'S', ..] => {
547            trim = 1;
548            Ok(Box::new(iterate_general_category_s()))
549        }
550        [b'C', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_CC.iter().copied())),
551        [b'C', b'f', ..] => Ok(Box::new(GENERAL_CATEGORY_CF.iter().copied())),
552        [b'C', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_CO.iter().copied())),
553        // [b'C', b'n', ..] => Ok(GENERAL_CATEGORY_CN.iter().copied()),
554        [b'C', ..] => {
555            trim = 1;
556            Ok(Box::new(iterate_general_category_c()))
557        }
558        [b'I', b's', ..] => {
559            while trim < regexp.len()
560                && matches!(regexp.as_bytes()[trim], b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'\x2D')
561            {
562                trim += 1;
563            }
564            let Some((start, end)) = seach_block_range(&regexp[2..trim]) else {
565                return Err(RegexpError::InvalidBlock);
566            };
567            Ok(Box::new(once((start, end))))
568        }
569        _ => return Err(RegexpError::InvalidCharProp),
570    };
571
572    *regexp = &regexp[trim..];
573    ret
574}
575
576/// [37] MultiCharEsc  ::= '\' [sSiIcCdDwW]
577fn parse_multi_char_esc(
578    regexp: &mut &str,
579) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
580    if regexp.starts_with('\\') {
581        return Err(RegexpError::SyntaxError);
582    }
583    *regexp = &regexp[1..];
584    if regexp.is_empty() {
585        return Err(RegexpError::SyntaxError);
586    }
587    let res: Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> =
588        match regexp.as_bytes()[0] {
589            c @ (b's' | b'S') => {
590                let arr = [('\t', '\t'), ('\n', '\n'), ('\r', '\r')];
591                if c.is_ascii_lowercase() {
592                    Ok(Box::new(arr.into_iter()))
593                } else {
594                    Ok(Box::new(complement_ranges(arr.into_iter())))
595                }
596            }
597            c @ (b'i' | b'I') => {
598                let arr = [
599                    (':', ':'),
600                    ('A', 'Z'),
601                    ('_', '_'),
602                    ('a', 'z'),
603                    ('\u{C0}', '\u{D6}'),
604                    ('\u{D8}', '\u{F6}'),
605                    ('\u{C0}', '\u{D6}'),
606                    ('\u{D8}', '\u{F6}'),
607                    ('\u{F8}', '\u{2FF}'),
608                    ('\u{370}', '\u{37D}'),
609                    ('\u{37F}', '\u{1FFF}'),
610                    ('\u{200C}', '\u{200D}'),
611                    ('\u{2070}', '\u{218F}'),
612                    ('\u{2C00}', '\u{2FEF}'),
613                    ('\u{3001}', '\u{D7FF}'),
614                    ('\u{F900}', '\u{FDCF}'),
615                    ('\u{FDF0}', '\u{FFFD}'),
616                    ('\u{10000}', '\u{EFFFF}'),
617                ];
618                if c.is_ascii_lowercase() {
619                    Ok(Box::new(arr.into_iter()))
620                } else {
621                    Ok(Box::new(complement_ranges(arr.into_iter())))
622                }
623            }
624            c @ (b'c' | b'C') => {
625                let arr = [
626                    ('-', '.'),
627                    ('0', '9'),
628                    (':', ':'),
629                    ('A', 'Z'),
630                    ('_', '_'),
631                    ('a', 'z'),
632                    ('\u{B7}', '\u{B7}'),
633                    ('\u{C0}', '\u{D6}'),
634                    ('\u{D8}', '\u{F6}'),
635                    ('\u{C0}', '\u{D6}'),
636                    ('\u{D8}', '\u{F6}'),
637                    ('\u{F8}', '\u{2FF}'),
638                    ('\u{300}', '\u{37D}'),
639                    ('\u{37F}', '\u{1FFF}'),
640                    ('\u{200C}', '\u{200D}'),
641                    ('\u{203F}', '\u{2040}'),
642                    ('\u{2070}', '\u{218F}'),
643                    ('\u{2C00}', '\u{2FEF}'),
644                    ('\u{3001}', '\u{D7FF}'),
645                    ('\u{F900}', '\u{FDCF}'),
646                    ('\u{FDF0}', '\u{FFFD}'),
647                    ('\u{10000}', '\u{EFFFF}'),
648                ];
649                if c.is_ascii_lowercase() {
650                    Ok(Box::new(arr.into_iter()))
651                } else {
652                    Ok(Box::new(complement_ranges(arr.into_iter())))
653                }
654            }
655            b'd' => Ok(Box::new(GENERAL_CATEGORY_ND.iter().copied())),
656            b'D' => Ok(Box::new(complement_ranges(
657                GENERAL_CATEGORY_ND.iter().copied(),
658            ))),
659            c @ (b'w' | b'W') => {
660                let iter = iterate_general_category_p()
661                    .chain(iterate_general_category_z().chain(iterate_general_category_c()));
662                if c.is_ascii_lowercase() {
663                    Ok(Box::new(iter))
664                } else {
665                    Ok(Box::new(complement_ranges(iter)))
666                }
667            }
668            _ => Err(RegexpError::SyntaxError),
669        };
670
671    if res.is_ok() {
672        *regexp = &regexp[1..];
673    }
674    res
675}
676
677#[cfg(test)]
678mod tests {
679    use super::*;
680
681    #[test]
682    fn regex_parse_tests() {
683        let ast = parse_regexp(&mut "a", false).unwrap().unwrap();
684        assert_eq!(format!("{ast}"), "a");
685
686        let ast = parse_regexp(&mut "aa", false).unwrap().unwrap();
687        assert_eq!(format!("{ast}"), "aa");
688
689        // TODO: It is difficult to output correctly without adding `Group` to AST.
690        // let mut test = "a+(c|b?)c";
691        // let ast = parse_regexp(&mut test, false);
692        // assert_eq!(test, "");
693        // let ast = ast.unwrap().unwrap();
694        // assert_eq!(format!("{ast}"), "a+(c|b?)c");
695    }
696
697    #[test]
698    fn regex_matching_tests() {
699        let re = XSRegexp::compile("").unwrap();
700        assert!(re.is_match(""));
701        assert!(!re.is_match("   "));
702        assert!(!re.is_match("a"));
703
704        let re = XSRegexp::compile("a").unwrap();
705        assert!(re.is_match("a"));
706        assert!(!re.is_match(""));
707        assert!(!re.is_match("   "));
708        assert!(!re.is_match("aa"));
709        assert!(!re.is_match("A"));
710        assert!(!re.is_match("b"));
711
712        let re = XSRegexp::compile("aa").unwrap();
713        assert!(re.is_match("aa"));
714        assert!(!re.is_match("a"));
715        assert!(!re.is_match("aaa"));
716        assert!(!re.is_match(""));
717        assert!(!re.is_match("   "));
718        assert!(!re.is_match("AA"));
719        assert!(!re.is_match("b"));
720
721        let re = XSRegexp::compile("ab").unwrap();
722        assert!(re.is_match("ab"));
723        assert!(!re.is_match("aa"));
724        assert!(!re.is_match("a"));
725        assert!(!re.is_match("b"));
726        assert!(!re.is_match(""));
727        assert!(!re.is_match(" ab "));
728        assert!(!re.is_match("AB"));
729
730        let re = XSRegexp::compile("a*").unwrap();
731        assert!(re.is_match(""));
732        assert!(re.is_match("a"));
733        assert!(re.is_match("aa"));
734        assert!(re.is_match("aaa"));
735        assert!(!re.is_match("ab"));
736        assert!(!re.is_match("b"));
737        assert!(!re.is_match(" aaa"));
738        assert!(!re.is_match("aaa "));
739        assert!(!re.is_match("aaA"));
740
741        let re = XSRegexp::compile("a+").unwrap();
742        assert!(re.is_match("a"));
743        assert!(re.is_match("aa"));
744        assert!(re.is_match("aaa"));
745        assert!(!re.is_match(""));
746        assert!(!re.is_match("ab"));
747        assert!(!re.is_match("b"));
748        assert!(!re.is_match(" aaa"));
749        assert!(!re.is_match("aaa "));
750        assert!(!re.is_match("aaA"));
751
752        let re = XSRegexp::compile("a?").unwrap();
753        assert!(re.is_match(""));
754        assert!(re.is_match("a"));
755        assert!(!re.is_match("aa"));
756        assert!(!re.is_match("aaa"));
757        assert!(!re.is_match("ab"));
758        assert!(!re.is_match("b"));
759        assert!(!re.is_match(" aaa"));
760        assert!(!re.is_match("aaa "));
761        assert!(!re.is_match("aaA"));
762
763        let re = XSRegexp::compile("a|b").unwrap();
764        assert!(re.is_match("a"));
765        assert!(re.is_match("b"));
766        assert!(!re.is_match(""));
767        assert!(!re.is_match("aa"));
768        assert!(!re.is_match("aaa"));
769        assert!(!re.is_match("ab"));
770        assert!(!re.is_match(" aaa"));
771        assert!(!re.is_match("aaa "));
772        assert!(!re.is_match("A"));
773        assert!(!re.is_match("B"));
774
775        let re = XSRegexp::compile("a+|b?").unwrap();
776        assert!(re.is_match(""));
777        assert!(re.is_match("a"));
778        assert!(re.is_match("aa"));
779        assert!(re.is_match("aaa"));
780        assert!(re.is_match("b"));
781        assert!(!re.is_match("bb"));
782        assert!(!re.is_match("ab"));
783        assert!(!re.is_match(" aaa"));
784        assert!(!re.is_match("aaa "));
785        assert!(!re.is_match("A"));
786        assert!(!re.is_match("B"));
787
788        let re = XSRegexp::compile("a+c|b?c").unwrap();
789        assert!(re.is_match("ac"));
790        assert!(re.is_match("aac"));
791        assert!(re.is_match("bc"));
792        assert!(re.is_match("c"));
793        assert!(!re.is_match(""));
794        assert!(!re.is_match("a"));
795        assert!(!re.is_match("aa"));
796        assert!(!re.is_match("aaa"));
797        assert!(!re.is_match("b"));
798        assert!(!re.is_match("bb"));
799        assert!(!re.is_match("bbc"));
800        assert!(!re.is_match("ab"));
801        assert!(!re.is_match("abc"));
802        assert!(!re.is_match(" aaa"));
803        assert!(!re.is_match("aaa "));
804        assert!(!re.is_match("A"));
805        assert!(!re.is_match("B"));
806        assert!(!re.is_match("C"));
807
808        let re = XSRegexp::compile("a+(c|b?)c").unwrap();
809        assert!(re.is_match("ac"));
810        assert!(re.is_match("aac"));
811        assert!(re.is_match("abc"));
812        assert!(!re.is_match("bc"));
813        assert!(!re.is_match("c"));
814        assert!(!re.is_match(""));
815        assert!(!re.is_match("a"));
816        assert!(!re.is_match("aa"));
817        assert!(!re.is_match("aaa"));
818        assert!(!re.is_match("b"));
819        assert!(!re.is_match("bb"));
820        assert!(!re.is_match("bbc"));
821        assert!(!re.is_match("ab"));
822        assert!(!re.is_match(" aaa"));
823        assert!(!re.is_match("aaa "));
824        assert!(!re.is_match("A"));
825        assert!(!re.is_match("B"));
826        assert!(!re.is_match("C"));
827
828        let re = XSRegexp::compile("[abde]").unwrap();
829        assert!(re.is_match("a"));
830        assert!(re.is_match("b"));
831        assert!(re.is_match("d"));
832        assert!(re.is_match("e"));
833        assert!(!re.is_match(""));
834        assert!(!re.is_match("c"));
835        assert!(!re.is_match("ab"));
836        assert!(!re.is_match("f"));
837
838        let re = XSRegexp::compile("[^abde]").unwrap();
839        assert!(re.is_match("c"));
840        assert!(re.is_match("f"));
841        assert!(!re.is_match(""));
842        assert!(!re.is_match("a"));
843        assert!(!re.is_match("b"));
844        assert!(!re.is_match("d"));
845        assert!(!re.is_match("e"));
846        assert!(!re.is_match("ab"));
847
848        let re = XSRegexp::compile("[a-ce-g]").unwrap();
849        assert!(re.is_match("a"));
850        assert!(re.is_match("b"));
851        assert!(re.is_match("c"));
852        assert!(re.is_match("e"));
853        assert!(re.is_match("f"));
854        assert!(re.is_match("g"));
855        assert!(!re.is_match("d"));
856        assert!(!re.is_match("h"));
857
858        let re = XSRegexp::compile("[^a-ce-g]").unwrap();
859        assert!(re.is_match("d"));
860        assert!(re.is_match("h"));
861        assert!(!re.is_match("a"));
862        assert!(!re.is_match("b"));
863        assert!(!re.is_match("c"));
864        assert!(!re.is_match("e"));
865        assert!(!re.is_match("f"));
866        assert!(!re.is_match("g"));
867
868        let re = XSRegexp::compile("[abde-[a-b]]").unwrap();
869        assert!(re.is_match("d"));
870        assert!(re.is_match("e"));
871        assert!(!re.is_match("a"));
872        assert!(!re.is_match("b"));
873        assert!(!re.is_match("c"));
874        assert!(!re.is_match("f"));
875
876        let re = XSRegexp::compile("[a-g-[d]]").unwrap();
877        assert!(re.is_match("a"));
878        assert!(re.is_match("b"));
879        assert!(re.is_match("c"));
880        assert!(re.is_match("e"));
881        assert!(re.is_match("f"));
882        assert!(re.is_match("g"));
883        assert!(!re.is_match("d"));
884        assert!(!re.is_match("h"));
885
886        let re = XSRegexp::compile("[a-g-[i]]").unwrap();
887        assert!(re.is_match("a"));
888        assert!(re.is_match("b"));
889        assert!(re.is_match("c"));
890        assert!(re.is_match("d"));
891        assert!(re.is_match("e"));
892        assert!(re.is_match("f"));
893        assert!(re.is_match("g"));
894        assert!(!re.is_match("h"));
895        assert!(!re.is_match("i"));
896
897        let re = XSRegexp::compile("[^a-g-[i]]").unwrap();
898        assert!(!re.is_match("a"));
899        assert!(!re.is_match("b"));
900        assert!(!re.is_match("c"));
901        assert!(!re.is_match("d"));
902        assert!(!re.is_match("e"));
903        assert!(!re.is_match("f"));
904        assert!(!re.is_match("g"));
905        assert!(re.is_match("h"));
906        assert!(!re.is_match("i"));
907        assert!(re.is_match("j"));
908
909        let re = XSRegexp::compile("[a-gik-m-[c-el]]").unwrap();
910        assert!(re.is_match("a"));
911        assert!(re.is_match("b"));
912        assert!(!re.is_match("c"));
913        assert!(!re.is_match("d"));
914        assert!(!re.is_match("e"));
915        assert!(re.is_match("f"));
916        assert!(re.is_match("g"));
917        assert!(!re.is_match("h"));
918        assert!(re.is_match("i"));
919        assert!(!re.is_match("j"));
920        assert!(re.is_match("k"));
921        assert!(!re.is_match("l"));
922        assert!(re.is_match("m"));
923    }
924}