1use std::{
5 iter::{empty, once},
6 sync::LazyLock,
7};
8
9use crate::{
10 ast::ASTNode,
11 fa::DFA,
12 unicode::{
13 GENERAL_CATEGORY_CC, GENERAL_CATEGORY_CF, GENERAL_CATEGORY_CO, GENERAL_CATEGORY_LL,
14 GENERAL_CATEGORY_LM, GENERAL_CATEGORY_LO, GENERAL_CATEGORY_LT, GENERAL_CATEGORY_LU,
15 GENERAL_CATEGORY_MC, GENERAL_CATEGORY_ME, GENERAL_CATEGORY_MN, GENERAL_CATEGORY_ND,
16 GENERAL_CATEGORY_NL, GENERAL_CATEGORY_NO, GENERAL_CATEGORY_PC, GENERAL_CATEGORY_PD,
17 GENERAL_CATEGORY_PE, GENERAL_CATEGORY_PF, GENERAL_CATEGORY_PI, GENERAL_CATEGORY_PO,
18 GENERAL_CATEGORY_PS, GENERAL_CATEGORY_SC, GENERAL_CATEGORY_SK, GENERAL_CATEGORY_SM,
19 GENERAL_CATEGORY_SO, GENERAL_CATEGORY_ZL, GENERAL_CATEGORY_ZP, GENERAL_CATEGORY_ZS,
20 iterate_general_category_c, iterate_general_category_l, iterate_general_category_m,
21 iterate_general_category_n, iterate_general_category_p, iterate_general_category_s,
22 iterate_general_category_z, seach_block_range,
23 },
24 util::{complement_ranges, difference_ranges, union_ranges},
25};
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
28pub enum RegexpError {
29 SyntaxError,
30 TooLargeQuantity,
31 InvalidQuantifier,
32 InvalidCharacter,
33 InvalidCharRange,
34 InvalidCharProp,
35 InvalidBlock,
36}
37
38impl std::fmt::Display for RegexpError {
39 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40 write!(f, "{self:?}")
41 }
42}
43
44impl std::error::Error for RegexpError {}
45
46#[derive(Debug)]
47pub struct XSRegexp {
48 fa: LazyLock<DFA<char>, Box<dyn Fn() -> DFA<char>>>,
49}
50
51impl XSRegexp {
52 pub fn compile(mut regexp: &str) -> Result<XSRegexp, RegexpError> {
53 let ast = parse_regexp(&mut regexp, false)?;
54 Ok(Self {
55 fa: LazyLock::new(Box::new(move || DFA::assemble(ast.as_ref()).unwrap())),
56 })
57 }
58
59 pub fn is_match(&self, input: &str) -> bool {
60 self.fa.is_match(input.chars())
61 }
62}
63
64fn parse_regexp(regexp: &mut &str, inner: bool) -> Result<Option<ASTNode<char>>, RegexpError> {
65 let mut res = parse_branch(regexp)?;
66 while let Some(rem) = regexp.strip_prefix('|') {
67 *regexp = rem;
68 if let Some(right) = parse_branch(regexp)? {
69 if let Some(left) = res {
70 res = Some(ASTNode::Alternation(Box::new(left), Box::new(right)));
71 } else {
72 res = Some(right);
73 }
74 }
75 }
76
77 if inner && regexp.starts_with(')') {
78 *regexp = ®exp[1..];
79 Ok(res)
80 } else if !regexp.is_empty() {
81 Err(RegexpError::SyntaxError)
82 } else {
83 Ok(res)
84 }
85}
86
87fn parse_branch(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
89 let mut ret = None;
90 while !regexp.starts_with([')', '|']) && !regexp.is_empty() {
91 if let Some(right) = parse_piece(regexp)? {
92 if let Some(left) = ret {
93 ret = Some(ASTNode::Catenation(Box::new(left), Box::new(right)));
94 } else {
95 ret = Some(right)
96 }
97 }
98 }
99 Ok(ret)
100}
101
102fn parse_piece(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
104 if let Some(atom) = parse_atom(regexp)? {
105 parse_quantifier(regexp, atom).map(Some)
106 } else {
107 Ok(None)
108 }
109}
110
111fn parse_quantifier(regexp: &mut &str, atom: ASTNode<char>) -> Result<ASTNode<char>, RegexpError> {
113 match regexp.as_bytes() {
114 [b'?', ..] => {
115 *regexp = ®exp[1..];
116 Ok(ASTNode::ZeroOrOne(Box::new(atom)))
117 }
118 [b'*', ..] => {
119 *regexp = ®exp[1..];
120 Ok(ASTNode::ZeroOrMore(Box::new(atom)))
121 }
122 [b'+', ..] => {
123 *regexp = ®exp[1..];
124 Ok(ASTNode::OneOrMore(Box::new(atom)))
125 }
126 [b'{', ..] => {
127 *regexp = ®exp[1..];
128 match parse_quantity(regexp)? {
129 Quantity::QuantRange(at_least, at_most) => Ok(ASTNode::Repeat {
130 node: Box::new(atom),
131 at_least,
132 at_most,
133 }),
134 Quantity::QuantExact(exact) => Ok(ASTNode::RepeatExact(Box::new(atom), exact)),
135 }
136 }
137 _ => Ok(atom),
138 }
139}
140
141enum Quantity {
142 QuantRange(usize, Option<usize>),
143 QuantExact(usize),
144}
145
146fn parse_quantity(regexp: &mut &str) -> Result<Quantity, RegexpError> {
148 let pos = regexp
149 .as_bytes()
150 .iter()
151 .position(|&c| !c.is_ascii_digit())
152 .ok_or(RegexpError::SyntaxError)?;
153 let p = regexp[..pos]
154 .parse::<usize>()
155 .or(Err(RegexpError::TooLargeQuantity))?;
156 *regexp = ®exp[pos..];
157
158 if regexp.is_empty() {
159 return Err(RegexpError::SyntaxError);
160 }
161
162 let head = regexp.as_bytes()[0];
163 if head == b'}' {
164 Ok(Quantity::QuantExact(p))
165 } else if head == b',' {
166 *regexp = ®exp[1..];
167
168 if regexp.is_empty() {
169 return Err(RegexpError::SyntaxError);
170 }
171
172 let head = regexp.as_bytes()[0];
173 if head == b'}' {
174 Ok(Quantity::QuantRange(p, None))
175 } else if head.is_ascii_digit() {
176 let pos = regexp
177 .as_bytes()
178 .iter()
179 .position(|&c| !c.is_ascii_digit())
180 .ok_or(RegexpError::SyntaxError)?;
181 let q = regexp[..pos]
182 .parse::<usize>()
183 .or(Err(RegexpError::TooLargeQuantity))?;
184
185 if p > q {
186 Err(RegexpError::InvalidQuantifier)
187 } else {
188 Ok(Quantity::QuantRange(p, Some(q)))
189 }
190 } else {
191 Err(RegexpError::SyntaxError)
192 }
193 } else {
194 Err(RegexpError::SyntaxError)
195 }
196}
197
198fn parse_atom(regexp: &mut &str) -> Result<Option<ASTNode<char>>, RegexpError> {
200 if regexp.is_empty() {
201 return Err(RegexpError::SyntaxError);
202 }
203
204 match regexp.as_bytes() {
205 [b'(', ..] => {
206 *regexp = ®exp[1..];
208 parse_regexp(regexp, true)
209 }
210 [b'\\', b'p', ..] => parse_cat_esc(regexp).map(ASTNode::alternate_all),
214 [b'\\', b'P', ..] => parse_compl_esc(regexp).map(ASTNode::alternate_all),
216 [
218 b'\\',
219 b's' | b'S' | b'i' | b'I' | b'c' | b'C' | b'd' | b'D' | b'w' | b'W',
220 ..,
221 ] => parse_multi_char_esc(regexp).map(ASTNode::alternate_all),
222 [
224 b'\\',
225 b'n' | b'r' | b't' | b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{'
226 | b'}' | b'\x2D' | b'\x5B' | b'\x5D' | b'\x5E',
227 ..,
228 ] => parse_single_char_esc(regexp).map(ASTNode::alternate_all),
229 [b'[', ..] => {
231 *regexp = ®exp[1..];
232 let ret = parse_char_group(regexp).map(ASTNode::alternate_all);
233 if !regexp.starts_with(']') {
234 return Err(RegexpError::SyntaxError);
235 }
236 *regexp = ®exp[1..];
237 ret
238 }
239 [b'.', ..] => {
241 *regexp = ®exp[1..];
242 Ok(ASTNode::negate_all(
243 [('\n', '\n'), ('\r', '\r')].into_iter(),
244 ))
245 }
246 [c, ..] => {
248 if matches!(
249 *c,
250 b'.' | b'\\' | b'?' | b'*' | b'+' | b'(' | b')' | b'|' | b'\x5B' | b'\x5D'
251 ) {
252 return Err(RegexpError::InvalidCharacter);
253 }
254 let c = regexp.chars().next().unwrap();
255 *regexp = ®exp[c.len_utf8()..];
256 Ok(Some(ASTNode::Charcters {
257 start: c,
258 end: c,
259 negation: false,
260 }))
261 }
262 [] => Ok(None),
263 }
264}
265
266fn parse_char_group(
268 regexp: &mut &str,
269) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
270 let mut negation = false;
271 if let Some(rem) = regexp.strip_prefix('^') {
272 *regexp = rem;
273 negation = true;
274 }
275
276 let mut pos_char_group = parse_pos_char_group(regexp)?;
277 if negation {
278 pos_char_group = Box::new(complement_ranges(pos_char_group));
279 }
280 let ret = if regexp.starts_with("-[") {
281 *regexp = ®exp[2..];
282 let sub = parse_char_group(regexp)?;
283 if !regexp.starts_with(']') {
284 return Err(RegexpError::SyntaxError);
285 }
286 *regexp = ®exp[1..];
287 Box::new(difference_ranges(pos_char_group, sub))
288 } else {
289 pos_char_group
290 };
291 Ok(ret)
292}
293
294fn parse_pos_char_group(
297 regexp: &mut &str,
298) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
299 if regexp.is_empty() {
300 return Err(RegexpError::SyntaxError);
301 }
302
303 fn parse_pos_char_group_once(
304 regexp: &mut &str,
305 hypen: &mut bool,
306 ) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
307 if regexp.starts_with("-") {
308 *regexp = ®exp[1..];
313 *hypen = true;
314 return Ok(Box::new(once(('-', '-'))));
315 }
316
317 let start = if regexp.starts_with('\\') {
318 if regexp.len() < 2 {
319 return Err(RegexpError::SyntaxError);
320 }
321 if matches!(
324 regexp.as_bytes()[1],
325 b'n' | b'r'
326 | b't'
327 | b'\\'
328 | b'|'
329 | b'.'
330 | b'?'
331 | b'*'
332 | b'+'
333 | b'('
334 | b')'
335 | b'{'
336 | b'}'
337 | b'\x2D'
338 | b'\x5B'
339 | b'\x5D'
340 | b'\x5E'
341 ) {
342 parse_single_char_esc(regexp).unwrap().next().unwrap().0
343 } else {
344 return parse_char_class_esc(regexp);
345 }
346 } else {
347 let start = regexp.chars().next().unwrap();
348 *regexp = ®exp[start.len_utf8()..];
349 if matches!(start, '\x2D' | '\x5B' | '\x5D') {
350 return Err(RegexpError::InvalidCharacter);
351 }
352 start
353 };
354 if !regexp.starts_with('-') || regexp.starts_with("-[") {
355 return Ok(Box::new(once((start, start))));
359 }
360 *regexp = ®exp[1..];
361
362 if regexp.is_empty() {
363 return Err(RegexpError::SyntaxError);
364 }
365
366 let end = if regexp.starts_with('\\') {
369 parse_single_char_esc(regexp)?.next().unwrap().0
370 } else {
371 let end = regexp.chars().next().unwrap();
372 *regexp = ®exp[end.len_utf8()..];
373 if matches!(end, '\x2D' | '\x5B' | '\x5D') {
374 return Err(RegexpError::InvalidCharacter);
375 }
376 end
377 };
378
379 if start > end {
380 return Err(RegexpError::InvalidCharRange);
381 }
382 Ok(Box::new(once((start, end))))
383 }
384
385 let mut res = Box::new(empty()) as Box<dyn Iterator<Item = (char, char)>>;
386 let mut hypen = false;
387 let mut init = false;
388 let mut last_hyphen = false;
389 while !regexp.starts_with("]") && !regexp.starts_with("-[") {
390 res = Box::new(union_ranges(
391 res,
392 parse_pos_char_group_once(regexp, &mut hypen)?,
393 ));
394 if last_hyphen {
395 return Err(RegexpError::SyntaxError);
397 }
398 if init && hypen {
399 last_hyphen = true;
402 }
403 hypen = false;
404 init = true;
405 }
406
407 Ok(res)
408}
409
410fn parse_char_class_esc<'a>(
411 regexp: &mut &str,
412) -> Result<Box<dyn Iterator<Item = (char, char)> + 'a>, RegexpError> {
413 match regexp.as_bytes() {
414 [b'\\', b'p', ..] => parse_cat_esc(regexp).map(|iter| Box::new(iter) as _),
416 [b'\\', b'P', ..] => parse_compl_esc(regexp).map(|iter| Box::new(iter) as _),
418 [
420 b'\\',
421 b's' | b'S' | b'i' | b'I' | b'c' | b'C' | b'd' | b'D' | b'w' | b'W',
422 ..,
423 ] => parse_multi_char_esc(regexp),
424 [
426 b'\\',
427 b'n' | b'r' | b't' | b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{'
428 | b'}' | b'\x2D' | b'\x5B' | b'\x5D' | b'\x5E',
429 ..,
430 ] => parse_single_char_esc(regexp).map(|iter| Box::new(iter) as _),
431 _ => Err(RegexpError::SyntaxError),
432 }
433}
434
435fn parse_single_char_esc<'a>(
437 regexp: &mut &str,
438) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
439 if regexp.starts_with('\\') {
440 return Err(RegexpError::SyntaxError);
441 }
442 *regexp = ®exp[1..];
443 if regexp.is_empty() {
444 return Err(RegexpError::SyntaxError);
445 }
446 let c = regexp.as_bytes()[0];
447 let c = match c {
448 b'n' => '\n',
449 b'r' => '\r',
450 b't' => '\t',
451 c @ (b'\\' | b'|' | b'.' | b'?' | b'*' | b'+' | b'(' | b')' | b'{' | b'}' | b'-' | b'['
452 | b']' | b'^') => c as char,
453 _ => return Err(RegexpError::InvalidCharacter),
454 };
455 *regexp = ®exp[1..];
456 Ok(once((c, c)))
457}
458
459fn parse_cat_esc<'a>(
461 regexp: &mut &str,
462) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
463 if !regexp.starts_with("\\p{") {
464 return Err(RegexpError::SyntaxError);
465 }
466 *regexp = ®exp[2..];
467 let ret = parse_char_prop(regexp)?;
468 if !regexp.starts_with("}") {
469 return Err(RegexpError::SyntaxError);
470 }
471 *regexp = ®exp[1..];
472 Ok(ret)
473}
474
475fn parse_compl_esc<'a>(
477 regexp: &mut &str,
478) -> Result<impl Iterator<Item = (char, char)> + 'a, RegexpError> {
479 if !regexp.starts_with("\\P{") {
480 return Err(RegexpError::SyntaxError);
481 }
482 *regexp = ®exp[2..];
483 let ret = complement_ranges(parse_char_prop(regexp)?);
484 if !regexp.starts_with("}") {
485 return Err(RegexpError::SyntaxError);
486 }
487 *regexp = ®exp[1..];
488 Ok(ret)
489}
490
491fn parse_char_prop(
493 regexp: &mut &str,
494) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
495 if regexp.is_empty() {
496 return Err(RegexpError::SyntaxError);
497 }
498
499 let mut trim = 2;
500 let ret: Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> = match regexp.as_bytes() {
501 [b'L', b'u', ..] => Ok(Box::new(GENERAL_CATEGORY_LU.iter().copied())),
502 [b'L', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_LL.iter().copied())),
503 [b'L', b't', ..] => Ok(Box::new(GENERAL_CATEGORY_LT.iter().copied())),
504 [b'L', b'm', ..] => Ok(Box::new(GENERAL_CATEGORY_LM.iter().copied())),
505 [b'L', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_LO.iter().copied())),
506 [b'L', ..] => {
507 trim = 1;
508 Ok(Box::new(iterate_general_category_l()))
509 }
510 [b'M', b'n', ..] => Ok(Box::new(GENERAL_CATEGORY_MN.iter().copied())),
511 [b'M', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_MC.iter().copied())),
512 [b'M', b'e', ..] => Ok(Box::new(GENERAL_CATEGORY_ME.iter().copied())),
513 [b'M', ..] => {
514 trim = 1;
515 Ok(Box::new(iterate_general_category_m()))
516 }
517 [b'N', b'd', ..] => Ok(Box::new(GENERAL_CATEGORY_ND.iter().copied())),
518 [b'N', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_NL.iter().copied())),
519 [b'N', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_NO.iter().copied())),
520 [b'N', ..] => {
521 trim = 1;
522 Ok(Box::new(iterate_general_category_n()))
523 }
524 [b'P', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_PC.iter().copied())),
525 [b'P', b'd', ..] => Ok(Box::new(GENERAL_CATEGORY_PD.iter().copied())),
526 [b'P', b's', ..] => Ok(Box::new(GENERAL_CATEGORY_PS.iter().copied())),
527 [b'P', b'e', ..] => Ok(Box::new(GENERAL_CATEGORY_PE.iter().copied())),
528 [b'P', b'i', ..] => Ok(Box::new(GENERAL_CATEGORY_PI.iter().copied())),
529 [b'P', b'f', ..] => Ok(Box::new(GENERAL_CATEGORY_PF.iter().copied())),
530 [b'P', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_PO.iter().copied())),
531 [b'P', ..] => {
532 trim = 1;
533 Ok(Box::new(iterate_general_category_p()))
534 }
535 [b'Z', b's', ..] => Ok(Box::new(GENERAL_CATEGORY_ZS.iter().copied())),
536 [b'Z', b'l', ..] => Ok(Box::new(GENERAL_CATEGORY_ZL.iter().copied())),
537 [b'Z', b'p', ..] => Ok(Box::new(GENERAL_CATEGORY_ZP.iter().copied())),
538 [b'Z', ..] => {
539 trim = 1;
540 Ok(Box::new(iterate_general_category_z()))
541 }
542 [b'S', b'm', ..] => Ok(Box::new(GENERAL_CATEGORY_SM.iter().copied())),
543 [b'S', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_SC.iter().copied())),
544 [b'S', b'k', ..] => Ok(Box::new(GENERAL_CATEGORY_SK.iter().copied())),
545 [b'S', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_SO.iter().copied())),
546 [b'S', ..] => {
547 trim = 1;
548 Ok(Box::new(iterate_general_category_s()))
549 }
550 [b'C', b'c', ..] => Ok(Box::new(GENERAL_CATEGORY_CC.iter().copied())),
551 [b'C', b'f', ..] => Ok(Box::new(GENERAL_CATEGORY_CF.iter().copied())),
552 [b'C', b'o', ..] => Ok(Box::new(GENERAL_CATEGORY_CO.iter().copied())),
553 [b'C', ..] => {
555 trim = 1;
556 Ok(Box::new(iterate_general_category_c()))
557 }
558 [b'I', b's', ..] => {
559 while trim < regexp.len()
560 && matches!(regexp.as_bytes()[trim], b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'\x2D')
561 {
562 trim += 1;
563 }
564 let Some((start, end)) = seach_block_range(®exp[2..trim]) else {
565 return Err(RegexpError::InvalidBlock);
566 };
567 Ok(Box::new(once((start, end))))
568 }
569 _ => return Err(RegexpError::InvalidCharProp),
570 };
571
572 *regexp = ®exp[trim..];
573 ret
574}
575
576fn parse_multi_char_esc(
578 regexp: &mut &str,
579) -> Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> {
580 if regexp.starts_with('\\') {
581 return Err(RegexpError::SyntaxError);
582 }
583 *regexp = ®exp[1..];
584 if regexp.is_empty() {
585 return Err(RegexpError::SyntaxError);
586 }
587 let res: Result<Box<dyn Iterator<Item = (char, char)>>, RegexpError> =
588 match regexp.as_bytes()[0] {
589 c @ (b's' | b'S') => {
590 let arr = [('\t', '\t'), ('\n', '\n'), ('\r', '\r')];
591 if c.is_ascii_lowercase() {
592 Ok(Box::new(arr.into_iter()))
593 } else {
594 Ok(Box::new(complement_ranges(arr.into_iter())))
595 }
596 }
597 c @ (b'i' | b'I') => {
598 let arr = [
599 (':', ':'),
600 ('A', 'Z'),
601 ('_', '_'),
602 ('a', 'z'),
603 ('\u{C0}', '\u{D6}'),
604 ('\u{D8}', '\u{F6}'),
605 ('\u{C0}', '\u{D6}'),
606 ('\u{D8}', '\u{F6}'),
607 ('\u{F8}', '\u{2FF}'),
608 ('\u{370}', '\u{37D}'),
609 ('\u{37F}', '\u{1FFF}'),
610 ('\u{200C}', '\u{200D}'),
611 ('\u{2070}', '\u{218F}'),
612 ('\u{2C00}', '\u{2FEF}'),
613 ('\u{3001}', '\u{D7FF}'),
614 ('\u{F900}', '\u{FDCF}'),
615 ('\u{FDF0}', '\u{FFFD}'),
616 ('\u{10000}', '\u{EFFFF}'),
617 ];
618 if c.is_ascii_lowercase() {
619 Ok(Box::new(arr.into_iter()))
620 } else {
621 Ok(Box::new(complement_ranges(arr.into_iter())))
622 }
623 }
624 c @ (b'c' | b'C') => {
625 let arr = [
626 ('-', '.'),
627 ('0', '9'),
628 (':', ':'),
629 ('A', 'Z'),
630 ('_', '_'),
631 ('a', 'z'),
632 ('\u{B7}', '\u{B7}'),
633 ('\u{C0}', '\u{D6}'),
634 ('\u{D8}', '\u{F6}'),
635 ('\u{C0}', '\u{D6}'),
636 ('\u{D8}', '\u{F6}'),
637 ('\u{F8}', '\u{2FF}'),
638 ('\u{300}', '\u{37D}'),
639 ('\u{37F}', '\u{1FFF}'),
640 ('\u{200C}', '\u{200D}'),
641 ('\u{203F}', '\u{2040}'),
642 ('\u{2070}', '\u{218F}'),
643 ('\u{2C00}', '\u{2FEF}'),
644 ('\u{3001}', '\u{D7FF}'),
645 ('\u{F900}', '\u{FDCF}'),
646 ('\u{FDF0}', '\u{FFFD}'),
647 ('\u{10000}', '\u{EFFFF}'),
648 ];
649 if c.is_ascii_lowercase() {
650 Ok(Box::new(arr.into_iter()))
651 } else {
652 Ok(Box::new(complement_ranges(arr.into_iter())))
653 }
654 }
655 b'd' => Ok(Box::new(GENERAL_CATEGORY_ND.iter().copied())),
656 b'D' => Ok(Box::new(complement_ranges(
657 GENERAL_CATEGORY_ND.iter().copied(),
658 ))),
659 c @ (b'w' | b'W') => {
660 let iter = iterate_general_category_p()
661 .chain(iterate_general_category_z().chain(iterate_general_category_c()));
662 if c.is_ascii_lowercase() {
663 Ok(Box::new(iter))
664 } else {
665 Ok(Box::new(complement_ranges(iter)))
666 }
667 }
668 _ => Err(RegexpError::SyntaxError),
669 };
670
671 if res.is_ok() {
672 *regexp = ®exp[1..];
673 }
674 res
675}
676
677#[cfg(test)]
678mod tests {
679 use super::*;
680
681 #[test]
682 fn regex_parse_tests() {
683 let ast = parse_regexp(&mut "a", false).unwrap().unwrap();
684 assert_eq!(format!("{ast}"), "a");
685
686 let ast = parse_regexp(&mut "aa", false).unwrap().unwrap();
687 assert_eq!(format!("{ast}"), "aa");
688
689 }
696
697 #[test]
698 fn regex_matching_tests() {
699 let re = XSRegexp::compile("").unwrap();
700 assert!(re.is_match(""));
701 assert!(!re.is_match(" "));
702 assert!(!re.is_match("a"));
703
704 let re = XSRegexp::compile("a").unwrap();
705 assert!(re.is_match("a"));
706 assert!(!re.is_match(""));
707 assert!(!re.is_match(" "));
708 assert!(!re.is_match("aa"));
709 assert!(!re.is_match("A"));
710 assert!(!re.is_match("b"));
711
712 let re = XSRegexp::compile("aa").unwrap();
713 assert!(re.is_match("aa"));
714 assert!(!re.is_match("a"));
715 assert!(!re.is_match("aaa"));
716 assert!(!re.is_match(""));
717 assert!(!re.is_match(" "));
718 assert!(!re.is_match("AA"));
719 assert!(!re.is_match("b"));
720
721 let re = XSRegexp::compile("ab").unwrap();
722 assert!(re.is_match("ab"));
723 assert!(!re.is_match("aa"));
724 assert!(!re.is_match("a"));
725 assert!(!re.is_match("b"));
726 assert!(!re.is_match(""));
727 assert!(!re.is_match(" ab "));
728 assert!(!re.is_match("AB"));
729
730 let re = XSRegexp::compile("a*").unwrap();
731 assert!(re.is_match(""));
732 assert!(re.is_match("a"));
733 assert!(re.is_match("aa"));
734 assert!(re.is_match("aaa"));
735 assert!(!re.is_match("ab"));
736 assert!(!re.is_match("b"));
737 assert!(!re.is_match(" aaa"));
738 assert!(!re.is_match("aaa "));
739 assert!(!re.is_match("aaA"));
740
741 let re = XSRegexp::compile("a+").unwrap();
742 assert!(re.is_match("a"));
743 assert!(re.is_match("aa"));
744 assert!(re.is_match("aaa"));
745 assert!(!re.is_match(""));
746 assert!(!re.is_match("ab"));
747 assert!(!re.is_match("b"));
748 assert!(!re.is_match(" aaa"));
749 assert!(!re.is_match("aaa "));
750 assert!(!re.is_match("aaA"));
751
752 let re = XSRegexp::compile("a?").unwrap();
753 assert!(re.is_match(""));
754 assert!(re.is_match("a"));
755 assert!(!re.is_match("aa"));
756 assert!(!re.is_match("aaa"));
757 assert!(!re.is_match("ab"));
758 assert!(!re.is_match("b"));
759 assert!(!re.is_match(" aaa"));
760 assert!(!re.is_match("aaa "));
761 assert!(!re.is_match("aaA"));
762
763 let re = XSRegexp::compile("a|b").unwrap();
764 assert!(re.is_match("a"));
765 assert!(re.is_match("b"));
766 assert!(!re.is_match(""));
767 assert!(!re.is_match("aa"));
768 assert!(!re.is_match("aaa"));
769 assert!(!re.is_match("ab"));
770 assert!(!re.is_match(" aaa"));
771 assert!(!re.is_match("aaa "));
772 assert!(!re.is_match("A"));
773 assert!(!re.is_match("B"));
774
775 let re = XSRegexp::compile("a+|b?").unwrap();
776 assert!(re.is_match(""));
777 assert!(re.is_match("a"));
778 assert!(re.is_match("aa"));
779 assert!(re.is_match("aaa"));
780 assert!(re.is_match("b"));
781 assert!(!re.is_match("bb"));
782 assert!(!re.is_match("ab"));
783 assert!(!re.is_match(" aaa"));
784 assert!(!re.is_match("aaa "));
785 assert!(!re.is_match("A"));
786 assert!(!re.is_match("B"));
787
788 let re = XSRegexp::compile("a+c|b?c").unwrap();
789 assert!(re.is_match("ac"));
790 assert!(re.is_match("aac"));
791 assert!(re.is_match("bc"));
792 assert!(re.is_match("c"));
793 assert!(!re.is_match(""));
794 assert!(!re.is_match("a"));
795 assert!(!re.is_match("aa"));
796 assert!(!re.is_match("aaa"));
797 assert!(!re.is_match("b"));
798 assert!(!re.is_match("bb"));
799 assert!(!re.is_match("bbc"));
800 assert!(!re.is_match("ab"));
801 assert!(!re.is_match("abc"));
802 assert!(!re.is_match(" aaa"));
803 assert!(!re.is_match("aaa "));
804 assert!(!re.is_match("A"));
805 assert!(!re.is_match("B"));
806 assert!(!re.is_match("C"));
807
808 let re = XSRegexp::compile("a+(c|b?)c").unwrap();
809 assert!(re.is_match("ac"));
810 assert!(re.is_match("aac"));
811 assert!(re.is_match("abc"));
812 assert!(!re.is_match("bc"));
813 assert!(!re.is_match("c"));
814 assert!(!re.is_match(""));
815 assert!(!re.is_match("a"));
816 assert!(!re.is_match("aa"));
817 assert!(!re.is_match("aaa"));
818 assert!(!re.is_match("b"));
819 assert!(!re.is_match("bb"));
820 assert!(!re.is_match("bbc"));
821 assert!(!re.is_match("ab"));
822 assert!(!re.is_match(" aaa"));
823 assert!(!re.is_match("aaa "));
824 assert!(!re.is_match("A"));
825 assert!(!re.is_match("B"));
826 assert!(!re.is_match("C"));
827
828 let re = XSRegexp::compile("[abde]").unwrap();
829 assert!(re.is_match("a"));
830 assert!(re.is_match("b"));
831 assert!(re.is_match("d"));
832 assert!(re.is_match("e"));
833 assert!(!re.is_match(""));
834 assert!(!re.is_match("c"));
835 assert!(!re.is_match("ab"));
836 assert!(!re.is_match("f"));
837
838 let re = XSRegexp::compile("[^abde]").unwrap();
839 assert!(re.is_match("c"));
840 assert!(re.is_match("f"));
841 assert!(!re.is_match(""));
842 assert!(!re.is_match("a"));
843 assert!(!re.is_match("b"));
844 assert!(!re.is_match("d"));
845 assert!(!re.is_match("e"));
846 assert!(!re.is_match("ab"));
847
848 let re = XSRegexp::compile("[a-ce-g]").unwrap();
849 assert!(re.is_match("a"));
850 assert!(re.is_match("b"));
851 assert!(re.is_match("c"));
852 assert!(re.is_match("e"));
853 assert!(re.is_match("f"));
854 assert!(re.is_match("g"));
855 assert!(!re.is_match("d"));
856 assert!(!re.is_match("h"));
857
858 let re = XSRegexp::compile("[^a-ce-g]").unwrap();
859 assert!(re.is_match("d"));
860 assert!(re.is_match("h"));
861 assert!(!re.is_match("a"));
862 assert!(!re.is_match("b"));
863 assert!(!re.is_match("c"));
864 assert!(!re.is_match("e"));
865 assert!(!re.is_match("f"));
866 assert!(!re.is_match("g"));
867
868 let re = XSRegexp::compile("[abde-[a-b]]").unwrap();
869 assert!(re.is_match("d"));
870 assert!(re.is_match("e"));
871 assert!(!re.is_match("a"));
872 assert!(!re.is_match("b"));
873 assert!(!re.is_match("c"));
874 assert!(!re.is_match("f"));
875
876 let re = XSRegexp::compile("[a-g-[d]]").unwrap();
877 assert!(re.is_match("a"));
878 assert!(re.is_match("b"));
879 assert!(re.is_match("c"));
880 assert!(re.is_match("e"));
881 assert!(re.is_match("f"));
882 assert!(re.is_match("g"));
883 assert!(!re.is_match("d"));
884 assert!(!re.is_match("h"));
885
886 let re = XSRegexp::compile("[a-g-[i]]").unwrap();
887 assert!(re.is_match("a"));
888 assert!(re.is_match("b"));
889 assert!(re.is_match("c"));
890 assert!(re.is_match("d"));
891 assert!(re.is_match("e"));
892 assert!(re.is_match("f"));
893 assert!(re.is_match("g"));
894 assert!(!re.is_match("h"));
895 assert!(!re.is_match("i"));
896
897 let re = XSRegexp::compile("[^a-g-[i]]").unwrap();
898 assert!(!re.is_match("a"));
899 assert!(!re.is_match("b"));
900 assert!(!re.is_match("c"));
901 assert!(!re.is_match("d"));
902 assert!(!re.is_match("e"));
903 assert!(!re.is_match("f"));
904 assert!(!re.is_match("g"));
905 assert!(re.is_match("h"));
906 assert!(!re.is_match("i"));
907 assert!(re.is_match("j"));
908
909 let re = XSRegexp::compile("[a-gik-m-[c-el]]").unwrap();
910 assert!(re.is_match("a"));
911 assert!(re.is_match("b"));
912 assert!(!re.is_match("c"));
913 assert!(!re.is_match("d"));
914 assert!(!re.is_match("e"));
915 assert!(re.is_match("f"));
916 assert!(re.is_match("g"));
917 assert!(!re.is_match("h"));
918 assert!(re.is_match("i"));
919 assert!(!re.is_match("j"));
920 assert!(re.is_match("k"));
921 assert!(!re.is_match("l"));
922 assert!(re.is_match("m"));
923 }
924}