1use crate::flags::Flags;
2use std::fmt;
3
4#[derive(Debug, Clone, PartialEq)]
6pub enum AstNode {
7 Literal(char),
9
10 CharClass(CharClass),
12
13 StartAnchor,
15 EndAnchor,
17 WordBoundary,
19 StartWord,
21 EndWord,
23 SetMatchStart,
25 SetMatchEnd,
27
28 ZeroOrMore {
30 node: Box<AstNode>,
32 greedy: bool,
34 },
35 OneOrMore {
37 node: Box<AstNode>,
39 greedy: bool,
41 },
42 Optional {
44 node: Box<AstNode>,
46 greedy: bool,
48 },
49 Exact {
51 node: Box<AstNode>,
53 count: usize,
55 },
56 Range {
58 node: Box<AstNode>,
60 min: usize,
62 max: Option<usize>,
64 greedy: bool,
66 },
67
68 Group {
70 nodes: Vec<AstNode>,
72 name: Option<String>,
74 capture: bool,
76 index: Option<usize>,
78 },
79 Alternation(Vec<Vec<AstNode>>),
81
82 Backref(usize),
84
85 LookAhead {
87 nodes: Vec<AstNode>,
89 positive: bool,
91 },
92 LookBehind {
94 nodes: Vec<AstNode>,
96 positive: bool,
98 },
99}
100
101#[derive(Debug, Clone, PartialEq)]
103pub enum CharClass {
104 Digit,
107 NonDigit,
109 Word,
111 NonWord,
113 Whitespace,
115 NonWhitespace,
117
118 Lowercase,
121 NonLowercase,
123 Uppercase,
125 NonUppercase,
127 Hex,
129 NonHex,
131 Octal,
133 NonOctal,
135 WordStart,
137 NonWordStart,
139 Punctuation,
141 NonPunctuation,
143 Alphanumeric,
145 NonAlphanumeric,
147
148 Set {
151 chars: Vec<CharRange>,
153 negated: bool,
155 },
156
157 Dot,
159}
160
161#[derive(Debug, Clone, PartialEq)]
163pub struct CharRange {
164 pub start: char,
166 pub end: char,
168}
169
170#[derive(Debug, Clone)]
172pub struct Parser {
173 input: Vec<char>,
174 pos: usize,
175 flags: Flags,
176 group_count: usize,
177}
178
179#[derive(Debug)]
181pub enum ParseError {
182 UnexpectedChar(char, usize),
183 UnexpectedEof,
184 InvalidQuantifier(String),
185 UnmatchedParen,
186 InvalidGroupName(String),
187 InvalidEscape(char),
188 InvalidCharClass,
189 DuplicateGroupName(String),
190 InvalidBackref(usize),
191 InvalidLineNumber(String),
192 InvalidGroup(String),
193}
194
195impl fmt::Display for ParseError {
196 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
197 match self {
198 ParseError::UnexpectedChar(c, pos) => {
199 write!(f, "Unexpected '{}' at position {}", c, pos)
200 }
201 ParseError::UnexpectedEof => write!(f, "Unexpected end of input"),
202 ParseError::InvalidQuantifier(s) => {
203 write!(f, "Invalid quantifier: {}", s)
204 }
205 ParseError::UnmatchedParen => write!(f, "Unmatched parenthesis"),
206 ParseError::InvalidGroupName(s) => {
207 write!(f, "Invalid group name: {}", s)
208 }
209 ParseError::InvalidEscape(c) => {
210 write!(f, "Invalid escape sequence: \\{}", c)
211 }
212 ParseError::InvalidCharClass => {
213 write!(f, "Invalid character class")
214 }
215 ParseError::DuplicateGroupName(s) => {
216 write!(f, "Duplicate group name: {}", s)
217 }
218 ParseError::InvalidBackref(n) => {
219 write!(f, "Invalid backreference: \\{}", n)
220 }
221 ParseError::InvalidLineNumber(s) => {
222 write!(f, "Invalid line number: {}", s)
223 }
224 ParseError::InvalidGroup(s) => {
225 write!(f, "Invalid group syntax: {}", s)
226 }
227 }
228 }
229}
230
231impl std::error::Error for ParseError {}
232
233impl Parser {
234 pub fn new(pattern: &str, flags: Flags) -> Self {
236 Parser {
237 input: pattern.chars().collect(),
238 pos: 0,
239 flags,
240 group_count: 0,
241 }
242 }
243
244 pub fn parse(&mut self) -> Result<Vec<AstNode>, ParseError> {
246 self.parse_alternation()
247 }
248
249 fn parse_alternation(&mut self) -> Result<Vec<AstNode>, ParseError> {
251 let mut alternatives = vec![];
252 let mut current = self.parse_sequence()?;
253
254 while self.peek() == Some('|') {
255 self.consume()?;
256 alternatives.push(current);
257 current = self.parse_sequence()?;
258 }
259 alternatives.push(current);
260
261 if alternatives.len() == 1 {
262 Ok(alternatives.pop().unwrap())
263 } else {
264 Ok(vec![AstNode::Alternation(alternatives)])
265 }
266 }
267
268 fn skip_whitespace_and_comments(&mut self) {
269 if !self.flags.verbose {
270 return;
271 }
272 while self.pos < self.input.len() {
273 let ch = self.input[self.pos];
274 if ch.is_whitespace() {
275 self.pos += 1;
276 } else if ch == '#' {
277 self.pos += 1;
278 while self.pos < self.input.len() && self.input[self.pos] != '\n' {
279 self.pos += 1;
280 }
281 } else {
282 break;
283 }
284 }
285 }
286
287 fn parse_sequence(&mut self) -> Result<Vec<AstNode>, ParseError> {
289 let mut nodes = vec![];
290
291 loop {
292 self.skip_whitespace_and_comments();
293 match self.current() {
294 Some(&'|') | Some(&')') | None => break,
295 _ => {
296 let node = self.parse_atom()?;
297 let node = self.apply_quantifier(node)?;
298 nodes.push(node);
299 }
300 }
301 }
302
303 Ok(nodes)
304 }
305
306 fn parse_atom(&mut self) -> Result<AstNode, ParseError> {
308 match self.current() {
309 None => Err(ParseError::UnexpectedEof),
310 Some(&'.') => {
311 self.consume()?;
312 Ok(AstNode::CharClass(CharClass::Dot))
313 }
314 Some(&'^') => {
315 self.consume()?;
316 Ok(AstNode::StartAnchor)
317 }
318 Some(&'$') => {
319 self.consume()?;
320 Ok(AstNode::EndAnchor)
321 }
322 Some(&'[') => self.parse_char_class(),
323 Some(&'(') => self.parse_group(),
324 Some(&'\\') => self.parse_escape(),
325 Some(&ch) => {
326 self.consume()?;
327 Ok(AstNode::Literal(ch))
328 }
329 }
330 }
331
332 fn parse_escape(&mut self) -> Result<AstNode, ParseError> {
334 self.consume()?; match self.current() {
337 None => Err(ParseError::UnexpectedEof),
338 Some(&'d') => {
339 self.consume()?;
340 Ok(AstNode::CharClass(CharClass::Digit))
341 }
342 Some(&'D') => {
343 self.consume()?;
344 Ok(AstNode::CharClass(CharClass::NonDigit))
345 }
346 Some(&'w') => {
347 self.consume()?;
348 Ok(AstNode::CharClass(CharClass::Word))
349 }
350 Some(&'W') => {
351 self.consume()?;
352 Ok(AstNode::CharClass(CharClass::NonWord))
353 }
354 Some(&'s') => {
355 self.consume()?;
356 Ok(AstNode::CharClass(CharClass::Whitespace))
357 }
358 Some(&'S') => {
359 self.consume()?;
360 Ok(AstNode::CharClass(CharClass::NonWhitespace))
361 }
362 Some(&'l') => {
363 self.consume()?;
364 Ok(AstNode::CharClass(CharClass::Lowercase))
365 }
366 Some(&'L') => {
367 self.consume()?;
368 Ok(AstNode::CharClass(CharClass::NonLowercase))
369 }
370 Some(&'u') => {
371 self.consume()?;
372 Ok(AstNode::CharClass(CharClass::Uppercase))
373 }
374 Some(&'U') => {
375 self.consume()?;
376 Ok(AstNode::CharClass(CharClass::NonUppercase))
377 }
378 Some(&'x') => {
379 self.consume()?;
380 Ok(AstNode::CharClass(CharClass::Hex))
381 }
382 Some(&'X') => {
383 self.consume()?;
384 Ok(AstNode::CharClass(CharClass::NonHex))
385 }
386 Some(&'o') => {
387 self.consume()?;
388 Ok(AstNode::CharClass(CharClass::Octal))
389 }
390 Some(&'O') => {
391 self.consume()?;
392 Ok(AstNode::CharClass(CharClass::NonOctal))
393 }
394 Some(&'h') => {
395 self.consume()?;
396 Ok(AstNode::CharClass(CharClass::WordStart))
397 }
398 Some(&'H') => {
399 self.consume()?;
400 Ok(AstNode::CharClass(CharClass::NonWordStart))
401 }
402 Some(&'p') => {
403 self.consume()?;
404 Ok(AstNode::CharClass(CharClass::Punctuation))
405 }
406 Some(&'P') => {
407 self.consume()?;
408 Ok(AstNode::CharClass(CharClass::NonPunctuation))
409 }
410 Some(&'a') => {
411 self.consume()?;
412 Ok(AstNode::CharClass(CharClass::Alphanumeric))
413 }
414 Some(&'A') => {
415 self.consume()?;
416 Ok(AstNode::CharClass(CharClass::NonAlphanumeric))
417 }
418 Some(&'b') => {
419 self.consume()?;
420 Ok(AstNode::WordBoundary)
421 }
422 Some(&'<') => {
423 self.consume()?;
424 Ok(AstNode::StartWord)
425 }
426 Some(&'>') => {
427 self.consume()?;
428 Ok(AstNode::EndWord)
429 }
430 Some(&'z') => {
431 self.consume()?;
432 match self.current() {
433 Some(&'s') => {
434 self.consume()?;
435 Ok(AstNode::SetMatchStart)
436 }
437 Some(&'e') => {
438 self.consume()?;
439 Ok(AstNode::SetMatchEnd)
440 }
441 _ => Err(ParseError::InvalidEscape('z')),
442 }
443 }
444 Some(&c @ '0'..='9') => {
445 self.consume()?;
446 let digit = c.to_digit(10).unwrap() as usize;
447 Ok(AstNode::Backref(digit))
448 }
449 Some(&'n') => {
450 self.consume()?;
451 Ok(AstNode::Literal('\n'))
452 }
453 Some(&'t') => {
454 self.consume()?;
455 Ok(AstNode::Literal('\t'))
456 }
457 Some(&'r') => {
458 self.consume()?;
459 Ok(AstNode::Literal('\r'))
460 }
461 Some(&'f') => {
462 self.consume()?;
463 Ok(AstNode::Literal('\x0C'))
464 }
465 Some(&'v') => {
466 self.consume()?;
467 Ok(AstNode::Literal('\x0B'))
468 }
469 Some(&'\\') => {
470 self.consume()?;
471 Ok(AstNode::Literal('\\'))
472 }
473 Some(&ch) => {
474 self.consume()?;
475 Ok(AstNode::Literal(ch))
477 }
478 }
479 }
480
481 fn parse_group(&mut self) -> Result<AstNode, ParseError> {
483 self.consume()?; if self.current() == Some(&'?') {
486 self.consume()?;
487 self.parse_extended_group()
488 } else {
489 self.group_count += 1;
491 let index = self.group_count;
492 let nodes = self.parse_alternation()?;
493 self.expect_close_paren()?;
494 Ok(AstNode::Group {
495 nodes,
496 name: None,
497 capture: true,
498 index: Some(index),
499 })
500 }
501 }
502
503 fn parse_extended_group(&mut self) -> Result<AstNode, ParseError> {
504 match self.current() {
505 Some(&':') => {
506 self.consume()?;
507 let nodes = self.parse_alternation()?;
508 self.expect_close_paren()?;
509 Ok(AstNode::Group {
510 nodes,
511 name: None,
512 capture: false,
513 index: None,
514 })
515 }
516 Some(&'<') => {
517 self.consume()?;
518 match self.current() {
520 Some(&'=') => {
521 self.consume()?;
522 let nodes = self.parse_alternation()?;
523 self.expect_close_paren()?;
524 Ok(AstNode::LookBehind {
525 nodes,
526 positive: true,
527 })
528 }
529 Some(&'!') => {
530 self.consume()?;
531 let nodes = self.parse_alternation()?;
532 self.expect_close_paren()?;
533 Ok(AstNode::LookBehind {
534 nodes,
535 positive: false,
536 })
537 }
538 _ => {
539 let name = self.parse_group_name()?;
541 if self.current() != Some(&'>') {
542 return Err(ParseError::InvalidGroupName("expected '>'".to_string()));
543 }
544 self.consume()?;
545
546 self.group_count += 1;
547 let index = self.group_count;
548
549 let nodes = self.parse_alternation()?;
550 self.expect_close_paren()?;
551 Ok(AstNode::Group {
552 nodes,
553 name: Some(name),
554 capture: true,
555 index: Some(index),
556 })
557 }
558 }
559 }
560 Some(&'>') => {
561 self.consume()?;
562 match self.current() {
563 Some(&'=') => {
564 self.consume()?;
565 let nodes = self.parse_alternation()?;
566 self.expect_close_paren()?;
567 Ok(AstNode::LookAhead {
568 nodes,
569 positive: true,
570 })
571 }
572 Some(&'!') => {
573 self.consume()?;
574 let nodes = self.parse_alternation()?;
575 self.expect_close_paren()?;
576 Ok(AstNode::LookAhead {
577 nodes,
578 positive: false,
579 })
580 }
581 _ => Err(ParseError::InvalidGroup(
582 "Expected = or ! after ?>".to_string(),
583 )),
584 }
585 }
586 _ => Err(ParseError::InvalidGroup("Unknown extension ?".to_string())),
587 }
588 }
589
590 fn parse_group_name(&mut self) -> Result<String, ParseError> {
592 let mut name = String::new();
593
594 loop {
595 match self.current() {
596 Some(&c) if c.is_alphanumeric() || c == '_' => {
597 name.push(c);
598 self.consume()?;
599 }
600 _ => break,
601 }
602 }
603
604 if name.is_empty() {
605 return Err(ParseError::InvalidGroupName("empty name".to_string()));
606 }
607
608 Ok(name)
609 }
610
611 fn parse_char_class(&mut self) -> Result<AstNode, ParseError> {
613 self.consume()?; let negated = if self.current() == Some(&'^') {
616 self.consume()?;
617 true
618 } else {
619 false
620 };
621
622 let mut ranges = vec![];
623
624 loop {
625 match self.current() {
626 None => return Err(ParseError::UnexpectedEof),
627 Some(&']') => {
628 self.consume()?;
629 break;
630 }
631 Some(&'\\') => {
632 self.consume()?;
634 match self.current() {
635 Some(&c) => {
636 self.consume()?;
637 ranges.push(CharRange { start: c, end: c });
638 }
639 None => return Err(ParseError::UnexpectedEof),
640 }
641 }
642 Some(&c) => {
643 self.consume()?;
644 if self.current() == Some(&'-')
646 && self.peek_ahead(1).is_some()
647 && self.peek_ahead(1) != Some(&']')
648 {
649 self.consume()?;
650 match self.current() {
651 Some(&end) => {
652 self.consume()?;
653 ranges.push(CharRange { start: c, end });
654 }
655 None => return Err(ParseError::UnexpectedEof),
656 }
657 } else {
658 ranges.push(CharRange { start: c, end: c });
659 }
660 }
661 }
662 }
663
664 Ok(AstNode::CharClass(CharClass::Set {
665 chars: ranges,
666 negated,
667 }))
668 }
669
670 fn apply_quantifier(&mut self, node: AstNode) -> Result<AstNode, ParseError> {
672 self.skip_whitespace_and_comments();
673 match self.current() {
674 Some(&'*') => {
675 self.consume()?;
676 let greedy = self.current() != Some(&'?');
677 if !greedy {
678 self.consume()?;
679 }
680 Ok(AstNode::ZeroOrMore {
681 node: Box::new(node),
682 greedy,
683 })
684 }
685 Some(&'+') => {
686 self.consume()?;
687 let greedy = self.current() != Some(&'?');
688 if !greedy {
689 self.consume()?;
690 }
691 Ok(AstNode::OneOrMore {
692 node: Box::new(node),
693 greedy,
694 })
695 }
696 Some(&'?') => {
697 self.consume()?;
698 let greedy = self.current() != Some(&'?');
699 if !greedy {
700 self.consume()?;
701 }
702 Ok(AstNode::Optional {
703 node: Box::new(node),
704 greedy,
705 })
706 }
707 Some(&'{') => self.parse_bounded_quantifier(node),
708 _ => Ok(node),
709 }
710 }
711
712 fn parse_bounded_quantifier(&mut self, node: AstNode) -> Result<AstNode, ParseError> {
714 self.consume()?; let min = if self.current() == Some(&',') {
718 0
719 } else {
720 self.parse_number()?
721 };
722
723 match self.current() {
724 Some(&',') => {
725 self.consume()?;
726 let max = if self.current() == Some(&'}') {
728 None
729 } else {
730 Some(self.parse_number()?)
731 };
732
733 if self.current() != Some(&'}') {
734 return Err(ParseError::InvalidQuantifier("expected '}'".to_string()));
735 }
736 self.consume()?;
737
738 let greedy = self.current() != Some(&'?');
739 if !greedy {
740 self.consume()?;
741 }
742
743 Ok(AstNode::Range {
744 node: Box::new(node),
745 min,
746 max,
747 greedy,
748 })
749 }
750 Some(&'}') => {
751 self.consume()?;
752 Ok(AstNode::Exact {
753 node: Box::new(node),
754 count: min,
755 })
756 }
757 _ => Err(ParseError::InvalidQuantifier(
758 "expected ',' or '}'".to_string(),
759 )),
760 }
761 }
762
763 fn parse_number(&mut self) -> Result<usize, ParseError> {
765 let mut num = 0;
766 let mut found = false;
767
768 while let Some(&c @ '0'..='9') = self.current() {
769 found = true;
770 num = num * 10 + (c.to_digit(10).unwrap() as usize);
771 self.consume()?;
772 }
773
774 if !found {
775 return Err(ParseError::InvalidLineNumber("expected digits".to_string()));
776 }
777
778 Ok(num)
779 }
780
781 fn expect_close_paren(&mut self) -> Result<(), ParseError> {
782 if self.current() != Some(&')') {
783 return Err(ParseError::UnmatchedParen);
784 }
785 self.consume()?;
786 Ok(())
787 }
788
789 fn current(&self) -> Option<&char> {
791 self.input.get(self.pos)
792 }
793
794 fn peek_ahead(&self, n: usize) -> Option<&char> {
796 self.input.get(self.pos + n)
797 }
798
799 fn peek(&self) -> Option<char> {
801 self.input.get(self.pos).copied()
802 }
803
804 fn consume(&mut self) -> Result<char, ParseError> {
806 match self.current() {
807 Some(&ch) => {
808 self.pos += 1;
809 Ok(ch)
810 }
811 None => Err(ParseError::UnexpectedEof),
812 }
813 }
814}