1use crate::error::{Error, Result};
4
5#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8 Char(char),
10 Escaped(char),
12 NamedClass(NamedClassToken),
14 NamedList(String),
16 Backreference(usize),
18 OpenParen,
20 CloseParen,
22 OpenBracket,
24 CloseBracket,
26 OpenBrace,
28 CloseBrace,
30 Pipe,
32 Caret,
34 Dollar,
36 Dot,
38 Star,
40 Plus,
42 Question,
44 StarPossessive,
46 PlusPossessive,
48 QuestionPossessive,
50 Tilde,
52 Hyphen,
54 NonCapturing,
56 PositiveLookahead,
58 NegativeLookahead,
60 PositiveLookbehind,
62 NegativeLookbehind,
64 NamedGroup(String),
66 BestMatch,
68 EnhanceMatch,
70 PosixMatch,
72 Verbose,
74 DotAll,
76 MultiLine,
78 Ungreedy,
80 CaseInsensitive,
82 Global,
84 Unicode,
86 ResetMatchStart,
88 AtomicGroup,
90 RecursivePattern,
92 RecursiveGroup(usize),
94 RecursiveNamedGroup(String),
96 Eof,
98}
99
100#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum NamedClassToken {
103 Digit,
105 NotDigit,
107 Word,
109 NotWord,
111 Whitespace,
113 NotWhitespace,
115 WordBoundary,
117 NotWordBoundary,
119}
120
121pub struct Lexer<'a> {
123 input: &'a str,
124 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
125 position: usize,
126 verbose: bool,
128}
129
130impl<'a> Lexer<'a> {
131 #[must_use]
133 pub fn new(input: &'a str) -> Self {
134 Lexer {
135 input,
136 chars: input.char_indices().peekable(),
137 position: 0,
138 verbose: false,
139 }
140 }
141
142 #[must_use]
144 pub fn new_with_flags(input: &'a str, verbose: bool) -> Self {
145 Lexer {
146 input,
147 chars: input.char_indices().peekable(),
148 position: 0,
149 verbose,
150 }
151 }
152
153 #[must_use]
155 pub fn position(&self) -> usize {
156 self.position
157 }
158
159 pub fn set_position(&mut self, pos: usize) {
161 self.position = pos;
162 self.chars = self.input[pos..].char_indices().peekable();
164 }
165
166 fn peek_char(&mut self) -> Option<char> {
168 self.chars.peek().map(|(_, ch)| *ch)
169 }
170
171 fn next_char(&mut self) -> Option<char> {
173 if let Some((pos, ch)) = self.chars.next() {
174 self.position = pos + ch.len_utf8();
175 Some(ch)
176 } else {
177 None
178 }
179 }
180
181 pub fn try_match(&mut self, s: &str) -> bool {
186 let remaining = &self.input[self.position..];
187 if remaining.starts_with(s) {
188 for _ in 0..s.chars().count() {
189 self.next_char();
190 }
191 true
192 } else {
193 false
194 }
195 }
196
197 fn skip_verbose_whitespace(&mut self) {
199 while let Some(ch) = self.peek_char() {
200 if ch.is_whitespace() {
201 self.next_char();
202 } else if ch == '#' {
203 self.next_char(); while let Some(c) = self.peek_char() {
206 if c == '\n' {
207 self.next_char();
208 break;
209 }
210 self.next_char();
211 }
212 } else {
213 break;
214 }
215 }
216 }
217
218 pub fn next_token(&mut self) -> Result<Token> {
223 if self.verbose {
225 self.skip_verbose_whitespace();
226 }
227
228 let Some(ch) = self.next_char() else {
229 return Ok(Token::Eof);
230 };
231
232 match ch {
233 '(' => self.lex_group_start(),
234 ')' => Ok(Token::CloseParen),
235 '[' => Ok(Token::OpenBracket),
236 ']' => Ok(Token::CloseBracket),
237 '{' => Ok(Token::OpenBrace),
238 '}' => Ok(Token::CloseBrace),
239 '|' => Ok(Token::Pipe),
240 '^' => Ok(Token::Caret),
241 '$' => Ok(Token::Dollar),
242 '.' => Ok(Token::Dot),
243 '*' => Ok(Token::Star),
244 '+' => Ok(Token::Plus),
245 '?' => Ok(Token::Question),
246 '~' => Ok(Token::Tilde),
247 '-' => Ok(Token::Hyphen),
248 '\\' => self.lex_escape(),
249 _ => Ok(Token::Char(ch)),
250 }
251 }
252
253 #[allow(clippy::too_many_lines)]
254 fn lex_group_start(&mut self) -> Result<Token> {
255 if self.peek_char() == Some('?') {
257 self.next_char(); match self.peek_char() {
260 Some(':') => {
261 self.next_char();
262 Ok(Token::NonCapturing)
263 }
264 Some('=') => {
265 self.next_char();
266 Ok(Token::PositiveLookahead)
267 }
268 Some('!') => {
269 self.next_char();
270 Ok(Token::NegativeLookahead)
271 }
272 Some('<') => {
273 self.next_char();
274 match self.peek_char() {
275 Some('=') => {
276 self.next_char();
277 Ok(Token::PositiveLookbehind)
278 }
279 Some('!') => {
280 self.next_char();
281 Ok(Token::NegativeLookbehind)
282 }
283 Some(c) if c.is_alphabetic() || c == '_' => self.lex_named_group(),
284 _ => Err(Error::parse(
285 self.position,
286 "expected '=', '!', or group name after '(?<'",
287 )),
288 }
289 }
290 Some('P') => {
291 self.next_char();
292 match self.peek_char() {
293 Some('<') => {
294 self.next_char();
295 self.lex_named_group()
296 }
297 Some('>') => {
298 self.next_char();
300 self.lex_recursive_name()
301 }
302 _ => Err(Error::parse(
303 self.position,
304 "expected '<' or '>' after '(?P'",
305 )),
306 }
307 }
308 Some('b') => {
309 self.next_char();
310 if self.peek_char() == Some(')') {
312 self.next_char();
313 Ok(Token::BestMatch)
314 } else {
315 Err(Error::parse(self.position, "expected ')' after '(?b'"))
316 }
317 }
318 Some('e') => {
319 self.next_char();
320 if self.peek_char() == Some(')') {
322 self.next_char();
323 Ok(Token::EnhanceMatch)
324 } else {
325 Err(Error::parse(self.position, "expected ')' after '(?e'"))
326 }
327 }
328 Some('p') => {
329 self.next_char();
330 if self.peek_char() == Some(')') {
332 self.next_char();
333 Ok(Token::PosixMatch)
334 } else {
335 Err(Error::parse(self.position, "expected ')' after '(?p'"))
336 }
337 }
338 Some('x') => {
339 self.next_char();
340 if self.peek_char() == Some(')') {
341 self.next_char();
342 self.verbose = true;
344 Ok(Token::Verbose)
345 } else {
346 Err(Error::parse(self.position, "expected ')' after '(?x'"))
347 }
348 }
349 Some('s') => {
350 self.next_char();
351 if self.peek_char() == Some(')') {
352 self.next_char();
353 Ok(Token::DotAll)
354 } else {
355 Err(Error::parse(self.position, "expected ')' after '(?s'"))
356 }
357 }
358 Some('m') => {
359 self.next_char();
360 if self.peek_char() == Some(')') {
361 self.next_char();
362 Ok(Token::MultiLine)
363 } else {
364 Err(Error::parse(self.position, "expected ')' after '(?m'"))
365 }
366 }
367 Some('U') => {
368 self.next_char();
369 if self.peek_char() == Some(')') {
370 self.next_char();
371 Ok(Token::Ungreedy)
372 } else {
373 Err(Error::parse(self.position, "expected ')' after '(?U'"))
374 }
375 }
376 Some('i') => {
377 self.next_char();
378 if self.peek_char() == Some(')') {
379 self.next_char();
380 Ok(Token::CaseInsensitive)
381 } else {
382 Err(Error::parse(self.position, "expected ')' after '(?i'"))
383 }
384 }
385 Some('g') => {
386 self.next_char();
387 if self.peek_char() == Some(')') {
388 self.next_char();
389 Ok(Token::Global)
390 } else {
391 Err(Error::parse(self.position, "expected ')' after '(?g'"))
392 }
393 }
394 Some('u') => {
395 self.next_char();
396 if self.peek_char() == Some(')') {
397 self.next_char();
398 Ok(Token::Unicode)
399 } else {
400 Err(Error::parse(self.position, "expected ')' after '(?u'"))
401 }
402 }
403 Some('>') => {
404 self.next_char();
406 Ok(Token::AtomicGroup)
407 }
408 Some('R') => {
409 self.next_char();
411 if self.peek_char() == Some(')') {
412 self.next_char();
413 Ok(Token::RecursivePattern)
414 } else {
415 Err(Error::parse(self.position, "expected ')' after '(?R'"))
416 }
417 }
418 Some('&') => {
419 self.next_char();
421 self.lex_recursive_name()
422 }
423 Some(c) if c.is_ascii_digit() => {
424 self.lex_recursive_number()
426 }
427 _ => Err(Error::parse(
428 self.position,
429 "invalid group syntax after '(?'",
430 )),
431 }
432 } else {
433 Ok(Token::OpenParen)
434 }
435 }
436
437 fn lex_named_group(&mut self) -> Result<Token> {
439 let mut name = String::new();
440
441 while let Some(ch) = self.peek_char() {
442 if ch == '>' {
443 self.next_char();
444 if name.is_empty() {
445 return Err(Error::parse(self.position, "empty group name"));
446 }
447 return Ok(Token::NamedGroup(name));
448 } else if ch.is_alphanumeric() || ch == '_' {
449 name.push(ch);
450 self.next_char();
451 } else {
452 return Err(Error::parse(
453 self.position,
454 format!("invalid character in group name: '{ch}'"),
455 ));
456 }
457 }
458
459 Err(Error::unclosed("named group", self.position))
460 }
461
462 fn lex_recursive_number(&mut self) -> Result<Token> {
464 let mut num = String::new();
465
466 while let Some(ch) = self.peek_char() {
467 if ch == ')' {
468 self.next_char();
469 if num.is_empty() {
470 return Err(Error::parse(self.position, "empty recursive group number"));
471 }
472 let group_num: usize = num
473 .parse()
474 .map_err(|_| Error::parse(self.position, "invalid recursive group number"))?;
475 return Ok(Token::RecursiveGroup(group_num));
476 } else if ch.is_ascii_digit() {
477 num.push(ch);
478 self.next_char();
479 } else {
480 return Err(Error::parse(
481 self.position,
482 format!("invalid character in recursive group: '{ch}'"),
483 ));
484 }
485 }
486
487 Err(Error::unclosed("recursive group", self.position))
488 }
489
490 fn lex_recursive_name(&mut self) -> Result<Token> {
492 let mut name = String::new();
493
494 while let Some(ch) = self.peek_char() {
495 if ch == ')' {
496 self.next_char();
497 if name.is_empty() {
498 return Err(Error::parse(self.position, "empty recursive group name"));
499 }
500 return Ok(Token::RecursiveNamedGroup(name));
501 } else if ch.is_alphanumeric() || ch == '_' {
502 name.push(ch);
503 self.next_char();
504 } else {
505 return Err(Error::parse(
506 self.position,
507 format!("invalid character in recursive group name: '{ch}'"),
508 ));
509 }
510 }
511
512 Err(Error::unclosed("recursive group", self.position))
513 }
514
515 fn lex_escape(&mut self) -> Result<Token> {
517 let Some(ch) = self.next_char() else {
518 return Err(Error::parse(self.position, "unexpected end after '\\'"));
519 };
520
521 match ch {
522 'd' => Ok(Token::NamedClass(NamedClassToken::Digit)),
524 'D' => Ok(Token::NamedClass(NamedClassToken::NotDigit)),
525 'w' => Ok(Token::NamedClass(NamedClassToken::Word)),
526 'W' => Ok(Token::NamedClass(NamedClassToken::NotWord)),
527 's' => Ok(Token::NamedClass(NamedClassToken::Whitespace)),
528 'S' => Ok(Token::NamedClass(NamedClassToken::NotWhitespace)),
529 'b' => Ok(Token::NamedClass(NamedClassToken::WordBoundary)),
530 'B' => Ok(Token::NamedClass(NamedClassToken::NotWordBoundary)),
531
532 'L' => self.lex_named_list(),
534
535 'n' => Ok(Token::Escaped('\n')),
537 'r' => Ok(Token::Escaped('\r')),
538 't' => Ok(Token::Escaped('\t')),
539 'f' => Ok(Token::Escaped('\x0C')),
540 'v' => Ok(Token::Escaped('\x0B')),
541 '0' => Ok(Token::Escaped('\0')),
542
543 '1'..='9' => {
545 let mut num = ch.to_digit(10).unwrap() as usize;
546 while let Some(next_ch) = self.peek_char() {
548 if let Some(digit) = next_ch.to_digit(10) {
549 num = num * 10 + digit as usize;
550 self.next_char();
551 } else {
552 break;
553 }
554 }
555 Ok(Token::Backreference(num))
556 }
557
558 'x' => self.lex_hex_escape(),
560
561 'u' => self.lex_unicode_escape(),
563
564 '\\' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '.' | '*' | '+' | '?'
566 | '~' | '-' | '/' => Ok(Token::Escaped(ch)),
567
568 'K' => Ok(Token::ResetMatchStart),
570
571 _ => Err(Error::invalid_escape(ch, self.position - 1)),
572 }
573 }
574
575 fn lex_hex_escape(&mut self) -> Result<Token> {
577 let mut hex = String::new();
578
579 for _ in 0..2 {
580 match self.next_char() {
581 Some(ch) if ch.is_ascii_hexdigit() => hex.push(ch),
582 Some(ch) => {
583 return Err(Error::parse(
584 self.position,
585 format!("invalid hex digit: '{ch}'"),
586 ));
587 }
588 None => return Err(Error::parse(self.position, "incomplete hex escape")),
589 }
590 }
591
592 let code = u8::from_str_radix(&hex, 16).unwrap();
593 Ok(Token::Escaped(code as char))
594 }
595
596 fn lex_unicode_escape(&mut self) -> Result<Token> {
598 let braced = self.peek_char() == Some('{');
599 if braced {
600 self.next_char();
601 }
602
603 let mut hex = String::new();
604 let max_digits = if braced { 6 } else { 4 };
605
606 for i in 0..max_digits {
607 match self.peek_char() {
608 Some('}') if braced => {
609 self.next_char();
610 break;
611 }
612 Some(ch) if ch.is_ascii_hexdigit() => {
613 hex.push(ch);
614 self.next_char();
615 }
616 Some(_) if !braced && i >= 4 => break,
617 Some(ch) => {
618 return Err(Error::parse(
619 self.position,
620 format!("invalid unicode digit: '{ch}'"),
621 ));
622 }
623 None => return Err(Error::parse(self.position, "incomplete unicode escape")),
624 }
625 }
626
627 if braced && self.peek_char() != Some('}') && hex.len() < max_digits {
628 return Err(Error::unclosed("unicode escape", self.position));
629 }
630
631 let code = u32::from_str_radix(&hex, 16)
632 .map_err(|_| Error::parse(self.position, format!("invalid unicode value: {hex}")))?;
633
634 char::from_u32(code)
635 .ok_or_else(|| {
636 Error::parse(
637 self.position,
638 format!("invalid unicode code point: U+{code:04X}"),
639 )
640 })
641 .map(Token::Escaped)
642 }
643
644 fn lex_named_list(&mut self) -> Result<Token> {
646 let Some(ch) = self.peek_char() else {
648 return Err(Error::parse(self.position, "unexpected end after '\\L'"));
649 };
650
651 if ch != '<' {
652 return Err(Error::parse(self.position, "expected '<' after '\\L'"));
653 }
654 self.next_char(); let mut name = String::new();
658 while let Some(ch) = self.peek_char() {
659 if ch == '>' {
660 self.next_char(); return Ok(Token::NamedList(name));
662 }
663 if ch.is_alphanumeric() || ch == '_' {
664 name.push(ch);
665 self.next_char();
666 } else {
667 return Err(Error::parse(
668 self.position,
669 format!("invalid character in named list: '{ch}'"),
670 ));
671 }
672 }
673
674 Err(Error::unclosed("named list", self.position))
675 }
676
677 pub fn peek_token(&mut self) -> Result<Token> {
682 let saved_position = self.position;
683 let saved_chars = self.chars.clone();
684 let token = self.next_token()?;
685 self.position = saved_position;
686 self.chars = saved_chars;
687 Ok(token)
688 }
689
690 pub fn is_eof(&mut self) -> bool {
692 self.peek_char().is_none()
693 }
694
695 #[must_use]
697 pub fn remaining(&self) -> &'a str {
698 &self.input[self.position..]
699 }
700}
701
702#[cfg(test)]
703mod tests {
704 use super::*;
705
706 #[test]
707 fn test_simple_chars() {
708 let mut lexer = Lexer::new("abc");
709 assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
710 assert_eq!(lexer.next_token().unwrap(), Token::Char('b'));
711 assert_eq!(lexer.next_token().unwrap(), Token::Char('c'));
712 assert_eq!(lexer.next_token().unwrap(), Token::Eof);
713 }
714
715 #[test]
716 fn test_metacharacters() {
717 let mut lexer = Lexer::new(".*+?");
718 assert_eq!(lexer.next_token().unwrap(), Token::Dot);
719 assert_eq!(lexer.next_token().unwrap(), Token::Star);
720 assert_eq!(lexer.next_token().unwrap(), Token::Plus);
721 assert_eq!(lexer.next_token().unwrap(), Token::Question);
722 }
723
724 #[test]
725 fn test_escapes() {
726 let mut lexer = Lexer::new(r"\d\w\s\n\t");
727 assert_eq!(
728 lexer.next_token().unwrap(),
729 Token::NamedClass(NamedClassToken::Digit)
730 );
731 assert_eq!(
732 lexer.next_token().unwrap(),
733 Token::NamedClass(NamedClassToken::Word)
734 );
735 assert_eq!(
736 lexer.next_token().unwrap(),
737 Token::NamedClass(NamedClassToken::Whitespace)
738 );
739 assert_eq!(lexer.next_token().unwrap(), Token::Escaped('\n'));
740 assert_eq!(lexer.next_token().unwrap(), Token::Escaped('\t'));
741 }
742
743 #[test]
744 fn test_backreference() {
745 let mut lexer = Lexer::new(r"\1\12");
746 assert_eq!(lexer.next_token().unwrap(), Token::Backreference(1));
747 assert_eq!(lexer.next_token().unwrap(), Token::Backreference(12));
748 }
749
750 #[test]
751 fn test_groups() {
752 let mut lexer = Lexer::new("(a)(?:b)(?=c)(?!d)");
753 assert_eq!(lexer.next_token().unwrap(), Token::OpenParen);
754 assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
755 assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
756 assert_eq!(lexer.next_token().unwrap(), Token::NonCapturing);
757 assert_eq!(lexer.next_token().unwrap(), Token::Char('b'));
758 assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
759 assert_eq!(lexer.next_token().unwrap(), Token::PositiveLookahead);
760 assert_eq!(lexer.next_token().unwrap(), Token::Char('c'));
761 assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
762 assert_eq!(lexer.next_token().unwrap(), Token::NegativeLookahead);
763 assert_eq!(lexer.next_token().unwrap(), Token::Char('d'));
764 assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
765 }
766
767 #[test]
768 fn test_named_group() {
769 let mut lexer = Lexer::new("(?<name>a)(?P<other>b)");
770 assert_eq!(
771 lexer.next_token().unwrap(),
772 Token::NamedGroup("name".into())
773 );
774 assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
775 assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
776 assert_eq!(
777 lexer.next_token().unwrap(),
778 Token::NamedGroup("other".into())
779 );
780 }
781
782 #[test]
783 fn test_fuzziness_marker() {
784 let mut lexer = Lexer::new("hello~2");
785 for ch in "hello".chars() {
786 assert_eq!(lexer.next_token().unwrap(), Token::Char(ch));
787 }
788 assert_eq!(lexer.next_token().unwrap(), Token::Tilde);
789 assert_eq!(lexer.next_token().unwrap(), Token::Char('2'));
790 }
791}