fuzzy_regex/parser/
lexer.rs

1//! Lexer for tokenizing fuzzy regex patterns.
2
3use crate::error::{Error, Result};
4
5/// Token produced by the lexer.
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    /// Literal character.
9    Char(char),
10    /// Escaped character (e.g., `\n`, `\t`).
11    Escaped(char),
12    /// Named class escape (e.g., `\d`, `\w`, `\s`).
13    NamedClass(NamedClassToken),
14    /// Named list reference `\L<name>`.
15    NamedList(String),
16    /// Backreference `\1`, `\2`, etc.
17    Backreference(usize),
18    /// `(` - open group.
19    OpenParen,
20    /// `)` - close group.
21    CloseParen,
22    /// `[` - open character class.
23    OpenBracket,
24    /// `]` - close character class.
25    CloseBracket,
26    /// `{` - open quantifier.
27    OpenBrace,
28    /// `}` - close quantifier.
29    CloseBrace,
30    /// `|` - alternation.
31    Pipe,
32    /// `^` - start anchor or negation in char class.
33    Caret,
34    /// `$` - end anchor.
35    Dollar,
36    /// `.` - any character.
37    Dot,
38    /// `*` - zero or more.
39    Star,
40    /// `+` - one or more.
41    Plus,
42    /// `?` - zero or one, or non-greedy modifier.
43    Question,
44    /// `*+` - possessive zero or more (only after atom, not after another quantifier).
45    StarPossessive,
46    /// `++` - possessive one or more (only after atom).
47    PlusPossessive,
48    /// `?+` - possessive zero or one (only after atom).
49    QuestionPossessive,
50    /// `~` - fuzziness marker.
51    Tilde,
52    /// `-` - range in character class.
53    Hyphen,
54    /// `(?:` - non-capturing group.
55    NonCapturing,
56    /// `(?=` - positive lookahead.
57    PositiveLookahead,
58    /// `(?!` - negative lookahead.
59    NegativeLookahead,
60    /// `(?<=` - positive lookbehind.
61    PositiveLookbehind,
62    /// `(?<!` - negative lookbehind.
63    NegativeLookbehind,
64    /// `(?P<name>` or `(?<name>` - named group.
65    NamedGroup(String),
66    /// `(?b)` - BESTMATCH flag (search for best match instead of first).
67    BestMatch,
68    /// `(?e)` - ENHANCEMATCH flag (improve fit of fuzzy match).
69    EnhanceMatch,
70    /// `(?p)` - POSIX leftmost-longest matching.
71    PosixMatch,
72    /// `(?x)` - Verbose mode (ignore whitespace, allow comments).
73    Verbose,
74    /// `(?s)` - Dot-all mode (`.` matches newlines).
75    DotAll,
76    /// `(?m)` - Multi-line mode (`^`/`$` match at line boundaries).
77    MultiLine,
78    /// `(?U)` - Ungreedy mode (invert default greediness).
79    Ungreedy,
80    /// `(?i)` - Case-insensitive mode.
81    CaseInsensitive,
82    /// `(?g)` - Global mode (find all matches).
83    Global,
84    /// `(?u)` - Unicode mode (enable Unicode character classes).
85    Unicode,
86    /// `\K` - Reset match start (keep everything before it out of the match).
87    ResetMatchStart,
88    /// `(?>` - Atomic group (prevent backtracking).
89    AtomicGroup,
90    /// `(?R)` - Recursive entire pattern.
91    RecursivePattern,
92    /// `(?1)`, `(?2)`, etc. - Recursive numbered group.
93    RecursiveGroup(usize),
94    /// `(?&name)` - Recursive named group.
95    RecursiveNamedGroup(String),
96    /// End of input.
97    Eof,
98}
99
100/// Named class tokens from escape sequences.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum NamedClassToken {
103    /// Digit class `\d` - matches any digit character.
104    Digit,
105    /// Non-digit class `\D` - matches any non-digit character.
106    NotDigit,
107    /// Word class `\w` - matches any word character (alphanumeric or underscore).
108    Word,
109    /// Non-word class `\W` - matches any non-word character.
110    NotWord,
111    /// Whitespace class `\s` - matches any whitespace character.
112    Whitespace,
113    /// Non-whitespace class `\S` - matches any non-whitespace character.
114    NotWhitespace,
115    /// Word boundary `\b` - matches at a word boundary position.
116    WordBoundary,
117    /// Non-word boundary `\B` - matches at a non-word boundary position.
118    NotWordBoundary,
119}
120
121/// Lexer for regex patterns.
122pub struct Lexer<'a> {
123    input: &'a str,
124    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
125    position: usize,
126    /// Verbose mode - skip whitespace and comments.
127    verbose: bool,
128}
129
130impl<'a> Lexer<'a> {
131    /// Create a new lexer for the given input.
132    #[must_use]
133    pub fn new(input: &'a str) -> Self {
134        Lexer {
135            input,
136            chars: input.char_indices().peekable(),
137            position: 0,
138            verbose: false,
139        }
140    }
141
142    /// Create a new lexer with verbose mode.
143    #[must_use]
144    pub fn new_with_flags(input: &'a str, verbose: bool) -> Self {
145        Lexer {
146            input,
147            chars: input.char_indices().peekable(),
148            position: 0,
149            verbose,
150        }
151    }
152
153    /// Get the current position in the input.
154    #[must_use]
155    pub fn position(&self) -> usize {
156        self.position
157    }
158
159    /// Set the position in the input (for backtracking).
160    pub fn set_position(&mut self, pos: usize) {
161        self.position = pos;
162        // Rebuild the chars iterator from the new position
163        self.chars = self.input[pos..].char_indices().peekable();
164    }
165
166    /// Peek at the next character without consuming it.
167    fn peek_char(&mut self) -> Option<char> {
168        self.chars.peek().map(|(_, ch)| *ch)
169    }
170
171    /// Consume and return the next character.
172    fn next_char(&mut self) -> Option<char> {
173        if let Some((pos, ch)) = self.chars.next() {
174            self.position = pos + ch.len_utf8();
175            Some(ch)
176        } else {
177            None
178        }
179    }
180
181    /// Try to match a specific string, advancing if successful.
182    ///
183    /// This is useful for parser extensions that need to match multi-character
184    /// sequences like keywords or specific syntax.
185    pub fn try_match(&mut self, s: &str) -> bool {
186        let remaining = &self.input[self.position..];
187        if remaining.starts_with(s) {
188            for _ in 0..s.chars().count() {
189                self.next_char();
190            }
191            true
192        } else {
193            false
194        }
195    }
196
197    /// Skip whitespace and comments in verbose mode.
198    fn skip_verbose_whitespace(&mut self) {
199        while let Some(ch) = self.peek_char() {
200            if ch.is_whitespace() {
201                self.next_char();
202            } else if ch == '#' {
203                // Skip comment until end of line
204                self.next_char(); // consume '#'
205                while let Some(c) = self.peek_char() {
206                    if c == '\n' {
207                        self.next_char();
208                        break;
209                    }
210                    self.next_char();
211                }
212            } else {
213                break;
214            }
215        }
216    }
217
218    /// Get the next token.
219    ///
220    /// # Errors
221    /// Returns an error if an invalid escape sequence is encountered.
222    pub fn next_token(&mut self) -> Result<Token> {
223        // In verbose mode, skip whitespace and comments
224        if self.verbose {
225            self.skip_verbose_whitespace();
226        }
227
228        let Some(ch) = self.next_char() else {
229            return Ok(Token::Eof);
230        };
231
232        match ch {
233            '(' => self.lex_group_start(),
234            ')' => Ok(Token::CloseParen),
235            '[' => Ok(Token::OpenBracket),
236            ']' => Ok(Token::CloseBracket),
237            '{' => Ok(Token::OpenBrace),
238            '}' => Ok(Token::CloseBrace),
239            '|' => Ok(Token::Pipe),
240            '^' => Ok(Token::Caret),
241            '$' => Ok(Token::Dollar),
242            '.' => Ok(Token::Dot),
243            '*' => Ok(Token::Star),
244            '+' => Ok(Token::Plus),
245            '?' => Ok(Token::Question),
246            '~' => Ok(Token::Tilde),
247            '-' => Ok(Token::Hyphen),
248            '\\' => self.lex_escape(),
249            _ => Ok(Token::Char(ch)),
250        }
251    }
252
253    #[allow(clippy::too_many_lines)]
254    fn lex_group_start(&mut self) -> Result<Token> {
255        // Check for special group syntax
256        if self.peek_char() == Some('?') {
257            self.next_char(); // consume '?'
258
259            match self.peek_char() {
260                Some(':') => {
261                    self.next_char();
262                    Ok(Token::NonCapturing)
263                }
264                Some('=') => {
265                    self.next_char();
266                    Ok(Token::PositiveLookahead)
267                }
268                Some('!') => {
269                    self.next_char();
270                    Ok(Token::NegativeLookahead)
271                }
272                Some('<') => {
273                    self.next_char();
274                    match self.peek_char() {
275                        Some('=') => {
276                            self.next_char();
277                            Ok(Token::PositiveLookbehind)
278                        }
279                        Some('!') => {
280                            self.next_char();
281                            Ok(Token::NegativeLookbehind)
282                        }
283                        Some(c) if c.is_alphabetic() || c == '_' => self.lex_named_group(),
284                        _ => Err(Error::parse(
285                            self.position,
286                            "expected '=', '!', or group name after '(?<'",
287                        )),
288                    }
289                }
290                Some('P') => {
291                    self.next_char();
292                    match self.peek_char() {
293                        Some('<') => {
294                            self.next_char();
295                            self.lex_named_group()
296                        }
297                        Some('>') => {
298                            // Recursive named group: (?P>name)
299                            self.next_char();
300                            self.lex_recursive_name()
301                        }
302                        _ => Err(Error::parse(
303                            self.position,
304                            "expected '<' or '>' after '(?P'",
305                        )),
306                    }
307                }
308                Some('b') => {
309                    self.next_char();
310                    // Expect closing paren for flag
311                    if self.peek_char() == Some(')') {
312                        self.next_char();
313                        Ok(Token::BestMatch)
314                    } else {
315                        Err(Error::parse(self.position, "expected ')' after '(?b'"))
316                    }
317                }
318                Some('e') => {
319                    self.next_char();
320                    // Expect closing paren for flag
321                    if self.peek_char() == Some(')') {
322                        self.next_char();
323                        Ok(Token::EnhanceMatch)
324                    } else {
325                        Err(Error::parse(self.position, "expected ')' after '(?e'"))
326                    }
327                }
328                Some('p') => {
329                    self.next_char();
330                    // Expect closing paren for flag
331                    if self.peek_char() == Some(')') {
332                        self.next_char();
333                        Ok(Token::PosixMatch)
334                    } else {
335                        Err(Error::parse(self.position, "expected ')' after '(?p'"))
336                    }
337                }
338                Some('x') => {
339                    self.next_char();
340                    if self.peek_char() == Some(')') {
341                        self.next_char();
342                        // Switch to verbose mode for subsequent tokens
343                        self.verbose = true;
344                        Ok(Token::Verbose)
345                    } else {
346                        Err(Error::parse(self.position, "expected ')' after '(?x'"))
347                    }
348                }
349                Some('s') => {
350                    self.next_char();
351                    if self.peek_char() == Some(')') {
352                        self.next_char();
353                        Ok(Token::DotAll)
354                    } else {
355                        Err(Error::parse(self.position, "expected ')' after '(?s'"))
356                    }
357                }
358                Some('m') => {
359                    self.next_char();
360                    if self.peek_char() == Some(')') {
361                        self.next_char();
362                        Ok(Token::MultiLine)
363                    } else {
364                        Err(Error::parse(self.position, "expected ')' after '(?m'"))
365                    }
366                }
367                Some('U') => {
368                    self.next_char();
369                    if self.peek_char() == Some(')') {
370                        self.next_char();
371                        Ok(Token::Ungreedy)
372                    } else {
373                        Err(Error::parse(self.position, "expected ')' after '(?U'"))
374                    }
375                }
376                Some('i') => {
377                    self.next_char();
378                    if self.peek_char() == Some(')') {
379                        self.next_char();
380                        Ok(Token::CaseInsensitive)
381                    } else {
382                        Err(Error::parse(self.position, "expected ')' after '(?i'"))
383                    }
384                }
385                Some('g') => {
386                    self.next_char();
387                    if self.peek_char() == Some(')') {
388                        self.next_char();
389                        Ok(Token::Global)
390                    } else {
391                        Err(Error::parse(self.position, "expected ')' after '(?g'"))
392                    }
393                }
394                Some('u') => {
395                    self.next_char();
396                    if self.peek_char() == Some(')') {
397                        self.next_char();
398                        Ok(Token::Unicode)
399                    } else {
400                        Err(Error::parse(self.position, "expected ')' after '(?u'"))
401                    }
402                }
403                Some('>') => {
404                    // Atomic group: (?>...)
405                    self.next_char();
406                    Ok(Token::AtomicGroup)
407                }
408                Some('R') => {
409                    // Recursive entire pattern: (?R)
410                    self.next_char();
411                    if self.peek_char() == Some(')') {
412                        self.next_char();
413                        Ok(Token::RecursivePattern)
414                    } else {
415                        Err(Error::parse(self.position, "expected ')' after '(?R'"))
416                    }
417                }
418                Some('&') => {
419                    // Recursive named group: (?&name)
420                    self.next_char();
421                    self.lex_recursive_name()
422                }
423                Some(c) if c.is_ascii_digit() => {
424                    // Recursive numbered group: (?1), (?2), etc.
425                    self.lex_recursive_number()
426                }
427                _ => Err(Error::parse(
428                    self.position,
429                    "invalid group syntax after '(?'",
430                )),
431            }
432        } else {
433            Ok(Token::OpenParen)
434        }
435    }
436
437    /// Lex a named group name.
438    fn lex_named_group(&mut self) -> Result<Token> {
439        let mut name = String::new();
440
441        while let Some(ch) = self.peek_char() {
442            if ch == '>' {
443                self.next_char();
444                if name.is_empty() {
445                    return Err(Error::parse(self.position, "empty group name"));
446                }
447                return Ok(Token::NamedGroup(name));
448            } else if ch.is_alphanumeric() || ch == '_' {
449                name.push(ch);
450                self.next_char();
451            } else {
452                return Err(Error::parse(
453                    self.position,
454                    format!("invalid character in group name: '{ch}'"),
455                ));
456            }
457        }
458
459        Err(Error::unclosed("named group", self.position))
460    }
461
462    /// Lex a recursive group number: (?1), (?2), etc.
463    fn lex_recursive_number(&mut self) -> Result<Token> {
464        let mut num = String::new();
465
466        while let Some(ch) = self.peek_char() {
467            if ch == ')' {
468                self.next_char();
469                if num.is_empty() {
470                    return Err(Error::parse(self.position, "empty recursive group number"));
471                }
472                let group_num: usize = num
473                    .parse()
474                    .map_err(|_| Error::parse(self.position, "invalid recursive group number"))?;
475                return Ok(Token::RecursiveGroup(group_num));
476            } else if ch.is_ascii_digit() {
477                num.push(ch);
478                self.next_char();
479            } else {
480                return Err(Error::parse(
481                    self.position,
482                    format!("invalid character in recursive group: '{ch}'"),
483                ));
484            }
485        }
486
487        Err(Error::unclosed("recursive group", self.position))
488    }
489
490    /// Lex a recursive group name: (?&name) or (?P>name)
491    fn lex_recursive_name(&mut self) -> Result<Token> {
492        let mut name = String::new();
493
494        while let Some(ch) = self.peek_char() {
495            if ch == ')' {
496                self.next_char();
497                if name.is_empty() {
498                    return Err(Error::parse(self.position, "empty recursive group name"));
499                }
500                return Ok(Token::RecursiveNamedGroup(name));
501            } else if ch.is_alphanumeric() || ch == '_' {
502                name.push(ch);
503                self.next_char();
504            } else {
505                return Err(Error::parse(
506                    self.position,
507                    format!("invalid character in recursive group name: '{ch}'"),
508                ));
509            }
510        }
511
512        Err(Error::unclosed("recursive group", self.position))
513    }
514
515    /// Lex an escape sequence.
516    fn lex_escape(&mut self) -> Result<Token> {
517        let Some(ch) = self.next_char() else {
518            return Err(Error::parse(self.position, "unexpected end after '\\'"));
519        };
520
521        match ch {
522            // Named classes
523            'd' => Ok(Token::NamedClass(NamedClassToken::Digit)),
524            'D' => Ok(Token::NamedClass(NamedClassToken::NotDigit)),
525            'w' => Ok(Token::NamedClass(NamedClassToken::Word)),
526            'W' => Ok(Token::NamedClass(NamedClassToken::NotWord)),
527            's' => Ok(Token::NamedClass(NamedClassToken::Whitespace)),
528            'S' => Ok(Token::NamedClass(NamedClassToken::NotWhitespace)),
529            'b' => Ok(Token::NamedClass(NamedClassToken::WordBoundary)),
530            'B' => Ok(Token::NamedClass(NamedClassToken::NotWordBoundary)),
531
532            // Named list \L<name>
533            'L' => self.lex_named_list(),
534
535            // Common escapes
536            'n' => Ok(Token::Escaped('\n')),
537            'r' => Ok(Token::Escaped('\r')),
538            't' => Ok(Token::Escaped('\t')),
539            'f' => Ok(Token::Escaped('\x0C')),
540            'v' => Ok(Token::Escaped('\x0B')),
541            '0' => Ok(Token::Escaped('\0')),
542
543            // Backreference (1-9)
544            '1'..='9' => {
545                let mut num = ch.to_digit(10).unwrap() as usize;
546                // Check for multi-digit backreference
547                while let Some(next_ch) = self.peek_char() {
548                    if let Some(digit) = next_ch.to_digit(10) {
549                        num = num * 10 + digit as usize;
550                        self.next_char();
551                    } else {
552                        break;
553                    }
554                }
555                Ok(Token::Backreference(num))
556            }
557
558            // Hex escape \xHH
559            'x' => self.lex_hex_escape(),
560
561            // Unicode escape \u{HHHH} or \uHHHH
562            'u' => self.lex_unicode_escape(),
563
564            // Escaped metacharacters and literals
565            '\\' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '.' | '*' | '+' | '?'
566            | '~' | '-' | '/' => Ok(Token::Escaped(ch)),
567
568            // \K - reset match start
569            'K' => Ok(Token::ResetMatchStart),
570
571            _ => Err(Error::invalid_escape(ch, self.position - 1)),
572        }
573    }
574
575    /// Lex a hex escape \xHH.
576    fn lex_hex_escape(&mut self) -> Result<Token> {
577        let mut hex = String::new();
578
579        for _ in 0..2 {
580            match self.next_char() {
581                Some(ch) if ch.is_ascii_hexdigit() => hex.push(ch),
582                Some(ch) => {
583                    return Err(Error::parse(
584                        self.position,
585                        format!("invalid hex digit: '{ch}'"),
586                    ));
587                }
588                None => return Err(Error::parse(self.position, "incomplete hex escape")),
589            }
590        }
591
592        let code = u8::from_str_radix(&hex, 16).unwrap();
593        Ok(Token::Escaped(code as char))
594    }
595
596    /// Lex a unicode escape \u{HHHH} or \uHHHH.
597    fn lex_unicode_escape(&mut self) -> Result<Token> {
598        let braced = self.peek_char() == Some('{');
599        if braced {
600            self.next_char();
601        }
602
603        let mut hex = String::new();
604        let max_digits = if braced { 6 } else { 4 };
605
606        for i in 0..max_digits {
607            match self.peek_char() {
608                Some('}') if braced => {
609                    self.next_char();
610                    break;
611                }
612                Some(ch) if ch.is_ascii_hexdigit() => {
613                    hex.push(ch);
614                    self.next_char();
615                }
616                Some(_) if !braced && i >= 4 => break,
617                Some(ch) => {
618                    return Err(Error::parse(
619                        self.position,
620                        format!("invalid unicode digit: '{ch}'"),
621                    ));
622                }
623                None => return Err(Error::parse(self.position, "incomplete unicode escape")),
624            }
625        }
626
627        if braced && self.peek_char() != Some('}') && hex.len() < max_digits {
628            return Err(Error::unclosed("unicode escape", self.position));
629        }
630
631        let code = u32::from_str_radix(&hex, 16)
632            .map_err(|_| Error::parse(self.position, format!("invalid unicode value: {hex}")))?;
633
634        char::from_u32(code)
635            .ok_or_else(|| {
636                Error::parse(
637                    self.position,
638                    format!("invalid unicode code point: U+{code:04X}"),
639                )
640            })
641            .map(Token::Escaped)
642    }
643
644    /// Lex a named list reference \L<name>.
645    fn lex_named_list(&mut self) -> Result<Token> {
646        // Expect < after \L
647        let Some(ch) = self.peek_char() else {
648            return Err(Error::parse(self.position, "unexpected end after '\\L'"));
649        };
650
651        if ch != '<' {
652            return Err(Error::parse(self.position, "expected '<' after '\\L'"));
653        }
654        self.next_char(); // consume '<'
655
656        // Read the name until we find '>'
657        let mut name = String::new();
658        while let Some(ch) = self.peek_char() {
659            if ch == '>' {
660                self.next_char(); // consume '>'
661                return Ok(Token::NamedList(name));
662            }
663            if ch.is_alphanumeric() || ch == '_' {
664                name.push(ch);
665                self.next_char();
666            } else {
667                return Err(Error::parse(
668                    self.position,
669                    format!("invalid character in named list: '{ch}'"),
670                ));
671            }
672        }
673
674        Err(Error::unclosed("named list", self.position))
675    }
676
677    /// Peek at the next token without consuming it.
678    ///
679    /// # Errors
680    /// Returns an error if an invalid escape sequence is encountered.
681    pub fn peek_token(&mut self) -> Result<Token> {
682        let saved_position = self.position;
683        let saved_chars = self.chars.clone();
684        let token = self.next_token()?;
685        self.position = saved_position;
686        self.chars = saved_chars;
687        Ok(token)
688    }
689
690    /// Check if we've reached the end of input.
691    pub fn is_eof(&mut self) -> bool {
692        self.peek_char().is_none()
693    }
694
695    /// Get remaining input from current position.
696    #[must_use]
697    pub fn remaining(&self) -> &'a str {
698        &self.input[self.position..]
699    }
700}
701
702#[cfg(test)]
703mod tests {
704    use super::*;
705
706    #[test]
707    fn test_simple_chars() {
708        let mut lexer = Lexer::new("abc");
709        assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
710        assert_eq!(lexer.next_token().unwrap(), Token::Char('b'));
711        assert_eq!(lexer.next_token().unwrap(), Token::Char('c'));
712        assert_eq!(lexer.next_token().unwrap(), Token::Eof);
713    }
714
715    #[test]
716    fn test_metacharacters() {
717        let mut lexer = Lexer::new(".*+?");
718        assert_eq!(lexer.next_token().unwrap(), Token::Dot);
719        assert_eq!(lexer.next_token().unwrap(), Token::Star);
720        assert_eq!(lexer.next_token().unwrap(), Token::Plus);
721        assert_eq!(lexer.next_token().unwrap(), Token::Question);
722    }
723
724    #[test]
725    fn test_escapes() {
726        let mut lexer = Lexer::new(r"\d\w\s\n\t");
727        assert_eq!(
728            lexer.next_token().unwrap(),
729            Token::NamedClass(NamedClassToken::Digit)
730        );
731        assert_eq!(
732            lexer.next_token().unwrap(),
733            Token::NamedClass(NamedClassToken::Word)
734        );
735        assert_eq!(
736            lexer.next_token().unwrap(),
737            Token::NamedClass(NamedClassToken::Whitespace)
738        );
739        assert_eq!(lexer.next_token().unwrap(), Token::Escaped('\n'));
740        assert_eq!(lexer.next_token().unwrap(), Token::Escaped('\t'));
741    }
742
743    #[test]
744    fn test_backreference() {
745        let mut lexer = Lexer::new(r"\1\12");
746        assert_eq!(lexer.next_token().unwrap(), Token::Backreference(1));
747        assert_eq!(lexer.next_token().unwrap(), Token::Backreference(12));
748    }
749
750    #[test]
751    fn test_groups() {
752        let mut lexer = Lexer::new("(a)(?:b)(?=c)(?!d)");
753        assert_eq!(lexer.next_token().unwrap(), Token::OpenParen);
754        assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
755        assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
756        assert_eq!(lexer.next_token().unwrap(), Token::NonCapturing);
757        assert_eq!(lexer.next_token().unwrap(), Token::Char('b'));
758        assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
759        assert_eq!(lexer.next_token().unwrap(), Token::PositiveLookahead);
760        assert_eq!(lexer.next_token().unwrap(), Token::Char('c'));
761        assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
762        assert_eq!(lexer.next_token().unwrap(), Token::NegativeLookahead);
763        assert_eq!(lexer.next_token().unwrap(), Token::Char('d'));
764        assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
765    }
766
767    #[test]
768    fn test_named_group() {
769        let mut lexer = Lexer::new("(?<name>a)(?P<other>b)");
770        assert_eq!(
771            lexer.next_token().unwrap(),
772            Token::NamedGroup("name".into())
773        );
774        assert_eq!(lexer.next_token().unwrap(), Token::Char('a'));
775        assert_eq!(lexer.next_token().unwrap(), Token::CloseParen);
776        assert_eq!(
777            lexer.next_token().unwrap(),
778            Token::NamedGroup("other".into())
779        );
780    }
781
782    #[test]
783    fn test_fuzziness_marker() {
784        let mut lexer = Lexer::new("hello~2");
785        for ch in "hello".chars() {
786            assert_eq!(lexer.next_token().unwrap(), Token::Char(ch));
787        }
788        assert_eq!(lexer.next_token().unwrap(), Token::Tilde);
789        assert_eq!(lexer.next_token().unwrap(), Token::Char('2'));
790    }
791}
fuzzy_regex/parser/lexer.rs

fuzzy_regex/parser/
lexer.rs