oxilean_parse/lexer/
types.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5use crate::tokens::{Span, StringPart, Token, TokenKind};
6
7/// A lexer mode stack.
8#[allow(dead_code)]
9#[allow(missing_docs)]
10pub struct LexerModeStack {
11    /// The stack of modes
12    pub stack: Vec<LexerMode>,
13}
14impl LexerModeStack {
15    /// Create a new mode stack starting in Normal mode.
16    #[allow(dead_code)]
17    pub fn new() -> Self {
18        LexerModeStack {
19            stack: vec![LexerMode::Normal],
20        }
21    }
22    /// Push a new mode.
23    #[allow(dead_code)]
24    pub fn push(&mut self, mode: LexerMode) {
25        self.stack.push(mode);
26    }
27    /// Pop the current mode, returning to the previous.
28    #[allow(dead_code)]
29    pub fn pop(&mut self) -> Option<LexerMode> {
30        if self.stack.len() > 1 {
31            self.stack.pop()
32        } else {
33            None
34        }
35    }
36    /// Returns the current mode.
37    #[allow(dead_code)]
38    pub fn current(&self) -> &LexerMode {
39        self.stack.last().unwrap_or(&LexerMode::Normal)
40    }
41    /// Returns the depth of the stack.
42    #[allow(dead_code)]
43    pub fn depth(&self) -> usize {
44        self.stack.len()
45    }
46}
47/// Lexer for tokenizing OxiLean source code.
48///
49/// Supports:
50/// - Keywords and identifiers (including Unicode letters: Greek, math symbols)
51/// - Number literals: decimal, hex (`0x`), binary (`0b`), octal (`0o`)
52/// - Float literals: `3.14`, `1.0e10`, `1.5e-3`
53/// - String literals with enhanced escape sequences (`\u{1234}`, `\x41`, `\0`)
54/// - Char literals: `'a'`, `'\n'`, `'\x41'`
55/// - Interpolated strings: `s!"hello {name}"`
56/// - Doc comments: `/-- ... -/` (block) and `--- ...` (line)
57/// - Operators: `>=`, `..`, `=>`, `<-`, `->`, etc.
58pub struct Lexer {
59    /// Input source code
60    input: Vec<char>,
61    /// Current position
62    pos: usize,
63    /// Current line (1-indexed)
64    line: usize,
65    /// Current column (1-indexed)
66    column: usize,
67}
68impl Lexer {
69    /// Create a new lexer from source code.
70    pub fn new(input: &str) -> Self {
71        Self {
72            input: input.chars().collect(),
73            pos: 0,
74            line: 1,
75            column: 1,
76        }
77    }
78    /// Get the current character without consuming it.
79    fn peek(&self) -> Option<char> {
80        self.input.get(self.pos).copied()
81    }
82    /// Get the character at offset from current position.
83    fn peek_ahead(&self, offset: usize) -> Option<char> {
84        self.input.get(self.pos + offset).copied()
85    }
86    /// Consume and return the current character.
87    fn advance(&mut self) -> Option<char> {
88        let ch = self.peek()?;
89        self.pos += 1;
90        if ch == '\n' {
91            self.line += 1;
92            self.column = 1;
93        } else {
94            self.column += 1;
95        }
96        Some(ch)
97    }
98    /// Skip whitespace and comments (but not doc comments).
99    fn skip_whitespace(&mut self) {
100        while let Some(ch) = self.peek() {
101            if ch.is_whitespace() {
102                self.advance();
103            } else if ch == '-' && self.peek_ahead(1) == Some('-') {
104                if self.peek_ahead(2) == Some('-') {
105                    break;
106                }
107                self.advance();
108                self.advance();
109                while let Some(ch) = self.peek() {
110                    if ch == '\n' {
111                        break;
112                    }
113                    self.advance();
114                }
115            } else if ch == '/' && self.peek_ahead(1) == Some('-') {
116                if self.peek_ahead(2) == Some('-') {
117                    break;
118                }
119                self.advance();
120                self.advance();
121                let mut depth = 1;
122                while depth > 0 {
123                    match self.peek() {
124                        Some('/') if self.peek_ahead(1) == Some('-') => {
125                            self.advance();
126                            self.advance();
127                            depth += 1;
128                        }
129                        Some('-') if self.peek_ahead(1) == Some('/') => {
130                            self.advance();
131                            self.advance();
132                            depth -= 1;
133                        }
134                        Some(_) => {
135                            self.advance();
136                        }
137                        None => break,
138                    }
139                }
140            } else {
141                break;
142            }
143        }
144    }
145    /// Lex a line doc comment: `--- ...` (everything until newline).
146    fn lex_line_doc_comment(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
147        self.advance();
148        self.advance();
149        self.advance();
150        if self.peek() == Some(' ') {
151            self.advance();
152        }
153        let mut s = String::new();
154        while let Some(ch) = self.peek() {
155            if ch == '\n' {
156                break;
157            }
158            s.push(ch);
159            self.advance();
160        }
161        Token::new(
162            TokenKind::DocComment(s),
163            Span::new(start, self.pos, start_line, start_col),
164        )
165    }
166    /// Lex a block doc comment: `/-- ... -/`.
167    fn lex_block_doc_comment(
168        &mut self,
169        start: usize,
170        start_line: usize,
171        start_col: usize,
172    ) -> Token {
173        self.advance();
174        self.advance();
175        self.advance();
176        if self.peek() == Some(' ') {
177            self.advance();
178        }
179        let mut s = String::new();
180        let mut depth = 1;
181        while depth > 0 {
182            match self.peek() {
183                Some('/') if self.peek_ahead(1) == Some('-') => {
184                    self.advance();
185                    self.advance();
186                    depth += 1;
187                    if depth > 1 {
188                        s.push_str("/-");
189                    }
190                }
191                Some('-') if self.peek_ahead(1) == Some('/') => {
192                    depth -= 1;
193                    if depth > 0 {
194                        s.push_str("-/");
195                    }
196                    self.advance();
197                    self.advance();
198                }
199                Some(ch) => {
200                    s.push(ch);
201                    self.advance();
202                }
203                None => break,
204            }
205        }
206        let s = s.trim_end().to_string();
207        Token::new(
208            TokenKind::DocComment(s),
209            Span::new(start, self.pos, start_line, start_col),
210        )
211    }
212    /// Lex an identifier or keyword.
213    /// Identifiers can contain Unicode letters (Greek, mathematical symbols, etc.),
214    /// digits, underscores, and primes.
215    fn lex_ident(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
216        let mut s = String::new();
217        if self.peek() == Some('_')
218            && !self
219                .peek_ahead(1)
220                .is_some_and(|c| c.is_alphanumeric() || c == '_')
221        {
222            self.advance();
223            return Token::new(
224                TokenKind::Underscore,
225                Span::new(start, self.pos, start_line, start_col),
226            );
227        }
228        while let Some(ch) = self.peek() {
229            if ch.is_alphanumeric() || ch == '_' || ch == '\'' {
230                s.push(ch);
231                self.advance();
232            } else {
233                break;
234            }
235        }
236        let kind = match s.as_str() {
237            "axiom" => TokenKind::Axiom,
238            "definition" | "def" => TokenKind::Definition,
239            "theorem" => TokenKind::Theorem,
240            "lemma" => TokenKind::Lemma,
241            "opaque" => TokenKind::Opaque,
242            "inductive" => TokenKind::Inductive,
243            "structure" => TokenKind::Structure,
244            "class" => TokenKind::Class,
245            "instance" => TokenKind::Instance,
246            "namespace" => TokenKind::Namespace,
247            "section" => TokenKind::Section,
248            "variable" => TokenKind::Variable,
249            "variables" => TokenKind::Variables,
250            "parameter" => TokenKind::Parameter,
251            "parameters" => TokenKind::Parameters,
252            "constant" => TokenKind::Constant,
253            "constants" => TokenKind::Constants,
254            "end" => TokenKind::End,
255            "import" => TokenKind::Import,
256            "export" => TokenKind::Export,
257            "open" => TokenKind::Open,
258            "attribute" => TokenKind::Attribute,
259            "return" => TokenKind::Return,
260            "Type" => TokenKind::Type,
261            "Prop" => TokenKind::Prop,
262            "Sort" => TokenKind::Sort,
263            "fun" => TokenKind::Fun,
264            "forall" => TokenKind::Forall,
265            "let" => TokenKind::Let,
266            "in" => TokenKind::In,
267            "if" => TokenKind::If,
268            "then" => TokenKind::Then,
269            "else" => TokenKind::Else,
270            "match" => TokenKind::Match,
271            "with" => TokenKind::With,
272            "do" => TokenKind::Do,
273            "have" => TokenKind::Have,
274            "suffices" => TokenKind::Suffices,
275            "show" => TokenKind::Show,
276            "where" => TokenKind::Where,
277            "from" => TokenKind::From,
278            "by" => TokenKind::By,
279            "exists" => TokenKind::Exists,
280            _ => TokenKind::Ident(s),
281        };
282        Token::new(kind, Span::new(start, self.pos, start_line, start_col))
283    }
284    /// Lex a hex number literal: `0x1A3F`.
285    fn lex_hex_number(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
286        let mut num = 0u64;
287        let mut has_digits = false;
288        while let Some(ch) = self.peek() {
289            if let Some(d) = ch.to_digit(16) {
290                num = num.wrapping_mul(16).wrapping_add(d as u64);
291                has_digits = true;
292                self.advance();
293            } else if ch == '_' {
294                self.advance();
295            } else {
296                break;
297            }
298        }
299        if !has_digits {
300            return Token::new(
301                TokenKind::Error("expected hex digits after 0x".to_string()),
302                Span::new(start, self.pos, start_line, start_col),
303            );
304        }
305        Token::new(
306            TokenKind::Nat(num),
307            Span::new(start, self.pos, start_line, start_col),
308        )
309    }
310    /// Lex a binary number literal: `0b1010`.
311    fn lex_bin_number(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
312        let mut num = 0u64;
313        let mut has_digits = false;
314        while let Some(ch) = self.peek() {
315            match ch {
316                '0' => {
317                    num = num.wrapping_mul(2);
318                    has_digits = true;
319                    self.advance();
320                }
321                '1' => {
322                    num = num.wrapping_mul(2).wrapping_add(1);
323                    has_digits = true;
324                    self.advance();
325                }
326                '_' => {
327                    self.advance();
328                }
329                _ => break,
330            }
331        }
332        if !has_digits {
333            return Token::new(
334                TokenKind::Error("expected binary digits after 0b".to_string()),
335                Span::new(start, self.pos, start_line, start_col),
336            );
337        }
338        Token::new(
339            TokenKind::Nat(num),
340            Span::new(start, self.pos, start_line, start_col),
341        )
342    }
343    /// Lex an octal number literal: `0o17`.
344    fn lex_oct_number(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
345        let mut num = 0u64;
346        let mut has_digits = false;
347        while let Some(ch) = self.peek() {
348            if let Some(d) = ch.to_digit(8) {
349                num = num.wrapping_mul(8).wrapping_add(d as u64);
350                has_digits = true;
351                self.advance();
352            } else if ch == '_' {
353                self.advance();
354            } else {
355                break;
356            }
357        }
358        if !has_digits {
359            return Token::new(
360                TokenKind::Error("expected octal digits after 0o".to_string()),
361                Span::new(start, self.pos, start_line, start_col),
362            );
363        }
364        Token::new(
365            TokenKind::Nat(num),
366            Span::new(start, self.pos, start_line, start_col),
367        )
368    }
369    /// Lex a number (decimal integer or float, or hex/bin/oct).
370    fn lex_number(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
371        if self.peek() == Some('0') {
372            match self.peek_ahead(1) {
373                Some('x') | Some('X') => {
374                    self.advance();
375                    self.advance();
376                    return self.lex_hex_number(start, start_line, start_col);
377                }
378                Some('b') | Some('B') => {
379                    self.advance();
380                    self.advance();
381                    return self.lex_bin_number(start, start_line, start_col);
382                }
383                Some('o') | Some('O') => {
384                    self.advance();
385                    self.advance();
386                    return self.lex_oct_number(start, start_line, start_col);
387                }
388                _ => {}
389            }
390        }
391        let mut int_str = String::new();
392        while let Some(ch) = self.peek() {
393            if ch.is_ascii_digit() {
394                int_str.push(ch);
395                self.advance();
396            } else if ch == '_' && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit()) {
397                self.advance();
398            } else {
399                break;
400            }
401        }
402        let is_float =
403            self.peek() == Some('.') && self.peek_ahead(1).is_some_and(|c| c.is_ascii_digit());
404        if is_float {
405            self.advance();
406            let mut frac_str = String::new();
407            frac_str.push_str(&int_str);
408            frac_str.push('.');
409            while let Some(ch) = self.peek() {
410                if ch.is_ascii_digit() {
411                    frac_str.push(ch);
412                    self.advance();
413                } else {
414                    break;
415                }
416            }
417            if self.peek() == Some('e') || self.peek() == Some('E') {
418                frac_str.push('e');
419                self.advance();
420                if self.peek() == Some('+') || self.peek() == Some('-') {
421                    frac_str.push(self.advance().expect("peek confirmed '+' or '-' exists"));
422                }
423                while let Some(ch) = self.peek() {
424                    if ch.is_ascii_digit() {
425                        frac_str.push(ch);
426                        self.advance();
427                    } else {
428                        break;
429                    }
430                }
431            }
432            let val: f64 = frac_str.parse().unwrap_or(0.0);
433            return Token::new(
434                TokenKind::Float(val),
435                Span::new(start, self.pos, start_line, start_col),
436            );
437        }
438        if self.peek() == Some('e') || self.peek() == Some('E') {
439            let mut exp_str = String::new();
440            exp_str.push_str(&int_str);
441            exp_str.push('e');
442            self.advance();
443            if self.peek() == Some('+') || self.peek() == Some('-') {
444                exp_str.push(self.advance().expect("peek confirmed '+' or '-' exists"));
445            }
446            while let Some(ch) = self.peek() {
447                if ch.is_ascii_digit() {
448                    exp_str.push(ch);
449                    self.advance();
450                } else {
451                    break;
452                }
453            }
454            let val: f64 = exp_str.parse().unwrap_or(0.0);
455            return Token::new(
456                TokenKind::Float(val),
457                Span::new(start, self.pos, start_line, start_col),
458            );
459        }
460        let num: u64 = int_str.parse().unwrap_or(0);
461        Token::new(
462            TokenKind::Nat(num),
463            Span::new(start, self.pos, start_line, start_col),
464        )
465    }
466    /// Parse an escape sequence inside a string or char literal.
467    /// Returns the resolved character or None on error.
468    fn parse_escape_char(&mut self) -> Option<char> {
469        match self.peek() {
470            Some('n') => {
471                self.advance();
472                Some('\n')
473            }
474            Some('t') => {
475                self.advance();
476                Some('\t')
477            }
478            Some('r') => {
479                self.advance();
480                Some('\r')
481            }
482            Some('\\') => {
483                self.advance();
484                Some('\\')
485            }
486            Some('"') => {
487                self.advance();
488                Some('"')
489            }
490            Some('\'') => {
491                self.advance();
492                Some('\'')
493            }
494            Some('0') => {
495                self.advance();
496                Some('\0')
497            }
498            Some('x') => {
499                self.advance();
500                let hi = self.advance().and_then(|c| c.to_digit(16))?;
501                let lo = self.advance().and_then(|c| c.to_digit(16))?;
502                let val = (hi * 16 + lo) as u8;
503                Some(val as char)
504            }
505            Some('u') => {
506                self.advance();
507                if self.peek() != Some('{') {
508                    return None;
509                }
510                self.advance();
511                let mut code = 0u32;
512                let mut has_digit = false;
513                while let Some(ch) = self.peek() {
514                    if ch == '}' {
515                        self.advance();
516                        break;
517                    }
518                    if let Some(d) = ch.to_digit(16) {
519                        code = code * 16 + d;
520                        has_digit = true;
521                        self.advance();
522                    } else {
523                        return None;
524                    }
525                }
526                if !has_digit {
527                    return None;
528                }
529                char::from_u32(code)
530            }
531            Some(ch) => {
532                self.advance();
533                Some(ch)
534            }
535            None => None,
536        }
537    }
538    /// Lex a string literal (with enhanced escape sequences).
539    fn lex_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
540        self.advance();
541        let mut s = String::new();
542        while let Some(ch) = self.peek() {
543            if ch == '"' {
544                self.advance();
545                return Token::new(
546                    TokenKind::String(s),
547                    Span::new(start, self.pos, start_line, start_col),
548                );
549            } else if ch == '\\' {
550                self.advance();
551                if let Some(escaped) = self.parse_escape_char() {
552                    s.push(escaped);
553                } else {
554                    s.push('?');
555                }
556            } else {
557                s.push(ch);
558                self.advance();
559            }
560        }
561        Token::new(
562            TokenKind::Error("unterminated string".to_string()),
563            Span::new(start, self.pos, start_line, start_col),
564        )
565    }
566    /// Lex a char literal: `'a'`, `'\n'`, `'\x41'`, `'\u{03B1}'`.
567    fn lex_char(&mut self, start: usize, start_line: usize, start_col: usize) -> Token {
568        self.advance();
569        let ch = if self.peek() == Some('\\') {
570            self.advance();
571            match self.parse_escape_char() {
572                Some(c) => c,
573                None => {
574                    return Token::new(
575                        TokenKind::Error("invalid escape in char literal".to_string()),
576                        Span::new(start, self.pos, start_line, start_col),
577                    );
578                }
579            }
580        } else {
581            match self.advance() {
582                Some(c) => c,
583                None => {
584                    return Token::new(
585                        TokenKind::Error("unterminated char literal".to_string()),
586                        Span::new(start, self.pos, start_line, start_col),
587                    );
588                }
589            }
590        };
591        if self.peek() == Some('\'') {
592            self.advance();
593            Token::new(
594                TokenKind::Char(ch),
595                Span::new(start, self.pos, start_line, start_col),
596            )
597        } else {
598            Token::new(
599                TokenKind::Error("unterminated char literal".to_string()),
600                Span::new(start, self.pos, start_line, start_col),
601            )
602        }
603    }
604    /// Lex an interpolated string: `s!"hello {name}"`.
605    /// Assumes `s` and `!` have NOT been consumed yet; we are at 's'.
606    fn lex_interpolated_string(
607        &mut self,
608        start: usize,
609        start_line: usize,
610        start_col: usize,
611    ) -> Token {
612        self.advance();
613        self.advance();
614        self.advance();
615        let mut parts: Vec<StringPart> = Vec::new();
616        let mut current_lit = String::new();
617        while let Some(ch) = self.peek() {
618            if ch == '"' {
619                self.advance();
620                if !current_lit.is_empty() {
621                    parts.push(StringPart::Literal(current_lit));
622                }
623                return Token::new(
624                    TokenKind::InterpolatedString(parts),
625                    Span::new(start, self.pos, start_line, start_col),
626                );
627            } else if ch == '{' {
628                self.advance();
629                if !current_lit.is_empty() {
630                    parts.push(StringPart::Literal(current_lit.clone()));
631                    current_lit.clear();
632                }
633                let mut interp_tokens = Vec::new();
634                let mut depth = 1;
635                loop {
636                    let tok = self.next_token_raw();
637                    match &tok.kind {
638                        TokenKind::LBrace => depth += 1,
639                        TokenKind::RBrace => {
640                            depth -= 1;
641                            if depth == 0 {
642                                break;
643                            }
644                        }
645                        TokenKind::Eof => break,
646                        _ => {}
647                    }
648                    interp_tokens.push(tok);
649                }
650                parts.push(StringPart::Interpolation(interp_tokens));
651            } else if ch == '\\' {
652                self.advance();
653                if let Some(escaped) = self.parse_escape_char() {
654                    current_lit.push(escaped);
655                } else {
656                    current_lit.push('?');
657                }
658            } else {
659                current_lit.push(ch);
660                self.advance();
661            }
662        }
663        Token::new(
664            TokenKind::Error("unterminated interpolated string".to_string()),
665            Span::new(start, self.pos, start_line, start_col),
666        )
667    }
668    /// Lex a single token without skipping whitespace first.
669    /// Used internally for interpolated string sub-lexing.
670    fn next_token_raw(&mut self) -> Token {
671        self.skip_whitespace();
672        let start = self.pos;
673        let start_line = self.line;
674        let start_col = self.column;
675        let Some(ch) = self.peek() else {
676            return Token::new(
677                TokenKind::Eof,
678                Span::new(start, start, start_line, start_col),
679            );
680        };
681        if ch.is_alphabetic() || ch == '_' {
682            return self.lex_ident(start, start_line, start_col);
683        }
684        if ch.is_ascii_digit() {
685            return self.lex_number(start, start_line, start_col);
686        }
687        if ch == '"' {
688            return self.lex_string(start, start_line, start_col);
689        }
690        self.advance();
691        let kind = match ch {
692            '(' => TokenKind::LParen,
693            ')' => TokenKind::RParen,
694            '{' => TokenKind::LBrace,
695            '}' => TokenKind::RBrace,
696            '[' => TokenKind::LBracket,
697            ']' => TokenKind::RBracket,
698            ',' => TokenKind::Comma,
699            ';' => TokenKind::Semicolon,
700            '.' if self.peek() == Some('.') => {
701                self.advance();
702                TokenKind::DotDot
703            }
704            '.' => TokenKind::Dot,
705            '|' if self.peek() == Some('|') => {
706                self.advance();
707                TokenKind::OrOr
708            }
709            '|' => TokenKind::Bar,
710            '@' => TokenKind::At,
711            '#' => TokenKind::Hash,
712            '+' => TokenKind::Plus,
713            '-' if self.peek() == Some('>') => {
714                self.advance();
715                TokenKind::Arrow
716            }
717            '-' => TokenKind::Minus,
718            '>' if self.peek() == Some('=') => {
719                self.advance();
720                TokenKind::Ge
721            }
722            '>' => TokenKind::Gt,
723            '*' => TokenKind::Star,
724            '/' => TokenKind::Slash,
725            '%' => TokenKind::Percent,
726            '^' => TokenKind::Caret,
727            '_' => TokenKind::Underscore,
728            '?' => TokenKind::Question,
729            '!' if self.peek() == Some('=') => {
730                self.advance();
731                TokenKind::BangEq
732            }
733            '!' => TokenKind::Bang,
734            '&' if self.peek() == Some('&') => {
735                self.advance();
736                TokenKind::AndAnd
737            }
738            ':' if self.peek() == Some('=') => {
739                self.advance();
740                TokenKind::Assign
741            }
742            ':' => TokenKind::Colon,
743            '=' if self.peek() == Some('>') => {
744                self.advance();
745                TokenKind::FatArrow
746            }
747            '=' => TokenKind::Eq,
748            '<' if self.peek() == Some('-') => {
749                self.advance();
750                TokenKind::LeftArrow
751            }
752            '<' if self.peek() == Some('=') => {
753                self.advance();
754                TokenKind::Le
755            }
756            '<' => TokenKind::Lt,
757            '\u{2190}' => TokenKind::LeftArrow,
758            '\u{2264}' | '\u{2A7D}' => TokenKind::Le,
759            '\u{2265}' | '\u{2A7E}' => TokenKind::Ge,
760            '\u{2260}' => TokenKind::Ne,
761            '\u{2192}' => TokenKind::Arrow,
762            '\u{21D2}' => TokenKind::FatArrow,
763            '\u{2227}' => TokenKind::And,
764            '\u{2228}' => TokenKind::Or,
765            '\u{00AC}' => TokenKind::Not,
766            '\u{2194}' => TokenKind::Iff,
767            '\u{2200}' => TokenKind::Forall,
768            '\u{2203}' => TokenKind::Exists,
769            '\u{03BB}' => TokenKind::Fun,
770            '\u{27E8}' => TokenKind::LAngle,
771            '\u{27E9}' => TokenKind::RAngle,
772            _ => TokenKind::Error(format!("unexpected character: {}", ch)),
773        };
774        Token::new(kind, Span::new(start, self.pos, start_line, start_col))
775    }
776    /// Get the next token.
777    pub fn next_token(&mut self) -> Token {
778        self.skip_whitespace();
779        let start = self.pos;
780        let start_line = self.line;
781        let start_col = self.column;
782        let Some(ch) = self.peek() else {
783            return Token::new(
784                TokenKind::Eof,
785                Span::new(start, start, start_line, start_col),
786            );
787        };
788        if ch == '/' && self.peek_ahead(1) == Some('-') && self.peek_ahead(2) == Some('-') {
789            return self.lex_block_doc_comment(start, start_line, start_col);
790        }
791        if ch == '-' && self.peek_ahead(1) == Some('-') && self.peek_ahead(2) == Some('-') {
792            return self.lex_line_doc_comment(start, start_line, start_col);
793        }
794        if ch == 's' && self.peek_ahead(1) == Some('!') && self.peek_ahead(2) == Some('"') {
795            return self.lex_interpolated_string(start, start_line, start_col);
796        }
797        if ch.is_alphabetic() || ch == '_' {
798            return self.lex_ident(start, start_line, start_col);
799        }
800        if ch.is_ascii_digit() {
801            return self.lex_number(start, start_line, start_col);
802        }
803        if ch == '"' {
804            return self.lex_string(start, start_line, start_col);
805        }
806        if ch == '\'' {
807            let is_char_lit = if self.peek_ahead(1) == Some('\\') {
808                true
809            } else {
810                self.peek_ahead(2) == Some('\'')
811            };
812            if is_char_lit {
813                return self.lex_char(start, start_line, start_col);
814            }
815        }
816        self.advance();
817        let kind = match ch {
818            '(' => TokenKind::LParen,
819            ')' => TokenKind::RParen,
820            '{' => TokenKind::LBrace,
821            '}' => TokenKind::RBrace,
822            '[' => TokenKind::LBracket,
823            ']' => TokenKind::RBracket,
824            ',' => TokenKind::Comma,
825            ';' => TokenKind::Semicolon,
826            '.' if self.peek() == Some('.') => {
827                self.advance();
828                TokenKind::DotDot
829            }
830            '.' => TokenKind::Dot,
831            '|' if self.peek() == Some('|') => {
832                self.advance();
833                TokenKind::OrOr
834            }
835            '|' => TokenKind::Bar,
836            '@' => TokenKind::At,
837            '#' => TokenKind::Hash,
838            '+' => TokenKind::Plus,
839            '-' if self.peek() == Some('>') => {
840                self.advance();
841                TokenKind::Arrow
842            }
843            '-' => TokenKind::Minus,
844            '>' if self.peek() == Some('=') => {
845                self.advance();
846                TokenKind::Ge
847            }
848            '>' => TokenKind::Gt,
849            '*' => TokenKind::Star,
850            '/' => TokenKind::Slash,
851            '%' => TokenKind::Percent,
852            '^' => TokenKind::Caret,
853            '_' => TokenKind::Underscore,
854            '?' => TokenKind::Question,
855            '\'' => TokenKind::Error("unexpected tick '".to_string()),
856            '!' if self.peek() == Some('=') => {
857                self.advance();
858                TokenKind::BangEq
859            }
860            '!' => TokenKind::Bang,
861            '&' if self.peek() == Some('&') => {
862                self.advance();
863                TokenKind::AndAnd
864            }
865            ':' if self.peek() == Some('=') => {
866                self.advance();
867                TokenKind::Assign
868            }
869            ':' => TokenKind::Colon,
870            '=' if self.peek() == Some('>') => {
871                self.advance();
872                TokenKind::FatArrow
873            }
874            '=' => TokenKind::Eq,
875            '<' if self.peek() == Some('-') => {
876                self.advance();
877                TokenKind::LeftArrow
878            }
879            '<' if self.peek() == Some('=') => {
880                self.advance();
881                TokenKind::Le
882            }
883            '<' => TokenKind::Lt,
884            '\u{2190}' => TokenKind::LeftArrow,
885            '\u{2264}' | '\u{2A7D}' => TokenKind::Le,
886            '\u{2265}' | '\u{2A7E}' => TokenKind::Ge,
887            '\u{2260}' => TokenKind::Ne,
888            '\u{2192}' => TokenKind::Arrow,
889            '\u{21D2}' => TokenKind::FatArrow,
890            '\u{2227}' => TokenKind::And,
891            '\u{2228}' => TokenKind::Or,
892            '\u{00AC}' => TokenKind::Not,
893            '\u{2194}' => TokenKind::Iff,
894            '\u{2200}' => TokenKind::Forall,
895            '\u{2203}' => TokenKind::Exists,
896            '\u{03BB}' => TokenKind::Fun,
897            '\u{27E8}' => TokenKind::LAngle,
898            '\u{27E9}' => TokenKind::RAngle,
899            _ => TokenKind::Error(format!("unexpected character: {}", ch)),
900        };
901        Token::new(kind, Span::new(start, self.pos, start_line, start_col))
902    }
903    /// Tokenize the entire input.
904    pub fn tokenize(&mut self) -> Vec<Token> {
905        let mut tokens = Vec::new();
906        loop {
907            let tok = self.next_token();
908            let is_eof = matches!(tok.kind, TokenKind::Eof);
909            tokens.push(tok);
910            if is_eof {
911                break;
912            }
913        }
914        tokens
915    }
916}
917/// A simple string scanner with position tracking.
918#[allow(dead_code)]
919#[allow(missing_docs)]
920pub struct Scanner<'a> {
921    /// The source text
922    src: &'a str,
923    /// Current byte position
924    pos: usize,
925    /// Current line number (1-based)
926    line: usize,
927    /// Current column number (1-based)
928    col: usize,
929}
930impl<'a> Scanner<'a> {
931    /// Create a new scanner at the start of the source.
932    #[allow(dead_code)]
933    pub fn new(src: &'a str) -> Self {
934        Scanner {
935            src,
936            pos: 0,
937            line: 1,
938            col: 1,
939        }
940    }
941    /// Peek at the current character without consuming it.
942    #[allow(dead_code)]
943    pub fn peek(&self) -> Option<char> {
944        self.src[self.pos..].chars().next()
945    }
946    /// Consume the next character.
947    #[allow(dead_code)]
948    #[allow(clippy::should_implement_trait)]
949    pub fn next(&mut self) -> Option<char> {
950        let c = self.src[self.pos..].chars().next()?;
951        self.pos += c.len_utf8();
952        if c == '\n' {
953            self.line += 1;
954            self.col = 1;
955        } else {
956            self.col += 1;
957        }
958        Some(c)
959    }
960    /// Returns true if the scanner is at the end.
961    #[allow(dead_code)]
962    pub fn is_eof(&self) -> bool {
963        self.pos >= self.src.len()
964    }
965    /// Returns the current position.
966    #[allow(dead_code)]
967    pub fn position(&self) -> usize {
968        self.pos
969    }
970    /// Returns the current line.
971    #[allow(dead_code)]
972    pub fn line(&self) -> usize {
973        self.line
974    }
975    /// Returns the current column.
976    #[allow(dead_code)]
977    pub fn col(&self) -> usize {
978        self.col
979    }
980    /// Skip whitespace.
981    #[allow(dead_code)]
982    pub fn skip_whitespace(&mut self) {
983        while self.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
984            self.next();
985        }
986    }
987    /// Consume characters while the predicate holds, returning the matched slice.
988    #[allow(dead_code)]
989    pub fn consume_while<F: Fn(char) -> bool>(&mut self, pred: F) -> &'a str {
990        let start = self.pos;
991        while self.peek().is_some_and(&pred) {
992            self.next();
993        }
994        &self.src[start..self.pos]
995    }
996    /// Returns the remaining source.
997    #[allow(dead_code)]
998    pub fn remaining(&self) -> &'a str {
999        &self.src[self.pos..]
1000    }
1001}
1002/// A lexer position record for backtracking.
1003#[allow(dead_code)]
1004#[allow(missing_docs)]
1005#[derive(Debug, Clone, Copy)]
1006pub struct LexerPos {
1007    /// Byte offset
1008    pub offset: usize,
1009    /// Line number
1010    pub line: usize,
1011    /// Column number
1012    pub col: usize,
1013}
1014/// A lexer rule table.
1015#[allow(dead_code)]
1016#[allow(missing_docs)]
1017pub struct LexerRuleTable {
1018    /// All rules in this table
1019    pub rules: Vec<LexerRule>,
1020}
1021impl LexerRuleTable {
1022    /// Create a new empty table.
1023    #[allow(dead_code)]
1024    pub fn new() -> Self {
1025        LexerRuleTable { rules: Vec::new() }
1026    }
1027    /// Add a rule.
1028    #[allow(dead_code)]
1029    pub fn add(&mut self, rule: LexerRule) {
1030        self.rules.push(rule);
1031    }
1032    /// Find the first rule whose start class matches a character.
1033    #[allow(dead_code)]
1034    pub fn find_rule(&self, c: char) -> Option<&LexerRule> {
1035        self.rules.iter().find(|r| r.start.matches(c))
1036    }
1037    /// Returns the number of rules.
1038    #[allow(dead_code)]
1039    pub fn len(&self) -> usize {
1040        self.rules.len()
1041    }
1042    /// Returns true if the table is empty.
1043    #[allow(dead_code)]
1044    pub fn is_empty(&self) -> bool {
1045        self.rules.is_empty()
1046    }
1047}
1048/// A token category for grouping.
1049#[allow(dead_code)]
1050#[allow(missing_docs)]
1051#[derive(Debug, Clone, PartialEq, Eq)]
1052pub enum TokenCategory {
1053    /// Identifier tokens
1054    Ident,
1055    /// Keyword tokens
1056    Keyword,
1057    /// Literal tokens
1058    Literal,
1059    /// Operator tokens
1060    Operator,
1061    /// Delimiter tokens (parens, brackets, etc.)
1062    Delimiter,
1063    /// Whitespace/trivia tokens
1064    Trivia,
1065    /// EOF token
1066    Eof,
1067}
1068/// A token diff record showing insertions/deletions between two token sequences.
1069#[allow(dead_code)]
1070#[allow(missing_docs)]
1071#[derive(Debug, Clone)]
1072pub struct TokenDiff {
1073    /// Number of tokens only in left
1074    pub only_left: usize,
1075    /// Number of tokens only in right
1076    pub only_right: usize,
1077    /// Number of matching tokens
1078    pub matching: usize,
1079}
1080/// A summary of a lexed file.
1081#[allow(dead_code)]
1082#[allow(missing_docs)]
1083#[derive(Debug, Default)]
1084pub struct LexSummary {
1085    /// Total token count
1086    pub token_count: usize,
1087    /// Number of identifier tokens
1088    pub ident_count: usize,
1089    /// Number of keyword tokens
1090    pub keyword_count: usize,
1091    /// Number of operator tokens
1092    pub operator_count: usize,
1093    /// Number of literal tokens
1094    pub literal_count: usize,
1095    /// Number of comment tokens
1096    pub comment_count: usize,
1097    /// Number of whitespace tokens
1098    pub whitespace_count: usize,
1099}
1100impl LexSummary {
1101    /// Create a new empty summary.
1102    #[allow(dead_code)]
1103    pub fn new() -> Self {
1104        LexSummary::default()
1105    }
1106    /// Format the summary as a string.
1107    #[allow(dead_code)]
1108    pub fn format(&self) -> String {
1109        format!(
1110            "tokens={} idents={} keywords={} ops={} literals={} comments={} ws={}",
1111            self.token_count,
1112            self.ident_count,
1113            self.keyword_count,
1114            self.operator_count,
1115            self.literal_count,
1116            self.comment_count,
1117            self.whitespace_count
1118        )
1119    }
1120}
1121/// A keyword trie node for fast keyword detection.
1122#[allow(dead_code)]
1123#[allow(missing_docs)]
1124pub struct KeywordTrie {
1125    /// Children by character
1126    children: std::collections::HashMap<char, KeywordTrie>,
1127    /// Whether this node is a terminal keyword
1128    pub is_keyword: bool,
1129}
1130impl KeywordTrie {
1131    /// Create an empty trie.
1132    #[allow(dead_code)]
1133    pub fn new() -> Self {
1134        KeywordTrie {
1135            children: std::collections::HashMap::new(),
1136            is_keyword: false,
1137        }
1138    }
1139    /// Insert a keyword into the trie.
1140    #[allow(dead_code)]
1141    pub fn insert(&mut self, keyword: &str) {
1142        let mut node = self;
1143        for c in keyword.chars() {
1144            node = node.children.entry(c).or_default();
1145        }
1146        node.is_keyword = true;
1147    }
1148    /// Check if a string is in the trie.
1149    #[allow(dead_code)]
1150    pub fn contains(&self, s: &str) -> bool {
1151        let mut node = self;
1152        for c in s.chars() {
1153            match node.children.get(&c) {
1154                Some(child) => node = child,
1155                None => return false,
1156            }
1157        }
1158        node.is_keyword
1159    }
1160}
1161/// A lexer rule: a name, a start class, and a continuation class.
1162#[allow(dead_code)]
1163#[allow(missing_docs)]
1164#[derive(Debug, Clone)]
1165pub struct LexerRule {
1166    /// Name of the token kind produced
1167    pub kind: String,
1168    /// Character class for the first character
1169    pub start: CharClass,
1170    /// Character class for continuation characters
1171    pub cont: CharClass,
1172}
1173impl LexerRule {
1174    /// Create a new lexer rule.
1175    #[allow(dead_code)]
1176    pub fn new(kind: &str, start: CharClass, cont: CharClass) -> Self {
1177        LexerRule {
1178            kind: kind.to_string(),
1179            start,
1180            cont,
1181        }
1182    }
1183}
1184/// A trivia (whitespace/comment) accumulator.
1185#[allow(dead_code)]
1186#[allow(missing_docs)]
1187#[derive(Debug, Default)]
1188pub struct TriviaAccumulator {
1189    /// Accumulated trivia text
1190    pub text: String,
1191    /// Whether any newlines were found
1192    pub has_newlines: bool,
1193}
1194impl TriviaAccumulator {
1195    /// Create a new accumulator.
1196    #[allow(dead_code)]
1197    pub fn new() -> Self {
1198        TriviaAccumulator {
1199            text: String::new(),
1200            has_newlines: false,
1201        }
1202    }
1203    /// Add trivia text.
1204    #[allow(dead_code)]
1205    pub fn push(&mut self, s: &str) {
1206        if s.contains('\n') {
1207            self.has_newlines = true;
1208        }
1209        self.text.push_str(s);
1210    }
1211    /// Clear the accumulator.
1212    #[allow(dead_code)]
1213    pub fn clear(&mut self) {
1214        self.text.clear();
1215        self.has_newlines = false;
1216    }
1217}
1218/// A simple character frequency table over a source string.
1219#[allow(dead_code)]
1220#[allow(missing_docs)]
1221pub struct CharFreqTable {
1222    /// Frequency counts indexed by char code (0..128)
1223    pub counts: [u32; 128],
1224}
1225impl CharFreqTable {
1226    /// Build a frequency table from source.
1227    #[allow(dead_code)]
1228    #[allow(clippy::should_implement_trait)]
1229    pub fn from_str(src: &str) -> Self {
1230        let mut counts = [0u32; 128];
1231        for c in src.chars() {
1232            if (c as usize) < 128 {
1233                counts[c as usize] += 1;
1234            }
1235        }
1236        CharFreqTable { counts }
1237    }
1238    /// Get the count for a character.
1239    #[allow(dead_code)]
1240    pub fn count(&self, c: char) -> u32 {
1241        if (c as usize) < 128 {
1242            self.counts[c as usize]
1243        } else {
1244            0
1245        }
1246    }
1247}
1248/// A simple tokenisation result with raw text.
1249#[allow(dead_code)]
1250#[allow(missing_docs)]
1251#[derive(Debug, Clone)]
1252pub struct RawToken {
1253    /// Kind string
1254    pub kind: String,
1255    /// Raw text of the token
1256    pub text: String,
1257    /// Start byte offset
1258    pub start: usize,
1259    /// End byte offset
1260    pub end: usize,
1261    /// Line number
1262    pub line: usize,
1263    /// Column number
1264    pub col: usize,
1265}
1266/// A filtered view of a token sequence.
1267#[allow(dead_code)]
1268#[allow(missing_docs)]
1269pub struct FilteredTokenStream {
1270    /// The underlying tokens
1271    pub tokens: Vec<RawToken>,
1272    /// Current index
1273    pos: usize,
1274}
1275impl FilteredTokenStream {
1276    /// Create from a token list, filtering out whitespace.
1277    #[allow(dead_code)]
1278    pub fn new(tokens: Vec<RawToken>) -> Self {
1279        let tokens = tokens.into_iter().filter(|t| t.kind != "WS").collect();
1280        FilteredTokenStream { tokens, pos: 0 }
1281    }
1282    /// Peek at the current token.
1283    #[allow(dead_code)]
1284    pub fn peek(&self) -> Option<&RawToken> {
1285        self.tokens.get(self.pos)
1286    }
1287    /// Consume the current token.
1288    #[allow(dead_code)]
1289    #[allow(clippy::should_implement_trait)]
1290    pub fn next(&mut self) -> Option<&RawToken> {
1291        let tok = self.tokens.get(self.pos)?;
1292        self.pos += 1;
1293        Some(tok)
1294    }
1295    /// Returns true if at end.
1296    #[allow(dead_code)]
1297    pub fn is_eof(&self) -> bool {
1298        self.pos >= self.tokens.len()
1299    }
1300    /// Returns remaining token count.
1301    #[allow(dead_code)]
1302    pub fn remaining(&self) -> usize {
1303        self.tokens.len().saturating_sub(self.pos)
1304    }
1305    /// Returns position.
1306    #[allow(dead_code)]
1307    pub fn position(&self) -> usize {
1308        self.pos
1309    }
1310}
1311/// A simple character class for lexer rules.
1312#[allow(dead_code)]
1313#[allow(missing_docs)]
1314#[derive(Debug, Clone)]
1315pub enum CharClass {
1316    /// Matches any ASCII letter
1317    Alpha,
1318    /// Matches any ASCII digit
1319    Digit,
1320    /// Matches any ASCII alphanumeric
1321    AlphaNum,
1322    /// Matches whitespace
1323    Whitespace,
1324    /// Matches a specific character
1325    Exact(char),
1326    /// Matches any character in a set
1327    OneOf(Vec<char>),
1328    /// Matches any character not in a set
1329    NoneOf(Vec<char>),
1330}
1331impl CharClass {
1332    /// Returns true if the character matches this class.
1333    #[allow(dead_code)]
1334    pub fn matches(&self, c: char) -> bool {
1335        match self {
1336            CharClass::Alpha => c.is_ascii_alphabetic(),
1337            CharClass::Digit => c.is_ascii_digit(),
1338            CharClass::AlphaNum => c.is_ascii_alphanumeric(),
1339            CharClass::Whitespace => c.is_whitespace(),
1340            CharClass::Exact(x) => c == *x,
1341            CharClass::OneOf(set) => set.contains(&c),
1342            CharClass::NoneOf(set) => !set.contains(&c),
1343        }
1344    }
1345}
1346/// A lexer mode for context-sensitive lexing.
1347#[allow(dead_code)]
1348#[allow(missing_docs)]
1349#[derive(Debug, Clone, PartialEq, Eq)]
1350pub enum LexerMode {
1351    /// Normal mode
1352    Normal,
1353    /// Inside a string literal
1354    StringLit,
1355    /// Inside a comment
1356    Comment,
1357    /// Inside a tactic block
1358    Tactic,
1359}
1360/// A lexer state machine with explicit transitions.
1361#[allow(dead_code)]
1362#[allow(missing_docs)]
1363#[derive(Debug, Clone, PartialEq, Eq)]
1364pub enum LexerState {
1365    /// Initial state
1366    Start,
1367    /// Scanning identifier
1368    Ident,
1369    /// Scanning number
1370    Number,
1371    /// Scanning operator
1372    Operator,
1373    /// Scanning string
1374    StringStart,
1375    /// Scanning line comment
1376    LineComment,
1377    /// Done
1378    Done,
1379}
1380/// A utility to find the line boundaries in a source string.
1381#[allow(dead_code)]
1382#[allow(missing_docs)]
1383pub struct LineMap {
1384    /// Byte offsets of each line start
1385    pub line_starts: Vec<usize>,
1386}
1387impl LineMap {
1388    /// Build a LineMap from source text.
1389    #[allow(dead_code)]
1390    #[allow(clippy::should_implement_trait)]
1391    pub fn from_str(src: &str) -> Self {
1392        let mut line_starts = vec![0];
1393        for (i, c) in src.char_indices() {
1394            if c == '\n' {
1395                line_starts.push(i + 1);
1396            }
1397        }
1398        LineMap { line_starts }
1399    }
1400    /// Returns the line number (1-based) for a byte offset.
1401    #[allow(dead_code)]
1402    pub fn line_for_offset(&self, offset: usize) -> usize {
1403        match self.line_starts.binary_search(&offset) {
1404            Ok(idx) => idx + 1,
1405            Err(idx) => idx,
1406        }
1407    }
1408    /// Returns the column (1-based) for a byte offset.
1409    #[allow(dead_code)]
1410    pub fn col_for_offset(&self, offset: usize) -> usize {
1411        let line = self.line_for_offset(offset);
1412        let line_start = self.line_starts.get(line - 1).copied().unwrap_or(0);
1413        offset - line_start + 1
1414    }
1415    /// Returns the total number of lines.
1416    #[allow(dead_code)]
1417    pub fn line_count(&self) -> usize {
1418        self.line_starts.len()
1419    }
1420}
oxilean_parse/lexer/types.rs

oxilean_parse/lexer/
types.rs