Skip to main content

oak_python/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use self::token_type::PythonTokenType;
5use crate::language::PythonLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::LexOutput,
9    source::{Source, TextEdit},
10};
11
12/// Python lexer state.
13pub(crate) type State<'a, S> = LexerState<'a, S, PythonLanguage>;
14
15/// Python lexer implementation.
16#[derive(Clone)]
17pub struct PythonLexer<'config> {
18    /// The Python language configuration.
19    config: &'config PythonLanguage,
20}
21
22impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
23    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PythonLanguage>) -> LexOutput<PythonLanguage> {
24        let mut state = State::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        if result.is_ok() {
27            state.add_eof();
28        }
29        state.finish_with_cache(result, cache)
30    }
31}
32
33impl<'config> PythonLexer<'config> {
34    /// Creates a new Python lexer.
35    pub fn new(config: &'config PythonLanguage) -> Self {
36        Self { config }
37    }
38
39    /// Skips whitespace characters.
40    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
41        let start_pos = state.get_position();
42
43        while let Some(ch) = state.current() {
44            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
45        }
46
47        if state.get_position() > start_pos {
48            state.add_token(PythonTokenType::Whitespace, start_pos, state.get_position());
49            true
50        }
51        else {
52            false
53        }
54    }
55
56    /// Handles newline characters.
57    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) -> bool {
58        let start_pos = state.get_position();
59        let kind = if bracket_level > 0 { PythonTokenType::Whitespace } else { PythonTokenType::Newline };
60
61        if let Some('\n') = state.current() {
62            state.advance(1);
63            state.add_token(kind, start_pos, state.get_position());
64            true
65        }
66        else if let Some('\r') = state.current() {
67            state.advance(1);
68            if let Some('\n') = state.current() {
69                state.advance(1);
70            }
71            state.add_token(kind, start_pos, state.get_position());
72            true
73        }
74        else {
75            false
76        }
77    }
78
79    /// Handles comments.
80    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        if let Some('#') = state.current() {
82            let start_pos = state.get_position();
83            state.advance(1); // Skip '#'
84
85            // Read until end of line
86            while let Some(ch) = state.current() {
87                if ch == '\n' || ch == '\r' {
88                    break;
89                }
90                state.advance(ch.len_utf8())
91            }
92
93            state.add_token(PythonTokenType::Comment, start_pos, state.get_position());
94            true
95        }
96        else {
97            false
98        }
99    }
100
101    /// Handles string literals.
102    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103        let start_pos = state.get_position();
104
105        // Check for prefixes (f, r, b, u, etc.)
106        let mut prefix = None;
107        if let Some(ch) = state.current() {
108            if "frbuFRBU".contains(ch) {
109                // Check if next char is a quote
110                if let Some(next_ch) = state.peek_next_n(ch.len_utf8()) {
111                    if next_ch == '"' || next_ch == '\'' {
112                        prefix = Some(ch.to_ascii_lowercase());
113                        state.advance(ch.len_utf8());
114                    }
115                }
116            }
117        }
118
119        // Check if it's the start of a string
120        let quote_char = match state.current() {
121            Some('"') => '"',
122            Some('\'') => '\'',
123            _ => {
124                if prefix.is_some() {
125                    // This shouldn't happen if we checked correctly above
126                    return false;
127                }
128                return false;
129            }
130        };
131
132        state.advance(1); // Skip first quote
133
134        // Check if it's a triple-quoted string
135        let is_triple = if let (Some(c1), Some(c2)) = (state.peek_next_n(0), state.peek_next_n(1)) { c1 == quote_char && c2 == quote_char } else { false };
136
137        if is_triple {
138            state.advance(2); // Skip remaining two quotes
139        }
140
141        let mut escaped = false;
142        while let Some(ch) = state.current() {
143            if escaped {
144                escaped = false;
145                state.advance(ch.len_utf8());
146                continue;
147            }
148
149            if ch == '\\' {
150                escaped = true;
151                state.advance(1);
152                continue;
153            }
154
155            if ch == quote_char {
156                if is_triple {
157                    if let (Some(c1), Some(c2)) = (state.peek_next_n(1), state.peek_next_n(2)) {
158                        if c1 == quote_char && c2 == quote_char {
159                            state.advance(3); // Skip three quotes
160                            break;
161                        }
162                    }
163                    state.advance(1);
164                    continue;
165                }
166                else {
167                    state.advance(1); // Skip closing quote
168                    break;
169                }
170            }
171            else if (ch == '\n' || ch == '\r') && !is_triple {
172                // Single-line strings cannot contain newlines
173                break;
174            }
175            else {
176                state.advance(ch.len_utf8());
177            }
178        }
179
180        let kind = match prefix {
181            Some('f') => PythonTokenType::FString,
182            Some('b') => PythonTokenType::Bytes,
183            _ => PythonTokenType::String,
184        };
185        state.add_token(kind, start_pos, state.get_position());
186        true
187    }
188
189    /// Handles number literals.
190    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
191        let start_pos = state.get_position();
192
193        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
194            return false;
195        }
196
197        // Simple implementation: only handles basic decimal numbers
198        while let Some(ch) = state.current() {
199            if ch.is_ascii_digit() || ch == '.' {
200                state.advance(1);
201            }
202            else {
203                break;
204            }
205        }
206
207        state.add_token(PythonTokenType::Number, start_pos, state.get_position());
208        true
209    }
210
211    /// Handles identifiers or keywords.
212    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
213        let start_pos = state.get_position();
214
215        // Check first character
216        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
217            return false;
218        }
219
220        // Read identifier
221        let mut text = String::new();
222        while let Some(ch) = state.current() {
223            if ch.is_ascii_alphanumeric() || ch == '_' {
224                text.push(ch);
225                state.advance(ch.len_utf8());
226            }
227            else {
228                break;
229            }
230        }
231
232        // Check if it's a keyword
233        let kind = match text.as_str() {
234            "and" => PythonTokenType::AndKeyword,
235            "as" => PythonTokenType::AsKeyword,
236            "assert" => PythonTokenType::AssertKeyword,
237            "async" => PythonTokenType::AsyncKeyword,
238            "await" => PythonTokenType::AwaitKeyword,
239            "break" => PythonTokenType::BreakKeyword,
240            "class" => PythonTokenType::ClassKeyword,
241            "continue" => PythonTokenType::ContinueKeyword,
242            "def" => PythonTokenType::DefKeyword,
243            "del" => PythonTokenType::DelKeyword,
244            "elif" => PythonTokenType::ElifKeyword,
245            "else" => PythonTokenType::ElseKeyword,
246            "except" => PythonTokenType::ExceptKeyword,
247            "False" => PythonTokenType::FalseKeyword,
248            "finally" => PythonTokenType::FinallyKeyword,
249            "for" => PythonTokenType::ForKeyword,
250            "from" => PythonTokenType::FromKeyword,
251            "global" => PythonTokenType::GlobalKeyword,
252            "if" => PythonTokenType::IfKeyword,
253            "import" => PythonTokenType::ImportKeyword,
254            "in" => PythonTokenType::InKeyword,
255            "is" => PythonTokenType::IsKeyword,
256            "lambda" => PythonTokenType::LambdaKeyword,
257            "None" => PythonTokenType::NoneKeyword,
258            "nonlocal" => PythonTokenType::NonlocalKeyword,
259            "not" => PythonTokenType::NotKeyword,
260            "or" => PythonTokenType::OrKeyword,
261            "pass" => PythonTokenType::PassKeyword,
262            "raise" => PythonTokenType::RaiseKeyword,
263            "return" => PythonTokenType::ReturnKeyword,
264            "True" => PythonTokenType::TrueKeyword,
265            "try" => PythonTokenType::TryKeyword,
266            "while" => PythonTokenType::WhileKeyword,
267            "with" => PythonTokenType::WithKeyword,
268            "yield" => PythonTokenType::YieldKeyword,
269            _ => PythonTokenType::Identifier,
270        };
271
272        state.add_token(kind, start_pos, state.get_position());
273        true
274    }
275
276    /// Handles operators.
277    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
278        let start_pos = state.get_position();
279
280        if let Some(ch) = state.current() {
281            let kind = match ch {
282                '+' => {
283                    state.advance(1);
284                    if let Some('=') = state.current() {
285                        state.advance(1);
286                        PythonTokenType::PlusAssign
287                    }
288                    else {
289                        PythonTokenType::Plus
290                    }
291                }
292                '-' => {
293                    state.advance(1);
294                    if let Some('=') = state.current() {
295                        state.advance(1);
296                        PythonTokenType::MinusAssign
297                    }
298                    else if let Some('>') = state.current() {
299                        state.advance(1);
300                        PythonTokenType::Arrow
301                    }
302                    else {
303                        PythonTokenType::Minus
304                    }
305                }
306                '*' => {
307                    state.advance(1);
308                    if let Some('=') = state.current() {
309                        state.advance(1);
310                        PythonTokenType::StarAssign
311                    }
312                    else if let Some('*') = state.current() {
313                        state.advance(1);
314                        if let Some('=') = state.current() {
315                            state.advance(1);
316                            PythonTokenType::DoubleStarAssign
317                        }
318                        else {
319                            PythonTokenType::DoubleStar
320                        }
321                    }
322                    else {
323                        PythonTokenType::Star
324                    }
325                }
326                '/' => {
327                    state.advance(1);
328                    if let Some('=') = state.current() {
329                        state.advance(1);
330                        PythonTokenType::SlashAssign
331                    }
332                    else if let Some('/') = state.current() {
333                        state.advance(1);
334                        if let Some('=') = state.current() {
335                            state.advance(1);
336                            PythonTokenType::DoubleSlashAssign
337                        }
338                        else {
339                            PythonTokenType::DoubleSlash
340                        }
341                    }
342                    else {
343                        PythonTokenType::Slash
344                    }
345                }
346                '%' => {
347                    state.advance(1);
348                    if let Some('=') = state.current() {
349                        state.advance(1);
350                        PythonTokenType::PercentAssign
351                    }
352                    else {
353                        PythonTokenType::Percent
354                    }
355                }
356                '=' => {
357                    state.advance(1);
358                    if let Some('=') = state.current() {
359                        state.advance(1);
360                        PythonTokenType::Equal
361                    }
362                    else {
363                        PythonTokenType::Assign
364                    }
365                }
366                '<' => {
367                    state.advance(1);
368                    if let Some('=') = state.current() {
369                        state.advance(1);
370                        PythonTokenType::LessEqual
371                    }
372                    else if let Some('<') = state.current() {
373                        state.advance(1);
374                        if let Some('=') = state.current() {
375                            state.advance(1);
376                            PythonTokenType::LeftShiftAssign
377                        }
378                        else {
379                            PythonTokenType::LeftShift
380                        }
381                    }
382                    else {
383                        PythonTokenType::Less
384                    }
385                }
386                '>' => {
387                    state.advance(1);
388                    if let Some('=') = state.current() {
389                        state.advance(1);
390                        PythonTokenType::GreaterEqual
391                    }
392                    else if let Some('>') = state.current() {
393                        state.advance(1);
394                        if let Some('=') = state.current() {
395                            state.advance(1);
396                            PythonTokenType::RightShiftAssign
397                        }
398                        else {
399                            PythonTokenType::RightShift
400                        }
401                    }
402                    else {
403                        PythonTokenType::Greater
404                    }
405                }
406                '!' => {
407                    state.advance(1);
408                    if let Some('=') = state.current() {
409                        state.advance(1);
410                        PythonTokenType::NotEqual
411                    }
412                    else {
413                        return false;
414                    }
415                }
416                '&' => {
417                    state.advance(1);
418                    if let Some('=') = state.current() {
419                        state.advance(1);
420                        PythonTokenType::AmpersandAssign
421                    }
422                    else {
423                        PythonTokenType::Ampersand
424                    }
425                }
426                '|' => {
427                    state.advance(1);
428                    if let Some('=') = state.current() {
429                        state.advance(1);
430                        PythonTokenType::PipeAssign
431                    }
432                    else {
433                        PythonTokenType::Pipe
434                    }
435                }
436                '^' => {
437                    state.advance(1);
438                    if let Some('=') = state.current() {
439                        state.advance(1);
440                        PythonTokenType::CaretAssign
441                    }
442                    else {
443                        PythonTokenType::Caret
444                    }
445                }
446                '~' => {
447                    state.advance(1);
448                    PythonTokenType::Tilde
449                }
450                '@' => {
451                    state.advance(1);
452                    if let Some('=') = state.current() {
453                        state.advance(1);
454                        PythonTokenType::AtAssign
455                    }
456                    else {
457                        PythonTokenType::At
458                    }
459                }
460                _ => return false,
461            };
462
463            state.add_token(kind, start_pos, state.get_position());
464            return true;
465        }
466
467        false
468    }
469
470    /// Handles delimiters.
471    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
472        let start_pos = state.get_position();
473
474        if let Some(ch) = state.current() {
475            let kind = match ch {
476                '(' => PythonTokenType::LeftParen,
477                ')' => PythonTokenType::RightParen,
478                '[' => PythonTokenType::LeftBracket,
479                ']' => PythonTokenType::RightBracket,
480                '{' => PythonTokenType::LeftBrace,
481                '}' => PythonTokenType::RightBrace,
482                ',' => PythonTokenType::Comma,
483                ':' => PythonTokenType::Colon,
484                ';' => PythonTokenType::Semicolon,
485                '.' => PythonTokenType::Dot, // Simple handling, ellipses not supported
486                _ => return false,
487            };
488
489            state.advance(1);
490            state.add_token(kind, start_pos, state.get_position());
491            return true;
492        }
493
494        false
495    }
496}
497
498impl<'config> PythonLexer<'config> {
499    pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
500        let mut indent_stack = vec![0];
501        let mut bracket_level: usize = 0;
502        let mut at_line_start = true;
503
504        while state.not_at_end() {
505            let safe_point = state.get_position();
506
507            if at_line_start && bracket_level == 0 {
508                self.handle_indentation(state, &mut indent_stack);
509                at_line_start = false;
510                continue;
511            }
512
513            if let Some(ch) = state.peek() {
514                match ch {
515                    ' ' | '\t' => {
516                        self.skip_whitespace(state);
517                    }
518                    '\n' | '\r' => {
519                        self.lex_newline(state, bracket_level);
520                        at_line_start = true;
521                    }
522                    '#' => {
523                        self.lex_comment(state);
524                    }
525                    '"' | '\'' => {
526                        self.lex_string(state);
527                    }
528                    '0'..='9' => {
529                        self.lex_number(state);
530                    }
531                    'f' | 'r' | 'b' | 'u' | 'F' | 'R' | 'B' | 'U' => {
532                        if !self.lex_string(state) {
533                            self.lex_identifier_or_keyword(state);
534                        }
535                    }
536                    'a'..='e' | 'g'..='q' | 's' | 't' | 'v'..='z' | 'A'..='E' | 'G'..='Q' | 'S' | 'T' | 'V'..='Z' | '_' => {
537                        self.lex_identifier_or_keyword(state);
538                    }
539                    '(' | '[' | '{' => {
540                        bracket_level += 1;
541                        self.lex_delimiter(state);
542                    }
543                    ')' | ']' | '}' => {
544                        bracket_level = bracket_level.saturating_sub(1);
545                        self.lex_delimiter(state);
546                    }
547                    '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '&' | '|' | '^' | '~' | '@' => {
548                        self.lex_operator(state);
549                    }
550                    ',' | ':' | ';' | '.' => {
551                        self.lex_delimiter(state);
552                    }
553                    _ => {
554                        // Fallback to error
555                        state.advance(ch.len_utf8());
556                        state.add_token(PythonTokenType::Error, safe_point, state.get_position())
557                    }
558                }
559            }
560
561            state.advance_if_dead_lock(safe_point)
562        }
563
564        // Emit remaining dedents
565        while indent_stack.len() > 1 {
566            indent_stack.pop();
567            let pos = state.get_position();
568            state.add_token(PythonTokenType::Dedent, pos, pos)
569        }
570
571        Ok(())
572    }
573
574    fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
575        let start_pos = state.get_position();
576        let current_indent;
577
578        // Skip comments and empty lines at start of line
579        let mut temp_state = state.get_position();
580        loop {
581            let mut indent = 0;
582            while let Some(ch) = state.get_char_at(temp_state) {
583                if ch == ' ' {
584                    indent += 1
585                }
586                else if ch == '\t' {
587                    indent += 8
588                }
589                // Standard Python tab width
590                else {
591                    break;
592                }
593                temp_state += 1
594            }
595
596            match state.get_char_at(temp_state) {
597                Some('\n') | Some('\r') | Some('#') => {
598                    // This is an empty line or comment-only line, ignore indentation change
599                    return;
600                }
601                None => return, // EOF
602                _ => {
603                    current_indent = indent;
604                    break;
605                }
606            }
607        }
608
609        // Advance state to skip the indentation we just measured
610        if current_indent > 0 {
611            let end_pos = state.get_position() + (temp_state - state.get_position());
612            state.add_token(PythonTokenType::Whitespace, start_pos, end_pos);
613            state.set_position(end_pos);
614        }
615
616        let last_indent = *stack.last().unwrap();
617        if current_indent > last_indent {
618            stack.push(current_indent);
619            state.add_token(PythonTokenType::Indent, state.get_position(), state.get_position())
620        }
621        else {
622            while current_indent < *stack.last().unwrap() {
623                stack.pop();
624                state.add_token(PythonTokenType::Dedent, state.get_position(), state.get_position())
625            }
626            // If current_indent doesn't match any previous level, it's an indentation error,
627            // but for now we just stop at the closest level.
628        }
629    }
630}