Skip to main content

oak_python/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use self::token_type::PythonTokenType;
5use crate::language::PythonLanguage;
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::LexOutput,
9    source::{Source, TextEdit},
10};
11
12type State<'a, S> = LexerState<'a, S, PythonLanguage>;
13
14/// Python lexer implementation.
15#[derive(Clone)]
16pub struct PythonLexer<'config> {
17    _config: &'config PythonLanguage,
18}
19
20impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PythonLanguage>) -> LexOutput<PythonLanguage> {
22        let mut state = State::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> PythonLexer<'config> {
32    /// Creates a new Python lexer.
33    pub fn new(config: &'config PythonLanguage) -> Self {
34        Self { _config: config }
35    }
36
37    /// Skips whitespace characters.
38    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
39        let start_pos = state.get_position();
40
41        while let Some(ch) = state.current() {
42            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
43        }
44
45        if state.get_position() > start_pos {
46            state.add_token(PythonTokenType::Whitespace, start_pos, state.get_position());
47            true
48        }
49        else {
50            false
51        }
52    }
53
54    /// Handles newline characters.
55    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) -> bool {
56        let start_pos = state.get_position();
57        let kind = if bracket_level > 0 { PythonTokenType::Whitespace } else { PythonTokenType::Newline };
58
59        if let Some('\n') = state.current() {
60            state.advance(1);
61            state.add_token(kind, start_pos, state.get_position());
62            true
63        }
64        else if let Some('\r') = state.current() {
65            state.advance(1);
66            if let Some('\n') = state.current() {
67                state.advance(1);
68            }
69            state.add_token(kind, start_pos, state.get_position());
70            true
71        }
72        else {
73            false
74        }
75    }
76
77    /// Handles comments.
78    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        if let Some('#') = state.current() {
80            let start_pos = state.get_position();
81            state.advance(1); // Skip '#'
82
83            // Read until end of line
84            while let Some(ch) = state.current() {
85                if ch == '\n' || ch == '\r' {
86                    break;
87                }
88                state.advance(ch.len_utf8())
89            }
90
91            state.add_token(PythonTokenType::Comment, start_pos, state.get_position());
92            true
93        }
94        else {
95            false
96        }
97    }
98
99    /// Handles string literals.
100    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
101        let start_pos = state.get_position();
102
103        // Check for prefixes (f, r, b, u, etc.)
104        let mut prefix = None;
105        if let Some(ch) = state.current() {
106            if "frbuFRBU".contains(ch) {
107                // Check if next char is a quote
108                if let Some(next_ch) = state.peek_next_n(ch.len_utf8()) {
109                    if next_ch == '"' || next_ch == '\'' {
110                        prefix = Some(ch.to_ascii_lowercase());
111                        state.advance(ch.len_utf8());
112                    }
113                }
114            }
115        }
116
117        // Check if it's the start of a string
118        let quote_char = match state.current() {
119            Some('"') => '"',
120            Some('\'') => '\'',
121            _ => {
122                if prefix.is_some() {
123                    // This shouldn't happen if we checked correctly above
124                    return false;
125                }
126                return false;
127            }
128        };
129
130        state.advance(1); // Skip first quote
131
132        // Check if it's a triple-quoted string
133        let is_triple = if let (Some(c1), Some(c2)) = (state.peek_next_n(0), state.peek_next_n(1)) { c1 == quote_char && c2 == quote_char } else { false };
134
135        if is_triple {
136            state.advance(2); // Skip remaining two quotes
137        }
138
139        let mut escaped = false;
140        while let Some(ch) = state.current() {
141            if escaped {
142                escaped = false;
143                state.advance(ch.len_utf8());
144                continue;
145            }
146
147            if ch == '\\' {
148                escaped = true;
149                state.advance(1);
150                continue;
151            }
152
153            if ch == quote_char {
154                if is_triple {
155                    if let (Some(c1), Some(c2)) = (state.peek_next_n(1), state.peek_next_n(2)) {
156                        if c1 == quote_char && c2 == quote_char {
157                            state.advance(3); // Skip three quotes
158                            break;
159                        }
160                    }
161                    state.advance(1);
162                    continue;
163                }
164                else {
165                    state.advance(1); // Skip closing quote
166                    break;
167                }
168            }
169            else if (ch == '\n' || ch == '\r') && !is_triple {
170                // Single-line strings cannot contain newlines
171                break;
172            }
173            else {
174                state.advance(ch.len_utf8());
175            }
176        }
177
178        let kind = match prefix {
179            Some('f') => PythonTokenType::FString,
180            Some('b') => PythonTokenType::Bytes,
181            _ => PythonTokenType::String,
182        };
183        state.add_token(kind, start_pos, state.get_position());
184        true
185    }
186
187    /// Handles number literals.
188    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189        let start_pos = state.get_position();
190
191        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
192            return false;
193        }
194
195        // Simple implementation: only handles basic decimal numbers
196        while let Some(ch) = state.current() {
197            if ch.is_ascii_digit() || ch == '.' {
198                state.advance(1);
199            }
200            else {
201                break;
202            }
203        }
204
205        state.add_token(PythonTokenType::Number, start_pos, state.get_position());
206        true
207    }
208
209    /// Handles identifiers or keywords.
210    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211        let start_pos = state.get_position();
212
213        // Check first character
214        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
215            return false;
216        }
217
218        // Read identifier
219        let mut text = String::new();
220        while let Some(ch) = state.current() {
221            if ch.is_ascii_alphanumeric() || ch == '_' {
222                text.push(ch);
223                state.advance(ch.len_utf8());
224            }
225            else {
226                break;
227            }
228        }
229
230        // Check if it's a keyword
231        let kind = match text.as_str() {
232            "and" => PythonTokenType::AndKeyword,
233            "as" => PythonTokenType::AsKeyword,
234            "assert" => PythonTokenType::AssertKeyword,
235            "async" => PythonTokenType::AsyncKeyword,
236            "await" => PythonTokenType::AwaitKeyword,
237            "break" => PythonTokenType::BreakKeyword,
238            "class" => PythonTokenType::ClassKeyword,
239            "continue" => PythonTokenType::ContinueKeyword,
240            "def" => PythonTokenType::DefKeyword,
241            "del" => PythonTokenType::DelKeyword,
242            "elif" => PythonTokenType::ElifKeyword,
243            "else" => PythonTokenType::ElseKeyword,
244            "except" => PythonTokenType::ExceptKeyword,
245            "False" => PythonTokenType::FalseKeyword,
246            "finally" => PythonTokenType::FinallyKeyword,
247            "for" => PythonTokenType::ForKeyword,
248            "from" => PythonTokenType::FromKeyword,
249            "global" => PythonTokenType::GlobalKeyword,
250            "if" => PythonTokenType::IfKeyword,
251            "import" => PythonTokenType::ImportKeyword,
252            "in" => PythonTokenType::InKeyword,
253            "is" => PythonTokenType::IsKeyword,
254            "lambda" => PythonTokenType::LambdaKeyword,
255            "None" => PythonTokenType::NoneKeyword,
256            "nonlocal" => PythonTokenType::NonlocalKeyword,
257            "not" => PythonTokenType::NotKeyword,
258            "or" => PythonTokenType::OrKeyword,
259            "pass" => PythonTokenType::PassKeyword,
260            "raise" => PythonTokenType::RaiseKeyword,
261            "return" => PythonTokenType::ReturnKeyword,
262            "True" => PythonTokenType::TrueKeyword,
263            "try" => PythonTokenType::TryKeyword,
264            "while" => PythonTokenType::WhileKeyword,
265            "with" => PythonTokenType::WithKeyword,
266            "yield" => PythonTokenType::YieldKeyword,
267            _ => PythonTokenType::Identifier,
268        };
269
270        state.add_token(kind, start_pos, state.get_position());
271        true
272    }
273
274    /// Handles operators.
275    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276        let start_pos = state.get_position();
277
278        if let Some(ch) = state.current() {
279            let kind = match ch {
280                '+' => {
281                    state.advance(1);
282                    if let Some('=') = state.current() {
283                        state.advance(1);
284                        PythonTokenType::PlusAssign
285                    }
286                    else {
287                        PythonTokenType::Plus
288                    }
289                }
290                '-' => {
291                    state.advance(1);
292                    if let Some('=') = state.current() {
293                        state.advance(1);
294                        PythonTokenType::MinusAssign
295                    }
296                    else if let Some('>') = state.current() {
297                        state.advance(1);
298                        PythonTokenType::Arrow
299                    }
300                    else {
301                        PythonTokenType::Minus
302                    }
303                }
304                '*' => {
305                    state.advance(1);
306                    if let Some('=') = state.current() {
307                        state.advance(1);
308                        PythonTokenType::StarAssign
309                    }
310                    else if let Some('*') = state.current() {
311                        state.advance(1);
312                        if let Some('=') = state.current() {
313                            state.advance(1);
314                            PythonTokenType::DoubleStarAssign
315                        }
316                        else {
317                            PythonTokenType::DoubleStar
318                        }
319                    }
320                    else {
321                        PythonTokenType::Star
322                    }
323                }
324                '/' => {
325                    state.advance(1);
326                    if let Some('=') = state.current() {
327                        state.advance(1);
328                        PythonTokenType::SlashAssign
329                    }
330                    else if let Some('/') = state.current() {
331                        state.advance(1);
332                        if let Some('=') = state.current() {
333                            state.advance(1);
334                            PythonTokenType::DoubleSlashAssign
335                        }
336                        else {
337                            PythonTokenType::DoubleSlash
338                        }
339                    }
340                    else {
341                        PythonTokenType::Slash
342                    }
343                }
344                '%' => {
345                    state.advance(1);
346                    if let Some('=') = state.current() {
347                        state.advance(1);
348                        PythonTokenType::PercentAssign
349                    }
350                    else {
351                        PythonTokenType::Percent
352                    }
353                }
354                '=' => {
355                    state.advance(1);
356                    if let Some('=') = state.current() {
357                        state.advance(1);
358                        PythonTokenType::Equal
359                    }
360                    else {
361                        PythonTokenType::Assign
362                    }
363                }
364                '<' => {
365                    state.advance(1);
366                    if let Some('=') = state.current() {
367                        state.advance(1);
368                        PythonTokenType::LessEqual
369                    }
370                    else if let Some('<') = state.current() {
371                        state.advance(1);
372                        if let Some('=') = state.current() {
373                            state.advance(1);
374                            PythonTokenType::LeftShiftAssign
375                        }
376                        else {
377                            PythonTokenType::LeftShift
378                        }
379                    }
380                    else {
381                        PythonTokenType::Less
382                    }
383                }
384                '>' => {
385                    state.advance(1);
386                    if let Some('=') = state.current() {
387                        state.advance(1);
388                        PythonTokenType::GreaterEqual
389                    }
390                    else if let Some('>') = state.current() {
391                        state.advance(1);
392                        if let Some('=') = state.current() {
393                            state.advance(1);
394                            PythonTokenType::RightShiftAssign
395                        }
396                        else {
397                            PythonTokenType::RightShift
398                        }
399                    }
400                    else {
401                        PythonTokenType::Greater
402                    }
403                }
404                '!' => {
405                    state.advance(1);
406                    if let Some('=') = state.current() {
407                        state.advance(1);
408                        PythonTokenType::NotEqual
409                    }
410                    else {
411                        return false;
412                    }
413                }
414                '&' => {
415                    state.advance(1);
416                    if let Some('=') = state.current() {
417                        state.advance(1);
418                        PythonTokenType::AmpersandAssign
419                    }
420                    else {
421                        PythonTokenType::Ampersand
422                    }
423                }
424                '|' => {
425                    state.advance(1);
426                    if let Some('=') = state.current() {
427                        state.advance(1);
428                        PythonTokenType::PipeAssign
429                    }
430                    else {
431                        PythonTokenType::Pipe
432                    }
433                }
434                '^' => {
435                    state.advance(1);
436                    if let Some('=') = state.current() {
437                        state.advance(1);
438                        PythonTokenType::CaretAssign
439                    }
440                    else {
441                        PythonTokenType::Caret
442                    }
443                }
444                '~' => {
445                    state.advance(1);
446                    PythonTokenType::Tilde
447                }
448                '@' => {
449                    state.advance(1);
450                    if let Some('=') = state.current() {
451                        state.advance(1);
452                        PythonTokenType::AtAssign
453                    }
454                    else {
455                        PythonTokenType::At
456                    }
457                }
458                _ => return false,
459            };
460
461            state.add_token(kind, start_pos, state.get_position());
462            return true;
463        }
464
465        false
466    }
467
468    /// Handles delimiters.
469    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
470        let start_pos = state.get_position();
471
472        if let Some(ch) = state.current() {
473            let kind = match ch {
474                '(' => PythonTokenType::LeftParen,
475                ')' => PythonTokenType::RightParen,
476                '[' => PythonTokenType::LeftBracket,
477                ']' => PythonTokenType::RightBracket,
478                '{' => PythonTokenType::LeftBrace,
479                '}' => PythonTokenType::RightBrace,
480                ',' => PythonTokenType::Comma,
481                ':' => PythonTokenType::Colon,
482                ';' => PythonTokenType::Semicolon,
483                '.' => PythonTokenType::Dot, // Simple handling, ellipses not supported
484                _ => return false,
485            };
486
487            state.advance(1);
488            state.add_token(kind, start_pos, state.get_position());
489            return true;
490        }
491
492        false
493    }
494}
495
496impl<'config> PythonLexer<'config> {
497    pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
498        let mut indent_stack = vec![0];
499        let mut bracket_level: usize = 0;
500        let mut at_line_start = true;
501
502        while state.not_at_end() {
503            let safe_point = state.get_position();
504
505            if at_line_start && bracket_level == 0 {
506                self.handle_indentation(state, &mut indent_stack);
507                at_line_start = false;
508                continue;
509            }
510
511            if let Some(ch) = state.peek() {
512                match ch {
513                    ' ' | '\t' => {
514                        self.skip_whitespace(state);
515                    }
516                    '\n' | '\r' => {
517                        self.lex_newline(state, bracket_level);
518                        at_line_start = true;
519                    }
520                    '#' => {
521                        self.lex_comment(state);
522                    }
523                    '"' | '\'' => {
524                        self.lex_string(state);
525                    }
526                    '0'..='9' => {
527                        self.lex_number(state);
528                    }
529                    'f' | 'r' | 'b' | 'u' | 'F' | 'R' | 'B' | 'U' => {
530                        if !self.lex_string(state) {
531                            self.lex_identifier_or_keyword(state);
532                        }
533                    }
534                    'a'..='e' | 'g'..='q' | 's' | 't' | 'v'..='z' | 'A'..='E' | 'G'..='Q' | 'S' | 'T' | 'V'..='Z' | '_' => {
535                        self.lex_identifier_or_keyword(state);
536                    }
537                    '(' | '[' | '{' => {
538                        bracket_level += 1;
539                        self.lex_delimiter(state);
540                    }
541                    ')' | ']' | '}' => {
542                        bracket_level = bracket_level.saturating_sub(1);
543                        self.lex_delimiter(state);
544                    }
545                    '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '&' | '|' | '^' | '~' | '@' => {
546                        self.lex_operator(state);
547                    }
548                    ',' | ':' | ';' | '.' => {
549                        self.lex_delimiter(state);
550                    }
551                    _ => {
552                        // Fallback to error
553                        state.advance(ch.len_utf8());
554                        state.add_token(PythonTokenType::Error, safe_point, state.get_position())
555                    }
556                }
557            }
558
559            state.advance_if_dead_lock(safe_point)
560        }
561
562        // Emit remaining dedents
563        while indent_stack.len() > 1 {
564            indent_stack.pop();
565            let pos = state.get_position();
566            state.add_token(PythonTokenType::Dedent, pos, pos)
567        }
568
569        Ok(())
570    }
571
572    fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
573        let start_pos = state.get_position();
574        let current_indent;
575
576        // Skip comments and empty lines at start of line
577        let mut temp_state = state.get_position();
578        loop {
579            let mut indent = 0;
580            while let Some(ch) = state.get_char_at(temp_state) {
581                if ch == ' ' {
582                    indent += 1
583                }
584                else if ch == '\t' {
585                    indent += 8
586                }
587                // Standard Python tab width
588                else {
589                    break;
590                }
591                temp_state += 1
592            }
593
594            match state.get_char_at(temp_state) {
595                Some('\n') | Some('\r') | Some('#') => {
596                    // This is an empty line or comment-only line, ignore indentation change
597                    return;
598                }
599                None => return, // EOF
600                _ => {
601                    current_indent = indent;
602                    break;
603                }
604            }
605        }
606
607        // Advance state to skip the indentation we just measured
608        if current_indent > 0 {
609            let end_pos = state.get_position() + (temp_state - state.get_position());
610            state.add_token(PythonTokenType::Whitespace, start_pos, end_pos);
611            state.set_position(end_pos);
612        }
613
614        let last_indent = *stack.last().unwrap();
615        if current_indent > last_indent {
616            stack.push(current_indent);
617            state.add_token(PythonTokenType::Indent, state.get_position(), state.get_position())
618        }
619        else {
620            while current_indent < *stack.last().unwrap() {
621                stack.pop();
622                state.add_token(PythonTokenType::Dedent, state.get_position(), state.get_position())
623            }
624            // If current_indent doesn't match any previous level, it's an indentation error,
625            // but for now we just stop at the closest level.
626        }
627    }
628}