1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use self::token_type::PythonTokenType;
5use crate::language::PythonLanguage;
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::LexOutput,
9 source::{Source, TextEdit},
10};
11
12type State<'a, S> = LexerState<'a, S, PythonLanguage>;
13
14#[derive(Clone)]
16pub struct PythonLexer<'config> {
17 _config: &'config PythonLanguage,
18}
19
20impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PythonLanguage>) -> LexOutput<PythonLanguage> {
22 let mut state = State::new_with_cache(source, 0, cache);
23 let result = self.run(&mut state);
24 if result.is_ok() {
25 state.add_eof();
26 }
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> PythonLexer<'config> {
32 pub fn new(config: &'config PythonLanguage) -> Self {
34 Self { _config: config }
35 }
36
37 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
39 let start_pos = state.get_position();
40
41 while let Some(ch) = state.current() {
42 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
43 }
44
45 if state.get_position() > start_pos {
46 state.add_token(PythonTokenType::Whitespace, start_pos, state.get_position());
47 true
48 }
49 else {
50 false
51 }
52 }
53
54 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) -> bool {
56 let start_pos = state.get_position();
57 let kind = if bracket_level > 0 { PythonTokenType::Whitespace } else { PythonTokenType::Newline };
58
59 if let Some('\n') = state.current() {
60 state.advance(1);
61 state.add_token(kind, start_pos, state.get_position());
62 true
63 }
64 else if let Some('\r') = state.current() {
65 state.advance(1);
66 if let Some('\n') = state.current() {
67 state.advance(1);
68 }
69 state.add_token(kind, start_pos, state.get_position());
70 true
71 }
72 else {
73 false
74 }
75 }
76
77 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79 if let Some('#') = state.current() {
80 let start_pos = state.get_position();
81 state.advance(1); while let Some(ch) = state.current() {
85 if ch == '\n' || ch == '\r' {
86 break;
87 }
88 state.advance(ch.len_utf8())
89 }
90
91 state.add_token(PythonTokenType::Comment, start_pos, state.get_position());
92 true
93 }
94 else {
95 false
96 }
97 }
98
99 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
101 let start_pos = state.get_position();
102
103 let mut prefix = None;
105 if let Some(ch) = state.current() {
106 if "frbuFRBU".contains(ch) {
107 if let Some(next_ch) = state.peek_next_n(ch.len_utf8()) {
109 if next_ch == '"' || next_ch == '\'' {
110 prefix = Some(ch.to_ascii_lowercase());
111 state.advance(ch.len_utf8());
112 }
113 }
114 }
115 }
116
117 let quote_char = match state.current() {
119 Some('"') => '"',
120 Some('\'') => '\'',
121 _ => {
122 if prefix.is_some() {
123 return false;
125 }
126 return false;
127 }
128 };
129
130 state.advance(1); let is_triple = if let (Some(c1), Some(c2)) = (state.peek_next_n(0), state.peek_next_n(1)) { c1 == quote_char && c2 == quote_char } else { false };
134
135 if is_triple {
136 state.advance(2); }
138
139 let mut escaped = false;
140 while let Some(ch) = state.current() {
141 if escaped {
142 escaped = false;
143 state.advance(ch.len_utf8());
144 continue;
145 }
146
147 if ch == '\\' {
148 escaped = true;
149 state.advance(1);
150 continue;
151 }
152
153 if ch == quote_char {
154 if is_triple {
155 if let (Some(c1), Some(c2)) = (state.peek_next_n(1), state.peek_next_n(2)) {
156 if c1 == quote_char && c2 == quote_char {
157 state.advance(3); break;
159 }
160 }
161 state.advance(1);
162 continue;
163 }
164 else {
165 state.advance(1); break;
167 }
168 }
169 else if (ch == '\n' || ch == '\r') && !is_triple {
170 break;
172 }
173 else {
174 state.advance(ch.len_utf8());
175 }
176 }
177
178 let kind = match prefix {
179 Some('f') => PythonTokenType::FString,
180 Some('b') => PythonTokenType::Bytes,
181 _ => PythonTokenType::String,
182 };
183 state.add_token(kind, start_pos, state.get_position());
184 true
185 }
186
187 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189 let start_pos = state.get_position();
190
191 if !state.current().map_or(false, |c| c.is_ascii_digit()) {
192 return false;
193 }
194
195 while let Some(ch) = state.current() {
197 if ch.is_ascii_digit() || ch == '.' {
198 state.advance(1);
199 }
200 else {
201 break;
202 }
203 }
204
205 state.add_token(PythonTokenType::Number, start_pos, state.get_position());
206 true
207 }
208
209 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211 let start_pos = state.get_position();
212
213 if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
215 return false;
216 }
217
218 let mut text = String::new();
220 while let Some(ch) = state.current() {
221 if ch.is_ascii_alphanumeric() || ch == '_' {
222 text.push(ch);
223 state.advance(ch.len_utf8());
224 }
225 else {
226 break;
227 }
228 }
229
230 let kind = match text.as_str() {
232 "and" => PythonTokenType::AndKeyword,
233 "as" => PythonTokenType::AsKeyword,
234 "assert" => PythonTokenType::AssertKeyword,
235 "async" => PythonTokenType::AsyncKeyword,
236 "await" => PythonTokenType::AwaitKeyword,
237 "break" => PythonTokenType::BreakKeyword,
238 "class" => PythonTokenType::ClassKeyword,
239 "continue" => PythonTokenType::ContinueKeyword,
240 "def" => PythonTokenType::DefKeyword,
241 "del" => PythonTokenType::DelKeyword,
242 "elif" => PythonTokenType::ElifKeyword,
243 "else" => PythonTokenType::ElseKeyword,
244 "except" => PythonTokenType::ExceptKeyword,
245 "False" => PythonTokenType::FalseKeyword,
246 "finally" => PythonTokenType::FinallyKeyword,
247 "for" => PythonTokenType::ForKeyword,
248 "from" => PythonTokenType::FromKeyword,
249 "global" => PythonTokenType::GlobalKeyword,
250 "if" => PythonTokenType::IfKeyword,
251 "import" => PythonTokenType::ImportKeyword,
252 "in" => PythonTokenType::InKeyword,
253 "is" => PythonTokenType::IsKeyword,
254 "lambda" => PythonTokenType::LambdaKeyword,
255 "None" => PythonTokenType::NoneKeyword,
256 "nonlocal" => PythonTokenType::NonlocalKeyword,
257 "not" => PythonTokenType::NotKeyword,
258 "or" => PythonTokenType::OrKeyword,
259 "pass" => PythonTokenType::PassKeyword,
260 "raise" => PythonTokenType::RaiseKeyword,
261 "return" => PythonTokenType::ReturnKeyword,
262 "True" => PythonTokenType::TrueKeyword,
263 "try" => PythonTokenType::TryKeyword,
264 "while" => PythonTokenType::WhileKeyword,
265 "with" => PythonTokenType::WithKeyword,
266 "yield" => PythonTokenType::YieldKeyword,
267 _ => PythonTokenType::Identifier,
268 };
269
270 state.add_token(kind, start_pos, state.get_position());
271 true
272 }
273
274 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276 let start_pos = state.get_position();
277
278 if let Some(ch) = state.current() {
279 let kind = match ch {
280 '+' => {
281 state.advance(1);
282 if let Some('=') = state.current() {
283 state.advance(1);
284 PythonTokenType::PlusAssign
285 }
286 else {
287 PythonTokenType::Plus
288 }
289 }
290 '-' => {
291 state.advance(1);
292 if let Some('=') = state.current() {
293 state.advance(1);
294 PythonTokenType::MinusAssign
295 }
296 else if let Some('>') = state.current() {
297 state.advance(1);
298 PythonTokenType::Arrow
299 }
300 else {
301 PythonTokenType::Minus
302 }
303 }
304 '*' => {
305 state.advance(1);
306 if let Some('=') = state.current() {
307 state.advance(1);
308 PythonTokenType::StarAssign
309 }
310 else if let Some('*') = state.current() {
311 state.advance(1);
312 if let Some('=') = state.current() {
313 state.advance(1);
314 PythonTokenType::DoubleStarAssign
315 }
316 else {
317 PythonTokenType::DoubleStar
318 }
319 }
320 else {
321 PythonTokenType::Star
322 }
323 }
324 '/' => {
325 state.advance(1);
326 if let Some('=') = state.current() {
327 state.advance(1);
328 PythonTokenType::SlashAssign
329 }
330 else if let Some('/') = state.current() {
331 state.advance(1);
332 if let Some('=') = state.current() {
333 state.advance(1);
334 PythonTokenType::DoubleSlashAssign
335 }
336 else {
337 PythonTokenType::DoubleSlash
338 }
339 }
340 else {
341 PythonTokenType::Slash
342 }
343 }
344 '%' => {
345 state.advance(1);
346 if let Some('=') = state.current() {
347 state.advance(1);
348 PythonTokenType::PercentAssign
349 }
350 else {
351 PythonTokenType::Percent
352 }
353 }
354 '=' => {
355 state.advance(1);
356 if let Some('=') = state.current() {
357 state.advance(1);
358 PythonTokenType::Equal
359 }
360 else {
361 PythonTokenType::Assign
362 }
363 }
364 '<' => {
365 state.advance(1);
366 if let Some('=') = state.current() {
367 state.advance(1);
368 PythonTokenType::LessEqual
369 }
370 else if let Some('<') = state.current() {
371 state.advance(1);
372 if let Some('=') = state.current() {
373 state.advance(1);
374 PythonTokenType::LeftShiftAssign
375 }
376 else {
377 PythonTokenType::LeftShift
378 }
379 }
380 else {
381 PythonTokenType::Less
382 }
383 }
384 '>' => {
385 state.advance(1);
386 if let Some('=') = state.current() {
387 state.advance(1);
388 PythonTokenType::GreaterEqual
389 }
390 else if let Some('>') = state.current() {
391 state.advance(1);
392 if let Some('=') = state.current() {
393 state.advance(1);
394 PythonTokenType::RightShiftAssign
395 }
396 else {
397 PythonTokenType::RightShift
398 }
399 }
400 else {
401 PythonTokenType::Greater
402 }
403 }
404 '!' => {
405 state.advance(1);
406 if let Some('=') = state.current() {
407 state.advance(1);
408 PythonTokenType::NotEqual
409 }
410 else {
411 return false;
412 }
413 }
414 '&' => {
415 state.advance(1);
416 if let Some('=') = state.current() {
417 state.advance(1);
418 PythonTokenType::AmpersandAssign
419 }
420 else {
421 PythonTokenType::Ampersand
422 }
423 }
424 '|' => {
425 state.advance(1);
426 if let Some('=') = state.current() {
427 state.advance(1);
428 PythonTokenType::PipeAssign
429 }
430 else {
431 PythonTokenType::Pipe
432 }
433 }
434 '^' => {
435 state.advance(1);
436 if let Some('=') = state.current() {
437 state.advance(1);
438 PythonTokenType::CaretAssign
439 }
440 else {
441 PythonTokenType::Caret
442 }
443 }
444 '~' => {
445 state.advance(1);
446 PythonTokenType::Tilde
447 }
448 '@' => {
449 state.advance(1);
450 if let Some('=') = state.current() {
451 state.advance(1);
452 PythonTokenType::AtAssign
453 }
454 else {
455 PythonTokenType::At
456 }
457 }
458 _ => return false,
459 };
460
461 state.add_token(kind, start_pos, state.get_position());
462 return true;
463 }
464
465 false
466 }
467
468 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
470 let start_pos = state.get_position();
471
472 if let Some(ch) = state.current() {
473 let kind = match ch {
474 '(' => PythonTokenType::LeftParen,
475 ')' => PythonTokenType::RightParen,
476 '[' => PythonTokenType::LeftBracket,
477 ']' => PythonTokenType::RightBracket,
478 '{' => PythonTokenType::LeftBrace,
479 '}' => PythonTokenType::RightBrace,
480 ',' => PythonTokenType::Comma,
481 ':' => PythonTokenType::Colon,
482 ';' => PythonTokenType::Semicolon,
483 '.' => PythonTokenType::Dot, _ => return false,
485 };
486
487 state.advance(1);
488 state.add_token(kind, start_pos, state.get_position());
489 return true;
490 }
491
492 false
493 }
494}
495
496impl<'config> PythonLexer<'config> {
497 pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
498 let mut indent_stack = vec![0];
499 let mut bracket_level: usize = 0;
500 let mut at_line_start = true;
501
502 while state.not_at_end() {
503 let safe_point = state.get_position();
504
505 if at_line_start && bracket_level == 0 {
506 self.handle_indentation(state, &mut indent_stack);
507 at_line_start = false;
508 continue;
509 }
510
511 if let Some(ch) = state.peek() {
512 match ch {
513 ' ' | '\t' => {
514 self.skip_whitespace(state);
515 }
516 '\n' | '\r' => {
517 self.lex_newline(state, bracket_level);
518 at_line_start = true;
519 }
520 '#' => {
521 self.lex_comment(state);
522 }
523 '"' | '\'' => {
524 self.lex_string(state);
525 }
526 '0'..='9' => {
527 self.lex_number(state);
528 }
529 'f' | 'r' | 'b' | 'u' | 'F' | 'R' | 'B' | 'U' => {
530 if !self.lex_string(state) {
531 self.lex_identifier_or_keyword(state);
532 }
533 }
534 'a'..='e' | 'g'..='q' | 's' | 't' | 'v'..='z' | 'A'..='E' | 'G'..='Q' | 'S' | 'T' | 'V'..='Z' | '_' => {
535 self.lex_identifier_or_keyword(state);
536 }
537 '(' | '[' | '{' => {
538 bracket_level += 1;
539 self.lex_delimiter(state);
540 }
541 ')' | ']' | '}' => {
542 bracket_level = bracket_level.saturating_sub(1);
543 self.lex_delimiter(state);
544 }
545 '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '&' | '|' | '^' | '~' | '@' => {
546 self.lex_operator(state);
547 }
548 ',' | ':' | ';' | '.' => {
549 self.lex_delimiter(state);
550 }
551 _ => {
552 state.advance(ch.len_utf8());
554 state.add_token(PythonTokenType::Error, safe_point, state.get_position())
555 }
556 }
557 }
558
559 state.advance_if_dead_lock(safe_point)
560 }
561
562 while indent_stack.len() > 1 {
564 indent_stack.pop();
565 let pos = state.get_position();
566 state.add_token(PythonTokenType::Dedent, pos, pos)
567 }
568
569 Ok(())
570 }
571
572 fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
573 let start_pos = state.get_position();
574 let current_indent;
575
576 let mut temp_state = state.get_position();
578 loop {
579 let mut indent = 0;
580 while let Some(ch) = state.get_char_at(temp_state) {
581 if ch == ' ' {
582 indent += 1
583 }
584 else if ch == '\t' {
585 indent += 8
586 }
587 else {
589 break;
590 }
591 temp_state += 1
592 }
593
594 match state.get_char_at(temp_state) {
595 Some('\n') | Some('\r') | Some('#') => {
596 return;
598 }
599 None => return, _ => {
601 current_indent = indent;
602 break;
603 }
604 }
605 }
606
607 if current_indent > 0 {
609 let end_pos = state.get_position() + (temp_state - state.get_position());
610 state.add_token(PythonTokenType::Whitespace, start_pos, end_pos);
611 state.set_position(end_pos);
612 }
613
614 let last_indent = *stack.last().unwrap();
615 if current_indent > last_indent {
616 stack.push(current_indent);
617 state.add_token(PythonTokenType::Indent, state.get_position(), state.get_position())
618 }
619 else {
620 while current_indent < *stack.last().unwrap() {
621 stack.pop();
622 state.add_token(PythonTokenType::Dedent, state.get_position(), state.get_position())
623 }
624 }
627 }
628}