1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use self::token_type::PythonTokenType;
5use crate::language::PythonLanguage;
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::LexOutput,
9 source::{Source, TextEdit},
10};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, PythonLanguage>;
14
15#[derive(Clone)]
17pub struct PythonLexer<'config> {
18 config: &'config PythonLanguage,
20}
21
22impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
23 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PythonLanguage>) -> LexOutput<PythonLanguage> {
24 let mut state = State::new_with_cache(source, 0, cache);
25 let result = self.run(&mut state);
26 if result.is_ok() {
27 state.add_eof();
28 }
29 state.finish_with_cache(result, cache)
30 }
31}
32
33impl<'config> PythonLexer<'config> {
34 pub fn new(config: &'config PythonLanguage) -> Self {
36 Self { config }
37 }
38
39 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
41 let start_pos = state.get_position();
42
43 while let Some(ch) = state.current() {
44 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
45 }
46
47 if state.get_position() > start_pos {
48 state.add_token(PythonTokenType::Whitespace, start_pos, state.get_position());
49 true
50 }
51 else {
52 false
53 }
54 }
55
56 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, bracket_level: usize) -> bool {
58 let start_pos = state.get_position();
59 let kind = if bracket_level > 0 { PythonTokenType::Whitespace } else { PythonTokenType::Newline };
60
61 if let Some('\n') = state.current() {
62 state.advance(1);
63 state.add_token(kind, start_pos, state.get_position());
64 true
65 }
66 else if let Some('\r') = state.current() {
67 state.advance(1);
68 if let Some('\n') = state.current() {
69 state.advance(1);
70 }
71 state.add_token(kind, start_pos, state.get_position());
72 true
73 }
74 else {
75 false
76 }
77 }
78
79 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81 if let Some('#') = state.current() {
82 let start_pos = state.get_position();
83 state.advance(1); while let Some(ch) = state.current() {
87 if ch == '\n' || ch == '\r' {
88 break;
89 }
90 state.advance(ch.len_utf8())
91 }
92
93 state.add_token(PythonTokenType::Comment, start_pos, state.get_position());
94 true
95 }
96 else {
97 false
98 }
99 }
100
101 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103 let start_pos = state.get_position();
104
105 let mut prefix = None;
107 if let Some(ch) = state.current() {
108 if "frbuFRBU".contains(ch) {
109 if let Some(next_ch) = state.peek_next_n(ch.len_utf8()) {
111 if next_ch == '"' || next_ch == '\'' {
112 prefix = Some(ch.to_ascii_lowercase());
113 state.advance(ch.len_utf8());
114 }
115 }
116 }
117 }
118
119 let quote_char = match state.current() {
121 Some('"') => '"',
122 Some('\'') => '\'',
123 _ => {
124 if prefix.is_some() {
125 return false;
127 }
128 return false;
129 }
130 };
131
132 state.advance(1); let is_triple = if let (Some(c1), Some(c2)) = (state.peek_next_n(0), state.peek_next_n(1)) { c1 == quote_char && c2 == quote_char } else { false };
136
137 if is_triple {
138 state.advance(2); }
140
141 let mut escaped = false;
142 while let Some(ch) = state.current() {
143 if escaped {
144 escaped = false;
145 state.advance(ch.len_utf8());
146 continue;
147 }
148
149 if ch == '\\' {
150 escaped = true;
151 state.advance(1);
152 continue;
153 }
154
155 if ch == quote_char {
156 if is_triple {
157 if let (Some(c1), Some(c2)) = (state.peek_next_n(1), state.peek_next_n(2)) {
158 if c1 == quote_char && c2 == quote_char {
159 state.advance(3); break;
161 }
162 }
163 state.advance(1);
164 continue;
165 }
166 else {
167 state.advance(1); break;
169 }
170 }
171 else if (ch == '\n' || ch == '\r') && !is_triple {
172 break;
174 }
175 else {
176 state.advance(ch.len_utf8());
177 }
178 }
179
180 let kind = match prefix {
181 Some('f') => PythonTokenType::FString,
182 Some('b') => PythonTokenType::Bytes,
183 _ => PythonTokenType::String,
184 };
185 state.add_token(kind, start_pos, state.get_position());
186 true
187 }
188
189 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
191 let start_pos = state.get_position();
192
193 if !state.current().map_or(false, |c| c.is_ascii_digit()) {
194 return false;
195 }
196
197 while let Some(ch) = state.current() {
199 if ch.is_ascii_digit() || ch == '.' {
200 state.advance(1);
201 }
202 else {
203 break;
204 }
205 }
206
207 state.add_token(PythonTokenType::Number, start_pos, state.get_position());
208 true
209 }
210
211 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
213 let start_pos = state.get_position();
214
215 if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
217 return false;
218 }
219
220 let mut text = String::new();
222 while let Some(ch) = state.current() {
223 if ch.is_ascii_alphanumeric() || ch == '_' {
224 text.push(ch);
225 state.advance(ch.len_utf8());
226 }
227 else {
228 break;
229 }
230 }
231
232 let kind = match text.as_str() {
234 "and" => PythonTokenType::AndKeyword,
235 "as" => PythonTokenType::AsKeyword,
236 "assert" => PythonTokenType::AssertKeyword,
237 "async" => PythonTokenType::AsyncKeyword,
238 "await" => PythonTokenType::AwaitKeyword,
239 "break" => PythonTokenType::BreakKeyword,
240 "class" => PythonTokenType::ClassKeyword,
241 "continue" => PythonTokenType::ContinueKeyword,
242 "def" => PythonTokenType::DefKeyword,
243 "del" => PythonTokenType::DelKeyword,
244 "elif" => PythonTokenType::ElifKeyword,
245 "else" => PythonTokenType::ElseKeyword,
246 "except" => PythonTokenType::ExceptKeyword,
247 "False" => PythonTokenType::FalseKeyword,
248 "finally" => PythonTokenType::FinallyKeyword,
249 "for" => PythonTokenType::ForKeyword,
250 "from" => PythonTokenType::FromKeyword,
251 "global" => PythonTokenType::GlobalKeyword,
252 "if" => PythonTokenType::IfKeyword,
253 "import" => PythonTokenType::ImportKeyword,
254 "in" => PythonTokenType::InKeyword,
255 "is" => PythonTokenType::IsKeyword,
256 "lambda" => PythonTokenType::LambdaKeyword,
257 "None" => PythonTokenType::NoneKeyword,
258 "nonlocal" => PythonTokenType::NonlocalKeyword,
259 "not" => PythonTokenType::NotKeyword,
260 "or" => PythonTokenType::OrKeyword,
261 "pass" => PythonTokenType::PassKeyword,
262 "raise" => PythonTokenType::RaiseKeyword,
263 "return" => PythonTokenType::ReturnKeyword,
264 "True" => PythonTokenType::TrueKeyword,
265 "try" => PythonTokenType::TryKeyword,
266 "while" => PythonTokenType::WhileKeyword,
267 "with" => PythonTokenType::WithKeyword,
268 "yield" => PythonTokenType::YieldKeyword,
269 _ => PythonTokenType::Identifier,
270 };
271
272 state.add_token(kind, start_pos, state.get_position());
273 true
274 }
275
276 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
278 let start_pos = state.get_position();
279
280 if let Some(ch) = state.current() {
281 let kind = match ch {
282 '+' => {
283 state.advance(1);
284 if let Some('=') = state.current() {
285 state.advance(1);
286 PythonTokenType::PlusAssign
287 }
288 else {
289 PythonTokenType::Plus
290 }
291 }
292 '-' => {
293 state.advance(1);
294 if let Some('=') = state.current() {
295 state.advance(1);
296 PythonTokenType::MinusAssign
297 }
298 else if let Some('>') = state.current() {
299 state.advance(1);
300 PythonTokenType::Arrow
301 }
302 else {
303 PythonTokenType::Minus
304 }
305 }
306 '*' => {
307 state.advance(1);
308 if let Some('=') = state.current() {
309 state.advance(1);
310 PythonTokenType::StarAssign
311 }
312 else if let Some('*') = state.current() {
313 state.advance(1);
314 if let Some('=') = state.current() {
315 state.advance(1);
316 PythonTokenType::DoubleStarAssign
317 }
318 else {
319 PythonTokenType::DoubleStar
320 }
321 }
322 else {
323 PythonTokenType::Star
324 }
325 }
326 '/' => {
327 state.advance(1);
328 if let Some('=') = state.current() {
329 state.advance(1);
330 PythonTokenType::SlashAssign
331 }
332 else if let Some('/') = state.current() {
333 state.advance(1);
334 if let Some('=') = state.current() {
335 state.advance(1);
336 PythonTokenType::DoubleSlashAssign
337 }
338 else {
339 PythonTokenType::DoubleSlash
340 }
341 }
342 else {
343 PythonTokenType::Slash
344 }
345 }
346 '%' => {
347 state.advance(1);
348 if let Some('=') = state.current() {
349 state.advance(1);
350 PythonTokenType::PercentAssign
351 }
352 else {
353 PythonTokenType::Percent
354 }
355 }
356 '=' => {
357 state.advance(1);
358 if let Some('=') = state.current() {
359 state.advance(1);
360 PythonTokenType::Equal
361 }
362 else {
363 PythonTokenType::Assign
364 }
365 }
366 '<' => {
367 state.advance(1);
368 if let Some('=') = state.current() {
369 state.advance(1);
370 PythonTokenType::LessEqual
371 }
372 else if let Some('<') = state.current() {
373 state.advance(1);
374 if let Some('=') = state.current() {
375 state.advance(1);
376 PythonTokenType::LeftShiftAssign
377 }
378 else {
379 PythonTokenType::LeftShift
380 }
381 }
382 else {
383 PythonTokenType::Less
384 }
385 }
386 '>' => {
387 state.advance(1);
388 if let Some('=') = state.current() {
389 state.advance(1);
390 PythonTokenType::GreaterEqual
391 }
392 else if let Some('>') = state.current() {
393 state.advance(1);
394 if let Some('=') = state.current() {
395 state.advance(1);
396 PythonTokenType::RightShiftAssign
397 }
398 else {
399 PythonTokenType::RightShift
400 }
401 }
402 else {
403 PythonTokenType::Greater
404 }
405 }
406 '!' => {
407 state.advance(1);
408 if let Some('=') = state.current() {
409 state.advance(1);
410 PythonTokenType::NotEqual
411 }
412 else {
413 return false;
414 }
415 }
416 '&' => {
417 state.advance(1);
418 if let Some('=') = state.current() {
419 state.advance(1);
420 PythonTokenType::AmpersandAssign
421 }
422 else {
423 PythonTokenType::Ampersand
424 }
425 }
426 '|' => {
427 state.advance(1);
428 if let Some('=') = state.current() {
429 state.advance(1);
430 PythonTokenType::PipeAssign
431 }
432 else {
433 PythonTokenType::Pipe
434 }
435 }
436 '^' => {
437 state.advance(1);
438 if let Some('=') = state.current() {
439 state.advance(1);
440 PythonTokenType::CaretAssign
441 }
442 else {
443 PythonTokenType::Caret
444 }
445 }
446 '~' => {
447 state.advance(1);
448 PythonTokenType::Tilde
449 }
450 '@' => {
451 state.advance(1);
452 if let Some('=') = state.current() {
453 state.advance(1);
454 PythonTokenType::AtAssign
455 }
456 else {
457 PythonTokenType::At
458 }
459 }
460 _ => return false,
461 };
462
463 state.add_token(kind, start_pos, state.get_position());
464 return true;
465 }
466
467 false
468 }
469
470 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
472 let start_pos = state.get_position();
473
474 if let Some(ch) = state.current() {
475 let kind = match ch {
476 '(' => PythonTokenType::LeftParen,
477 ')' => PythonTokenType::RightParen,
478 '[' => PythonTokenType::LeftBracket,
479 ']' => PythonTokenType::RightBracket,
480 '{' => PythonTokenType::LeftBrace,
481 '}' => PythonTokenType::RightBrace,
482 ',' => PythonTokenType::Comma,
483 ':' => PythonTokenType::Colon,
484 ';' => PythonTokenType::Semicolon,
485 '.' => PythonTokenType::Dot, _ => return false,
487 };
488
489 state.advance(1);
490 state.add_token(kind, start_pos, state.get_position());
491 return true;
492 }
493
494 false
495 }
496}
497
498impl<'config> PythonLexer<'config> {
499 pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
500 let mut indent_stack = vec![0];
501 let mut bracket_level: usize = 0;
502 let mut at_line_start = true;
503
504 while state.not_at_end() {
505 let safe_point = state.get_position();
506
507 if at_line_start && bracket_level == 0 {
508 self.handle_indentation(state, &mut indent_stack);
509 at_line_start = false;
510 continue;
511 }
512
513 if let Some(ch) = state.peek() {
514 match ch {
515 ' ' | '\t' => {
516 self.skip_whitespace(state);
517 }
518 '\n' | '\r' => {
519 self.lex_newline(state, bracket_level);
520 at_line_start = true;
521 }
522 '#' => {
523 self.lex_comment(state);
524 }
525 '"' | '\'' => {
526 self.lex_string(state);
527 }
528 '0'..='9' => {
529 self.lex_number(state);
530 }
531 'f' | 'r' | 'b' | 'u' | 'F' | 'R' | 'B' | 'U' => {
532 if !self.lex_string(state) {
533 self.lex_identifier_or_keyword(state);
534 }
535 }
536 'a'..='e' | 'g'..='q' | 's' | 't' | 'v'..='z' | 'A'..='E' | 'G'..='Q' | 'S' | 'T' | 'V'..='Z' | '_' => {
537 self.lex_identifier_or_keyword(state);
538 }
539 '(' | '[' | '{' => {
540 bracket_level += 1;
541 self.lex_delimiter(state);
542 }
543 ')' | ']' | '}' => {
544 bracket_level = bracket_level.saturating_sub(1);
545 self.lex_delimiter(state);
546 }
547 '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '&' | '|' | '^' | '~' | '@' => {
548 self.lex_operator(state);
549 }
550 ',' | ':' | ';' | '.' => {
551 self.lex_delimiter(state);
552 }
553 _ => {
554 state.advance(ch.len_utf8());
556 state.add_token(PythonTokenType::Error, safe_point, state.get_position())
557 }
558 }
559 }
560
561 state.advance_if_dead_lock(safe_point)
562 }
563
564 while indent_stack.len() > 1 {
566 indent_stack.pop();
567 let pos = state.get_position();
568 state.add_token(PythonTokenType::Dedent, pos, pos)
569 }
570
571 Ok(())
572 }
573
574 fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
575 let start_pos = state.get_position();
576 let current_indent;
577
578 let mut temp_state = state.get_position();
580 loop {
581 let mut indent = 0;
582 while let Some(ch) = state.get_char_at(temp_state) {
583 if ch == ' ' {
584 indent += 1
585 }
586 else if ch == '\t' {
587 indent += 8
588 }
589 else {
591 break;
592 }
593 temp_state += 1
594 }
595
596 match state.get_char_at(temp_state) {
597 Some('\n') | Some('\r') | Some('#') => {
598 return;
600 }
601 None => return, _ => {
603 current_indent = indent;
604 break;
605 }
606 }
607 }
608
609 if current_indent > 0 {
611 let end_pos = state.get_position() + (temp_state - state.get_position());
612 state.add_token(PythonTokenType::Whitespace, start_pos, end_pos);
613 state.set_position(end_pos);
614 }
615
616 let last_indent = *stack.last().unwrap();
617 if current_indent > last_indent {
618 stack.push(current_indent);
619 state.add_token(PythonTokenType::Indent, state.get_position(), state.get_position())
620 }
621 else {
622 while current_indent < *stack.last().unwrap() {
623 stack.pop();
624 state.add_token(PythonTokenType::Dedent, state.get_position(), state.get_position())
625 }
626 }
629 }
630}