1use crate::{kind::PythonSyntaxKind, language::PythonLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::LexOutput,
5 source::{Source, TextEdit},
6};
7
8type State<'a, S> = LexerState<'a, S, PythonLanguage>;
9
10#[derive(Clone)]
11pub struct PythonLexer<'config> {
12 _config: &'config PythonLanguage,
13}
14
15impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
16 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<PythonLanguage>) -> LexOutput<PythonLanguage> {
17 let mut state = State::new_with_cache(source, 0, cache);
18 let result = self.run(&mut state);
19 if result.is_ok() {
20 state.add_eof();
21 }
22 state.finish_with_cache(result, cache)
23 }
24}
25
26impl<'config> PythonLexer<'config> {
27 pub fn new(config: &'config PythonLanguage) -> Self {
28 Self { _config: config }
29 }
30
31 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
33 let start_pos = state.get_position();
34
35 while let Some(ch) = state.current() {
36 if ch == ' ' || ch == '\t' {
37 state.advance(ch.len_utf8());
38 }
39 else {
40 break;
41 }
42 }
43
44 if state.get_position() > start_pos {
45 state.add_token(PythonSyntaxKind::Whitespace, start_pos, state.get_position());
46 true
47 }
48 else {
49 false
50 }
51 }
52
53 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
55 let start_pos = state.get_position();
56
57 if let Some('\n') = state.current() {
58 state.advance(1);
59 state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
60 true
61 }
62 else if let Some('\r') = state.current() {
63 state.advance(1);
64 if let Some('\n') = state.current() {
65 state.advance(1);
66 }
67 state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
68 true
69 }
70 else {
71 false
72 }
73 }
74
75 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
77 if let Some('#') = state.current() {
78 let start_pos = state.get_position();
79 state.advance(1); while let Some(ch) = state.current() {
83 if ch == '\n' || ch == '\r' {
84 break;
85 }
86 state.advance(ch.len_utf8());
87 }
88
89 state.add_token(PythonSyntaxKind::Comment, start_pos, state.get_position());
90 true
91 }
92 else {
93 false
94 }
95 }
96
97 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
99 let start_pos = state.get_position();
100
101 let quote_char = match state.current() {
103 Some('"') => '"',
104 Some('\'') => '\'',
105 _ => return false,
106 };
107
108 state.advance(1); let mut escaped = false;
112 while let Some(ch) = state.current() {
113 if escaped {
114 escaped = false;
115 state.advance(ch.len_utf8());
116 continue;
117 }
118
119 if ch == '\\' {
120 escaped = true;
121 state.advance(1);
122 continue;
123 }
124
125 if ch == quote_char {
126 state.advance(1); break;
128 }
129 else if ch == '\n' || ch == '\r' {
130 break;
132 }
133 else {
134 state.advance(ch.len_utf8());
135 }
136 }
137
138 state.add_token(PythonSyntaxKind::String, start_pos, state.get_position());
139 true
140 }
141
142 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
144 let start_pos = state.get_position();
145
146 if !state.current().map_or(false, |c| c.is_ascii_digit()) {
147 return false;
148 }
149
150 while let Some(ch) = state.current() {
152 if ch.is_ascii_digit() || ch == '.' {
153 state.advance(1);
154 }
155 else {
156 break;
157 }
158 }
159
160 state.add_token(PythonSyntaxKind::Number, start_pos, state.get_position());
161 true
162 }
163
164 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
166 let start_pos = state.get_position();
167
168 if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
170 return false;
171 }
172
173 let mut text = String::new();
175 while let Some(ch) = state.current() {
176 if ch.is_ascii_alphanumeric() || ch == '_' {
177 text.push(ch);
178 state.advance(ch.len_utf8());
179 }
180 else {
181 break;
182 }
183 }
184
185 let kind = match text.as_str() {
187 "and" => PythonSyntaxKind::AndKeyword,
188 "as" => PythonSyntaxKind::AsKeyword,
189 "assert" => PythonSyntaxKind::AssertKeyword,
190 "async" => PythonSyntaxKind::AsyncKeyword,
191 "await" => PythonSyntaxKind::AwaitKeyword,
192 "break" => PythonSyntaxKind::BreakKeyword,
193 "class" => PythonSyntaxKind::ClassKeyword,
194 "continue" => PythonSyntaxKind::ContinueKeyword,
195 "def" => PythonSyntaxKind::DefKeyword,
196 "del" => PythonSyntaxKind::DelKeyword,
197 "elif" => PythonSyntaxKind::ElifKeyword,
198 "else" => PythonSyntaxKind::ElseKeyword,
199 "except" => PythonSyntaxKind::ExceptKeyword,
200 "False" => PythonSyntaxKind::FalseKeyword,
201 "finally" => PythonSyntaxKind::FinallyKeyword,
202 "for" => PythonSyntaxKind::ForKeyword,
203 "from" => PythonSyntaxKind::FromKeyword,
204 "global" => PythonSyntaxKind::GlobalKeyword,
205 "if" => PythonSyntaxKind::IfKeyword,
206 "import" => PythonSyntaxKind::ImportKeyword,
207 "in" => PythonSyntaxKind::InKeyword,
208 "is" => PythonSyntaxKind::IsKeyword,
209 "lambda" => PythonSyntaxKind::LambdaKeyword,
210 "None" => PythonSyntaxKind::NoneKeyword,
211 "nonlocal" => PythonSyntaxKind::NonlocalKeyword,
212 "not" => PythonSyntaxKind::NotKeyword,
213 "or" => PythonSyntaxKind::OrKeyword,
214 "pass" => PythonSyntaxKind::PassKeyword,
215 "raise" => PythonSyntaxKind::RaiseKeyword,
216 "return" => PythonSyntaxKind::ReturnKeyword,
217 "True" => PythonSyntaxKind::TrueKeyword,
218 "try" => PythonSyntaxKind::TryKeyword,
219 "while" => PythonSyntaxKind::WhileKeyword,
220 "with" => PythonSyntaxKind::WithKeyword,
221 "yield" => PythonSyntaxKind::YieldKeyword,
222 _ => PythonSyntaxKind::Identifier,
223 };
224
225 state.add_token(kind, start_pos, state.get_position());
226 true
227 }
228
229 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
231 let start_pos = state.get_position();
232
233 if let Some(ch) = state.current() {
234 let kind = match ch {
235 '+' => {
236 state.advance(1);
237 if let Some('=') = state.current() {
238 state.advance(1);
239 PythonSyntaxKind::PlusAssign
240 }
241 else {
242 PythonSyntaxKind::Plus
243 }
244 }
245 '-' => {
246 state.advance(1);
247 if let Some('=') = state.current() {
248 state.advance(1);
249 PythonSyntaxKind::MinusAssign
250 }
251 else if let Some('>') = state.current() {
252 state.advance(1);
253 PythonSyntaxKind::Arrow
254 }
255 else {
256 PythonSyntaxKind::Minus
257 }
258 }
259 '*' => {
260 state.advance(1);
261 if let Some('=') = state.current() {
262 state.advance(1);
263 PythonSyntaxKind::StarAssign
264 }
265 else if let Some('*') = state.current() {
266 state.advance(1);
267 if let Some('=') = state.current() {
268 state.advance(1);
269 PythonSyntaxKind::DoubleStarAssign
270 }
271 else {
272 PythonSyntaxKind::DoubleStar
273 }
274 }
275 else {
276 PythonSyntaxKind::Star
277 }
278 }
279 '/' => {
280 state.advance(1);
281 if let Some('=') = state.current() {
282 state.advance(1);
283 PythonSyntaxKind::SlashAssign
284 }
285 else if let Some('/') = state.current() {
286 state.advance(1);
287 if let Some('=') = state.current() {
288 state.advance(1);
289 PythonSyntaxKind::DoubleSlashAssign
290 }
291 else {
292 PythonSyntaxKind::DoubleSlash
293 }
294 }
295 else {
296 PythonSyntaxKind::Slash
297 }
298 }
299 '%' => {
300 state.advance(1);
301 if let Some('=') = state.current() {
302 state.advance(1);
303 PythonSyntaxKind::PercentAssign
304 }
305 else {
306 PythonSyntaxKind::Percent
307 }
308 }
309 '=' => {
310 state.advance(1);
311 if let Some('=') = state.current() {
312 state.advance(1);
313 PythonSyntaxKind::Eq
314 }
315 else {
316 PythonSyntaxKind::Assign
317 }
318 }
319 '<' => {
320 state.advance(1);
321 if let Some('=') = state.current() {
322 state.advance(1);
323 PythonSyntaxKind::LessEqual
324 }
325 else if let Some('<') = state.current() {
326 state.advance(1);
327 if let Some('=') = state.current() {
328 state.advance(1);
329 PythonSyntaxKind::LeftShiftAssign
330 }
331 else {
332 PythonSyntaxKind::LeftShift
333 }
334 }
335 else {
336 PythonSyntaxKind::Less
337 }
338 }
339 '>' => {
340 state.advance(1);
341 if let Some('=') = state.current() {
342 state.advance(1);
343 PythonSyntaxKind::GreaterEqual
344 }
345 else if let Some('>') = state.current() {
346 state.advance(1);
347 if let Some('=') = state.current() {
348 state.advance(1);
349 PythonSyntaxKind::RightShiftAssign
350 }
351 else {
352 PythonSyntaxKind::RightShift
353 }
354 }
355 else {
356 PythonSyntaxKind::Greater
357 }
358 }
359 '!' => {
360 state.advance(1);
361 if let Some('=') = state.current() {
362 state.advance(1);
363 PythonSyntaxKind::NotEqual
364 }
365 else {
366 return false;
367 }
368 }
369 '&' => {
370 state.advance(1);
371 if let Some('=') = state.current() {
372 state.advance(1);
373 PythonSyntaxKind::AmpersandAssign
374 }
375 else {
376 PythonSyntaxKind::Ampersand
377 }
378 }
379 '|' => {
380 state.advance(1);
381 if let Some('=') = state.current() {
382 state.advance(1);
383 PythonSyntaxKind::PipeAssign
384 }
385 else {
386 PythonSyntaxKind::Pipe
387 }
388 }
389 '^' => {
390 state.advance(1);
391 if let Some('=') = state.current() {
392 state.advance(1);
393 PythonSyntaxKind::CaretAssign
394 }
395 else {
396 PythonSyntaxKind::Caret
397 }
398 }
399 '~' => {
400 state.advance(1);
401 PythonSyntaxKind::Tilde
402 }
403 '@' => {
404 state.advance(1);
405 if let Some('=') = state.current() {
406 state.advance(1);
407 PythonSyntaxKind::AtAssign
408 }
409 else {
410 PythonSyntaxKind::At
411 }
412 }
413 _ => return false,
414 };
415
416 state.add_token(kind, start_pos, state.get_position());
417 return true;
418 }
419
420 false
421 }
422
423 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
425 let start_pos = state.get_position();
426
427 if let Some(ch) = state.current() {
428 let kind = match ch {
429 '(' => PythonSyntaxKind::LeftParen,
430 ')' => PythonSyntaxKind::RightParen,
431 '[' => PythonSyntaxKind::LeftBracket,
432 ']' => PythonSyntaxKind::RightBracket,
433 '{' => PythonSyntaxKind::LeftBrace,
434 '}' => PythonSyntaxKind::RightBrace,
435 ',' => PythonSyntaxKind::Comma,
436 ':' => PythonSyntaxKind::Colon,
437 ';' => PythonSyntaxKind::Semicolon,
438 '.' => PythonSyntaxKind::Dot, _ => return false,
440 };
441
442 state.advance(1);
443 state.add_token(kind, start_pos, state.get_position());
444 return true;
445 }
446
447 false
448 }
449}
450
451impl<'config> PythonLexer<'config> {
452 pub(crate) fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
453 let mut indent_stack = vec![0];
454 let mut bracket_level: usize = 0;
455 let mut at_line_start = true;
456
457 while state.not_at_end() {
458 let safe_point = state.get_position();
459
460 if at_line_start && bracket_level == 0 {
461 self.handle_indentation(state, &mut indent_stack);
462 at_line_start = false;
463 continue;
464 }
465
466 if let Some(ch) = state.peek() {
467 match ch {
468 ' ' | '\t' => {
469 self.skip_whitespace(state);
470 }
471 '\n' | '\r' => {
472 self.lex_newline(state);
473 at_line_start = true;
474 }
475 '#' => {
476 self.lex_comment(state);
477 }
478 '"' | '\'' => {
479 self.lex_string(state);
480 }
481 '0'..='9' => {
482 self.lex_number(state);
483 }
484 'a'..='z' | 'A'..='Z' | '_' => {
485 self.lex_identifier_or_keyword(state);
486 }
487 '(' | '[' | '{' => {
488 bracket_level += 1;
489 self.lex_delimiter(state);
490 }
491 ')' | ']' | '}' => {
492 bracket_level = bracket_level.saturating_sub(1);
493 self.lex_delimiter(state);
494 }
495 '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '&' | '|' | '^' | '~' | '@' => {
496 self.lex_operator(state);
497 }
498 ',' | ':' | ';' | '.' => {
499 self.lex_delimiter(state);
500 }
501 _ => {
502 state.advance(ch.len_utf8());
504 state.add_token(PythonSyntaxKind::Error, safe_point, state.get_position());
505 }
506 }
507 }
508
509 state.advance_if_dead_lock(safe_point);
510 }
511
512 while indent_stack.len() > 1 {
514 indent_stack.pop();
515 let pos = state.get_position();
516 state.add_token(PythonSyntaxKind::Dedent, pos, pos);
517 }
518
519 Ok(())
520 }
521
522 fn handle_indentation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, stack: &mut Vec<usize>) {
523 let start_pos = state.get_position();
524 let current_indent;
525
526 let mut temp_state = state.get_position();
528 loop {
529 let mut indent = 0;
530 while let Some(ch) = state.get_char_at(temp_state) {
531 if ch == ' ' {
532 indent += 1;
533 }
534 else if ch == '\t' {
535 indent += 8;
536 }
537 else {
539 break;
540 }
541 temp_state += 1;
542 }
543
544 match state.get_char_at(temp_state) {
545 Some('\n') | Some('\r') | Some('#') => {
546 return;
548 }
549 None => return, _ => {
551 current_indent = indent;
552 break;
553 }
554 }
555 }
556
557 if current_indent > 0 {
559 let end_pos = state.get_position() + (temp_state - state.get_position());
560 state.add_token(PythonSyntaxKind::Whitespace, start_pos, end_pos);
561 state.set_position(end_pos);
562 }
563
564 let last_indent = *stack.last().unwrap();
565 if current_indent > last_indent {
566 stack.push(current_indent);
567 state.add_token(PythonSyntaxKind::Indent, state.get_position(), state.get_position());
568 }
569 else {
570 while current_indent < *stack.last().unwrap() {
571 stack.pop();
572 state.add_token(PythonSyntaxKind::Dedent, state.get_position(), state.get_position());
573 }
574 }
577 }
578}