1use crate::{kind::PythonSyntaxKind, language::PythonLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PythonLanguage>;
5
6#[derive(Clone)]
7pub struct PythonLexer<'config> {
8 config: &'config PythonLanguage,
9}
10
11impl<'config> PythonLexer<'config> {
12 pub fn new(config: &'config PythonLanguage) -> Self {
13 Self { config }
14 }
15
16 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18 let start_pos = state.get_position();
19
20 while let Some(ch) = state.current() {
21 if ch == ' ' || ch == '\t' {
22 state.advance(ch.len_utf8());
23 }
24 else {
25 break;
26 }
27 }
28
29 if state.get_position() > start_pos {
30 state.add_token(PythonSyntaxKind::Whitespace, start_pos, state.get_position());
31 true
32 }
33 else {
34 false
35 }
36 }
37
38 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40 let start_pos = state.get_position();
41
42 if let Some('\n') = state.current() {
43 state.advance(1);
44 state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
45 true
46 }
47 else if let Some('\r') = state.current() {
48 state.advance(1);
49 if let Some('\n') = state.current() {
50 state.advance(1);
51 }
52 state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
53 true
54 }
55 else {
56 false
57 }
58 }
59
60 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
62 if let Some('#') = state.current() {
63 let start_pos = state.get_position();
64 state.advance(1); while let Some(ch) = state.current() {
68 if ch == '\n' || ch == '\r' {
69 break;
70 }
71 state.advance(ch.len_utf8());
72 }
73
74 state.add_token(PythonSyntaxKind::Comment, start_pos, state.get_position());
75 true
76 }
77 else {
78 false
79 }
80 }
81
82 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
84 let start_pos = state.get_position();
85
86 let quote_char = match state.current() {
88 Some('"') => '"',
89 Some('\'') => '\'',
90 _ => return false,
91 };
92
93 state.advance(1); let mut escaped = false;
97 while let Some(ch) = state.current() {
98 if escaped {
99 escaped = false;
100 state.advance(ch.len_utf8());
101 continue;
102 }
103
104 if ch == '\\' {
105 escaped = true;
106 state.advance(1);
107 continue;
108 }
109
110 if ch == quote_char {
111 state.advance(1); break;
113 }
114 else if ch == '\n' || ch == '\r' {
115 break;
117 }
118 else {
119 state.advance(ch.len_utf8());
120 }
121 }
122
123 state.add_token(PythonSyntaxKind::String, start_pos, state.get_position());
124 true
125 }
126
127 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
129 let start_pos = state.get_position();
130
131 if !state.current().map_or(false, |c| c.is_ascii_digit()) {
132 return false;
133 }
134
135 while let Some(ch) = state.current() {
137 if ch.is_ascii_digit() || ch == '.' {
138 state.advance(1);
139 }
140 else {
141 break;
142 }
143 }
144
145 state.add_token(PythonSyntaxKind::Number, start_pos, state.get_position());
146 true
147 }
148
149 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
151 let start_pos = state.get_position();
152
153 if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
155 return false;
156 }
157
158 while let Some(ch) = state.current() {
160 if ch.is_ascii_alphanumeric() || ch == '_' {
161 state.advance(ch.len_utf8());
162 }
163 else {
164 break;
165 }
166 }
167
168 let kind = PythonSyntaxKind::Identifier; state.add_token(kind, start_pos, state.get_position());
172 true
173 }
174
175 fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
177 let start_pos = state.get_position();
178
179 if let Some(ch) = state.current() {
181 let kind = match ch {
182 '+' => {
183 state.advance(1);
184 PythonSyntaxKind::Plus
185 }
186 '-' => {
187 state.advance(1);
188 PythonSyntaxKind::Minus
189 }
190 '*' => {
191 state.advance(1);
192 PythonSyntaxKind::Star
193 }
194 '/' => {
195 state.advance(1);
196 PythonSyntaxKind::Slash
197 }
198 '%' => {
199 state.advance(1);
200 PythonSyntaxKind::Percent
201 }
202 '=' => {
203 state.advance(1);
204 PythonSyntaxKind::Assign
205 }
206 '<' => {
207 state.advance(1);
208 PythonSyntaxKind::Less
209 }
210 '>' => {
211 state.advance(1);
212 PythonSyntaxKind::Greater
213 }
214 '&' => {
215 state.advance(1);
216 PythonSyntaxKind::Ampersand
217 }
218 '|' => {
219 state.advance(1);
220 PythonSyntaxKind::Pipe
221 }
222 '^' => {
223 state.advance(1);
224 PythonSyntaxKind::Caret
225 }
226 '~' => {
227 state.advance(1);
228 PythonSyntaxKind::Tilde
229 }
230 '@' => {
231 state.advance(1);
232 PythonSyntaxKind::At
233 }
234 _ => return false,
235 };
236
237 state.add_token(kind, start_pos, state.get_position());
238 return true;
239 }
240
241 false
242 }
243
244 fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
246 let start_pos = state.get_position();
247
248 if let Some(ch) = state.current() {
249 let kind = match ch {
250 '(' => PythonSyntaxKind::LeftParen,
251 ')' => PythonSyntaxKind::RightParen,
252 '[' => PythonSyntaxKind::LeftBracket,
253 ']' => PythonSyntaxKind::RightBracket,
254 '{' => PythonSyntaxKind::LeftBrace,
255 '}' => PythonSyntaxKind::RightBrace,
256 ',' => PythonSyntaxKind::Comma,
257 ':' => PythonSyntaxKind::Colon,
258 ';' => PythonSyntaxKind::Semicolon,
259 '.' => PythonSyntaxKind::Dot, _ => return false,
261 };
262
263 state.advance(1);
264 state.add_token(kind, start_pos, state.get_position());
265 return true;
266 }
267
268 false
269 }
270
271 fn lex_indent<S: Source>(&self, state: &mut State<S>) -> bool {
273 false
275 }
276
277 fn lex_other<S: Source>(&self, state: &mut State<S>) -> bool {
279 if let Some(ch) = state.current() {
280 let start_pos = state.get_position();
281 state.advance(ch.len_utf8());
282 state.add_token(PythonSyntaxKind::Error, start_pos, state.get_position());
283 true
284 }
285 else {
286 false
287 }
288 }
289}
290
291impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
292 fn lex(&self, source: impl Source) -> LexOutput<PythonLanguage> {
293 let mut state = LexerState::new(source);
294
295 while state.not_at_end() {
296 if self.skip_whitespace(&mut state) {
297 continue;
298 }
299
300 if self.lex_newline(&mut state) {
301 continue;
302 }
303
304 if self.lex_comment(&mut state) {
305 continue;
306 }
307
308 if self.lex_string(&mut state) {
309 continue;
310 }
311
312 if self.lex_number(&mut state) {
313 continue;
314 }
315
316 if self.lex_identifier_or_keyword(&mut state) {
317 continue;
318 }
319
320 if self.lex_operator(&mut state) {
321 continue;
322 }
323
324 if self.lex_delimiter(&mut state) {
325 continue;
326 }
327
328 if self.lex_indent(&mut state) {
329 continue;
330 }
331
332 if self.lex_other(&mut state) {
333 continue;
334 }
335
336 if let Some(ch) = state.current() {
338 let start_pos = state.get_position();
339 state.advance(ch.len_utf8());
340 state.add_token(PythonSyntaxKind::Error, start_pos, state.get_position());
341 }
342 else {
343 break;
344 }
345 }
346
347 let eof_pos = state.get_position();
349 state.add_token(PythonSyntaxKind::Eof, eof_pos, eof_pos);
350
351 state.finish(Ok(()))
352 }
353
354 fn lex_incremental(
355 &self,
356 source: impl Source,
357 _offset: usize,
358 _cache: IncrementalCache<'_, PythonLanguage>,
359 ) -> LexOutput<PythonLanguage> {
360 self.lex(source)
362 }
363}