oak_python/lexer/
mod.rs

1use crate::{kind::PythonSyntaxKind, language::PythonLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PythonLanguage>;
5
6#[derive(Clone)]
7pub struct PythonLexer<'config> {
8    config: &'config PythonLanguage,
9}
10
11impl<'config> PythonLexer<'config> {
12    pub fn new(config: &'config PythonLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.current() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(PythonSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.current() {
43            state.advance(1);
44            state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.current() {
48            state.advance(1);
49            if let Some('\n') = state.current() {
50                state.advance(1);
51            }
52            state.add_token(PythonSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理注释
61    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
62        if let Some('#') = state.current() {
63            let start_pos = state.get_position();
64            state.advance(1); // 跳过 '#'
65
66            // 读取到行尾
67            while let Some(ch) = state.current() {
68                if ch == '\n' || ch == '\r' {
69                    break;
70                }
71                state.advance(ch.len_utf8());
72            }
73
74            state.add_token(PythonSyntaxKind::Comment, start_pos, state.get_position());
75            true
76        }
77        else {
78            false
79        }
80    }
81
82    /// 处理字符串字面量
83    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
84        let start_pos = state.get_position();
85
86        // 检查是否是字符串开始
87        let quote_char = match state.current() {
88            Some('"') => '"',
89            Some('\'') => '\'',
90            _ => return false,
91        };
92
93        state.advance(1); // 跳过开始引号
94
95        // 检查是否是三引号字符串 - 简化实现,不支持三引号
96        let mut escaped = false;
97        while let Some(ch) = state.current() {
98            if escaped {
99                escaped = false;
100                state.advance(ch.len_utf8());
101                continue;
102            }
103
104            if ch == '\\' {
105                escaped = true;
106                state.advance(1);
107                continue;
108            }
109
110            if ch == quote_char {
111                state.advance(1); // 跳过结束引号
112                break;
113            }
114            else if ch == '\n' || ch == '\r' {
115                // 单行字符串不能包含换行符
116                break;
117            }
118            else {
119                state.advance(ch.len_utf8());
120            }
121        }
122
123        state.add_token(PythonSyntaxKind::String, start_pos, state.get_position());
124        true
125    }
126
127    /// 处理数字字面量
128    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
129        let start_pos = state.get_position();
130
131        if !state.current().map_or(false, |c| c.is_ascii_digit()) {
132            return false;
133        }
134
135        // 简化实现:只处理基本的十进制数字
136        while let Some(ch) = state.current() {
137            if ch.is_ascii_digit() || ch == '.' {
138                state.advance(1);
139            }
140            else {
141                break;
142            }
143        }
144
145        state.add_token(PythonSyntaxKind::Number, start_pos, state.get_position());
146        true
147    }
148
149    /// 处理标识符或关键字
150    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
151        let start_pos = state.get_position();
152
153        // 检查第一个字符
154        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
155            return false;
156        }
157
158        // 读取标识符
159        while let Some(ch) = state.current() {
160            if ch.is_ascii_alphanumeric() || ch == '_' {
161                state.advance(ch.len_utf8());
162            }
163            else {
164                break;
165            }
166        }
167
168        // 检查是否是关键字
169        let kind = PythonSyntaxKind::Identifier; // 简化处理,都标记为标识符
170
171        state.add_token(kind, start_pos, state.get_position());
172        true
173    }
174
175    /// 处理操作符
176    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
177        let start_pos = state.get_position();
178
179        // 简化实现:只处理单字符操作符
180        if let Some(ch) = state.current() {
181            let kind = match ch {
182                '+' => {
183                    state.advance(1);
184                    PythonSyntaxKind::Plus
185                }
186                '-' => {
187                    state.advance(1);
188                    PythonSyntaxKind::Minus
189                }
190                '*' => {
191                    state.advance(1);
192                    PythonSyntaxKind::Star
193                }
194                '/' => {
195                    state.advance(1);
196                    PythonSyntaxKind::Slash
197                }
198                '%' => {
199                    state.advance(1);
200                    PythonSyntaxKind::Percent
201                }
202                '=' => {
203                    state.advance(1);
204                    PythonSyntaxKind::Assign
205                }
206                '<' => {
207                    state.advance(1);
208                    PythonSyntaxKind::Less
209                }
210                '>' => {
211                    state.advance(1);
212                    PythonSyntaxKind::Greater
213                }
214                '&' => {
215                    state.advance(1);
216                    PythonSyntaxKind::Ampersand
217                }
218                '|' => {
219                    state.advance(1);
220                    PythonSyntaxKind::Pipe
221                }
222                '^' => {
223                    state.advance(1);
224                    PythonSyntaxKind::Caret
225                }
226                '~' => {
227                    state.advance(1);
228                    PythonSyntaxKind::Tilde
229                }
230                '@' => {
231                    state.advance(1);
232                    PythonSyntaxKind::At
233                }
234                _ => return false,
235            };
236
237            state.add_token(kind, start_pos, state.get_position());
238            return true;
239        }
240
241        false
242    }
243
244    /// 处理分隔符
245    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
246        let start_pos = state.get_position();
247
248        if let Some(ch) = state.current() {
249            let kind = match ch {
250                '(' => PythonSyntaxKind::LeftParen,
251                ')' => PythonSyntaxKind::RightParen,
252                '[' => PythonSyntaxKind::LeftBracket,
253                ']' => PythonSyntaxKind::RightBracket,
254                '{' => PythonSyntaxKind::LeftBrace,
255                '}' => PythonSyntaxKind::RightBrace,
256                ',' => PythonSyntaxKind::Comma,
257                ':' => PythonSyntaxKind::Colon,
258                ';' => PythonSyntaxKind::Semicolon,
259                '.' => PythonSyntaxKind::Dot, // 简化处理,不支持省略号
260                _ => return false,
261            };
262
263            state.advance(1);
264            state.add_token(kind, start_pos, state.get_position());
265            return true;
266        }
267
268        false
269    }
270
271    /// 处理缩进
272    fn lex_indent<S: Source>(&self, state: &mut State<S>) -> bool {
273        // 简化的缩进处理
274        false
275    }
276
277    /// 处理其他字符
278    fn lex_other<S: Source>(&self, state: &mut State<S>) -> bool {
279        if let Some(ch) = state.current() {
280            let start_pos = state.get_position();
281            state.advance(ch.len_utf8());
282            state.add_token(PythonSyntaxKind::Error, start_pos, state.get_position());
283            true
284        }
285        else {
286            false
287        }
288    }
289}
290
291impl<'config> Lexer<PythonLanguage> for PythonLexer<'config> {
292    fn lex(&self, source: impl Source) -> LexOutput<PythonLanguage> {
293        let mut state = LexerState::new(source);
294
295        while state.not_at_end() {
296            if self.skip_whitespace(&mut state) {
297                continue;
298            }
299
300            if self.lex_newline(&mut state) {
301                continue;
302            }
303
304            if self.lex_comment(&mut state) {
305                continue;
306            }
307
308            if self.lex_string(&mut state) {
309                continue;
310            }
311
312            if self.lex_number(&mut state) {
313                continue;
314            }
315
316            if self.lex_identifier_or_keyword(&mut state) {
317                continue;
318            }
319
320            if self.lex_operator(&mut state) {
321                continue;
322            }
323
324            if self.lex_delimiter(&mut state) {
325                continue;
326            }
327
328            if self.lex_indent(&mut state) {
329                continue;
330            }
331
332            if self.lex_other(&mut state) {
333                continue;
334            }
335
336            // 如果没有匹配任何规则,前进一个字符避免无限循环
337            if let Some(ch) = state.current() {
338                let start_pos = state.get_position();
339                state.advance(ch.len_utf8());
340                state.add_token(PythonSyntaxKind::Error, start_pos, state.get_position());
341            }
342            else {
343                break;
344            }
345        }
346
347        // 添加 EOF kind
348        let eof_pos = state.get_position();
349        state.add_token(PythonSyntaxKind::Eof, eof_pos, eof_pos);
350
351        state.finish(Ok(()))
352    }
353
354    fn lex_incremental(
355        &self,
356        source: impl Source,
357        _offset: usize,
358        _cache: IncrementalCache<'_, PythonLanguage>,
359    ) -> LexOutput<PythonLanguage> {
360        // 简化实现,直接调用完整的 lex 方法
361        self.lex(source)
362    }
363}