oak_c/lexer/
mod.rs

1use crate::{kind::CSyntaxKind, language::CLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3use serde::Serialize;
4use std::sync::LazyLock;
5
6type State<S> = LexerState<S, CLanguage>;
7
8#[derive(Clone, Copy, Debug, Serialize)]
9pub struct CLexer<'config> {
10    config: &'config CLanguage,
11}
12
13impl<'config> CLexer<'config> {
14    pub fn new(config: &'config CLanguage) -> Self {
15        Self { config }
16    }
17
18    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
19        while state.not_at_end() {
20            if self.skip_whitespace(state) {
21                continue;
22            }
23            if self.skip_comment(state) {
24                continue;
25            }
26            if self.lex_newline(state) {
27                continue;
28            }
29            if self.lex_string(state) {
30                continue;
31            }
32            if self.lex_char(state) {
33                continue;
34            }
35            if self.lex_number(state) {
36                continue;
37            }
38            if self.lex_keyword_or_identifier(state) {
39                continue;
40            }
41            if self.lex_operator_or_delimiter(state) {
42                continue;
43            }
44            if self.lex_preprocessor(state) {
45                continue;
46            }
47            if self.lex_text(state) {
48                continue;
49            }
50            else {
51                // 如果没有匹配到任何模式,跳过当前字符
52                state.advance(1);
53            }
54        }
55        Ok(())
56    }
57
58    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
59        let start = state.get_position();
60        let mut count = 0;
61
62        while let Some(ch) = state.current() {
63            if ch.is_whitespace() && ch != '\n' && ch != '\r' {
64                state.advance(1);
65                count += 1;
66            }
67            else {
68                break;
69            }
70        }
71
72        if count > 0 {
73            state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
74            true
75        }
76        else {
77            false
78        }
79    }
80
81    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
82        let start = state.get_position();
83
84        if let Some('/') = state.current() {
85            if let Some('/') = state.peek() {
86                // 单行注释
87                state.advance(2);
88                while let Some(ch) = state.current() {
89                    if ch == '\n' || ch == '\r' {
90                        break;
91                    }
92                    state.advance(1);
93                }
94                state.add_token(CSyntaxKind::Comment, start, state.get_position());
95                return true;
96            }
97            else if let Some('*') = state.peek() {
98                // 多行注释
99                state.advance(2);
100                while let Some(ch) = state.current() {
101                    if ch == '*' && state.peek() == Some('/') {
102                        state.advance(2);
103                        break;
104                    }
105                    state.advance(1);
106                }
107                state.add_token(CSyntaxKind::Comment, start, state.get_position());
108                return true;
109            }
110        }
111        false
112    }
113
114    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
115        let start = state.get_position();
116
117        if let Some(ch) = state.current() {
118            if ch == '\n' {
119                state.advance(1);
120                state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
121                return true;
122            }
123            else if ch == '\r' {
124                state.advance(1);
125                if state.current() == Some('\n') {
126                    state.advance(1);
127                }
128                state.add_token(CSyntaxKind::Whitespace, start, state.get_position());
129                return true;
130            }
131        }
132        false
133    }
134
135    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
136        let start = state.get_position();
137
138        if let Some('"') = state.current() {
139            state.advance(1);
140            while let Some(ch) = state.current() {
141                if ch == '"' {
142                    state.advance(1);
143                    break;
144                }
145                else if ch == '\\' {
146                    state.advance(1);
147                    if state.current().is_some() {
148                        state.advance(1);
149                    }
150                }
151                else {
152                    state.advance(1);
153                }
154            }
155            state.add_token(CSyntaxKind::StringLiteral, start, state.get_position());
156            return true;
157        }
158        false
159    }
160
161    fn lex_char<S: Source>(&self, state: &mut State<S>) -> bool {
162        let start = state.get_position();
163
164        if let Some('\'') = state.current() {
165            state.advance(1);
166            while let Some(ch) = state.current() {
167                if ch == '\'' {
168                    state.advance(1);
169                    break;
170                }
171                else if ch == '\\' {
172                    state.advance(1);
173                    if state.current().is_some() {
174                        state.advance(1);
175                    }
176                }
177                else {
178                    state.advance(1);
179                }
180            }
181            state.add_token(CSyntaxKind::CharLiteral, start, state.get_position());
182            return true;
183        }
184        false
185    }
186
187    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
188        let start = state.get_position();
189
190        if let Some(ch) = state.current() {
191            if ch.is_ascii_digit() {
192                state.advance(1);
193                while let Some(ch) = state.current() {
194                    if ch.is_ascii_alphanumeric() || ch == '.' || ch == 'e' || ch == 'E' || ch == '+' || ch == '-' {
195                        state.advance(1);
196                    }
197                    else {
198                        break;
199                    }
200                }
201
202                let text = state.get_text_in((start..state.get_position()).into());
203                let kind = if text.contains('.') || text.contains('e') || text.contains('E') {
204                    CSyntaxKind::FloatLiteral
205                }
206                else {
207                    CSyntaxKind::IntegerLiteral
208                };
209                state.add_token(kind, start, state.get_position());
210                return true;
211            }
212        }
213        false
214    }
215
216    fn lex_keyword_or_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
217        let start = state.get_position();
218
219        if let Some(ch) = state.current() {
220            if ch.is_ascii_alphabetic() || ch == '_' {
221                state.advance(1);
222                while let Some(ch) = state.current() {
223                    if ch.is_ascii_alphanumeric() || ch == '_' {
224                        state.advance(1);
225                    }
226                    else {
227                        break;
228                    }
229                }
230
231                let text = state.get_text_in((start..state.get_position()).into());
232                let kind = if C_KEYWORDS.contains(&text) {
233                    match text {
234                        "auto" => CSyntaxKind::Auto,
235                        "register" => CSyntaxKind::Register,
236                        "static" => CSyntaxKind::Static,
237                        "extern" => CSyntaxKind::Extern,
238                        "typedef" => CSyntaxKind::Typedef,
239                        "void" => CSyntaxKind::Void,
240                        "char" => CSyntaxKind::Char,
241                        "short" => CSyntaxKind::Short,
242                        "int" => CSyntaxKind::Int,
243                        "long" => CSyntaxKind::Long,
244                        "float" => CSyntaxKind::Float,
245                        "double" => CSyntaxKind::Double,
246                        "signed" => CSyntaxKind::Signed,
247                        "unsigned" => CSyntaxKind::Unsigned,
248                        "struct" => CSyntaxKind::Struct,
249                        "union" => CSyntaxKind::Union,
250                        "enum" => CSyntaxKind::Enum,
251                        "const" => CSyntaxKind::Const,
252                        "volatile" => CSyntaxKind::Volatile,
253                        "restrict" => CSyntaxKind::Restrict,
254                        "if" => CSyntaxKind::If,
255                        "else" => CSyntaxKind::Else,
256                        "switch" => CSyntaxKind::Switch,
257                        "case" => CSyntaxKind::Case,
258                        "default" => CSyntaxKind::Default,
259                        "for" => CSyntaxKind::For,
260                        "while" => CSyntaxKind::While,
261                        "do" => CSyntaxKind::Do,
262                        "break" => CSyntaxKind::Break,
263                        "continue" => CSyntaxKind::Continue,
264                        "goto" => CSyntaxKind::Goto,
265                        "return" => CSyntaxKind::Return,
266                        "sizeof" => CSyntaxKind::Sizeof,
267                        "inline" => CSyntaxKind::Inline,
268                        "_Bool" => CSyntaxKind::Bool,
269                        "_Complex" => CSyntaxKind::Complex,
270                        "_Imaginary" => CSyntaxKind::Imaginary,
271                        "_Alignas" => CSyntaxKind::Alignas,
272                        "_Alignof" => CSyntaxKind::Alignof,
273                        "_Atomic" => CSyntaxKind::Atomic,
274                        "_Static_assert" => CSyntaxKind::StaticAssert,
275                        "_Thread_local" => CSyntaxKind::ThreadLocal,
276                        "_Generic" => CSyntaxKind::Generic,
277                        "_Noreturn" => CSyntaxKind::Noreturn,
278                        _ => CSyntaxKind::Identifier,
279                    }
280                }
281                else {
282                    CSyntaxKind::Identifier
283                };
284                state.add_token(kind, start, state.get_position());
285                return true;
286            }
287        }
288        false
289    }
290
291    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
292        let start = state.get_position();
293
294        if let Some(ch) = state.current() {
295            let three_char = if let Some(next_ch) = state.peek_next_n(1) {
296                if let Some(third_ch) = state.peek_next_n(2) { Some(format!("{}{}{}", ch, next_ch, third_ch)) } else { None }
297            }
298            else {
299                None
300            };
301
302            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
303
304            // 检查三字符操作符
305            if let Some(ref three) = three_char {
306                if let Some(&kind) = C_THREE_CHAR_OPERATORS.get(three.as_str()) {
307                    state.advance(3);
308                    state.add_token(kind, start, state.get_position());
309                    return true;
310                }
311            }
312
313            // 检查双字符操作符
314            if let Some(&kind) = C_TWO_CHAR_OPERATORS.get(two_char.as_str()) {
315                state.advance(2);
316                state.add_token(kind, start, state.get_position());
317                return true;
318            }
319
320            // 检查单字符操作符和分隔符
321            let kind = match ch {
322                '(' => CSyntaxKind::LeftParen,
323                ')' => CSyntaxKind::RightParen,
324                '[' => CSyntaxKind::LeftBracket,
325                ']' => CSyntaxKind::RightBracket,
326                '{' => CSyntaxKind::LeftBrace,
327                '}' => CSyntaxKind::RightBrace,
328                ',' => CSyntaxKind::Comma,
329                ';' => CSyntaxKind::Semicolon,
330                ':' => CSyntaxKind::Colon,
331                '.' => CSyntaxKind::Dot,
332                '?' => CSyntaxKind::Question,
333                '+' => CSyntaxKind::Plus,
334                '-' => CSyntaxKind::Minus,
335                '*' => CSyntaxKind::Star,
336                '/' => CSyntaxKind::Slash,
337                '%' => CSyntaxKind::Percent,
338                '=' => CSyntaxKind::Assign,
339                '<' => CSyntaxKind::Less,
340                '>' => CSyntaxKind::Greater,
341                '!' => CSyntaxKind::LogicalNot,
342                '&' => CSyntaxKind::BitAnd,
343                '|' => CSyntaxKind::BitOr,
344                '^' => CSyntaxKind::BitXor,
345                '~' => CSyntaxKind::BitNot,
346                _ => return false,
347            };
348            state.advance(1);
349            state.add_token(kind, start, state.get_position());
350            return true;
351        }
352        false
353    }
354
355    fn lex_preprocessor<S: Source>(&self, state: &mut State<S>) -> bool {
356        let start = state.get_position();
357
358        if let Some('#') = state.current() {
359            state.advance(1);
360            while let Some(ch) = state.current() {
361                if ch == '\n' || ch == '\r' {
362                    break;
363                }
364                state.advance(1);
365            }
366            state.add_token(CSyntaxKind::PreprocessorDirective, start, state.get_position());
367            return true;
368        }
369        false
370    }
371
372    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
373        let start = state.get_position();
374
375        if let Some(ch) = state.current() {
376            if !ch.is_whitespace() && !ch.is_ascii_alphanumeric() && !"()[]{},.;:?+-*/%=<>!&|^~#\"'_".contains(ch) {
377                state.advance(1);
378                state.add_token(CSyntaxKind::Text, start, state.get_position());
379                return true;
380            }
381        }
382        false
383    }
384}
385
386impl<'config> Lexer<CLanguage> for CLexer<'config> {
387    fn lex_incremental(
388        &self,
389        source: impl Source,
390        _changed: usize,
391        _cache: IncrementalCache<CLanguage>,
392    ) -> LexOutput<CLanguage> {
393        let mut state = LexerState::new_with_cache(source, _changed, _cache);
394        let result = self.run(&mut state);
395        state.finish(result)
396    }
397}
398
399static C_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
400    &[
401        "auto",
402        "register",
403        "static",
404        "extern",
405        "typedef",
406        "void",
407        "char",
408        "short",
409        "int",
410        "long",
411        "float",
412        "double",
413        "signed",
414        "unsigned",
415        "struct",
416        "union",
417        "enum",
418        "const",
419        "volatile",
420        "restrict",
421        "if",
422        "else",
423        "switch",
424        "case",
425        "default",
426        "for",
427        "while",
428        "do",
429        "break",
430        "continue",
431        "goto",
432        "return",
433        "sizeof",
434        "inline",
435        "_Bool",
436        "_Complex",
437        "_Imaginary",
438        "_Alignas",
439        "_Alignof",
440        "_Atomic",
441        "_Static_assert",
442        "_Thread_local",
443        "_Generic",
444        "_Noreturn",
445    ]
446});
447
448static C_TWO_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CSyntaxKind>> = LazyLock::new(|| {
449    let mut map = std::collections::HashMap::new();
450    map.insert("+=", CSyntaxKind::PlusAssign);
451    map.insert("-=", CSyntaxKind::MinusAssign);
452    map.insert("*=", CSyntaxKind::StarAssign);
453    map.insert("/=", CSyntaxKind::SlashAssign);
454    map.insert("%=", CSyntaxKind::PercentAssign);
455    map.insert("==", CSyntaxKind::Equal);
456    map.insert("!=", CSyntaxKind::NotEqual);
457    map.insert("<=", CSyntaxKind::LessEqual);
458    map.insert(">=", CSyntaxKind::GreaterEqual);
459    map.insert("&&", CSyntaxKind::LogicalAnd);
460    map.insert("||", CSyntaxKind::LogicalOr);
461    map.insert("<<", CSyntaxKind::LeftShift);
462    map.insert(">>", CSyntaxKind::RightShift);
463    map.insert("&=", CSyntaxKind::AndAssign);
464    map.insert("|=", CSyntaxKind::OrAssign);
465    map.insert("^=", CSyntaxKind::XorAssign);
466    map.insert("++", CSyntaxKind::Increment);
467    map.insert("--", CSyntaxKind::Decrement);
468    map.insert("->", CSyntaxKind::Arrow);
469    map
470});
471
472static C_THREE_CHAR_OPERATORS: LazyLock<std::collections::HashMap<&str, CSyntaxKind>> = LazyLock::new(|| {
473    let mut map = std::collections::HashMap::new();
474    map.insert("<<=", CSyntaxKind::LeftShiftAssign);
475    map.insert(">>=", CSyntaxKind::RightShiftAssign);
476    map
477});