oak_objective_c/lexer/
mod.rs

1use crate::{kind::ObjectiveCLanguageSyntaxKind, language::ObjectiveCLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, ObjectiveCLanguage>;
10
11static OC_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static OC_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static OC_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static OC_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone)]
17pub struct ObjectiveCLexer<'config> {
18    config: &'config ObjectiveCLanguage,
19}
20
21impl<'config> Lexer<ObjectiveCLanguage> for ObjectiveCLexer<'config> {
22    fn lex_incremental(
23        &self,
24        source: impl Source,
25        changed: usize,
26        cache: IncrementalCache<ObjectiveCLanguage>,
27    ) -> LexOutput<ObjectiveCLanguage> {
28        let mut state = LexerState::new_with_cache(source, changed, cache);
29        let result = self.run(&mut state);
30        state.finish(result)
31    }
32}
33
34impl<'config> ObjectiveCLexer<'config> {
35    pub fn new(config: &'config ObjectiveCLanguage) -> Self {
36        Self { config }
37    }
38
39    /// 主词法分析循环
40    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_char_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_operators(state) {
69                continue;
70            }
71
72            if self.lex_single_char_tokens(state) {
73                continue;
74            }
75
76            state.safe_check(safe_point);
77        }
78
79        // 添加 EOF token
80        let eof_pos = state.get_position();
81        state.add_token(ObjectiveCLanguageSyntaxKind::Eof, eof_pos, eof_pos);
82        Ok(())
83    }
84
85    /// 跳过空白字符
86    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
87        match OC_WHITESPACE.scan(state.rest(), state.get_position(), ObjectiveCLanguageSyntaxKind::Whitespace) {
88            Some(token) => {
89                state.advance_with(token);
90                return true;
91            }
92            None => {}
93        }
94        false
95    }
96
97    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
98        let start = state.get_position();
99        let rest = state.rest();
100        // line comment: // ... until newline
101        if rest.starts_with("//") {
102            state.advance(2);
103            while let Some(ch) = state.peek() {
104                if ch == '\n' || ch == '\r' {
105                    break;
106                }
107                state.advance(ch.len_utf8());
108            }
109            state.add_token(ObjectiveCLanguageSyntaxKind::CommentToken, start, state.get_position());
110            return true;
111        }
112        // block comment: /* ... */ with nesting support
113        if rest.starts_with("/*") {
114            state.advance(2);
115            let mut depth = 1usize;
116            while let Some(ch) = state.peek() {
117                if ch == '/' && state.peek_next_n(1) == Some('*') {
118                    state.advance(2);
119                    depth += 1;
120                    continue;
121                }
122                if ch == '*' && state.peek_next_n(1) == Some('/') {
123                    state.advance(2);
124                    depth -= 1;
125                    if depth == 0 {
126                        break;
127                    }
128                    continue;
129                }
130                state.advance(ch.len_utf8());
131            }
132            state.add_token(ObjectiveCLanguageSyntaxKind::CommentToken, start, state.get_position());
133            return true;
134        }
135        false
136    }
137
138    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
139        let start = state.get_position();
140
141        // Objective-C string literal: @"..."
142        if state.current() == Some('@') && state.peek_next_n(1) == Some('"') {
143            state.advance(2); // consume @"
144            let mut escaped = false;
145            while let Some(ch) = state.peek() {
146                if ch == '"' && !escaped {
147                    state.advance(1); // consume closing quote
148                    break;
149                }
150                state.advance(ch.len_utf8());
151                if escaped {
152                    escaped = false;
153                    continue;
154                }
155                if ch == '\\' {
156                    escaped = true;
157                    continue;
158                }
159                if ch == '\n' || ch == '\r' {
160                    break;
161                }
162            }
163            state.add_token(ObjectiveCLanguageSyntaxKind::String, start, state.get_position());
164            return true;
165        }
166
167        // normal string: "..."
168        if state.current() == Some('"') {
169            state.advance(1);
170            let mut escaped = false;
171            while let Some(ch) = state.peek() {
172                if ch == '"' && !escaped {
173                    state.advance(1); // consume closing quote
174                    break;
175                }
176                state.advance(ch.len_utf8());
177                if escaped {
178                    escaped = false;
179                    continue;
180                }
181                if ch == '\\' {
182                    escaped = true;
183                    continue;
184                }
185                if ch == '\n' || ch == '\r' {
186                    break;
187                }
188            }
189            state.add_token(ObjectiveCLanguageSyntaxKind::String, start, state.get_position());
190            return true;
191        }
192
193        false
194    }
195
196    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
197        let start = state.get_position();
198        if state.current() != Some('\'') {
199            return false;
200        }
201
202        state.advance(1); // opening '
203        if let Some('\\') = state.peek() {
204            state.advance(1);
205            if let Some(c) = state.peek() {
206                state.advance(c.len_utf8());
207            }
208        }
209        else if let Some(c) = state.peek() {
210            state.advance(c.len_utf8());
211        }
212        else {
213            state.set_position(start);
214            return false;
215        }
216
217        if state.peek() == Some('\'') {
218            state.advance(1);
219            state.add_token(ObjectiveCLanguageSyntaxKind::Character, start, state.get_position());
220            return true;
221        }
222
223        state.set_position(start);
224        false
225    }
226
227    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
228        let start = state.get_position();
229        let first = match state.current() {
230            Some(c) => c,
231            None => return false,
232        };
233
234        if !first.is_ascii_digit() {
235            return false;
236        }
237
238        let mut is_float = false;
239
240        // consume digits
241        state.advance(1);
242        while let Some(c) = state.peek() {
243            if c.is_ascii_digit() {
244                state.advance(1);
245            }
246            else {
247                break;
248            }
249        }
250
251        // fractional part
252        if state.peek() == Some('.') {
253            let n1 = state.peek_next_n(1);
254            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
255                is_float = true;
256                state.advance(1); // consume '.'
257                while let Some(c) = state.peek() {
258                    if c.is_ascii_digit() {
259                        state.advance(1);
260                    }
261                    else {
262                        break;
263                    }
264                }
265            }
266        }
267
268        // exponent
269        if let Some(c) = state.peek() {
270            if c == 'e' || c == 'E' {
271                let n1 = state.peek_next_n(1);
272                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
273                    is_float = true;
274                    state.advance(1);
275                    if let Some(sign) = state.peek() {
276                        if sign == '+' || sign == '-' {
277                            state.advance(1);
278                        }
279                    }
280                    while let Some(d) = state.peek() {
281                        if d.is_ascii_digit() {
282                            state.advance(1);
283                        }
284                        else {
285                            break;
286                        }
287                    }
288                }
289            }
290        }
291
292        // suffix letters (e.g., f, l, u)
293        while let Some(c) = state.peek() {
294            if c.is_ascii_alphabetic() {
295                state.advance(1);
296            }
297            else {
298                break;
299            }
300        }
301
302        let end = state.get_position();
303        state.add_token(
304            if is_float { ObjectiveCLanguageSyntaxKind::FloatLiteral } else { ObjectiveCLanguageSyntaxKind::IntegerLiteral },
305            start,
306            end,
307        );
308        true
309    }
310
311    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
312        let start = state.get_position();
313        let ch = match state.current() {
314            Some(c) => c,
315            None => return false,
316        };
317
318        if !(ch.is_ascii_alphabetic() || ch == '_') {
319            return false;
320        }
321
322        state.advance(1);
323        while let Some(c) = state.current() {
324            if c.is_ascii_alphanumeric() || c == '_' {
325                state.advance(1);
326            }
327            else {
328                break;
329            }
330        }
331
332        let end = state.get_position();
333        let text = state.get_text_in((start..end).into());
334        let kind = match text {
335            // Objective-C keywords
336            "@interface" => ObjectiveCLanguageSyntaxKind::InterfaceKeyword,
337            "@implementation" => ObjectiveCLanguageSyntaxKind::ImplementationKeyword,
338            "@end" => ObjectiveCLanguageSyntaxKind::EndKeyword,
339            "@property" => ObjectiveCLanguageSyntaxKind::PropertyKeyword,
340            "@synthesize" => ObjectiveCLanguageSyntaxKind::SynthesizeKeyword,
341            "@dynamic" => ObjectiveCLanguageSyntaxKind::DynamicKeyword,
342            "@protocol" => ObjectiveCLanguageSyntaxKind::ProtocolKeyword,
343            "@import" => ObjectiveCLanguageSyntaxKind::ImportKeyword,
344            "#import" => ObjectiveCLanguageSyntaxKind::ImportKeyword,
345            "#include" => ObjectiveCLanguageSyntaxKind::IncludeKeyword,
346
347            // C keywords
348            "if" => ObjectiveCLanguageSyntaxKind::IfKeyword,
349            "else" => ObjectiveCLanguageSyntaxKind::ElseKeyword,
350            "for" => ObjectiveCLanguageSyntaxKind::ForKeyword,
351            "while" => ObjectiveCLanguageSyntaxKind::WhileKeyword,
352            "do" => ObjectiveCLanguageSyntaxKind::DoKeyword,
353            "switch" => ObjectiveCLanguageSyntaxKind::SwitchKeyword,
354            "case" => ObjectiveCLanguageSyntaxKind::CaseKeyword,
355            "default" => ObjectiveCLanguageSyntaxKind::DefaultKeyword,
356            "break" => ObjectiveCLanguageSyntaxKind::BreakKeyword,
357            "continue" => ObjectiveCLanguageSyntaxKind::ContinueKeyword,
358            "return" => ObjectiveCLanguageSyntaxKind::ReturnKeyword,
359            "void" => ObjectiveCLanguageSyntaxKind::VoidKeyword,
360            "int" => ObjectiveCLanguageSyntaxKind::IntKeyword,
361            "float" => ObjectiveCLanguageSyntaxKind::FloatKeyword,
362            "double" => ObjectiveCLanguageSyntaxKind::DoubleKeyword,
363            "char" => ObjectiveCLanguageSyntaxKind::CharKeyword,
364            "BOOL" => ObjectiveCLanguageSyntaxKind::BoolKeyword,
365            "id" => ObjectiveCLanguageSyntaxKind::IdKeyword,
366            "self" => ObjectiveCLanguageSyntaxKind::SelfKeyword,
367            "super" => ObjectiveCLanguageSyntaxKind::SuperKeyword,
368            "nil" => ObjectiveCLanguageSyntaxKind::NilKeyword,
369            "YES" => ObjectiveCLanguageSyntaxKind::YesKeyword,
370            "NO" => ObjectiveCLanguageSyntaxKind::NoKeyword,
371
372            _ => ObjectiveCLanguageSyntaxKind::Identifier,
373        };
374
375        state.add_token(kind, start, state.get_position());
376        true
377    }
378
379    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
380        let start = state.get_position();
381        let rest = state.rest();
382
383        // prefer longest matches first
384        let patterns: &[(&str, ObjectiveCLanguageSyntaxKind)] = &[
385            ("==", ObjectiveCLanguageSyntaxKind::EqualEqual),
386            ("!=", ObjectiveCLanguageSyntaxKind::NotEqual),
387            (">=", ObjectiveCLanguageSyntaxKind::GreaterEqual),
388            ("<=", ObjectiveCLanguageSyntaxKind::LessEqual),
389            ("&&", ObjectiveCLanguageSyntaxKind::And),
390            ("||", ObjectiveCLanguageSyntaxKind::Or),
391        ];
392
393        for (pat, kind) in patterns {
394            if rest.starts_with(pat) {
395                state.advance(pat.len());
396                state.add_token(*kind, start, state.get_position());
397                return true;
398            }
399        }
400
401        if let Some(ch) = state.current() {
402            let kind = match ch {
403                '+' => Some(ObjectiveCLanguageSyntaxKind::Plus),
404                '-' => Some(ObjectiveCLanguageSyntaxKind::Minus),
405                '*' => Some(ObjectiveCLanguageSyntaxKind::Star),
406                '/' => Some(ObjectiveCLanguageSyntaxKind::Slash),
407                '%' => Some(ObjectiveCLanguageSyntaxKind::Percent),
408                '=' => Some(ObjectiveCLanguageSyntaxKind::Equal),
409                '>' => Some(ObjectiveCLanguageSyntaxKind::Greater),
410                '<' => Some(ObjectiveCLanguageSyntaxKind::Less),
411                '!' => Some(ObjectiveCLanguageSyntaxKind::Not),
412                '?' => Some(ObjectiveCLanguageSyntaxKind::Question),
413                ':' => Some(ObjectiveCLanguageSyntaxKind::Colon),
414                '.' => Some(ObjectiveCLanguageSyntaxKind::Dot),
415                _ => None,
416            };
417
418            if let Some(k) = kind {
419                state.advance(ch.len_utf8());
420                state.add_token(k, start, state.get_position());
421                return true;
422            }
423        }
424
425        false
426    }
427
428    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
429        let start = state.get_position();
430        if let Some(ch) = state.current() {
431            let kind = match ch {
432                '(' => ObjectiveCLanguageSyntaxKind::LeftParen,
433                ')' => ObjectiveCLanguageSyntaxKind::RightParen,
434                '[' => ObjectiveCLanguageSyntaxKind::LeftBracket,
435                ']' => ObjectiveCLanguageSyntaxKind::RightBracket,
436                '{' => ObjectiveCLanguageSyntaxKind::LeftBrace,
437                '}' => ObjectiveCLanguageSyntaxKind::RightBrace,
438                ',' => ObjectiveCLanguageSyntaxKind::Comma,
439                ';' => ObjectiveCLanguageSyntaxKind::Semicolon,
440                '@' => ObjectiveCLanguageSyntaxKind::At,
441                _ => return false,
442            };
443
444            state.advance(ch.len_utf8());
445            state.add_token(kind, start, state.get_position());
446            true
447        }
448        else {
449            false
450        }
451    }
452}