Skip to main content

oak_objective_c/lexer/
mod.rs

1use crate::{kind::ObjectiveCSyntaxKind, language::ObjectiveCLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, ObjectiveCLanguage>;
5
6#[derive(Clone)]
7pub struct ObjectiveCLexer<'config> {
8    #[allow(dead_code)]
9    config: &'config ObjectiveCLanguage,
10}
11
12impl<'config> Lexer<ObjectiveCLanguage> for ObjectiveCLexer<'config> {
13    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ObjectiveCLanguage>) -> LexOutput<ObjectiveCLanguage> {
14        let mut state = State::new(source);
15        let result = self.run(&mut state);
16        if result.is_ok() {
17            state.add_eof();
18        }
19        state.finish_with_cache(result, cache)
20    }
21}
22
23impl<'config> ObjectiveCLexer<'config> {
24    pub fn new(config: &'config ObjectiveCLanguage) -> Self {
25        Self { config }
26    }
27
28    /// 主词法分析循环
29    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
30        while state.not_at_end() {
31            let safe_point = state.get_position();
32
33            if self.skip_whitespace(state) {
34                continue;
35            }
36
37            if self.skip_comment(state) {
38                continue;
39            }
40
41            if self.lex_string_literal(state) {
42                continue;
43            }
44
45            if self.lex_char_literal(state) {
46                continue;
47            }
48
49            if self.lex_number_literal(state) {
50                continue;
51            }
52
53            if self.lex_identifier_or_keyword(state) {
54                continue;
55            }
56
57            if self.lex_operators(state) {
58                continue;
59            }
60
61            if self.lex_single_char_tokens(state) {
62                continue;
63            }
64
65            // 如果没有匹配任何模式,添加错误 token 并前进
66            let start_pos = state.get_position();
67            if let Some(ch) = state.peek() {
68                state.advance(ch.len_utf8());
69                state.add_token(ObjectiveCSyntaxKind::Error, start_pos, state.get_position());
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        Ok(())
76    }
77
78    /// 跳过空白字符
79    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
80        let start = state.get_position();
81        while let Some(ch) = state.peek() {
82            if ch.is_whitespace() {
83                state.advance(ch.len_utf8());
84            }
85            else {
86                break;
87            }
88        }
89        if state.get_position() > start {
90            state.add_token(ObjectiveCSyntaxKind::Whitespace, start, state.get_position());
91            true
92        }
93        else {
94            false
95        }
96    }
97
98    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
99        let start = state.get_position();
100        let rest = state.rest();
101        // line comment: // ... until newline
102        if rest.starts_with("//") {
103            state.advance(2);
104            while let Some(ch) = state.peek() {
105                if ch == '\n' || ch == '\r' {
106                    break;
107                }
108                state.advance(ch.len_utf8());
109            }
110            state.add_token(ObjectiveCSyntaxKind::CommentToken, start, state.get_position());
111            return true;
112        }
113        // block comment: /* ... */ with nesting support
114        if rest.starts_with("/*") {
115            state.advance(2);
116            let mut depth = 1usize;
117            while let Some(ch) = state.peek() {
118                if ch == '/' && state.peek_next_n(1) == Some('*') {
119                    state.advance(2);
120                    depth += 1;
121                    continue;
122                }
123                if ch == '*' && state.peek_next_n(1) == Some('/') {
124                    state.advance(2);
125                    depth -= 1;
126                    if depth == 0 {
127                        break;
128                    }
129                    continue;
130                }
131                state.advance(ch.len_utf8());
132            }
133            state.add_token(ObjectiveCSyntaxKind::CommentToken, start, state.get_position());
134            return true;
135        }
136        false
137    }
138
139    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
140        let start = state.get_position();
141
142        // Objective-C string literal: @"..."
143        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
144            state.advance(2); // consume @"
145            let mut escaped = false;
146            while let Some(ch) = state.peek() {
147                if ch == '"' && !escaped {
148                    state.advance(1); // consume closing quote
149                    break;
150                }
151                state.advance(ch.len_utf8());
152                if escaped {
153                    escaped = false;
154                    continue;
155                }
156                if ch == '\\' {
157                    escaped = true;
158                    continue;
159                }
160                if ch == '\n' || ch == '\r' {
161                    break;
162                }
163            }
164            state.add_token(ObjectiveCSyntaxKind::String, start, state.get_position());
165            return true;
166        }
167
168        // normal string: "..."
169        if state.peek() == Some('"') {
170            state.advance(1);
171            let mut escaped = false;
172            while let Some(ch) = state.peek() {
173                if ch == '"' && !escaped {
174                    state.advance(1); // consume closing quote
175                    break;
176                }
177                state.advance(ch.len_utf8());
178                if escaped {
179                    escaped = false;
180                    continue;
181                }
182                if ch == '\\' {
183                    escaped = true;
184                    continue;
185                }
186                if ch == '\n' || ch == '\r' {
187                    break;
188                }
189            }
190            state.add_token(ObjectiveCSyntaxKind::String, start, state.get_position());
191            return true;
192        }
193
194        false
195    }
196
197    fn lex_char_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
198        let start = state.get_position();
199        if state.peek() != Some('\'') {
200            return false;
201        }
202
203        state.advance(1); // opening '
204        if let Some('\\') = state.peek() {
205            state.advance(1);
206            if let Some(c) = state.peek() {
207                state.advance(c.len_utf8());
208            }
209        }
210        else if let Some(c) = state.peek() {
211            state.advance(c.len_utf8());
212        }
213        else {
214            state.set_position(start);
215            return false;
216        }
217
218        if state.peek() == Some('\'') {
219            state.advance(1);
220            state.add_token(ObjectiveCSyntaxKind::Character, start, state.get_position());
221            return true;
222        }
223
224        state.set_position(start);
225        false
226    }
227
228    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
229        let start = state.get_position();
230        let first = match state.peek() {
231            Some(c) => c,
232            None => return false,
233        };
234
235        if !first.is_ascii_digit() {
236            return false;
237        }
238
239        let mut is_float = false;
240
241        // consume digits
242        state.advance(1);
243        while let Some(c) = state.peek() {
244            if c.is_ascii_digit() {
245                state.advance(1);
246            }
247            else {
248                break;
249            }
250        }
251
252        // fractional part
253        if state.peek() == Some('.') {
254            let n1 = state.peek_next_n(1);
255            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
256                is_float = true;
257                state.advance(1); // consume '.'
258                while let Some(c) = state.peek() {
259                    if c.is_ascii_digit() {
260                        state.advance(1);
261                    }
262                    else {
263                        break;
264                    }
265                }
266            }
267        }
268
269        // exponent
270        if let Some(c) = state.peek() {
271            if c == 'e' || c == 'E' {
272                let n1 = state.peek_next_n(1);
273                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
274                    is_float = true;
275                    state.advance(1);
276                    if let Some(sign) = state.peek() {
277                        if sign == '+' || sign == '-' {
278                            state.advance(1);
279                        }
280                    }
281                    while let Some(d) = state.peek() {
282                        if d.is_ascii_digit() {
283                            state.advance(1);
284                        }
285                        else {
286                            break;
287                        }
288                    }
289                }
290            }
291        }
292
293        // suffix letters (e.g., f, l, u)
294        while let Some(c) = state.peek() {
295            if c.is_ascii_alphabetic() {
296                state.advance(1);
297            }
298            else {
299                break;
300            }
301        }
302
303        let end = state.get_position();
304        state.add_token(if is_float { ObjectiveCSyntaxKind::FloatLiteral } else { ObjectiveCSyntaxKind::IntegerLiteral }, start, end);
305        true
306    }
307
308    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
309        let start = state.get_position();
310        let ch = match state.peek() {
311            Some(c) => c,
312            None => return false,
313        };
314
315        if !(ch.is_ascii_alphabetic() || ch == '_' || ch == '@' || ch == '#') {
316            return false;
317        }
318
319        state.advance(1);
320        while let Some(c) = state.peek() {
321            if c.is_ascii_alphanumeric() || c == '_' {
322                state.advance(1);
323            }
324            else {
325                break;
326            }
327        }
328
329        let end = state.get_position();
330        let text = state.get_text_in(oak_core::Range { start, end });
331        let kind = match text.as_ref() {
332            // Objective-C keywords
333            "@interface" => ObjectiveCSyntaxKind::InterfaceKeyword,
334            "@implementation" => ObjectiveCSyntaxKind::ImplementationKeyword,
335            "@end" => ObjectiveCSyntaxKind::EndKeyword,
336            "@property" => ObjectiveCSyntaxKind::PropertyKeyword,
337            "@synthesize" => ObjectiveCSyntaxKind::SynthesizeKeyword,
338            "@dynamic" => ObjectiveCSyntaxKind::DynamicKeyword,
339            "@protocol" => ObjectiveCSyntaxKind::ProtocolKeyword,
340            "@import" => ObjectiveCSyntaxKind::ImportKeyword,
341            "#import" => ObjectiveCSyntaxKind::ImportKeyword,
342            "#include" => ObjectiveCSyntaxKind::IncludeKeyword,
343
344            // C keywords
345            "if" => ObjectiveCSyntaxKind::IfKeyword,
346            "else" => ObjectiveCSyntaxKind::ElseKeyword,
347            "for" => ObjectiveCSyntaxKind::ForKeyword,
348            "while" => ObjectiveCSyntaxKind::WhileKeyword,
349            "do" => ObjectiveCSyntaxKind::DoKeyword,
350            "switch" => ObjectiveCSyntaxKind::SwitchKeyword,
351            "case" => ObjectiveCSyntaxKind::CaseKeyword,
352            "default" => ObjectiveCSyntaxKind::DefaultKeyword,
353            "break" => ObjectiveCSyntaxKind::BreakKeyword,
354            "continue" => ObjectiveCSyntaxKind::ContinueKeyword,
355            "return" => ObjectiveCSyntaxKind::ReturnKeyword,
356            "void" => ObjectiveCSyntaxKind::VoidKeyword,
357            "int" => ObjectiveCSyntaxKind::IntKeyword,
358            "float" => ObjectiveCSyntaxKind::FloatKeyword,
359            "double" => ObjectiveCSyntaxKind::DoubleKeyword,
360            "char" => ObjectiveCSyntaxKind::CharKeyword,
361            "BOOL" => ObjectiveCSyntaxKind::BoolKeyword,
362            "id" => ObjectiveCSyntaxKind::IdKeyword,
363            "self" => ObjectiveCSyntaxKind::SelfKeyword,
364            "super" => ObjectiveCSyntaxKind::SuperKeyword,
365            "nil" => ObjectiveCSyntaxKind::NilKeyword,
366            "YES" => ObjectiveCSyntaxKind::YesKeyword,
367            "NO" => ObjectiveCSyntaxKind::NoKeyword,
368
369            _ => ObjectiveCSyntaxKind::Identifier,
370        };
371
372        state.add_token(kind, start, state.get_position());
373        true
374    }
375
376    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
377        let start = state.get_position();
378        let rest = state.rest();
379
380        // prefer longest matches first
381        let patterns: &[(&str, ObjectiveCSyntaxKind)] =
382            &[("==", ObjectiveCSyntaxKind::EqualEqual), ("!=", ObjectiveCSyntaxKind::NotEqual), (">=", ObjectiveCSyntaxKind::GreaterEqual), ("<=", ObjectiveCSyntaxKind::LessEqual), ("&&", ObjectiveCSyntaxKind::And), ("||", ObjectiveCSyntaxKind::Or)];
383
384        for (pat, kind) in patterns {
385            if rest.starts_with(pat) {
386                state.advance(pat.len());
387                state.add_token(*kind, start, state.get_position());
388                return true;
389            }
390        }
391
392        if let Some(ch) = state.peek() {
393            let kind = match ch {
394                '+' => Some(ObjectiveCSyntaxKind::Plus),
395                '-' => Some(ObjectiveCSyntaxKind::Minus),
396                '*' => Some(ObjectiveCSyntaxKind::Star),
397                '/' => Some(ObjectiveCSyntaxKind::Slash),
398                '%' => Some(ObjectiveCSyntaxKind::Percent),
399                '=' => Some(ObjectiveCSyntaxKind::Equal),
400                '>' => Some(ObjectiveCSyntaxKind::Greater),
401                '<' => Some(ObjectiveCSyntaxKind::Less),
402                '!' => Some(ObjectiveCSyntaxKind::Not),
403                '?' => Some(ObjectiveCSyntaxKind::Question),
404                ':' => Some(ObjectiveCSyntaxKind::Colon),
405                '.' => Some(ObjectiveCSyntaxKind::Dot),
406                _ => None,
407            };
408
409            if let Some(k) = kind {
410                state.advance(ch.len_utf8());
411                state.add_token(k, start, state.get_position());
412                return true;
413            }
414        }
415
416        false
417    }
418
419    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
420        let start = state.get_position();
421        if let Some(ch) = state.peek() {
422            let kind = match ch {
423                '(' => ObjectiveCSyntaxKind::LeftParen,
424                ')' => ObjectiveCSyntaxKind::RightParen,
425                '[' => ObjectiveCSyntaxKind::LeftBracket,
426                ']' => ObjectiveCSyntaxKind::RightBracket,
427                '{' => ObjectiveCSyntaxKind::LeftBrace,
428                '}' => ObjectiveCSyntaxKind::RightBrace,
429                ',' => ObjectiveCSyntaxKind::Comma,
430                ';' => ObjectiveCSyntaxKind::Semicolon,
431                '@' => ObjectiveCSyntaxKind::At,
432                _ => return false,
433            };
434
435            state.advance(ch.len_utf8());
436            state.add_token(kind, start, state.get_position());
437            true
438        }
439        else {
440            false
441        }
442    }
443}