Skip to main content

oak_objective_c/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions.
3pub mod token_type;
4
5use crate::{language::ObjectiveCLanguage, lexer::token_type::ObjectiveCTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, ObjectiveCLanguage>;
9
10/// Objective-C lexer.
11#[derive(Clone)]
12pub struct ObjectiveCLexer<'config> {
13    #[allow(dead_code)]
14    config: &'config ObjectiveCLanguage,
15}
16
17impl<'config> Lexer<ObjectiveCLanguage> for ObjectiveCLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ObjectiveCLanguage>) -> LexOutput<ObjectiveCLanguage> {
19        let mut state = State::new(source);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> ObjectiveCLexer<'config> {
29    /// Creates a new Objective-C lexer.
30    pub fn new(config: &'config ObjectiveCLanguage) -> Self {
31        Self { config }
32    }
33
34    /// Main lexing loop.
35    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_char_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operators(state) {
64                continue;
65            }
66
67            if self.lex_single_char_tokens(state) {
68                continue;
69            }
70
71            // If no pattern matches, add an error token and advance.
72            let start_pos = state.get_position();
73            if let Some(ch) = state.peek() {
74                state.advance(ch.len_utf8());
75                state.add_token(ObjectiveCTokenType::Error, start_pos, state.get_position());
76            }
77
78            state.advance_if_dead_lock(safe_point);
79        }
80
81        Ok(())
82    }
83
84    /// Skips whitespace.
85    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
86        let start = state.get_position();
87        while let Some(ch) = state.peek() {
88            if ch.is_whitespace() {
89                state.advance(ch.len_utf8());
90            }
91            else {
92                break;
93            }
94        }
95        if state.get_position() > start {
96            state.add_token(ObjectiveCTokenType::Whitespace, start, state.get_position());
97            true
98        }
99        else {
100            false
101        }
102    }
103
104    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
105        let start = state.get_position();
106        let rest = state.rest();
107        // line comment: // ... until newline
108        if rest.starts_with("//") {
109            state.advance(2);
110            while let Some(ch) = state.peek() {
111                if ch == '\n' || ch == '\r' {
112                    break;
113                }
114                state.advance(ch.len_utf8());
115            }
116            state.add_token(ObjectiveCTokenType::CommentToken, start, state.get_position());
117            return true;
118        }
119        // block comment: /* ... */ with nesting support
120        if rest.starts_with("/*") {
121            state.advance(2);
122            let mut depth = 1usize;
123            while let Some(ch) = state.peek() {
124                if ch == '/' && state.peek_next_n(1) == Some('*') {
125                    state.advance(2);
126                    depth += 1;
127                    continue;
128                }
129                if ch == '*' && state.peek_next_n(1) == Some('/') {
130                    state.advance(2);
131                    depth -= 1;
132                    if depth == 0 {
133                        break;
134                    }
135                    continue;
136                }
137                state.advance(ch.len_utf8());
138            }
139            state.add_token(ObjectiveCTokenType::CommentToken, start, state.get_position());
140            return true;
141        }
142        false
143    }
144
145    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
146        let start = state.get_position();
147
148        // Objective-C string literal: @"..."
149        if state.peek() == Some('@') && state.peek_next_n(1) == Some('"') {
150            state.advance(2); // consume @"
151            let mut escaped = false;
152            while let Some(ch) = state.peek() {
153                if ch == '"' && !escaped {
154                    state.advance(1); // consume closing quote
155                    break;
156                }
157                state.advance(ch.len_utf8());
158                if escaped {
159                    escaped = false;
160                    continue;
161                }
162                if ch == '\\' {
163                    escaped = true;
164                    continue;
165                }
166                if ch == '\n' || ch == '\r' {
167                    break;
168                }
169            }
170            state.add_token(ObjectiveCTokenType::String, start, state.get_position());
171            return true;
172        }
173
174        // normal string: "..."
175        if state.peek() == Some('"') {
176            state.advance(1);
177            let mut escaped = false;
178            while let Some(ch) = state.peek() {
179                if ch == '"' && !escaped {
180                    state.advance(1); // consume closing quote
181                    break;
182                }
183                state.advance(ch.len_utf8());
184                if escaped {
185                    escaped = false;
186                    continue;
187                }
188                if ch == '\\' {
189                    escaped = true;
190                    continue;
191                }
192                if ch == '\n' || ch == '\r' {
193                    break;
194                }
195            }
196            state.add_token(ObjectiveCTokenType::String, start, state.get_position());
197            return true;
198        }
199
200        false
201    }
202
203    fn lex_char_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
204        let start = state.get_position();
205        if state.peek() != Some('\'') {
206            return false;
207        }
208
209        state.advance(1); // opening '
210        if let Some('\\') = state.peek() {
211            state.advance(1);
212            if let Some(c) = state.peek() {
213                state.advance(c.len_utf8());
214            }
215        }
216        else if let Some(c) = state.peek() {
217            state.advance(c.len_utf8());
218        }
219        else {
220            state.set_position(start);
221            return false;
222        }
223
224        if state.peek() == Some('\'') {
225            state.advance(1);
226            state.add_token(ObjectiveCTokenType::Character, start, state.get_position());
227            return true;
228        }
229
230        state.set_position(start);
231        false
232    }
233
234    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
235        let start = state.get_position();
236        let first = match state.peek() {
237            Some(c) => c,
238            None => return false,
239        };
240
241        if !first.is_ascii_digit() {
242            return false;
243        }
244
245        let mut is_float = false;
246
247        // consume digits
248        state.advance(1);
249        while let Some(c) = state.peek() {
250            if c.is_ascii_digit() {
251                state.advance(1);
252            }
253            else {
254                break;
255            }
256        }
257
258        // fractional part
259        if state.peek() == Some('.') {
260            let n1 = state.peek_next_n(1);
261            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
262                is_float = true;
263                state.advance(1); // consume '.'
264                while let Some(c) = state.peek() {
265                    if c.is_ascii_digit() {
266                        state.advance(1);
267                    }
268                    else {
269                        break;
270                    }
271                }
272            }
273        }
274
275        // exponent
276        if let Some(c) = state.peek() {
277            if c == 'e' || c == 'E' {
278                let n1 = state.peek_next_n(1);
279                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
280                    is_float = true;
281                    state.advance(1);
282                    if let Some(sign) = state.peek() {
283                        if sign == '+' || sign == '-' {
284                            state.advance(1);
285                        }
286                    }
287                    while let Some(d) = state.peek() {
288                        if d.is_ascii_digit() {
289                            state.advance(1);
290                        }
291                        else {
292                            break;
293                        }
294                    }
295                }
296            }
297        }
298
299        // suffix letters (e.g., f, l, u)
300        while let Some(c) = state.peek() {
301            if c.is_ascii_alphabetic() {
302                state.advance(1);
303            }
304            else {
305                break;
306            }
307        }
308
309        let end = state.get_position();
310        state.add_token(if is_float { ObjectiveCTokenType::FloatLiteral } else { ObjectiveCTokenType::IntegerLiteral }, start, end);
311        true
312    }
313
314    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
315        let start = state.get_position();
316        let ch = match state.peek() {
317            Some(c) => c,
318            None => return false,
319        };
320
321        if !(ch.is_ascii_alphabetic() || ch == '_' || ch == '@' || ch == '#') {
322            return false;
323        }
324
325        state.advance(1);
326        while let Some(c) = state.peek() {
327            if c.is_ascii_alphanumeric() || c == '_' {
328                state.advance(1);
329            }
330            else {
331                break;
332            }
333        }
334
335        let end = state.get_position();
336        let text = state.get_text_in(oak_core::Range { start, end });
337        let kind = match text.as_ref() {
338            // Objective-C keywords
339            "@interface" => ObjectiveCTokenType::InterfaceKeyword,
340            "@implementation" => ObjectiveCTokenType::ImplementationKeyword,
341            "@end" => ObjectiveCTokenType::EndKeyword,
342            "@property" => ObjectiveCTokenType::PropertyKeyword,
343            "@synthesize" => ObjectiveCTokenType::SynthesizeKeyword,
344            "@dynamic" => ObjectiveCTokenType::DynamicKeyword,
345            "@protocol" => ObjectiveCTokenType::ProtocolKeyword,
346            "@import" => ObjectiveCTokenType::ImportKeyword,
347            "#import" => ObjectiveCTokenType::ImportKeyword,
348            "#include" => ObjectiveCTokenType::IncludeKeyword,
349
350            // C keywords
351            "if" => ObjectiveCTokenType::IfKeyword,
352            "else" => ObjectiveCTokenType::ElseKeyword,
353            "for" => ObjectiveCTokenType::ForKeyword,
354            "while" => ObjectiveCTokenType::WhileKeyword,
355            "do" => ObjectiveCTokenType::DoKeyword,
356            "switch" => ObjectiveCTokenType::SwitchKeyword,
357            "case" => ObjectiveCTokenType::CaseKeyword,
358            "default" => ObjectiveCTokenType::DefaultKeyword,
359            "break" => ObjectiveCTokenType::BreakKeyword,
360            "continue" => ObjectiveCTokenType::ContinueKeyword,
361            "return" => ObjectiveCTokenType::ReturnKeyword,
362            "void" => ObjectiveCTokenType::VoidKeyword,
363            "int" => ObjectiveCTokenType::IntKeyword,
364            "float" => ObjectiveCTokenType::FloatKeyword,
365            "double" => ObjectiveCTokenType::DoubleKeyword,
366            "char" => ObjectiveCTokenType::CharKeyword,
367            "BOOL" => ObjectiveCTokenType::BoolKeyword,
368            "id" => ObjectiveCTokenType::IdKeyword,
369            "self" => ObjectiveCTokenType::SelfKeyword,
370            "super" => ObjectiveCTokenType::SuperKeyword,
371            "nil" => ObjectiveCTokenType::NilKeyword,
372            "YES" => ObjectiveCTokenType::YesKeyword,
373            "NO" => ObjectiveCTokenType::NoKeyword,
374
375            _ => ObjectiveCTokenType::Identifier,
376        };
377
378        state.add_token(kind, start, state.get_position());
379        true
380    }
381
382    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
383        let start = state.get_position();
384        let rest = state.rest();
385
386        // prefer longest matches first
387        let patterns: &[(&str, ObjectiveCTokenType)] =
388            &[("==", ObjectiveCTokenType::EqualEqual), ("!=", ObjectiveCTokenType::NotEqual), (">=", ObjectiveCTokenType::GreaterEqual), ("<=", ObjectiveCTokenType::LessEqual), ("&&", ObjectiveCTokenType::And), ("||", ObjectiveCTokenType::Or)];
389
390        for (pat, kind) in patterns {
391            if rest.starts_with(pat) {
392                state.advance(pat.len());
393                state.add_token(*kind, start, state.get_position());
394                return true;
395            }
396        }
397
398        if let Some(ch) = state.peek() {
399            let kind = match ch {
400                '+' => Some(ObjectiveCTokenType::Plus),
401                '-' => Some(ObjectiveCTokenType::Minus),
402                '*' => Some(ObjectiveCTokenType::Star),
403                '/' => Some(ObjectiveCTokenType::Slash),
404                '%' => Some(ObjectiveCTokenType::Percent),
405                '=' => Some(ObjectiveCTokenType::Equal),
406                '>' => Some(ObjectiveCTokenType::Greater),
407                '<' => Some(ObjectiveCTokenType::Less),
408                '!' => Some(ObjectiveCTokenType::Not),
409                '?' => Some(ObjectiveCTokenType::Question),
410                ':' => Some(ObjectiveCTokenType::Colon),
411                '.' => Some(ObjectiveCTokenType::Dot),
412                _ => None,
413            };
414
415            if let Some(k) = kind {
416                state.advance(ch.len_utf8());
417                state.add_token(k, start, state.get_position());
418                return true;
419            }
420        }
421
422        false
423    }
424
425    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
426        let start = state.get_position();
427        if let Some(ch) = state.peek() {
428            let kind = match ch {
429                '(' => ObjectiveCTokenType::LeftParen,
430                ')' => ObjectiveCTokenType::RightParen,
431                '[' => ObjectiveCTokenType::LeftBracket,
432                ']' => ObjectiveCTokenType::RightBracket,
433                '{' => ObjectiveCTokenType::LeftBrace,
434                '}' => ObjectiveCTokenType::RightBrace,
435                ',' => ObjectiveCTokenType::Comma,
436                ';' => ObjectiveCTokenType::Semicolon,
437                '@' => ObjectiveCTokenType::At,
438                _ => return false,
439            };
440
441            state.advance(ch.len_utf8());
442            state.add_token(kind, start, state.get_position());
443            true
444        }
445        else {
446            false
447        }
448    }
449}