1use crate::{kind::ObjectiveCLanguageSyntaxKind, language::ObjectiveCLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, ObjectiveCLanguage>;
10
11static OC_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static OC_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static OC_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static OC_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone)]
17pub struct ObjectiveCLexer<'config> {
18 config: &'config ObjectiveCLanguage,
19}
20
21impl<'config> Lexer<ObjectiveCLanguage> for ObjectiveCLexer<'config> {
22 fn lex_incremental(
23 &self,
24 source: impl Source,
25 changed: usize,
26 cache: IncrementalCache<ObjectiveCLanguage>,
27 ) -> LexOutput<ObjectiveCLanguage> {
28 let mut state = LexerState::new_with_cache(source, changed, cache);
29 let result = self.run(&mut state);
30 state.finish(result)
31 }
32}
33
34impl<'config> ObjectiveCLexer<'config> {
35 pub fn new(config: &'config ObjectiveCLanguage) -> Self {
36 Self { config }
37 }
38
39 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
41 while state.not_at_end() {
42 let safe_point = state.get_position();
43
44 if self.skip_whitespace(state) {
45 continue;
46 }
47
48 if self.skip_comment(state) {
49 continue;
50 }
51
52 if self.lex_string_literal(state) {
53 continue;
54 }
55
56 if self.lex_char_literal(state) {
57 continue;
58 }
59
60 if self.lex_number_literal(state) {
61 continue;
62 }
63
64 if self.lex_identifier_or_keyword(state) {
65 continue;
66 }
67
68 if self.lex_operators(state) {
69 continue;
70 }
71
72 if self.lex_single_char_tokens(state) {
73 continue;
74 }
75
76 state.safe_check(safe_point);
77 }
78
79 let eof_pos = state.get_position();
81 state.add_token(ObjectiveCLanguageSyntaxKind::Eof, eof_pos, eof_pos);
82 Ok(())
83 }
84
85 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
87 match OC_WHITESPACE.scan(state.rest(), state.get_position(), ObjectiveCLanguageSyntaxKind::Whitespace) {
88 Some(token) => {
89 state.advance_with(token);
90 return true;
91 }
92 None => {}
93 }
94 false
95 }
96
97 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
98 let start = state.get_position();
99 let rest = state.rest();
100 if rest.starts_with("//") {
102 state.advance(2);
103 while let Some(ch) = state.peek() {
104 if ch == '\n' || ch == '\r' {
105 break;
106 }
107 state.advance(ch.len_utf8());
108 }
109 state.add_token(ObjectiveCLanguageSyntaxKind::CommentToken, start, state.get_position());
110 return true;
111 }
112 if rest.starts_with("/*") {
114 state.advance(2);
115 let mut depth = 1usize;
116 while let Some(ch) = state.peek() {
117 if ch == '/' && state.peek_next_n(1) == Some('*') {
118 state.advance(2);
119 depth += 1;
120 continue;
121 }
122 if ch == '*' && state.peek_next_n(1) == Some('/') {
123 state.advance(2);
124 depth -= 1;
125 if depth == 0 {
126 break;
127 }
128 continue;
129 }
130 state.advance(ch.len_utf8());
131 }
132 state.add_token(ObjectiveCLanguageSyntaxKind::CommentToken, start, state.get_position());
133 return true;
134 }
135 false
136 }
137
138 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
139 let start = state.get_position();
140
141 if state.current() == Some('@') && state.peek_next_n(1) == Some('"') {
143 state.advance(2); let mut escaped = false;
145 while let Some(ch) = state.peek() {
146 if ch == '"' && !escaped {
147 state.advance(1); break;
149 }
150 state.advance(ch.len_utf8());
151 if escaped {
152 escaped = false;
153 continue;
154 }
155 if ch == '\\' {
156 escaped = true;
157 continue;
158 }
159 if ch == '\n' || ch == '\r' {
160 break;
161 }
162 }
163 state.add_token(ObjectiveCLanguageSyntaxKind::String, start, state.get_position());
164 return true;
165 }
166
167 if state.current() == Some('"') {
169 state.advance(1);
170 let mut escaped = false;
171 while let Some(ch) = state.peek() {
172 if ch == '"' && !escaped {
173 state.advance(1); break;
175 }
176 state.advance(ch.len_utf8());
177 if escaped {
178 escaped = false;
179 continue;
180 }
181 if ch == '\\' {
182 escaped = true;
183 continue;
184 }
185 if ch == '\n' || ch == '\r' {
186 break;
187 }
188 }
189 state.add_token(ObjectiveCLanguageSyntaxKind::String, start, state.get_position());
190 return true;
191 }
192
193 false
194 }
195
196 fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
197 let start = state.get_position();
198 if state.current() != Some('\'') {
199 return false;
200 }
201
202 state.advance(1); if let Some('\\') = state.peek() {
204 state.advance(1);
205 if let Some(c) = state.peek() {
206 state.advance(c.len_utf8());
207 }
208 }
209 else if let Some(c) = state.peek() {
210 state.advance(c.len_utf8());
211 }
212 else {
213 state.set_position(start);
214 return false;
215 }
216
217 if state.peek() == Some('\'') {
218 state.advance(1);
219 state.add_token(ObjectiveCLanguageSyntaxKind::Character, start, state.get_position());
220 return true;
221 }
222
223 state.set_position(start);
224 false
225 }
226
227 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
228 let start = state.get_position();
229 let first = match state.current() {
230 Some(c) => c,
231 None => return false,
232 };
233
234 if !first.is_ascii_digit() {
235 return false;
236 }
237
238 let mut is_float = false;
239
240 state.advance(1);
242 while let Some(c) = state.peek() {
243 if c.is_ascii_digit() {
244 state.advance(1);
245 }
246 else {
247 break;
248 }
249 }
250
251 if state.peek() == Some('.') {
253 let n1 = state.peek_next_n(1);
254 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
255 is_float = true;
256 state.advance(1); while let Some(c) = state.peek() {
258 if c.is_ascii_digit() {
259 state.advance(1);
260 }
261 else {
262 break;
263 }
264 }
265 }
266 }
267
268 if let Some(c) = state.peek() {
270 if c == 'e' || c == 'E' {
271 let n1 = state.peek_next_n(1);
272 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
273 is_float = true;
274 state.advance(1);
275 if let Some(sign) = state.peek() {
276 if sign == '+' || sign == '-' {
277 state.advance(1);
278 }
279 }
280 while let Some(d) = state.peek() {
281 if d.is_ascii_digit() {
282 state.advance(1);
283 }
284 else {
285 break;
286 }
287 }
288 }
289 }
290 }
291
292 while let Some(c) = state.peek() {
294 if c.is_ascii_alphabetic() {
295 state.advance(1);
296 }
297 else {
298 break;
299 }
300 }
301
302 let end = state.get_position();
303 state.add_token(
304 if is_float { ObjectiveCLanguageSyntaxKind::FloatLiteral } else { ObjectiveCLanguageSyntaxKind::IntegerLiteral },
305 start,
306 end,
307 );
308 true
309 }
310
311 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
312 let start = state.get_position();
313 let ch = match state.current() {
314 Some(c) => c,
315 None => return false,
316 };
317
318 if !(ch.is_ascii_alphabetic() || ch == '_') {
319 return false;
320 }
321
322 state.advance(1);
323 while let Some(c) = state.current() {
324 if c.is_ascii_alphanumeric() || c == '_' {
325 state.advance(1);
326 }
327 else {
328 break;
329 }
330 }
331
332 let end = state.get_position();
333 let text = state.get_text_in((start..end).into());
334 let kind = match text {
335 "@interface" => ObjectiveCLanguageSyntaxKind::InterfaceKeyword,
337 "@implementation" => ObjectiveCLanguageSyntaxKind::ImplementationKeyword,
338 "@end" => ObjectiveCLanguageSyntaxKind::EndKeyword,
339 "@property" => ObjectiveCLanguageSyntaxKind::PropertyKeyword,
340 "@synthesize" => ObjectiveCLanguageSyntaxKind::SynthesizeKeyword,
341 "@dynamic" => ObjectiveCLanguageSyntaxKind::DynamicKeyword,
342 "@protocol" => ObjectiveCLanguageSyntaxKind::ProtocolKeyword,
343 "@import" => ObjectiveCLanguageSyntaxKind::ImportKeyword,
344 "#import" => ObjectiveCLanguageSyntaxKind::ImportKeyword,
345 "#include" => ObjectiveCLanguageSyntaxKind::IncludeKeyword,
346
347 "if" => ObjectiveCLanguageSyntaxKind::IfKeyword,
349 "else" => ObjectiveCLanguageSyntaxKind::ElseKeyword,
350 "for" => ObjectiveCLanguageSyntaxKind::ForKeyword,
351 "while" => ObjectiveCLanguageSyntaxKind::WhileKeyword,
352 "do" => ObjectiveCLanguageSyntaxKind::DoKeyword,
353 "switch" => ObjectiveCLanguageSyntaxKind::SwitchKeyword,
354 "case" => ObjectiveCLanguageSyntaxKind::CaseKeyword,
355 "default" => ObjectiveCLanguageSyntaxKind::DefaultKeyword,
356 "break" => ObjectiveCLanguageSyntaxKind::BreakKeyword,
357 "continue" => ObjectiveCLanguageSyntaxKind::ContinueKeyword,
358 "return" => ObjectiveCLanguageSyntaxKind::ReturnKeyword,
359 "void" => ObjectiveCLanguageSyntaxKind::VoidKeyword,
360 "int" => ObjectiveCLanguageSyntaxKind::IntKeyword,
361 "float" => ObjectiveCLanguageSyntaxKind::FloatKeyword,
362 "double" => ObjectiveCLanguageSyntaxKind::DoubleKeyword,
363 "char" => ObjectiveCLanguageSyntaxKind::CharKeyword,
364 "BOOL" => ObjectiveCLanguageSyntaxKind::BoolKeyword,
365 "id" => ObjectiveCLanguageSyntaxKind::IdKeyword,
366 "self" => ObjectiveCLanguageSyntaxKind::SelfKeyword,
367 "super" => ObjectiveCLanguageSyntaxKind::SuperKeyword,
368 "nil" => ObjectiveCLanguageSyntaxKind::NilKeyword,
369 "YES" => ObjectiveCLanguageSyntaxKind::YesKeyword,
370 "NO" => ObjectiveCLanguageSyntaxKind::NoKeyword,
371
372 _ => ObjectiveCLanguageSyntaxKind::Identifier,
373 };
374
375 state.add_token(kind, start, state.get_position());
376 true
377 }
378
379 fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
380 let start = state.get_position();
381 let rest = state.rest();
382
383 let patterns: &[(&str, ObjectiveCLanguageSyntaxKind)] = &[
385 ("==", ObjectiveCLanguageSyntaxKind::EqualEqual),
386 ("!=", ObjectiveCLanguageSyntaxKind::NotEqual),
387 (">=", ObjectiveCLanguageSyntaxKind::GreaterEqual),
388 ("<=", ObjectiveCLanguageSyntaxKind::LessEqual),
389 ("&&", ObjectiveCLanguageSyntaxKind::And),
390 ("||", ObjectiveCLanguageSyntaxKind::Or),
391 ];
392
393 for (pat, kind) in patterns {
394 if rest.starts_with(pat) {
395 state.advance(pat.len());
396 state.add_token(*kind, start, state.get_position());
397 return true;
398 }
399 }
400
401 if let Some(ch) = state.current() {
402 let kind = match ch {
403 '+' => Some(ObjectiveCLanguageSyntaxKind::Plus),
404 '-' => Some(ObjectiveCLanguageSyntaxKind::Minus),
405 '*' => Some(ObjectiveCLanguageSyntaxKind::Star),
406 '/' => Some(ObjectiveCLanguageSyntaxKind::Slash),
407 '%' => Some(ObjectiveCLanguageSyntaxKind::Percent),
408 '=' => Some(ObjectiveCLanguageSyntaxKind::Equal),
409 '>' => Some(ObjectiveCLanguageSyntaxKind::Greater),
410 '<' => Some(ObjectiveCLanguageSyntaxKind::Less),
411 '!' => Some(ObjectiveCLanguageSyntaxKind::Not),
412 '?' => Some(ObjectiveCLanguageSyntaxKind::Question),
413 ':' => Some(ObjectiveCLanguageSyntaxKind::Colon),
414 '.' => Some(ObjectiveCLanguageSyntaxKind::Dot),
415 _ => None,
416 };
417
418 if let Some(k) = kind {
419 state.advance(ch.len_utf8());
420 state.add_token(k, start, state.get_position());
421 return true;
422 }
423 }
424
425 false
426 }
427
428 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
429 let start = state.get_position();
430 if let Some(ch) = state.current() {
431 let kind = match ch {
432 '(' => ObjectiveCLanguageSyntaxKind::LeftParen,
433 ')' => ObjectiveCLanguageSyntaxKind::RightParen,
434 '[' => ObjectiveCLanguageSyntaxKind::LeftBracket,
435 ']' => ObjectiveCLanguageSyntaxKind::RightBracket,
436 '{' => ObjectiveCLanguageSyntaxKind::LeftBrace,
437 '}' => ObjectiveCLanguageSyntaxKind::RightBrace,
438 ',' => ObjectiveCLanguageSyntaxKind::Comma,
439 ';' => ObjectiveCLanguageSyntaxKind::Semicolon,
440 '@' => ObjectiveCLanguageSyntaxKind::At,
441 _ => return false,
442 };
443
444 state.advance(ch.len_utf8());
445 state.add_token(kind, start, state.get_position());
446 true
447 }
448 else {
449 false
450 }
451 }
452}