1use crate::{kind::ErlangSyntaxKind, language::ErlangLanguage};
2use oak_core::{
3 errors::OakError,
4 lexer::{LexOutput, Lexer, LexerCache, LexerState},
5 source::{Source, TextEdit},
6};
7use std::{collections::HashSet, sync::LazyLock};
8
9#[derive(Clone)]
11pub struct ErlangLexer<'config> {
12 _config: &'config ErlangLanguage,
13}
14
15impl<'config> Lexer<ErlangLanguage> for ErlangLexer<'config> {
16 fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<ErlangLanguage>) -> LexOutput<ErlangLanguage> {
17 let mut state = LexerState::new(text);
18 let result = self.run(&mut state);
19 if result.is_ok() {
20 state.add_eof();
21 }
22 state.finish_with_cache(result, cache)
23 }
24}
25
26impl<'config> ErlangLexer<'config> {
27 pub fn new(config: &'config ErlangLanguage) -> Self {
28 Self { _config: config }
29 }
30
31 pub fn run<'a, S: Source + ?Sized>(&self, state: &mut LexerState<'a, S, ErlangLanguage>) -> Result<(), OakError> {
33 while state.not_at_end() {
34 let start_pos = state.get_position();
36
37 if self.skip_whitespace_and_comments(state) {
39 continue;
40 }
41
42 if self.lex_string_literal(state) {
44 continue;
45 }
46
47 if self.lex_character_literal(state) {
48 continue;
49 }
50
51 if self.lex_number(state) {
52 continue;
53 }
54
55 if self.lex_identifier_atom_or_keyword(state) {
56 continue;
57 }
58
59 if self.lex_operator(state) {
60 continue;
61 }
62
63 if self.lex_single_char_token(state) {
64 continue;
65 }
66
67 if state.get_position() == start_pos {
69 if let Some(ch) = state.current() {
71 state.advance(ch.len_utf8());
72 let end = state.get_position();
73 state.add_token(ErlangSyntaxKind::Error, start_pos, end);
74 }
75 }
76 }
77 Ok(())
78 }
79
80 fn skip_whitespace_and_comments<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
82 let mut skipped = false;
83
84 while let Some(ch) = state.current() {
86 if WHITESPACE.contains(&ch) {
87 let start = state.get_position();
88 if ch == '\n' {
89 state.advance(1);
90 state.add_token(ErlangSyntaxKind::Newline, start, state.get_position());
91 }
92 else {
93 while let Some(ch) = state.current() {
95 if WHITESPACE.contains(&ch) && ch != '\n' {
96 state.advance(ch.len_utf8());
97 }
98 else {
99 break;
100 }
101 }
102 state.add_token(ErlangSyntaxKind::Whitespace, start, state.get_position());
103 }
104 skipped = true;
105 }
106 else if ch == '%' {
107 let start = state.get_position();
109 state.advance(1); while let Some(ch) = state.current() {
113 if ch == '\n' {
114 break;
115 }
116 state.advance(ch.len_utf8());
117 }
118
119 state.add_token(ErlangSyntaxKind::Comment, start, state.get_position());
120 skipped = true;
121 }
122 else {
123 break;
124 }
125 }
126
127 skipped
128 }
129
130 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
132 if let Some('"') = state.current() {
133 let start = state.get_position();
134 state.advance(1); while let Some(ch) = state.current() {
137 if ch == '"' {
138 state.advance(1); let end = state.get_position();
140 state.add_token(ErlangSyntaxKind::String, start, end);
141 return true;
142 }
143 else if ch == '\\' {
144 state.advance(1); if let Some(ch) = state.current() {
146 state.advance(ch.len_utf8());
147 }
148 }
149 else {
150 state.advance(ch.len_utf8());
151 }
152 }
153
154 let end = state.get_position();
156 state.add_token(ErlangSyntaxKind::String, start, end);
157 true
158 }
159 else {
160 false
161 }
162 }
163
164 fn lex_character_literal<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
166 if let Some('$') = state.current() {
167 let start = state.get_position();
168 state.advance(1); if let Some(ch) = state.current() {
171 if ch == '\\' {
172 state.advance(1);
173 if let Some(next) = state.current() {
175 if next.is_ascii_digit() {
176 let mut count = 0;
178 while let Some(ch) = state.current() {
179 if ch.is_ascii_digit() && count < 3 {
180 state.advance(1);
181 count += 1;
182 }
183 else {
184 break;
185 }
186 }
187 }
188 else {
189 state.advance(next.len_utf8());
190 }
191 }
192 }
193 else {
194 state.advance(ch.len_utf8());
195 }
196 state.add_token(ErlangSyntaxKind::Character, start, state.get_position());
197 true
198 }
199 else {
200 state.add_token(ErlangSyntaxKind::Error, start, state.get_position());
202 true
203 }
204 }
205 else {
206 false
207 }
208 }
209
210 fn lex_number<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
212 if let Some(ch) = state.current() {
213 if ch.is_ascii_digit() {
214 let start = state.get_position();
215
216 while let Some(ch) = state.current() {
218 if ch.is_ascii_digit() {
219 state.advance(1);
220 }
221 else {
222 break;
223 }
224 }
225
226 if let Some('.') = state.current() {
228 if let Some(next_ch) = state.peek() {
229 if next_ch.is_ascii_digit() {
230 state.advance(1); while let Some(ch) = state.current() {
234 if ch.is_ascii_digit() {
235 state.advance(1);
236 }
237 else {
238 break;
239 }
240 }
241 }
242 }
243 }
244
245 if let Some(ch) = state.current() {
247 if ch == 'e' || ch == 'E' {
248 state.advance(1);
249
250 if let Some(ch) = state.current() {
252 if ch == '+' || ch == '-' {
253 state.advance(1);
254 }
255 }
256
257 while let Some(ch) = state.current() {
259 if ch.is_ascii_digit() {
260 state.advance(1);
261 }
262 else {
263 break;
264 }
265 }
266 }
267 }
268
269 state.add_token(ErlangSyntaxKind::Number, start, state.get_position());
270 true
271 }
272 else {
273 false
274 }
275 }
276 else {
277 false
278 }
279 }
280
281 fn lex_identifier_atom_or_keyword<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
283 if let Some(ch) = state.current() {
284 let start = state.get_position();
285
286 if ch.is_ascii_uppercase() || ch == '_' {
288 state.advance(1);
289 while let Some(ch) = state.current() {
290 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '@' {
291 state.advance(1);
292 }
293 else {
294 break;
295 }
296 }
297 state.add_token(ErlangSyntaxKind::Variable, start, state.get_position());
298 return true;
299 }
300
301 if ch.is_ascii_lowercase() {
303 state.advance(1);
304 while let Some(ch) = state.current() {
305 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '@' {
306 state.advance(1);
307 }
308 else {
309 break;
310 }
311 }
312 let end = state.get_position();
313 let text = state.source().get_text_in(oak_core::Range { start, end });
314
315 if KEYWORDS.contains(text.as_ref()) {
317 let kind = match text.as_ref() {
318 "after" => ErlangSyntaxKind::After,
319 "and" => ErlangSyntaxKind::And,
320 "andalso" => ErlangSyntaxKind::Andalso,
321 "band" => ErlangSyntaxKind::Band,
322 "begin" => ErlangSyntaxKind::Begin,
323 "bnot" => ErlangSyntaxKind::Bnot,
324 "bor" => ErlangSyntaxKind::Bor,
325 "bsl" => ErlangSyntaxKind::Bsl,
326 "bsr" => ErlangSyntaxKind::Bsr,
327 "bxor" => ErlangSyntaxKind::Bxor,
328 "case" => ErlangSyntaxKind::Case,
329 "catch" => ErlangSyntaxKind::Catch,
330 "cond" => ErlangSyntaxKind::Cond,
331 "div" => ErlangSyntaxKind::Div,
332 "end" => ErlangSyntaxKind::End,
333 "fun" => ErlangSyntaxKind::Fun,
334 "if" => ErlangSyntaxKind::If,
335 "let" => ErlangSyntaxKind::Let,
336 "not" => ErlangSyntaxKind::Not,
337 "of" => ErlangSyntaxKind::Of,
338 "or" => ErlangSyntaxKind::Or,
339 "orelse" => ErlangSyntaxKind::Orelse,
340 "query" => ErlangSyntaxKind::Query,
341 "receive" => ErlangSyntaxKind::Receive,
342 "rem" => ErlangSyntaxKind::Rem,
343 "try" => ErlangSyntaxKind::Try,
344 "when" => ErlangSyntaxKind::When,
345 "xor" => ErlangSyntaxKind::Xor,
346 _ => ErlangSyntaxKind::Atom,
347 };
348 state.add_token(kind, start, end);
349 }
350 else {
351 state.add_token(ErlangSyntaxKind::Atom, start, end);
352 }
353 return true;
354 }
355
356 if ch == '\'' {
358 state.advance(1);
359 while let Some(ch) = state.current() {
360 if ch == '\'' {
361 state.advance(1);
362 state.add_token(ErlangSyntaxKind::Atom, start, state.get_position());
363 return true;
364 }
365 else if ch == '\\' {
366 state.advance(1);
367 if let Some(next) = state.current() {
368 state.advance(next.len_utf8());
369 }
370 }
371 else {
372 state.advance(ch.len_utf8());
373 }
374 }
375 state.add_token(ErlangSyntaxKind::Atom, start, state.get_position());
376 return true;
377 }
378 }
379 false
380 }
381
382 fn lex_operator<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
384 if let Some(ch) = state.current() {
385 let start = state.get_position();
386
387 match ch {
388 '+' => {
389 state.advance(1);
390 if let Some('+') = state.current() {
391 state.advance(1);
392 state.add_token(ErlangSyntaxKind::PlusPlus, start, state.get_position());
393 }
394 else {
395 state.add_token(ErlangSyntaxKind::Plus, start, state.get_position());
396 }
397 true
398 }
399 '-' => {
400 state.advance(1);
401 if let Some('-') = state.current() {
402 state.advance(1);
403 state.add_token(ErlangSyntaxKind::MinusMinus, start, state.get_position());
404 }
405 else if let Some('>') = state.current() {
406 state.advance(1);
407 state.add_token(ErlangSyntaxKind::Arrow, start, state.get_position());
408 }
409 else {
410 state.add_token(ErlangSyntaxKind::Minus, start, state.get_position());
411 }
412 true
413 }
414 '*' => {
415 state.advance(1);
416 state.add_token(ErlangSyntaxKind::Star, start, state.get_position());
417 true
418 }
419 '/' => {
420 state.advance(1);
421 if let Some('=') = state.current() {
422 state.advance(1);
423 state.add_token(ErlangSyntaxKind::SlashEqual, start, state.get_position());
424 }
425 else {
426 state.add_token(ErlangSyntaxKind::Slash, start, state.get_position());
427 }
428 true
429 }
430 '=' => {
431 state.advance(1);
432 match state.current() {
433 Some('=') => {
434 state.advance(1);
435 state.add_token(ErlangSyntaxKind::EqualEqual, start, state.get_position());
436 }
437 Some(':') => {
438 state.advance(1);
439 if let Some('=') = state.current() {
440 state.advance(1);
441 state.add_token(ErlangSyntaxKind::EqualColonEqual, start, state.get_position());
442 }
443 else {
444 state.set_position(start + 1);
446 state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
447 }
448 }
449 Some('/') => {
450 state.advance(1);
451 if let Some('=') = state.current() {
452 state.advance(1);
453 state.add_token(ErlangSyntaxKind::EqualSlashEqual, start, state.get_position());
454 }
455 else {
456 state.set_position(start + 1);
458 state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
459 }
460 }
461 Some('<') => {
462 state.advance(1);
463 state.add_token(ErlangSyntaxKind::LessEqual, start, state.get_position());
464 }
465 _ => {
466 state.add_token(ErlangSyntaxKind::Equal, start, state.get_position());
467 }
468 }
469 true
470 }
471 '<' => {
472 state.advance(1);
473 state.add_token(ErlangSyntaxKind::Less, start, state.get_position());
474 true
475 }
476 '>' => {
477 state.advance(1);
478 if let Some('=') = state.current() {
479 state.advance(1);
480 state.add_token(ErlangSyntaxKind::GreaterEqual, start, state.get_position());
481 }
482 else {
483 state.add_token(ErlangSyntaxKind::Greater, start, state.get_position());
484 }
485 true
486 }
487 '!' => {
488 state.advance(1);
489 state.add_token(ErlangSyntaxKind::Exclamation, start, state.get_position());
490 true
491 }
492 '?' => {
493 state.advance(1);
494 state.add_token(ErlangSyntaxKind::Question, start, state.get_position());
495 true
496 }
497 '|' => {
498 state.advance(1);
499 if let Some('|') = state.current() {
500 state.advance(1);
501 state.add_token(ErlangSyntaxKind::PipePipe, start, state.get_position());
502 }
503 else {
504 state.add_token(ErlangSyntaxKind::Pipe, start, state.get_position());
505 }
506 true
507 }
508 '#' => {
509 state.advance(1);
510 state.add_token(ErlangSyntaxKind::Hash, start, state.get_position());
511 true
512 }
513 _ => false,
514 }
515 }
516 else {
517 false
518 }
519 }
520
521 fn lex_single_char_token<S: Source + ?Sized>(&self, state: &mut LexerState<S, ErlangLanguage>) -> bool {
523 if let Some(ch) = state.current() {
524 let start = state.get_position();
525 let kind = match ch {
526 '(' => Some(ErlangSyntaxKind::LeftParen),
527 ')' => Some(ErlangSyntaxKind::RightParen),
528 '{' => Some(ErlangSyntaxKind::LeftBrace),
529 '}' => Some(ErlangSyntaxKind::RightBrace),
530 '[' => Some(ErlangSyntaxKind::LeftBracket),
531 ']' => Some(ErlangSyntaxKind::RightBracket),
532 ',' => Some(ErlangSyntaxKind::Comma),
533 ';' => Some(ErlangSyntaxKind::Semicolon),
534 '.' => Some(ErlangSyntaxKind::Dot),
535 ':' => Some(ErlangSyntaxKind::Colon),
536 _ => None,
537 };
538
539 if let Some(kind) = kind {
540 state.advance(ch.len_utf8());
541 state.add_token(kind, start, state.get_position());
542 true
543 }
544 else {
545 false
546 }
547 }
548 else {
549 false
550 }
551 }
552}
553
554static WHITESPACE: LazyLock<HashSet<char>> = LazyLock::new(|| [' ', '\t', '\r', '\n'].into_iter().collect());
556
557static KEYWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
558 ["after", "and", "andalso", "band", "begin", "bnot", "bor", "bsl", "bsr", "bxor", "case", "catch", "cond", "div", "end", "fun", "if", "let", "not", "of", "or", "orelse", "query", "receive", "rem", "try", "when", "xor"].into_iter().collect()
559});