1use crate::{kind::ElixirSyntaxKind, language::ElixirLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, ElixirLanguage>;
10
11static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static ELIXIR_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["#"] });
13static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
15
16#[derive(Clone)]
17pub struct ElixirLexer<'config> {
18 config: &'config ElixirLanguage,
19}
20
21impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
22 fn lex_incremental(
23 &self,
24 source: impl Source,
25 changed: usize,
26 cache: IncrementalCache<ElixirLanguage>,
27 ) -> LexOutput<ElixirLanguage> {
28 let mut state = LexerState::new_with_cache(source, changed, cache);
29 let result = self.run(&mut state);
30 state.finish(result)
31 }
32}
33
34impl<'config> ElixirLexer<'config> {
35 pub fn new(config: &'config ElixirLanguage) -> Self {
36 Self { config }
37 }
38
39 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.skip_whitespace(state) {
44 continue;
45 }
46
47 if self.skip_comment(state) {
48 continue;
49 }
50
51 if self.lex_string_literal(state) {
52 continue;
53 }
54
55 if self.lex_char_literal(state) {
56 continue;
57 }
58
59 if self.lex_sigil(state) {
60 continue;
61 }
62
63 if self.lex_number_literal(state) {
64 continue;
65 }
66
67 if self.lex_identifier_or_keyword(state) {
68 continue;
69 }
70
71 if self.lex_atom(state) {
72 continue;
73 }
74
75 if self.lex_operators(state) {
76 continue;
77 }
78
79 state.safe_check(safe_point);
80 }
81
82 let eof_pos = state.get_position();
84 state.add_token(ElixirSyntaxKind::Eof, eof_pos, eof_pos);
85 Ok(())
86 }
87
88 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
90 match ELIXIR_WHITESPACE.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Whitespace) {
91 Some(token) => {
92 state.advance_with(token);
93 return true;
94 }
95 None => {}
96 }
97 false
98 }
99
100 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
101 match ELIXIR_COMMENT.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Comment) {
102 Some(token) => {
103 state.advance_with(token);
104 return true;
105 }
106 None => {}
107 }
108 false
109 }
110
111 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
112 let _start = state.get_position();
113 match ELIXIR_STRING.scan(state.rest(), state.get_position(), ElixirSyntaxKind::String) {
114 Some(token) => {
115 state.advance_with(token);
116 return true;
117 }
118 None => {}
119 }
120 false
121 }
122
123 fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
124 let _start = state.get_position();
125 match ELIXIR_CHAR.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Character) {
126 Some(token) => {
127 state.advance_with(token);
128 return true;
129 }
130 None => {}
131 }
132 false
133 }
134
135 fn lex_sigil<S: Source>(&self, state: &mut State<S>) -> bool {
136 let start = state.get_position();
137 let rest = state.rest();
138
139 if rest.starts_with("~") {
140 state.advance(1);
141 if let Some(sigil_type) = state.peek() {
142 if sigil_type.is_alphabetic() {
143 state.advance(sigil_type.len_utf8());
144
145 if let Some(delimiter) = state.peek() {
147 let closing_delimiter = match delimiter {
148 '(' => ')',
149 '[' => ']',
150 '{' => '}',
151 '<' => '>',
152 '/' => '/',
153 '|' => '|',
154 '"' => '"',
155 '\'' => '\'',
156 _ => delimiter,
157 };
158
159 state.advance(delimiter.len_utf8());
160
161 while let Some(ch) = state.peek() {
162 if ch == closing_delimiter {
163 state.advance(ch.len_utf8());
164 break;
165 }
166 state.advance(ch.len_utf8());
167 }
168
169 while let Some(ch) = state.peek() {
171 if ch.is_alphabetic() {
172 state.advance(ch.len_utf8());
173 }
174 else {
175 break;
176 }
177 }
178
179 state.add_token(ElixirSyntaxKind::Sigil, start, state.get_position());
180 return true;
181 }
182 }
183 }
184 }
185 false
186 }
187
188 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
189 let start = state.get_position();
190 let first = match state.current() {
191 Some(c) => c,
192 None => return false,
193 };
194 if !first.is_ascii_digit() {
195 return false;
196 }
197 let mut is_float = false;
198 if first == '0' {
199 match state.peek_next_n(1) {
200 Some('x') | Some('X') => {
201 state.advance(2);
202 while let Some(c) = state.peek() {
203 if c.is_ascii_hexdigit() || c == '_' {
204 state.advance(1);
205 }
206 else {
207 break;
208 }
209 }
210 }
211 Some('b') | Some('B') => {
212 state.advance(2);
213 while let Some(c) = state.peek() {
214 if c == '0' || c == '1' || c == '_' {
215 state.advance(1);
216 }
217 else {
218 break;
219 }
220 }
221 }
222 Some('o') | Some('O') => {
223 state.advance(2);
224 while let Some(c) = state.peek() {
225 if ('0'..='7').contains(&c) || c == '_' {
226 state.advance(1);
227 }
228 else {
229 break;
230 }
231 }
232 }
233 _ => {
234 state.advance(1);
235 while let Some(c) = state.peek() {
236 if c.is_ascii_digit() || c == '_' {
237 state.advance(1);
238 }
239 else {
240 break;
241 }
242 }
243 }
244 }
245 }
246 else {
247 state.advance(1);
248 while let Some(c) = state.peek() {
249 if c.is_ascii_digit() || c == '_' {
250 state.advance(1);
251 }
252 else {
253 break;
254 }
255 }
256 }
257 if state.peek() == Some('.') {
259 let n1 = state.peek_next_n(1);
260 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
261 is_float = true;
262 state.advance(1); while let Some(c) = state.peek() {
264 if c.is_ascii_digit() || c == '_' {
265 state.advance(1);
266 }
267 else {
268 break;
269 }
270 }
271 }
272 }
273 if let Some(c) = state.peek() {
275 if c == 'e' || c == 'E' {
276 let n1 = state.peek_next_n(1);
277 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
278 is_float = true;
279 state.advance(1);
280 if let Some(sign) = state.peek() {
281 if sign == '+' || sign == '-' {
282 state.advance(1);
283 }
284 }
285 while let Some(d) = state.peek() {
286 if d.is_ascii_digit() || d == '_' {
287 state.advance(1);
288 }
289 else {
290 break;
291 }
292 }
293 }
294 }
295 }
296 while let Some(c) = state.peek() {
298 if c.is_ascii_alphabetic() {
299 state.advance(1);
300 }
301 else {
302 break;
303 }
304 }
305 let end = state.get_position();
306 state.add_token(if is_float { ElixirSyntaxKind::Float } else { ElixirSyntaxKind::Number }, start, end);
307 true
308 }
309
310 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
311 let start = state.get_position();
312
313 if let Some(ch) = state.current() {
314 if ch.is_alphabetic() || ch == '_' {
315 state.advance(ch.len_utf8());
316
317 while let Some(next_ch) = state.peek() {
318 if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!' {
319 state.advance(next_ch.len_utf8());
320 }
321 else {
322 break;
323 }
324 }
325
326 let text = state.get_text_in((start..state.get_position()).into());
327 let kind = match text {
328 "after" => ElixirSyntaxKind::After,
329 "and" => ElixirSyntaxKind::And,
330 "case" => ElixirSyntaxKind::Case,
331 "catch" => ElixirSyntaxKind::Catch,
332 "cond" => ElixirSyntaxKind::Cond,
333 "def" => ElixirSyntaxKind::Def,
334 "defp" => ElixirSyntaxKind::Defp,
335 "defmodule" => ElixirSyntaxKind::Defmodule,
336 "defstruct" => ElixirSyntaxKind::Defstruct,
337 "defprotocol" => ElixirSyntaxKind::Defprotocol,
338 "defimpl" => ElixirSyntaxKind::Defimpl,
339 "defmacro" => ElixirSyntaxKind::Defmacro,
340 "defmacrop" => ElixirSyntaxKind::Defmacrop,
341 "do" => ElixirSyntaxKind::Do,
342 "else" => ElixirSyntaxKind::Else,
343 "elsif" => ElixirSyntaxKind::Elsif,
344 "end" => ElixirSyntaxKind::End,
345 "false" => ElixirSyntaxKind::False,
346 "fn" => ElixirSyntaxKind::Fn,
347 "if" => ElixirSyntaxKind::If,
348 "in" => ElixirSyntaxKind::In,
349 "not" => ElixirSyntaxKind::Not,
350 "or" => ElixirSyntaxKind::Or,
351 "receive" => ElixirSyntaxKind::Receive,
352 "rescue" => ElixirSyntaxKind::Rescue,
353 "true" => ElixirSyntaxKind::True,
354 "try" => ElixirSyntaxKind::Try,
355 "unless" => ElixirSyntaxKind::Unless,
356 "when" => ElixirSyntaxKind::When,
357 "with" => ElixirSyntaxKind::With,
358 _ => {
359 if text.chars().next().unwrap().is_uppercase() {
360 ElixirSyntaxKind::Variable
361 }
362 else {
363 ElixirSyntaxKind::Identifier
364 }
365 }
366 };
367
368 state.add_token(kind, start, state.get_position());
369 return true;
370 }
371 }
372 false
373 }
374
375 fn lex_atom<S: Source>(&self, state: &mut State<S>) -> bool {
376 let start = state.get_position();
377
378 if state.current() == Some(':') {
379 state.advance(1);
380
381 if state.peek() == Some('"') {
383 state.advance(1);
384 while let Some(ch) = state.peek() {
385 if ch == '"' {
386 state.advance(1);
387 break;
388 }
389 if ch == '\\' {
390 state.advance(1);
391 if let Some(escaped) = state.peek() {
392 state.advance(escaped.len_utf8());
393 }
394 }
395 else {
396 state.advance(ch.len_utf8());
397 }
398 }
399 }
400 else if let Some(ch) = state.peek() {
401 if ch.is_alphabetic() || ch == '_' {
402 state.advance(ch.len_utf8());
403 while let Some(next_ch) = state.peek() {
404 if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!' {
405 state.advance(next_ch.len_utf8());
406 }
407 else {
408 break;
409 }
410 }
411 }
412 }
413
414 state.add_token(ElixirSyntaxKind::Atom, start, state.get_position());
415 return true;
416 }
417 false
418 }
419
420 fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
421 let start = state.get_position();
422 let rest = state.rest();
423
424 if rest.starts_with("===") {
426 state.advance(3);
427 state.add_token(ElixirSyntaxKind::EqualEqualEqual, start, state.get_position());
428 return true;
429 }
430 if rest.starts_with("!==") {
431 state.advance(3);
432 state.add_token(ElixirSyntaxKind::NotEqualEqual, start, state.get_position());
433 return true;
434 }
435 if rest.starts_with("==") {
436 state.advance(2);
437 state.add_token(ElixirSyntaxKind::EqualEqual, start, state.get_position());
438 return true;
439 }
440 if rest.starts_with("!=") {
441 state.advance(2);
442 state.add_token(ElixirSyntaxKind::NotEqual, start, state.get_position());
443 return true;
444 }
445 if rest.starts_with("<=") {
446 state.advance(2);
447 state.add_token(ElixirSyntaxKind::LessEqual, start, state.get_position());
448 return true;
449 }
450 if rest.starts_with(">=") {
451 state.advance(2);
452 state.add_token(ElixirSyntaxKind::GreaterEqual, start, state.get_position());
453 return true;
454 }
455 if rest.starts_with("++") {
456 state.advance(2);
457 state.add_token(ElixirSyntaxKind::PlusPlus, start, state.get_position());
458 return true;
459 }
460 if rest.starts_with("--") {
461 state.advance(2);
462 state.add_token(ElixirSyntaxKind::MinusMinus, start, state.get_position());
463 return true;
464 }
465 if rest.starts_with("**") {
466 state.advance(2);
467 state.add_token(ElixirSyntaxKind::StarStar, start, state.get_position());
468 return true;
469 }
470 if rest.starts_with("<<") {
471 state.advance(2);
472 state.add_token(ElixirSyntaxKind::LeftShift, start, state.get_position());
473 return true;
474 }
475 if rest.starts_with(">>") {
476 state.advance(2);
477 state.add_token(ElixirSyntaxKind::RightShift, start, state.get_position());
478 return true;
479 }
480 if rest.starts_with("=~") {
481 state.advance(2);
482 state.add_token(ElixirSyntaxKind::MatchOp, start, state.get_position());
483 return true;
484 }
485 if rest.starts_with("|>") {
486 state.advance(2);
487 state.add_token(ElixirSyntaxKind::PipeRight, start, state.get_position());
488 return true;
489 }
490 if rest.starts_with("||") {
491 state.advance(2);
492 state.add_token(ElixirSyntaxKind::PipePipe, start, state.get_position());
493 return true;
494 }
495 if rest.starts_with("->") {
496 state.advance(2);
497 state.add_token(ElixirSyntaxKind::Arrow, start, state.get_position());
498 return true;
499 }
500
501 if let Some(ch) = state.current() {
503 let kind = match ch {
504 '+' => ElixirSyntaxKind::Plus,
505 '-' => ElixirSyntaxKind::Minus,
506 '*' => ElixirSyntaxKind::Star,
507 '/' => ElixirSyntaxKind::Slash,
508 '=' => ElixirSyntaxKind::Equal,
509 '<' => ElixirSyntaxKind::Less,
510 '>' => ElixirSyntaxKind::Greater,
511 '!' => ElixirSyntaxKind::Exclamation,
512 '?' => ElixirSyntaxKind::Question,
513 '&' => ElixirSyntaxKind::Ampersand,
514 '@' => ElixirSyntaxKind::At,
515 '^' => ElixirSyntaxKind::Caret,
516 '~' => ElixirSyntaxKind::Tilde,
517 '|' => ElixirSyntaxKind::Pipe,
518 '#' => ElixirSyntaxKind::Hash,
519 '(' => ElixirSyntaxKind::LeftParen,
520 ')' => ElixirSyntaxKind::RightParen,
521 '{' => ElixirSyntaxKind::LeftBrace,
522 '}' => ElixirSyntaxKind::RightBrace,
523 '[' => ElixirSyntaxKind::LeftBracket,
524 ']' => ElixirSyntaxKind::RightBracket,
525 ',' => ElixirSyntaxKind::Comma,
526 ';' => ElixirSyntaxKind::Semicolon,
527 '.' => ElixirSyntaxKind::Dot,
528 ':' => ElixirSyntaxKind::Colon,
529 '\n' => ElixirSyntaxKind::Newline,
530 _ => return false,
531 };
532
533 state.advance(ch.len_utf8());
534 state.add_token(kind, start, state.get_position());
535 return true;
536 }
537
538 false
539 }
540}