oak_elixir/lexer/
mod.rs

1use crate::{kind::ElixirSyntaxKind, language::ElixirLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, ElixirLanguage>;
10
11static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static ELIXIR_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["#"] });
13static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
15
16#[derive(Clone)]
17pub struct ElixirLexer<'config> {
18    config: &'config ElixirLanguage,
19}
20
21impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
22    fn lex_incremental(
23        &self,
24        source: impl Source,
25        changed: usize,
26        cache: IncrementalCache<ElixirLanguage>,
27    ) -> LexOutput<ElixirLanguage> {
28        let mut state = LexerState::new_with_cache(source, changed, cache);
29        let result = self.run(&mut state);
30        state.finish(result)
31    }
32}
33
34impl<'config> ElixirLexer<'config> {
35    pub fn new(config: &'config ElixirLanguage) -> Self {
36        Self { config }
37    }
38
39    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_char_literal(state) {
56                continue;
57            }
58
59            if self.lex_sigil(state) {
60                continue;
61            }
62
63            if self.lex_number_literal(state) {
64                continue;
65            }
66
67            if self.lex_identifier_or_keyword(state) {
68                continue;
69            }
70
71            if self.lex_atom(state) {
72                continue;
73            }
74
75            if self.lex_operators(state) {
76                continue;
77            }
78
79            state.safe_check(safe_point);
80        }
81
82        // 添加 EOF token
83        let eof_pos = state.get_position();
84        state.add_token(ElixirSyntaxKind::Eof, eof_pos, eof_pos);
85        Ok(())
86    }
87
88    /// 跳过空白字符
89    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
90        match ELIXIR_WHITESPACE.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Whitespace) {
91            Some(token) => {
92                state.advance_with(token);
93                return true;
94            }
95            None => {}
96        }
97        false
98    }
99
100    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
101        match ELIXIR_COMMENT.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Comment) {
102            Some(token) => {
103                state.advance_with(token);
104                return true;
105            }
106            None => {}
107        }
108        false
109    }
110
111    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
112        let _start = state.get_position();
113        match ELIXIR_STRING.scan(state.rest(), state.get_position(), ElixirSyntaxKind::String) {
114            Some(token) => {
115                state.advance_with(token);
116                return true;
117            }
118            None => {}
119        }
120        false
121    }
122
123    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
124        let _start = state.get_position();
125        match ELIXIR_CHAR.scan(state.rest(), state.get_position(), ElixirSyntaxKind::Character) {
126            Some(token) => {
127                state.advance_with(token);
128                return true;
129            }
130            None => {}
131        }
132        false
133    }
134
135    fn lex_sigil<S: Source>(&self, state: &mut State<S>) -> bool {
136        let start = state.get_position();
137        let rest = state.rest();
138
139        if rest.starts_with("~") {
140            state.advance(1);
141            if let Some(sigil_type) = state.peek() {
142                if sigil_type.is_alphabetic() {
143                    state.advance(sigil_type.len_utf8());
144
145                    // 查找分隔符
146                    if let Some(delimiter) = state.peek() {
147                        let closing_delimiter = match delimiter {
148                            '(' => ')',
149                            '[' => ']',
150                            '{' => '}',
151                            '<' => '>',
152                            '/' => '/',
153                            '|' => '|',
154                            '"' => '"',
155                            '\'' => '\'',
156                            _ => delimiter,
157                        };
158
159                        state.advance(delimiter.len_utf8());
160
161                        while let Some(ch) = state.peek() {
162                            if ch == closing_delimiter {
163                                state.advance(ch.len_utf8());
164                                break;
165                            }
166                            state.advance(ch.len_utf8());
167                        }
168
169                        // 可选的修饰符
170                        while let Some(ch) = state.peek() {
171                            if ch.is_alphabetic() {
172                                state.advance(ch.len_utf8());
173                            }
174                            else {
175                                break;
176                            }
177                        }
178
179                        state.add_token(ElixirSyntaxKind::Sigil, start, state.get_position());
180                        return true;
181                    }
182                }
183            }
184        }
185        false
186    }
187
188    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
189        let start = state.get_position();
190        let first = match state.current() {
191            Some(c) => c,
192            None => return false,
193        };
194        if !first.is_ascii_digit() {
195            return false;
196        }
197        let mut is_float = false;
198        if first == '0' {
199            match state.peek_next_n(1) {
200                Some('x') | Some('X') => {
201                    state.advance(2);
202                    while let Some(c) = state.peek() {
203                        if c.is_ascii_hexdigit() || c == '_' {
204                            state.advance(1);
205                        }
206                        else {
207                            break;
208                        }
209                    }
210                }
211                Some('b') | Some('B') => {
212                    state.advance(2);
213                    while let Some(c) = state.peek() {
214                        if c == '0' || c == '1' || c == '_' {
215                            state.advance(1);
216                        }
217                        else {
218                            break;
219                        }
220                    }
221                }
222                Some('o') | Some('O') => {
223                    state.advance(2);
224                    while let Some(c) = state.peek() {
225                        if ('0'..='7').contains(&c) || c == '_' {
226                            state.advance(1);
227                        }
228                        else {
229                            break;
230                        }
231                    }
232                }
233                _ => {
234                    state.advance(1);
235                    while let Some(c) = state.peek() {
236                        if c.is_ascii_digit() || c == '_' {
237                            state.advance(1);
238                        }
239                        else {
240                            break;
241                        }
242                    }
243                }
244            }
245        }
246        else {
247            state.advance(1);
248            while let Some(c) = state.peek() {
249                if c.is_ascii_digit() || c == '_' {
250                    state.advance(1);
251                }
252                else {
253                    break;
254                }
255            }
256        }
257        // fractional part
258        if state.peek() == Some('.') {
259            let n1 = state.peek_next_n(1);
260            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
261                is_float = true;
262                state.advance(1); // consume '.'
263                while let Some(c) = state.peek() {
264                    if c.is_ascii_digit() || c == '_' {
265                        state.advance(1);
266                    }
267                    else {
268                        break;
269                    }
270                }
271            }
272        }
273        // exponent
274        if let Some(c) = state.peek() {
275            if c == 'e' || c == 'E' {
276                let n1 = state.peek_next_n(1);
277                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
278                    is_float = true;
279                    state.advance(1);
280                    if let Some(sign) = state.peek() {
281                        if sign == '+' || sign == '-' {
282                            state.advance(1);
283                        }
284                    }
285                    while let Some(d) = state.peek() {
286                        if d.is_ascii_digit() || d == '_' {
287                            state.advance(1);
288                        }
289                        else {
290                            break;
291                        }
292                    }
293                }
294            }
295        }
296        // suffix letters (Elixir does not have explicit number suffixes like Rust, but we keep the structure for consistency if needed later)
297        while let Some(c) = state.peek() {
298            if c.is_ascii_alphabetic() {
299                state.advance(1);
300            }
301            else {
302                break;
303            }
304        }
305        let end = state.get_position();
306        state.add_token(if is_float { ElixirSyntaxKind::Float } else { ElixirSyntaxKind::Number }, start, end);
307        true
308    }
309
310    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
311        let start = state.get_position();
312
313        if let Some(ch) = state.current() {
314            if ch.is_alphabetic() || ch == '_' {
315                state.advance(ch.len_utf8());
316
317                while let Some(next_ch) = state.peek() {
318                    if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!' {
319                        state.advance(next_ch.len_utf8());
320                    }
321                    else {
322                        break;
323                    }
324                }
325
326                let text = state.get_text_in((start..state.get_position()).into());
327                let kind = match text {
328                    "after" => ElixirSyntaxKind::After,
329                    "and" => ElixirSyntaxKind::And,
330                    "case" => ElixirSyntaxKind::Case,
331                    "catch" => ElixirSyntaxKind::Catch,
332                    "cond" => ElixirSyntaxKind::Cond,
333                    "def" => ElixirSyntaxKind::Def,
334                    "defp" => ElixirSyntaxKind::Defp,
335                    "defmodule" => ElixirSyntaxKind::Defmodule,
336                    "defstruct" => ElixirSyntaxKind::Defstruct,
337                    "defprotocol" => ElixirSyntaxKind::Defprotocol,
338                    "defimpl" => ElixirSyntaxKind::Defimpl,
339                    "defmacro" => ElixirSyntaxKind::Defmacro,
340                    "defmacrop" => ElixirSyntaxKind::Defmacrop,
341                    "do" => ElixirSyntaxKind::Do,
342                    "else" => ElixirSyntaxKind::Else,
343                    "elsif" => ElixirSyntaxKind::Elsif,
344                    "end" => ElixirSyntaxKind::End,
345                    "false" => ElixirSyntaxKind::False,
346                    "fn" => ElixirSyntaxKind::Fn,
347                    "if" => ElixirSyntaxKind::If,
348                    "in" => ElixirSyntaxKind::In,
349                    "not" => ElixirSyntaxKind::Not,
350                    "or" => ElixirSyntaxKind::Or,
351                    "receive" => ElixirSyntaxKind::Receive,
352                    "rescue" => ElixirSyntaxKind::Rescue,
353                    "true" => ElixirSyntaxKind::True,
354                    "try" => ElixirSyntaxKind::Try,
355                    "unless" => ElixirSyntaxKind::Unless,
356                    "when" => ElixirSyntaxKind::When,
357                    "with" => ElixirSyntaxKind::With,
358                    _ => {
359                        if text.chars().next().unwrap().is_uppercase() {
360                            ElixirSyntaxKind::Variable
361                        }
362                        else {
363                            ElixirSyntaxKind::Identifier
364                        }
365                    }
366                };
367
368                state.add_token(kind, start, state.get_position());
369                return true;
370            }
371        }
372        false
373    }
374
375    fn lex_atom<S: Source>(&self, state: &mut State<S>) -> bool {
376        let start = state.get_position();
377
378        if state.current() == Some(':') {
379            state.advance(1);
380
381            // 处理引用的原子 :"atom"
382            if state.peek() == Some('"') {
383                state.advance(1);
384                while let Some(ch) = state.peek() {
385                    if ch == '"' {
386                        state.advance(1);
387                        break;
388                    }
389                    if ch == '\\' {
390                        state.advance(1);
391                        if let Some(escaped) = state.peek() {
392                            state.advance(escaped.len_utf8());
393                        }
394                    }
395                    else {
396                        state.advance(ch.len_utf8());
397                    }
398                }
399            }
400            else if let Some(ch) = state.peek() {
401                if ch.is_alphabetic() || ch == '_' {
402                    state.advance(ch.len_utf8());
403                    while let Some(next_ch) = state.peek() {
404                        if next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!' {
405                            state.advance(next_ch.len_utf8());
406                        }
407                        else {
408                            break;
409                        }
410                    }
411                }
412            }
413
414            state.add_token(ElixirSyntaxKind::Atom, start, state.get_position());
415            return true;
416        }
417        false
418    }
419
420    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
421        let start = state.get_position();
422        let rest = state.rest();
423
424        // 多字符操作符
425        if rest.starts_with("===") {
426            state.advance(3);
427            state.add_token(ElixirSyntaxKind::EqualEqualEqual, start, state.get_position());
428            return true;
429        }
430        if rest.starts_with("!==") {
431            state.advance(3);
432            state.add_token(ElixirSyntaxKind::NotEqualEqual, start, state.get_position());
433            return true;
434        }
435        if rest.starts_with("==") {
436            state.advance(2);
437            state.add_token(ElixirSyntaxKind::EqualEqual, start, state.get_position());
438            return true;
439        }
440        if rest.starts_with("!=") {
441            state.advance(2);
442            state.add_token(ElixirSyntaxKind::NotEqual, start, state.get_position());
443            return true;
444        }
445        if rest.starts_with("<=") {
446            state.advance(2);
447            state.add_token(ElixirSyntaxKind::LessEqual, start, state.get_position());
448            return true;
449        }
450        if rest.starts_with(">=") {
451            state.advance(2);
452            state.add_token(ElixirSyntaxKind::GreaterEqual, start, state.get_position());
453            return true;
454        }
455        if rest.starts_with("++") {
456            state.advance(2);
457            state.add_token(ElixirSyntaxKind::PlusPlus, start, state.get_position());
458            return true;
459        }
460        if rest.starts_with("--") {
461            state.advance(2);
462            state.add_token(ElixirSyntaxKind::MinusMinus, start, state.get_position());
463            return true;
464        }
465        if rest.starts_with("**") {
466            state.advance(2);
467            state.add_token(ElixirSyntaxKind::StarStar, start, state.get_position());
468            return true;
469        }
470        if rest.starts_with("<<") {
471            state.advance(2);
472            state.add_token(ElixirSyntaxKind::LeftShift, start, state.get_position());
473            return true;
474        }
475        if rest.starts_with(">>") {
476            state.advance(2);
477            state.add_token(ElixirSyntaxKind::RightShift, start, state.get_position());
478            return true;
479        }
480        if rest.starts_with("=~") {
481            state.advance(2);
482            state.add_token(ElixirSyntaxKind::MatchOp, start, state.get_position());
483            return true;
484        }
485        if rest.starts_with("|>") {
486            state.advance(2);
487            state.add_token(ElixirSyntaxKind::PipeRight, start, state.get_position());
488            return true;
489        }
490        if rest.starts_with("||") {
491            state.advance(2);
492            state.add_token(ElixirSyntaxKind::PipePipe, start, state.get_position());
493            return true;
494        }
495        if rest.starts_with("->") {
496            state.advance(2);
497            state.add_token(ElixirSyntaxKind::Arrow, start, state.get_position());
498            return true;
499        }
500
501        // 单字符操作符
502        if let Some(ch) = state.current() {
503            let kind = match ch {
504                '+' => ElixirSyntaxKind::Plus,
505                '-' => ElixirSyntaxKind::Minus,
506                '*' => ElixirSyntaxKind::Star,
507                '/' => ElixirSyntaxKind::Slash,
508                '=' => ElixirSyntaxKind::Equal,
509                '<' => ElixirSyntaxKind::Less,
510                '>' => ElixirSyntaxKind::Greater,
511                '!' => ElixirSyntaxKind::Exclamation,
512                '?' => ElixirSyntaxKind::Question,
513                '&' => ElixirSyntaxKind::Ampersand,
514                '@' => ElixirSyntaxKind::At,
515                '^' => ElixirSyntaxKind::Caret,
516                '~' => ElixirSyntaxKind::Tilde,
517                '|' => ElixirSyntaxKind::Pipe,
518                '#' => ElixirSyntaxKind::Hash,
519                '(' => ElixirSyntaxKind::LeftParen,
520                ')' => ElixirSyntaxKind::RightParen,
521                '{' => ElixirSyntaxKind::LeftBrace,
522                '}' => ElixirSyntaxKind::RightBrace,
523                '[' => ElixirSyntaxKind::LeftBracket,
524                ']' => ElixirSyntaxKind::RightBracket,
525                ',' => ElixirSyntaxKind::Comma,
526                ';' => ElixirSyntaxKind::Semicolon,
527                '.' => ElixirSyntaxKind::Dot,
528                ':' => ElixirSyntaxKind::Colon,
529                '\n' => ElixirSyntaxKind::Newline,
530                _ => return false,
531            };
532
533            state.advance(ch.len_utf8());
534            state.add_token(kind, start, state.get_position());
535            return true;
536        }
537
538        false
539    }
540}