Skip to main content

oak_elixir/lexer/
mod.rs

1use crate::{kind::ElixirSyntaxKind, language::ElixirLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState,
4    errors::OakError,
5    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
6    source::Source,
7};
8use std::sync::LazyLock;
9
10type State<'s, S> = LexerState<'s, S, ElixirLanguage>;
11
12static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13static ELIXIR_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
14static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
16
17#[derive(Clone, Debug)]
18pub struct ElixirLexer<'config> {
19    _config: &'config ElixirLanguage,
20}
21
22impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
23    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElixirLanguage>) -> LexOutput<ElixirLanguage> {
24        let mut state = State::new_with_cache(source, 0, cache);
25        let result = self.run(&mut state);
26        if result.is_ok() {
27            state.add_eof();
28        }
29        state.finish_with_cache(result, cache)
30    }
31}
32
33impl<'config> ElixirLexer<'config> {
34    pub fn new(config: &'config ElixirLanguage) -> Self {
35        Self { _config: config }
36    }
37
38    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.skip_comment(state) {
47                continue;
48            }
49
50            if self.lex_string_literal(state) {
51                continue;
52            }
53
54            if self.lex_char_literal(state) {
55                continue;
56            }
57
58            if self.lex_sigil(state) {
59                continue;
60            }
61
62            if self.lex_number_literal(state) {
63                continue;
64            }
65
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            if self.lex_atom(state) {
71                continue;
72            }
73
74            if self.lex_operators(state) {
75                continue;
76            }
77
78            state.advance_if_dead_lock(safe_point);
79        }
80
81        Ok(())
82    }
83
84    /// 跳过空白字符
85    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86        ELIXIR_WHITESPACE.scan(state, ElixirSyntaxKind::Whitespace)
87    }
88
89    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90        ELIXIR_COMMENT.scan(state, ElixirSyntaxKind::Comment, ElixirSyntaxKind::Comment)
91    }
92
93    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94        ELIXIR_STRING.scan(state, ElixirSyntaxKind::String)
95    }
96
97    fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
98        ELIXIR_CHAR.scan(state, ElixirSyntaxKind::Character)
99    }
100
101    fn lex_sigil<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
102        let start = state.get_position();
103
104        if state.consume_if_starts_with("~") {
105            if let Some(sigil_type) = state.peek() {
106                if sigil_type.is_alphabetic() {
107                    state.advance(sigil_type.len_utf8());
108
109                    // 查找分隔符
110                    if let Some(delimiter) = state.peek() {
111                        let closing_delimiter = match delimiter {
112                            '(' => ')',
113                            '[' => ']',
114                            '{' => '}',
115                            '<' => '>',
116                            '/' => '/',
117                            '|' => '|',
118                            '"' => '"',
119                            '\'' => '\'',
120                            _ => delimiter,
121                        };
122
123                        state.advance(delimiter.len_utf8());
124
125                        while let Some(ch) = state.peek() {
126                            if ch == closing_delimiter {
127                                state.advance(ch.len_utf8());
128                                break;
129                            }
130                            state.advance(ch.len_utf8());
131                        }
132
133                        // 可选的修饰符
134                        state.take_while(|c| c.is_alphabetic());
135
136                        state.add_token(ElixirSyntaxKind::Sigil, start, state.get_position());
137                        return true;
138                    }
139                }
140            }
141        }
142        false
143    }
144
145    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146        let start = state.get_position();
147        let first = match state.peek() {
148            Some(c) => c,
149            None => return false,
150        };
151        if !first.is_ascii_digit() {
152            return false;
153        }
154        let mut is_float = false;
155        if first == '0' {
156            match state.peek_next_n(1) {
157                Some('x') | Some('X') => {
158                    state.advance(2);
159                    state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
160                }
161                Some('b') | Some('B') => {
162                    state.advance(2);
163                    state.take_while(|c| c == '0' || c == '1' || c == '_');
164                }
165                Some('o') | Some('O') => {
166                    state.advance(2);
167                    state.take_while(|c| ('0'..='7').contains(&c) || c == '_');
168                }
169                _ => {
170                    state.advance(1);
171                    state.take_while(|c| c.is_ascii_digit() || c == '_');
172                }
173            }
174        }
175        else {
176            state.advance(1);
177            state.take_while(|c| c.is_ascii_digit() || c == '_');
178        }
179        // fractional part
180        if state.peek() == Some('.') {
181            let n1 = state.peek_next_n(1);
182            if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
183                is_float = true;
184                state.advance(1); // consume '.'
185                state.take_while(|c| c.is_ascii_digit() || c == '_');
186            }
187        }
188        // exponent
189        if let Some(c) = state.peek() {
190            if c == 'e' || c == 'E' {
191                let n1 = state.peek_next_n(1);
192                if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
193                    is_float = true;
194                    state.advance(1);
195                    if let Some(sign) = state.peek() {
196                        if sign == '+' || sign == '-' {
197                            state.advance(1);
198                        }
199                    }
200                    state.take_while(|d| d.is_ascii_digit() || d == '_');
201                }
202            }
203        }
204        // suffix letters
205        state.take_while(|c| c.is_ascii_alphabetic());
206        let end = state.get_position();
207        state.add_token(if is_float { ElixirSyntaxKind::Float } else { ElixirSyntaxKind::Number }, start, end);
208        true
209    }
210
211    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
212        let start = state.get_position();
213
214        if let Some(ch) = state.peek() {
215            if ch.is_alphabetic() || ch == '_' {
216                state.advance(ch.len_utf8());
217                state.take_while(|next_ch| next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!');
218
219                let text = state.get_text_in((start..state.get_position()).into());
220                let kind = match text.as_ref() {
221                    "after" => ElixirSyntaxKind::After,
222                    "and" => ElixirSyntaxKind::And,
223                    "case" => ElixirSyntaxKind::Case,
224                    "catch" => ElixirSyntaxKind::Catch,
225                    "cond" => ElixirSyntaxKind::Cond,
226                    "def" => ElixirSyntaxKind::Def,
227                    "defp" => ElixirSyntaxKind::Defp,
228                    "defmodule" => ElixirSyntaxKind::Defmodule,
229                    "defstruct" => ElixirSyntaxKind::Defstruct,
230                    "defprotocol" => ElixirSyntaxKind::Defprotocol,
231                    "defimpl" => ElixirSyntaxKind::Defimpl,
232                    "defmacro" => ElixirSyntaxKind::Defmacro,
233                    "defmacrop" => ElixirSyntaxKind::Defmacrop,
234                    "do" => ElixirSyntaxKind::Do,
235                    "else" => ElixirSyntaxKind::Else,
236                    "elsif" => ElixirSyntaxKind::Elsif,
237                    "end" => ElixirSyntaxKind::End,
238                    "false" => ElixirSyntaxKind::False,
239                    "fn" => ElixirSyntaxKind::Fn,
240                    "if" => ElixirSyntaxKind::If,
241                    "in" => ElixirSyntaxKind::In,
242                    "not" => ElixirSyntaxKind::Not,
243                    "or" => ElixirSyntaxKind::Or,
244                    "receive" => ElixirSyntaxKind::Receive,
245                    "rescue" => ElixirSyntaxKind::Rescue,
246                    "true" => ElixirSyntaxKind::True,
247                    "try" => ElixirSyntaxKind::Try,
248                    "unless" => ElixirSyntaxKind::Unless,
249                    "when" => ElixirSyntaxKind::When,
250                    "with" => ElixirSyntaxKind::With,
251                    _ => {
252                        if text.as_ref().chars().next().unwrap().is_uppercase() {
253                            ElixirSyntaxKind::Variable
254                        }
255                        else {
256                            ElixirSyntaxKind::Identifier
257                        }
258                    }
259                };
260
261                state.add_token(kind, start, state.get_position());
262                return true;
263            }
264        }
265        false
266    }
267
268    fn lex_atom<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
269        let start = state.get_position();
270
271        if state.consume_if_starts_with(":") {
272            // 处理引用的原子 :"atom"
273            if state.consume_if_starts_with("\"") {
274                while let Some(ch) = state.peek() {
275                    if ch == '"' {
276                        state.advance(1);
277                        break;
278                    }
279                    if state.consume_if_starts_with("\\") {
280                        if let Some(escaped) = state.peek() {
281                            state.advance(escaped.len_utf8());
282                        }
283                    }
284                    else {
285                        state.advance(ch.len_utf8());
286                    }
287                }
288            }
289            else if let Some(ch) = state.peek() {
290                if ch.is_alphabetic() || ch == '_' {
291                    state.advance(ch.len_utf8());
292                    state.take_while(|next_ch| next_ch.is_alphanumeric() || next_ch == '_' || next_ch == '?' || next_ch == '!');
293                }
294            }
295
296            state.add_token(ElixirSyntaxKind::Atom, start, state.get_position());
297            return true;
298        }
299        false
300    }
301
302    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
303        let start = state.get_position();
304
305        // 多字符操作符
306        let ops = [
307            ("===", ElixirSyntaxKind::EqualEqualEqual),
308            ("!==", ElixirSyntaxKind::NotEqualEqual),
309            ("==", ElixirSyntaxKind::EqualEqual),
310            ("!=", ElixirSyntaxKind::NotEqual),
311            ("<=", ElixirSyntaxKind::LessEqual),
312            (">=", ElixirSyntaxKind::GreaterEqual),
313            ("++", ElixirSyntaxKind::PlusPlus),
314            ("--", ElixirSyntaxKind::MinusMinus),
315            ("**", ElixirSyntaxKind::StarStar),
316            ("<<", ElixirSyntaxKind::LeftShift),
317            (">>", ElixirSyntaxKind::RightShift),
318            ("=~", ElixirSyntaxKind::MatchOp),
319            ("|>", ElixirSyntaxKind::PipeRight),
320            ("||", ElixirSyntaxKind::PipePipe),
321            ("->", ElixirSyntaxKind::Arrow),
322        ];
323
324        for (pattern, kind) in ops {
325            if state.consume_if_starts_with(pattern) {
326                state.add_token(kind, start, state.get_position());
327                return true;
328            }
329        }
330
331        // 单字符操作符
332        if let Some(ch) = state.peek() {
333            let kind = match ch {
334                '+' => ElixirSyntaxKind::Plus,
335                '-' => ElixirSyntaxKind::Minus,
336                '*' => ElixirSyntaxKind::Star,
337                '/' => ElixirSyntaxKind::Slash,
338                '=' => ElixirSyntaxKind::Equal,
339                '<' => ElixirSyntaxKind::Less,
340                '>' => ElixirSyntaxKind::Greater,
341                '!' => ElixirSyntaxKind::Exclamation,
342                '?' => ElixirSyntaxKind::Question,
343                '&' => ElixirSyntaxKind::Ampersand,
344                '@' => ElixirSyntaxKind::At,
345                '^' => ElixirSyntaxKind::Caret,
346                '~' => ElixirSyntaxKind::Tilde,
347                '|' => ElixirSyntaxKind::Pipe,
348                '#' => ElixirSyntaxKind::Hash,
349                '(' => ElixirSyntaxKind::LeftParen,
350                ')' => ElixirSyntaxKind::RightParen,
351                '{' => ElixirSyntaxKind::LeftBrace,
352                '}' => ElixirSyntaxKind::RightBrace,
353                '[' => ElixirSyntaxKind::LeftBracket,
354                ']' => ElixirSyntaxKind::RightBracket,
355                ',' => ElixirSyntaxKind::Comma,
356                ';' => ElixirSyntaxKind::Semicolon,
357                '.' => ElixirSyntaxKind::Dot,
358                ':' => ElixirSyntaxKind::Colon,
359                '\n' => ElixirSyntaxKind::Newline,
360                _ => return false,
361            };
362
363            state.advance(ch.len_utf8());
364            state.add_token(kind, start, state.get_position());
365            return true;
366        }
367
368        false
369    }
370}