Skip to main content

oak_elixir/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::ElixirLanguage, lexer::token_type::ElixirTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState,
7    errors::OakError,
8    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, ElixirLanguage>;
14
15static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static ELIXIR_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
17static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
19
20#[derive(Clone, Debug)]
21pub struct ElixirLexer<'config> {
22    config: &'config ElixirLanguage,
23}
24
25impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElixirLanguage>) -> LexOutput<ElixirLanguage> {
27        let mut state = State::new_with_cache(source, 0, cache);
28        let result = self.run(&mut state);
29        if result.is_ok() {
30            state.add_eof()
31        }
32        state.finish_with_cache(result, cache)
33    }
34}
35
36impl<'config> ElixirLexer<'config> {
37    /// Creates a new ElixirLexer with the given language configuration.
38    pub fn new(config: &'config ElixirLanguage) -> Self {
39        Self { config }
40    }
41
42    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
43        while state.not_at_end() {
44            let safe_point = state.get_position();
45
46            if self.skip_whitespace(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_char_literal(state) {
59                continue;
60            }
61
62            if self.lex_sigil(state) {
63                continue;
64            }
65
66            if self.lex_number_literal(state) {
67                continue;
68            }
69
70            if self.lex_identifier_or_keyword(state) {
71                continue;
72            }
73
74            if self.lex_atom(state) {
75                continue;
76            }
77
78            if self.lex_operators(state) {
79                continue;
80            }
81
82            state.advance_if_dead_lock(safe_point);
83        }
84
85        Ok(())
86    }
87
88    /// Skips whitespace.
89    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90        ELIXIR_WHITESPACE.scan(state, ElixirTokenType::Whitespace)
91    }
92
93    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94        ELIXIR_COMMENT.scan(state, ElixirTokenType::Comment, ElixirTokenType::Comment)
95    }
96
97    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
98        ELIXIR_STRING.scan(state, ElixirTokenType::String)
99    }
100
101    fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
102        ELIXIR_CHAR.scan(state, ElixirTokenType::Character)
103    }
104
105    fn lex_sigil<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
106        let start = state.get_position();
107        if state.peek() == Some('~') {
108            state.advance(1);
109            state.take_while(|c| c.is_alphabetic());
110            if let Some(ch) = state.peek() {
111                if "\"\'([{<".contains(ch) {
112                    // Simplified sigil lexing
113                    state.advance(ch.len_utf8());
114                    let closer = match ch {
115                        '(' => ')',
116                        '[' => ']',
117                        '{' => '}',
118                        '<' => '>',
119                        c => c,
120                    };
121                    state.take_while(|c| c != closer);
122                    if state.peek() == Some(closer) {
123                        state.advance(closer.len_utf8());
124                    }
125                }
126            }
127            state.add_token(ElixirTokenType::Sigil, start, state.get_position());
128            return true;
129        }
130        false
131    }
132
133    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
134        let start = state.get_position();
135        if let Some(ch) = state.peek() {
136            if ch.is_ascii_digit() {
137                state.take_while(|c| c.is_ascii_digit() || c == '_');
138                if state.peek() == Some('.') {
139                    state.advance(1);
140                    state.take_while(|c| c.is_ascii_digit() || c == '_');
141                    state.add_token(ElixirTokenType::Float, start, state.get_position());
142                }
143                else {
144                    state.add_token(ElixirTokenType::Number, start, state.get_position());
145                }
146                return true;
147            }
148        }
149        false
150    }
151
152    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
153        let start = state.get_position();
154        if let Some(ch) = state.peek() {
155            if ch.is_alphabetic() || ch == '_' {
156                state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
157                let text = state.get_text_in(oak_core::Range { start, end: state.get_position() });
158                let token_type = match text.as_ref() {
159                    "after" => ElixirTokenType::After,
160                    "and" => ElixirTokenType::And,
161                    "case" => ElixirTokenType::Case,
162                    "catch" => ElixirTokenType::Catch,
163                    "cond" => ElixirTokenType::Cond,
164                    "def" => ElixirTokenType::Def,
165                    "defp" => ElixirTokenType::Defp,
166                    "defmodule" => ElixirTokenType::Defmodule,
167                    "defstruct" => ElixirTokenType::Defstruct,
168                    "defprotocol" => ElixirTokenType::Defprotocol,
169                    "defimpl" => ElixirTokenType::Defimpl,
170                    "defmacro" => ElixirTokenType::Defmacro,
171                    "defmacrop" => ElixirTokenType::Defmacrop,
172                    "do" => ElixirTokenType::Do,
173                    "else" => ElixirTokenType::Else,
174                    "elsif" => ElixirTokenType::Elsif,
175                    "end" => ElixirTokenType::End,
176                    "false" => ElixirTokenType::False,
177                    "fn" => ElixirTokenType::Fn,
178                    "if" => ElixirTokenType::If,
179                    "in" => ElixirTokenType::In,
180                    "not" => ElixirTokenType::Not,
181                    "or" => ElixirTokenType::Or,
182                    "receive" => ElixirTokenType::Receive,
183                    "rescue" => ElixirTokenType::Rescue,
184                    "true" => ElixirTokenType::True,
185                    "try" => ElixirTokenType::Try,
186                    "unless" => ElixirTokenType::Unless,
187                    "when" => ElixirTokenType::When,
188                    "with" => ElixirTokenType::With,
189                    _ if text.chars().next().map_or(false, |c| c.is_uppercase()) => ElixirTokenType::Variable,
190                    _ => ElixirTokenType::Identifier,
191                };
192                state.add_token(token_type, start, state.get_position());
193                return true;
194            }
195        }
196        false
197    }
198
199    fn lex_atom<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
200        let start = state.get_position();
201        if state.peek() == Some(':') {
202            state.advance(1);
203            if let Some(ch) = state.peek() {
204                if ch.is_alphabetic() || ch == '_' {
205                    state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
206                }
207                else if ch == '"' {
208                    state.advance(1);
209                    state.take_while(|c| c != '"');
210                    if state.peek() == Some('"') {
211                        state.advance(1);
212                    }
213                }
214            }
215            state.add_token(ElixirTokenType::Atom, start, state.get_position());
216            return true;
217        }
218        false
219    }
220
221    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
222        let start = state.get_position();
223        let operators = [
224            ("|>", ElixirTokenType::Pipeline),
225            ("++", ElixirTokenType::PlusPlus),
226            ("--", ElixirTokenType::MinusMinus),
227            ("<>", ElixirTokenType::Concat),
228            ("==", ElixirTokenType::EqEq),
229            ("!=", ElixirTokenType::Ne),
230            ("<=", ElixirTokenType::Le),
231            (">=", ElixirTokenType::Ge),
232            ("&&", ElixirTokenType::AndAnd),
233            ("||", ElixirTokenType::OrOr),
234            ("<<", ElixirTokenType::LeftDoubleBracket),
235            (">>", ElixirTokenType::RightDoubleBracket),
236            ("->", ElixirTokenType::Arrow),
237            ("+", ElixirTokenType::Plus),
238            ("-", ElixirTokenType::Minus),
239            ("*", ElixirTokenType::Mul),
240            ("/", ElixirTokenType::Div),
241            (".", ElixirTokenType::Dot),
242            (",", ElixirTokenType::Comma),
243            (";", ElixirTokenType::Semicolon),
244            (":", ElixirTokenType::Colon),
245            ("(", ElixirTokenType::LeftParen),
246            (")", ElixirTokenType::RightParen),
247            ("{", ElixirTokenType::LeftBrace),
248            ("}", ElixirTokenType::RightBrace),
249            ("[", ElixirTokenType::LeftBracket),
250            ("]", ElixirTokenType::RightBracket),
251            ("|", ElixirTokenType::Pipe),
252            ("=", ElixirTokenType::Eq),
253            ("<", ElixirTokenType::Lt),
254            (">", ElixirTokenType::Gt),
255            ("!", ElixirTokenType::Bang),
256            ("@", ElixirTokenType::At),
257            ("%", ElixirTokenType::Percent),
258        ];
259
260        for (op, token_type) in operators {
261            if state.starts_with(op) {
262                state.advance(op.len());
263                state.add_token(token_type, start, state.get_position());
264                return true;
265            }
266        }
267        false
268    }
269}