Skip to main content

oak_elixir/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::ElixirLanguage, lexer::token_type::ElixirTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState,
7    errors::OakError,
8    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, ElixirLanguage>;
14
15static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static ELIXIR_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
17static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
19
20#[derive(Clone, Debug)]
21pub struct ElixirLexer<'config> {
22    config: &'config ElixirLanguage,
23}
24
25impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElixirLanguage>) -> LexOutput<ElixirLanguage> {
27        let mut state = State::new_with_cache(source, 0, cache);
28        let result = self.run(&mut state);
29        if result.is_ok() {
30            state.add_eof()
31        }
32        state.finish_with_cache(result, cache)
33    }
34}
35
36impl<'config> ElixirLexer<'config> {
37    pub fn new(config: &'config ElixirLanguage) -> Self {
38        Self { config }
39    }
40
41    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
42        while state.not_at_end() {
43            let safe_point = state.get_position();
44
45            if self.skip_whitespace(state) {
46                continue;
47            }
48
49            if self.skip_comment(state) {
50                continue;
51            }
52
53            if self.lex_string_literal(state) {
54                continue;
55            }
56
57            if self.lex_char_literal(state) {
58                continue;
59            }
60
61            if self.lex_sigil(state) {
62                continue;
63            }
64
65            if self.lex_number_literal(state) {
66                continue;
67            }
68
69            if self.lex_identifier_or_keyword(state) {
70                continue;
71            }
72
73            if self.lex_atom(state) {
74                continue;
75            }
76
77            if self.lex_operators(state) {
78                continue;
79            }
80
81            state.advance_if_dead_lock(safe_point);
82        }
83
84        Ok(())
85    }
86
87    /// Skips whitespace.
88    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        ELIXIR_WHITESPACE.scan(state, ElixirTokenType::Whitespace)
90    }
91
92    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93        ELIXIR_COMMENT.scan(state, ElixirTokenType::Comment, ElixirTokenType::Comment)
94    }
95
96    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97        ELIXIR_STRING.scan(state, ElixirTokenType::String)
98    }
99
100    fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
101        ELIXIR_CHAR.scan(state, ElixirTokenType::Character)
102    }
103
104    fn lex_sigil<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
105        let start = state.get_position();
106        if state.peek() == Some('~') {
107            state.advance(1);
108            state.take_while(|c| c.is_alphabetic());
109            if let Some(ch) = state.peek() {
110                if "\"\'([{<".contains(ch) {
111                    // Simplified sigil lexing
112                    state.advance(ch.len_utf8());
113                    let closer = match ch {
114                        '(' => ')',
115                        '[' => ']',
116                        '{' => '}',
117                        '<' => '>',
118                        c => c,
119                    };
120                    state.take_while(|c| c != closer);
121                    if state.peek() == Some(closer) {
122                        state.advance(closer.len_utf8());
123                    }
124                }
125            }
126            state.add_token(ElixirTokenType::Sigil, start, state.get_position());
127            return true;
128        }
129        false
130    }
131
132    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
133        let start = state.get_position();
134        if let Some(ch) = state.peek() {
135            if ch.is_ascii_digit() {
136                state.take_while(|c| c.is_ascii_digit() || c == '_');
137                if state.peek() == Some('.') {
138                    state.advance(1);
139                    state.take_while(|c| c.is_ascii_digit() || c == '_');
140                    state.add_token(ElixirTokenType::Float, start, state.get_position());
141                }
142                else {
143                    state.add_token(ElixirTokenType::Number, start, state.get_position());
144                }
145                return true;
146            }
147        }
148        false
149    }
150
151    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
152        let start = state.get_position();
153        if let Some(ch) = state.peek() {
154            if ch.is_alphabetic() || ch == '_' {
155                state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
156                let text = state.get_text_in(oak_core::Range { start, end: state.get_position() });
157                let token_type = match text.as_ref() {
158                    "after" => ElixirTokenType::After,
159                    "and" => ElixirTokenType::And,
160                    "case" => ElixirTokenType::Case,
161                    "catch" => ElixirTokenType::Catch,
162                    "cond" => ElixirTokenType::Cond,
163                    "def" => ElixirTokenType::Def,
164                    "defp" => ElixirTokenType::Defp,
165                    "defmodule" => ElixirTokenType::Defmodule,
166                    "defstruct" => ElixirTokenType::Defstruct,
167                    "defprotocol" => ElixirTokenType::Defprotocol,
168                    "defimpl" => ElixirTokenType::Defimpl,
169                    "defmacro" => ElixirTokenType::Defmacro,
170                    "defmacrop" => ElixirTokenType::Defmacrop,
171                    "do" => ElixirTokenType::Do,
172                    "else" => ElixirTokenType::Else,
173                    "elsif" => ElixirTokenType::Elsif,
174                    "end" => ElixirTokenType::End,
175                    "false" => ElixirTokenType::False,
176                    "fn" => ElixirTokenType::Fn,
177                    "if" => ElixirTokenType::If,
178                    "in" => ElixirTokenType::In,
179                    "not" => ElixirTokenType::Not,
180                    "or" => ElixirTokenType::Or,
181                    "receive" => ElixirTokenType::Receive,
182                    "rescue" => ElixirTokenType::Rescue,
183                    "true" => ElixirTokenType::True,
184                    "try" => ElixirTokenType::Try,
185                    "unless" => ElixirTokenType::Unless,
186                    "when" => ElixirTokenType::When,
187                    "with" => ElixirTokenType::With,
188                    _ if text.chars().next().map_or(false, |c| c.is_uppercase()) => ElixirTokenType::Variable,
189                    _ => ElixirTokenType::Identifier,
190                };
191                state.add_token(token_type, start, state.get_position());
192                return true;
193            }
194        }
195        false
196    }
197
198    fn lex_atom<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
199        let start = state.get_position();
200        if state.peek() == Some(':') {
201            state.advance(1);
202            if let Some(ch) = state.peek() {
203                if ch.is_alphabetic() || ch == '_' {
204                    state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
205                }
206                else if ch == '"' {
207                    state.advance(1);
208                    state.take_while(|c| c != '"');
209                    if state.peek() == Some('"') {
210                        state.advance(1);
211                    }
212                }
213            }
214            state.add_token(ElixirTokenType::Atom, start, state.get_position());
215            return true;
216        }
217        false
218    }
219
220    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
221        let start = state.get_position();
222        let operators = [
223            ("|>", ElixirTokenType::Pipeline),
224            ("++", ElixirTokenType::PlusPlus),
225            ("--", ElixirTokenType::MinusMinus),
226            ("<>", ElixirTokenType::Concat),
227            ("==", ElixirTokenType::EqEq),
228            ("!=", ElixirTokenType::Ne),
229            ("<=", ElixirTokenType::Le),
230            (">=", ElixirTokenType::Ge),
231            ("&&", ElixirTokenType::AndAnd),
232            ("||", ElixirTokenType::OrOr),
233            ("<<", ElixirTokenType::LeftDoubleBracket),
234            (">>", ElixirTokenType::RightDoubleBracket),
235            ("->", ElixirTokenType::Arrow),
236            ("+", ElixirTokenType::Plus),
237            ("-", ElixirTokenType::Minus),
238            ("*", ElixirTokenType::Mul),
239            ("/", ElixirTokenType::Div),
240            (".", ElixirTokenType::Dot),
241            (",", ElixirTokenType::Comma),
242            (";", ElixirTokenType::Semicolon),
243            (":", ElixirTokenType::Colon),
244            ("(", ElixirTokenType::LeftParen),
245            (")", ElixirTokenType::RightParen),
246            ("{", ElixirTokenType::LeftBrace),
247            ("}", ElixirTokenType::RightBrace),
248            ("[", ElixirTokenType::LeftBracket),
249            ("]", ElixirTokenType::RightBracket),
250            ("|", ElixirTokenType::Pipe),
251            ("=", ElixirTokenType::Eq),
252            ("<", ElixirTokenType::Lt),
253            (">", ElixirTokenType::Gt),
254            ("!", ElixirTokenType::Bang),
255            ("@", ElixirTokenType::At),
256            ("%", ElixirTokenType::Percent),
257        ];
258
259        for (op, token_type) in operators {
260            if state.starts_with(op) {
261                state.advance(op.len());
262                state.add_token(token_type, start, state.get_position());
263                return true;
264            }
265        }
266        false
267    }
268}