1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::ElixirLanguage, lexer::token_type::ElixirTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState,
7 errors::OakError,
8 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9 source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, ElixirLanguage>;
14
15static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static ELIXIR_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
17static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
19
20#[derive(Clone, Debug)]
21pub struct ElixirLexer<'config> {
22 config: &'config ElixirLanguage,
23}
24
25impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
26 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElixirLanguage>) -> LexOutput<ElixirLanguage> {
27 let mut state = State::new_with_cache(source, 0, cache);
28 let result = self.run(&mut state);
29 if result.is_ok() {
30 state.add_eof()
31 }
32 state.finish_with_cache(result, cache)
33 }
34}
35
36impl<'config> ElixirLexer<'config> {
37 pub fn new(config: &'config ElixirLanguage) -> Self {
39 Self { config }
40 }
41
42 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
43 while state.not_at_end() {
44 let safe_point = state.get_position();
45
46 if self.skip_whitespace(state) {
47 continue;
48 }
49
50 if self.skip_comment(state) {
51 continue;
52 }
53
54 if self.lex_string_literal(state) {
55 continue;
56 }
57
58 if self.lex_char_literal(state) {
59 continue;
60 }
61
62 if self.lex_sigil(state) {
63 continue;
64 }
65
66 if self.lex_number_literal(state) {
67 continue;
68 }
69
70 if self.lex_identifier_or_keyword(state) {
71 continue;
72 }
73
74 if self.lex_atom(state) {
75 continue;
76 }
77
78 if self.lex_operators(state) {
79 continue;
80 }
81
82 state.advance_if_dead_lock(safe_point);
83 }
84
85 Ok(())
86 }
87
88 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90 ELIXIR_WHITESPACE.scan(state, ElixirTokenType::Whitespace)
91 }
92
93 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
94 ELIXIR_COMMENT.scan(state, ElixirTokenType::Comment, ElixirTokenType::Comment)
95 }
96
97 fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
98 ELIXIR_STRING.scan(state, ElixirTokenType::String)
99 }
100
101 fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
102 ELIXIR_CHAR.scan(state, ElixirTokenType::Character)
103 }
104
105 fn lex_sigil<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
106 let start = state.get_position();
107 if state.peek() == Some('~') {
108 state.advance(1);
109 state.take_while(|c| c.is_alphabetic());
110 if let Some(ch) = state.peek() {
111 if "\"\'([{<".contains(ch) {
112 state.advance(ch.len_utf8());
114 let closer = match ch {
115 '(' => ')',
116 '[' => ']',
117 '{' => '}',
118 '<' => '>',
119 c => c,
120 };
121 state.take_while(|c| c != closer);
122 if state.peek() == Some(closer) {
123 state.advance(closer.len_utf8());
124 }
125 }
126 }
127 state.add_token(ElixirTokenType::Sigil, start, state.get_position());
128 return true;
129 }
130 false
131 }
132
133 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
134 let start = state.get_position();
135 if let Some(ch) = state.peek() {
136 if ch.is_ascii_digit() {
137 state.take_while(|c| c.is_ascii_digit() || c == '_');
138 if state.peek() == Some('.') {
139 state.advance(1);
140 state.take_while(|c| c.is_ascii_digit() || c == '_');
141 state.add_token(ElixirTokenType::Float, start, state.get_position());
142 }
143 else {
144 state.add_token(ElixirTokenType::Number, start, state.get_position());
145 }
146 return true;
147 }
148 }
149 false
150 }
151
152 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
153 let start = state.get_position();
154 if let Some(ch) = state.peek() {
155 if ch.is_alphabetic() || ch == '_' {
156 state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
157 let text = state.get_text_in(oak_core::Range { start, end: state.get_position() });
158 let token_type = match text.as_ref() {
159 "after" => ElixirTokenType::After,
160 "and" => ElixirTokenType::And,
161 "case" => ElixirTokenType::Case,
162 "catch" => ElixirTokenType::Catch,
163 "cond" => ElixirTokenType::Cond,
164 "def" => ElixirTokenType::Def,
165 "defp" => ElixirTokenType::Defp,
166 "defmodule" => ElixirTokenType::Defmodule,
167 "defstruct" => ElixirTokenType::Defstruct,
168 "defprotocol" => ElixirTokenType::Defprotocol,
169 "defimpl" => ElixirTokenType::Defimpl,
170 "defmacro" => ElixirTokenType::Defmacro,
171 "defmacrop" => ElixirTokenType::Defmacrop,
172 "do" => ElixirTokenType::Do,
173 "else" => ElixirTokenType::Else,
174 "elsif" => ElixirTokenType::Elsif,
175 "end" => ElixirTokenType::End,
176 "false" => ElixirTokenType::False,
177 "fn" => ElixirTokenType::Fn,
178 "if" => ElixirTokenType::If,
179 "in" => ElixirTokenType::In,
180 "not" => ElixirTokenType::Not,
181 "or" => ElixirTokenType::Or,
182 "receive" => ElixirTokenType::Receive,
183 "rescue" => ElixirTokenType::Rescue,
184 "true" => ElixirTokenType::True,
185 "try" => ElixirTokenType::Try,
186 "unless" => ElixirTokenType::Unless,
187 "when" => ElixirTokenType::When,
188 "with" => ElixirTokenType::With,
189 _ if text.chars().next().map_or(false, |c| c.is_uppercase()) => ElixirTokenType::Variable,
190 _ => ElixirTokenType::Identifier,
191 };
192 state.add_token(token_type, start, state.get_position());
193 return true;
194 }
195 }
196 false
197 }
198
199 fn lex_atom<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
200 let start = state.get_position();
201 if state.peek() == Some(':') {
202 state.advance(1);
203 if let Some(ch) = state.peek() {
204 if ch.is_alphabetic() || ch == '_' {
205 state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
206 }
207 else if ch == '"' {
208 state.advance(1);
209 state.take_while(|c| c != '"');
210 if state.peek() == Some('"') {
211 state.advance(1);
212 }
213 }
214 }
215 state.add_token(ElixirTokenType::Atom, start, state.get_position());
216 return true;
217 }
218 false
219 }
220
221 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
222 let start = state.get_position();
223 let operators = [
224 ("|>", ElixirTokenType::Pipeline),
225 ("++", ElixirTokenType::PlusPlus),
226 ("--", ElixirTokenType::MinusMinus),
227 ("<>", ElixirTokenType::Concat),
228 ("==", ElixirTokenType::EqEq),
229 ("!=", ElixirTokenType::Ne),
230 ("<=", ElixirTokenType::Le),
231 (">=", ElixirTokenType::Ge),
232 ("&&", ElixirTokenType::AndAnd),
233 ("||", ElixirTokenType::OrOr),
234 ("<<", ElixirTokenType::LeftDoubleBracket),
235 (">>", ElixirTokenType::RightDoubleBracket),
236 ("->", ElixirTokenType::Arrow),
237 ("+", ElixirTokenType::Plus),
238 ("-", ElixirTokenType::Minus),
239 ("*", ElixirTokenType::Mul),
240 ("/", ElixirTokenType::Div),
241 (".", ElixirTokenType::Dot),
242 (",", ElixirTokenType::Comma),
243 (";", ElixirTokenType::Semicolon),
244 (":", ElixirTokenType::Colon),
245 ("(", ElixirTokenType::LeftParen),
246 (")", ElixirTokenType::RightParen),
247 ("{", ElixirTokenType::LeftBrace),
248 ("}", ElixirTokenType::RightBrace),
249 ("[", ElixirTokenType::LeftBracket),
250 ("]", ElixirTokenType::RightBracket),
251 ("|", ElixirTokenType::Pipe),
252 ("=", ElixirTokenType::Eq),
253 ("<", ElixirTokenType::Lt),
254 (">", ElixirTokenType::Gt),
255 ("!", ElixirTokenType::Bang),
256 ("@", ElixirTokenType::At),
257 ("%", ElixirTokenType::Percent),
258 ];
259
260 for (op, token_type) in operators {
261 if state.starts_with(op) {
262 state.advance(op.len());
263 state.add_token(token_type, start, state.get_position());
264 return true;
265 }
266 }
267 false
268 }
269}