1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::ElixirLanguage, lexer::token_type::ElixirTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState,
7 errors::OakError,
8 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9 source::Source,
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, ElixirLanguage>;
14
15static ELIXIR_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static ELIXIR_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
17static ELIXIR_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18static ELIXIR_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: None });
19
20#[derive(Clone, Debug)]
21pub struct ElixirLexer<'config> {
22 config: &'config ElixirLanguage,
23}
24
25impl<'config> Lexer<ElixirLanguage> for ElixirLexer<'config> {
26 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElixirLanguage>) -> LexOutput<ElixirLanguage> {
27 let mut state = State::new_with_cache(source, 0, cache);
28 let result = self.run(&mut state);
29 if result.is_ok() {
30 state.add_eof()
31 }
32 state.finish_with_cache(result, cache)
33 }
34}
35
36impl<'config> ElixirLexer<'config> {
37 pub fn new(config: &'config ElixirLanguage) -> Self {
38 Self { config }
39 }
40
41 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
42 while state.not_at_end() {
43 let safe_point = state.get_position();
44
45 if self.skip_whitespace(state) {
46 continue;
47 }
48
49 if self.skip_comment(state) {
50 continue;
51 }
52
53 if self.lex_string_literal(state) {
54 continue;
55 }
56
57 if self.lex_char_literal(state) {
58 continue;
59 }
60
61 if self.lex_sigil(state) {
62 continue;
63 }
64
65 if self.lex_number_literal(state) {
66 continue;
67 }
68
69 if self.lex_identifier_or_keyword(state) {
70 continue;
71 }
72
73 if self.lex_atom(state) {
74 continue;
75 }
76
77 if self.lex_operators(state) {
78 continue;
79 }
80
81 state.advance_if_dead_lock(safe_point);
82 }
83
84 Ok(())
85 }
86
87 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89 ELIXIR_WHITESPACE.scan(state, ElixirTokenType::Whitespace)
90 }
91
92 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93 ELIXIR_COMMENT.scan(state, ElixirTokenType::Comment, ElixirTokenType::Comment)
94 }
95
96 fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97 ELIXIR_STRING.scan(state, ElixirTokenType::String)
98 }
99
100 fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
101 ELIXIR_CHAR.scan(state, ElixirTokenType::Character)
102 }
103
104 fn lex_sigil<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
105 let start = state.get_position();
106 if state.peek() == Some('~') {
107 state.advance(1);
108 state.take_while(|c| c.is_alphabetic());
109 if let Some(ch) = state.peek() {
110 if "\"\'([{<".contains(ch) {
111 state.advance(ch.len_utf8());
113 let closer = match ch {
114 '(' => ')',
115 '[' => ']',
116 '{' => '}',
117 '<' => '>',
118 c => c,
119 };
120 state.take_while(|c| c != closer);
121 if state.peek() == Some(closer) {
122 state.advance(closer.len_utf8());
123 }
124 }
125 }
126 state.add_token(ElixirTokenType::Sigil, start, state.get_position());
127 return true;
128 }
129 false
130 }
131
132 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
133 let start = state.get_position();
134 if let Some(ch) = state.peek() {
135 if ch.is_ascii_digit() {
136 state.take_while(|c| c.is_ascii_digit() || c == '_');
137 if state.peek() == Some('.') {
138 state.advance(1);
139 state.take_while(|c| c.is_ascii_digit() || c == '_');
140 state.add_token(ElixirTokenType::Float, start, state.get_position());
141 }
142 else {
143 state.add_token(ElixirTokenType::Number, start, state.get_position());
144 }
145 return true;
146 }
147 }
148 false
149 }
150
151 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
152 let start = state.get_position();
153 if let Some(ch) = state.peek() {
154 if ch.is_alphabetic() || ch == '_' {
155 state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
156 let text = state.get_text_in(oak_core::Range { start, end: state.get_position() });
157 let token_type = match text.as_ref() {
158 "after" => ElixirTokenType::After,
159 "and" => ElixirTokenType::And,
160 "case" => ElixirTokenType::Case,
161 "catch" => ElixirTokenType::Catch,
162 "cond" => ElixirTokenType::Cond,
163 "def" => ElixirTokenType::Def,
164 "defp" => ElixirTokenType::Defp,
165 "defmodule" => ElixirTokenType::Defmodule,
166 "defstruct" => ElixirTokenType::Defstruct,
167 "defprotocol" => ElixirTokenType::Defprotocol,
168 "defimpl" => ElixirTokenType::Defimpl,
169 "defmacro" => ElixirTokenType::Defmacro,
170 "defmacrop" => ElixirTokenType::Defmacrop,
171 "do" => ElixirTokenType::Do,
172 "else" => ElixirTokenType::Else,
173 "elsif" => ElixirTokenType::Elsif,
174 "end" => ElixirTokenType::End,
175 "false" => ElixirTokenType::False,
176 "fn" => ElixirTokenType::Fn,
177 "if" => ElixirTokenType::If,
178 "in" => ElixirTokenType::In,
179 "not" => ElixirTokenType::Not,
180 "or" => ElixirTokenType::Or,
181 "receive" => ElixirTokenType::Receive,
182 "rescue" => ElixirTokenType::Rescue,
183 "true" => ElixirTokenType::True,
184 "try" => ElixirTokenType::Try,
185 "unless" => ElixirTokenType::Unless,
186 "when" => ElixirTokenType::When,
187 "with" => ElixirTokenType::With,
188 _ if text.chars().next().map_or(false, |c| c.is_uppercase()) => ElixirTokenType::Variable,
189 _ => ElixirTokenType::Identifier,
190 };
191 state.add_token(token_type, start, state.get_position());
192 return true;
193 }
194 }
195 false
196 }
197
198 fn lex_atom<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
199 let start = state.get_position();
200 if state.peek() == Some(':') {
201 state.advance(1);
202 if let Some(ch) = state.peek() {
203 if ch.is_alphabetic() || ch == '_' {
204 state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '!' || c == '?');
205 }
206 else if ch == '"' {
207 state.advance(1);
208 state.take_while(|c| c != '"');
209 if state.peek() == Some('"') {
210 state.advance(1);
211 }
212 }
213 }
214 state.add_token(ElixirTokenType::Atom, start, state.get_position());
215 return true;
216 }
217 false
218 }
219
220 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
221 let start = state.get_position();
222 let operators = [
223 ("|>", ElixirTokenType::Pipeline),
224 ("++", ElixirTokenType::PlusPlus),
225 ("--", ElixirTokenType::MinusMinus),
226 ("<>", ElixirTokenType::Concat),
227 ("==", ElixirTokenType::EqEq),
228 ("!=", ElixirTokenType::Ne),
229 ("<=", ElixirTokenType::Le),
230 (">=", ElixirTokenType::Ge),
231 ("&&", ElixirTokenType::AndAnd),
232 ("||", ElixirTokenType::OrOr),
233 ("<<", ElixirTokenType::LeftDoubleBracket),
234 (">>", ElixirTokenType::RightDoubleBracket),
235 ("->", ElixirTokenType::Arrow),
236 ("+", ElixirTokenType::Plus),
237 ("-", ElixirTokenType::Minus),
238 ("*", ElixirTokenType::Mul),
239 ("/", ElixirTokenType::Div),
240 (".", ElixirTokenType::Dot),
241 (",", ElixirTokenType::Comma),
242 (";", ElixirTokenType::Semicolon),
243 (":", ElixirTokenType::Colon),
244 ("(", ElixirTokenType::LeftParen),
245 (")", ElixirTokenType::RightParen),
246 ("{", ElixirTokenType::LeftBrace),
247 ("}", ElixirTokenType::RightBrace),
248 ("[", ElixirTokenType::LeftBracket),
249 ("]", ElixirTokenType::RightBracket),
250 ("|", ElixirTokenType::Pipe),
251 ("=", ElixirTokenType::Eq),
252 ("<", ElixirTokenType::Lt),
253 (">", ElixirTokenType::Gt),
254 ("!", ElixirTokenType::Bang),
255 ("@", ElixirTokenType::At),
256 ("%", ElixirTokenType::Percent),
257 ];
258
259 for (op, token_type) in operators {
260 if state.starts_with(op) {
261 state.advance(op.len());
262 state.add_token(token_type, start, state.get_position());
263 return true;
264 }
265 }
266 false
267 }
268}