1#![doc = include_str!("readme.md")]
2
3use oak_core::Source;
4pub mod token_type;
6
7use crate::{language::ElmLanguage, lexer::token_type::ElmTokenType};
8use oak_core::{
9 Lexer, LexerCache, LexerState,
10 errors::OakError,
11 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
12};
13use std::sync::LazyLock;
14
15type State<'s, S> = LexerState<'s, S, ElmLanguage>;
16
17static ELM_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18static ELM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "--", block_start: "{-", block_end: "-}", nested_blocks: true });
19static ELM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
20static ELM_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
21
22#[derive(Clone, Debug)]
24pub struct ElmLexer<'config> {
25 config: &'config ElmLanguage,
26}
27
28impl<'config> Lexer<ElmLanguage> for ElmLexer<'config> {
29 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<ElmLanguage>) -> LexOutput<ElmLanguage> {
30 let mut state = State::new_with_cache(source, 0, cache);
31 let result = self.run(&mut state);
32 if result.is_ok() {
33 state.add_eof();
34 }
35 state.finish_with_cache(result, cache)
36 }
37}
38
39impl<'config> ElmLexer<'config> {
40 pub fn new(config: &'config ElmLanguage) -> Self {
42 Self { config }
43 }
44
45 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
46 while state.not_at_end() {
47 let safe_point = state.get_position();
48
49 if self.skip_whitespace(state) {
50 continue;
51 }
52
53 if self.skip_comment(state) {
54 continue;
55 }
56
57 if self.lex_string_literal(state) {
58 continue;
59 }
60
61 if self.lex_char_literal(state) {
62 continue;
63 }
64
65 if self.lex_number_literal(state) {
66 continue;
67 }
68
69 if self.lex_identifier_or_keyword(state) {
70 continue;
71 }
72
73 if self.lex_operators(state) {
74 continue;
75 }
76
77 state.advance_if_dead_lock(safe_point);
78 }
79
80 Ok(())
81 }
82
83 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
85 ELM_WHITESPACE.scan(state, ElmTokenType::Whitespace)
86 }
87
88 fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89 ELM_COMMENT.scan(state, ElmTokenType::Comment, ElmTokenType::Comment)
90 }
91
92 fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93 ELM_STRING.scan(state, ElmTokenType::String)
94 }
95
96 fn lex_char_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
97 ELM_CHAR.scan(state, ElmTokenType::Char)
98 }
99
100 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
101 let start = state.get_position();
102 let first = match state.peek() {
103 Some(c) => c,
104 None => return false,
105 };
106 if !first.is_ascii_digit() {
107 return false;
108 }
109 let mut is_float = false;
110
111 state.advance(1);
112 state.take_while(|c| c.is_ascii_digit() || c == '_');
113
114 if state.peek() == Some('.') {
116 let n1 = state.peek_next_n(1);
117 if n1.map(|c| c.is_ascii_digit()).unwrap_or(false) {
118 is_float = true;
119 state.advance(1); state.take_while(|c| c.is_ascii_digit() || c == '_');
121 }
122 }
123 if let Some(c) = state.peek() {
125 if c == 'e' || c == 'E' {
126 let n1 = state.peek_next_n(1);
127 if n1 == Some('+') || n1 == Some('-') || n1.map(|d| d.is_ascii_digit()).unwrap_or(false) {
128 is_float = true;
129 state.advance(1);
130 if let Some(sign) = state.peek() {
131 if sign == '+' || sign == '-' {
132 state.advance(1);
133 }
134 }
135 state.take_while(|d| d.is_ascii_digit() || d == '_');
136 }
137 }
138 }
139
140 let end = state.get_position();
141 state.add_token(if is_float { ElmTokenType::Float } else { ElmTokenType::Number }, start, end);
142 true
143 }
144
145 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
146 let start = state.get_position();
147
148 if let Some(ch) = state.peek() {
149 if ch.is_alphabetic() || ch == '_' {
150 state.advance(ch.len_utf8());
151 state.take_while(|next_ch| next_ch.is_alphanumeric() || next_ch == '_');
152
153 let text = state.get_text_in((start..state.get_position()).into());
154 let kind = match text.as_ref() {
155 "if" => ElmTokenType::If,
156 "then" => ElmTokenType::Then,
157 "else" => ElmTokenType::Else,
158 "case" => ElmTokenType::Case,
159 "of" => ElmTokenType::Of,
160 "let" => ElmTokenType::Let,
161 "in" => ElmTokenType::In,
162 "type" => ElmTokenType::Type,
163 "alias" => ElmTokenType::Alias,
164 "module" => ElmTokenType::Module,
165 "where" => ElmTokenType::Where,
166 "import" => ElmTokenType::Import,
167 "exposing" => ElmTokenType::Exposing,
168 "as" => ElmTokenType::As,
169 "port" => ElmTokenType::Port,
170 _ => ElmTokenType::Identifier,
171 };
172
173 state.add_token(kind, start, state.get_position());
174 return true;
175 }
176 }
177 false
178 }
179
180 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
181 let start = state.get_position();
182
183 let ops = [
185 ("==", ElmTokenType::EqualEqual),
186 ("/=", ElmTokenType::NotEqual),
187 ("<=", ElmTokenType::LessEqual),
188 (">=", ElmTokenType::GreaterEqual),
189 ("&&", ElmTokenType::DoubleAmpersand),
190 ("||", ElmTokenType::DoublePipe),
191 ("++", ElmTokenType::DoublePlus),
192 ("<<", ElmTokenType::DoubleLess),
193 (">>", ElmTokenType::DoubleGreater),
194 ("|>", ElmTokenType::PipeGreater),
195 ("->", ElmTokenType::Arrow),
196 ("...", ElmTokenType::TripleDot),
197 ("..", ElmTokenType::DoubleDot),
198 ("//", ElmTokenType::DoubleSlash),
199 ];
200
201 for (pattern, kind) in ops {
202 if state.consume_if_starts_with(pattern) {
203 state.add_token(kind, start, state.get_position());
204 return true;
205 }
206 }
207
208 if let Some(ch) = state.peek() {
210 let kind = match ch {
211 '+' => ElmTokenType::Plus,
212 '-' => ElmTokenType::Minus,
213 '*' => ElmTokenType::Star,
214 '/' => ElmTokenType::Slash,
215 '=' => ElmTokenType::Equal,
216 '<' => ElmTokenType::Less,
217 '>' => ElmTokenType::Greater,
218 '^' => ElmTokenType::Caret,
219 '|' => ElmTokenType::Pipe,
220 '(' => ElmTokenType::LeftParen,
221 ')' => ElmTokenType::RightParen,
222 '{' => ElmTokenType::LeftBrace,
223 '}' => ElmTokenType::RightBrace,
224 '[' => ElmTokenType::LeftBracket,
225 ']' => ElmTokenType::RightBracket,
226 ',' => ElmTokenType::Comma,
227 ';' => ElmTokenType::Semicolon,
228 '.' => ElmTokenType::Dot,
229 ':' => ElmTokenType::Colon,
230 '\\' => ElmTokenType::Backslash,
231 '%' => ElmTokenType::Percent,
232 '\n' => ElmTokenType::Newline,
233 _ => return false,
234 };
235
236 state.advance(ch.len_utf8());
237 state.add_token(kind, start, state.get_position());
238 return true;
239 }
240
241 false
242 }
243}