1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HaskellLanguage, lexer::token_type::HaskellTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, HaskellLanguage>;
8
9#[derive(Clone)]
11pub struct HaskellLexer<'config> {
12 config: &'config HaskellLanguage,
14}
15
16impl<'config> HaskellLexer<'config> {
17 pub fn new(config: &'config HaskellLanguage) -> Self {
19 Self { config }
20 }
21
22 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
23 let start_pos = state.get_position();
24 while let Some(ch) = state.peek() {
25 if ch == ' ' || ch == '\t' {
26 state.bump();
27 }
28 else {
29 break;
30 }
31 }
32
33 if state.get_position() > start_pos {
34 state.add_token(HaskellTokenType::Whitespace, start_pos, state.get_position());
35 true
36 }
37 else {
38 false
39 }
40 }
41
42 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43 let start_pos = state.get_position();
44
45 if let Some('\n') = state.peek() {
46 state.bump();
47 state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
48 true
49 }
50 else if let Some('\r') = state.peek() {
51 state.bump();
52 if let Some('\n') = state.peek() {
53 state.bump();
54 }
55 state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
56 true
57 }
58 else {
59 false
60 }
61 }
62
63 fn lex_single_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
64 let start_pos = state.get_position();
65
66 if let Some('-') = state.peek() {
67 if let Some('-') = state.peek_next_n(1) {
68 state.advance(2);
69 while let Some(ch) = state.peek() {
70 if ch == '\n' || ch == '\r' {
71 break;
72 }
73 state.bump();
74 }
75 state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
76 true
77 }
78 else {
79 false
80 }
81 }
82 else {
83 false
84 }
85 }
86
87 fn lex_multi_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88 let start_pos = state.get_position();
89
90 if let Some('{') = state.peek() {
91 if let Some('-') = state.peek_next_n(1) {
92 state.advance(2);
93 let mut depth = 1;
94 while let Some(ch) = state.peek() {
95 if ch == '{' && state.peek_next_n(1) == Some('-') {
96 depth += 1;
97 state.advance(2)
98 }
99 else if ch == '-' && state.peek_next_n(1) == Some('}') {
100 depth -= 1;
101 state.advance(2);
102 if depth == 0 {
103 break;
104 }
105 }
106 else {
107 state.bump();
108 }
109 }
110 state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
111 true
112 }
113 else {
114 false
115 }
116 }
117 else {
118 false
119 }
120 }
121
122 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
123 let start_pos = state.get_position();
124
125 if let Some(ch) = state.peek() {
126 if ch.is_ascii_alphabetic() || ch == '_' {
127 state.bump();
128
129 while let Some(ch) = state.peek() {
130 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
131 state.bump();
132 }
133 else {
134 break;
135 }
136 }
137
138 let end_pos = state.get_position();
139 let text = state.get_text_in((start_pos..end_pos).into());
140 let kind = self.keyword_or_identifier(text.as_ref());
141
142 state.add_token(kind, start_pos, end_pos);
143 true
144 }
145 else {
146 false
147 }
148 }
149 else {
150 false
151 }
152 }
153
154 fn keyword_or_identifier(&self, text: &str) -> HaskellTokenType {
155 match text {
156 "case" => HaskellTokenType::Case,
157 "class" => HaskellTokenType::Class,
158 "data" => HaskellTokenType::Data,
159 "default" => HaskellTokenType::Default,
160 "deriving" => HaskellTokenType::Deriving,
161 "do" => HaskellTokenType::Do,
162 "else" => HaskellTokenType::Else,
163 "if" => HaskellTokenType::If,
164 "import" => HaskellTokenType::Import,
165 "in" => HaskellTokenType::In,
166 "infix" => HaskellTokenType::Infix,
167 "infixl" => HaskellTokenType::Infixl,
168 "infixr" => HaskellTokenType::Infixr,
169 "instance" => HaskellTokenType::Instance,
170 "let" => HaskellTokenType::Let,
171 "module" => HaskellTokenType::Module,
172 "newtype" => HaskellTokenType::Newtype,
173 "of" => HaskellTokenType::Of,
174 "then" => HaskellTokenType::Then,
175 "type" => HaskellTokenType::Type,
176 "where" => HaskellTokenType::Where,
177 _ => HaskellTokenType::Identifier,
178 }
179 }
180
181 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
182 let start_pos = state.get_position();
183
184 if let Some(ch) = state.peek() {
185 if ch.is_ascii_digit() {
186 state.bump();
187
188 while let Some(ch) = state.peek() {
189 if ch.is_ascii_digit() {
190 state.bump();
191 }
192 else if ch == '.' {
193 state.bump();
194 while let Some(ch) = state.peek() {
195 if ch.is_ascii_digit() {
196 state.bump();
197 }
198 else {
199 break;
200 }
201 }
202 break;
203 }
204 else {
205 break;
206 }
207 }
208
209 state.add_token(HaskellTokenType::Number, start_pos, state.get_position());
210 true
211 }
212 else {
213 false
214 }
215 }
216 else {
217 false
218 }
219 }
220
221 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222 let start_pos = state.get_position();
223
224 if let Some('"') = state.peek() {
225 state.bump();
226
227 while let Some(ch) = state.peek() {
228 if ch == '"' {
229 state.bump();
230 state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
231 return true;
232 }
233 else if ch == '\\' {
234 state.bump();
235 if let Some(_) = state.peek() {
236 state.bump();
237 }
238 }
239 else {
240 state.bump();
241 }
242 }
243
244 state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
245 true
246 }
247 else {
248 false
249 }
250 }
251
252 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
253 let start_pos = state.get_position();
254
255 if let Some('\'') = state.peek() {
256 state.bump();
257
258 if let Some(ch) = state.peek() {
259 if ch == '\\' {
260 state.bump();
261 if let Some(_) = state.peek() {
262 state.bump();
263 }
264 }
265 else if ch != '\'' {
266 state.bump();
267 }
268 }
269
270 if let Some('\'') = state.peek() {
271 state.bump();
272 state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
273 true
274 }
275 else {
276 state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
277 true
278 }
279 }
280 else {
281 false
282 }
283 }
284
285 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
286 let start_pos = state.get_position();
287
288 if let Some(ch) = state.peek() {
289 let token_kind = match ch {
290 '+' => {
291 state.bump();
292 if let Some('+') = state.peek() {
293 state.bump();
294 HaskellTokenType::Append
295 }
296 else {
297 HaskellTokenType::Plus
298 }
299 }
300 '-' => {
301 state.bump();
302 if let Some('>') = state.peek() {
303 state.bump();
304 HaskellTokenType::Arrow
305 }
306 else {
307 HaskellTokenType::Minus
308 }
309 }
310 '*' => {
311 state.bump();
312 HaskellTokenType::Star
313 }
314 '/' => {
315 state.bump();
316 HaskellTokenType::Slash
317 }
318 '=' => {
319 state.bump();
320 if let Some('=') = state.peek() {
321 state.bump();
322 HaskellTokenType::Equal
323 }
324 else {
325 HaskellTokenType::Assign
326 }
327 }
328 '<' => {
329 state.bump();
330 if let Some('=') = state.peek() {
331 state.bump();
332 HaskellTokenType::LessEqual
333 }
334 else if let Some('-') = state.peek() {
335 state.bump();
336 HaskellTokenType::LeftArrow
337 }
338 else {
339 HaskellTokenType::Less
340 }
341 }
342 '>' => {
343 state.bump();
344 if let Some('=') = state.peek() {
345 state.bump();
346 HaskellTokenType::GreaterEqual
347 }
348 else {
349 HaskellTokenType::Greater
350 }
351 }
352 ':' => {
353 state.bump();
354 if let Some(':') = state.peek() {
355 state.bump();
356 HaskellTokenType::DoubleColon
357 }
358 else {
359 HaskellTokenType::Colon
360 }
361 }
362 '|' => {
363 state.bump();
364 HaskellTokenType::Pipe
365 }
366 '&' => {
367 state.bump();
368 HaskellTokenType::Ampersand
369 }
370 '!' => {
371 state.bump();
372 HaskellTokenType::Bang
373 }
374 '?' => {
375 state.bump();
376 HaskellTokenType::Question
377 }
378 ';' => {
379 state.bump();
380 HaskellTokenType::Semicolon
381 }
382 ',' => {
383 state.bump();
384 HaskellTokenType::Comma
385 }
386 '.' => {
387 state.bump();
388 if let Some('.') = state.peek() {
389 state.bump();
390 HaskellTokenType::DoubleDot
391 }
392 else {
393 HaskellTokenType::Dot
394 }
395 }
396 '$' => {
397 state.bump();
398 HaskellTokenType::Dollar
399 }
400 '@' => {
401 state.bump();
402 HaskellTokenType::At
403 }
404 '~' => {
405 state.bump();
406 HaskellTokenType::Tilde
407 }
408 '\\' => {
409 state.bump();
410 HaskellTokenType::Backslash
411 }
412 '`' => {
413 state.bump();
414 HaskellTokenType::Backtick
415 }
416 _ => return false,
417 };
418
419 state.add_token(token_kind, start_pos, state.get_position());
420 true
421 }
422 else {
423 false
424 }
425 }
426
427 fn lex_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
428 let start_pos = state.get_position();
429
430 if let Some(ch) = state.peek() {
431 let token_kind = match ch {
432 '(' => {
433 state.bump();
434 HaskellTokenType::LeftParen
435 }
436 ')' => {
437 state.bump();
438 HaskellTokenType::RightParen
439 }
440 '[' => {
441 state.bump();
442 HaskellTokenType::LeftBracket
443 }
444 ']' => {
445 state.bump();
446 HaskellTokenType::RightBracket
447 }
448 '{' => {
449 state.bump();
450 HaskellTokenType::LeftBrace
451 }
452 '}' => {
453 state.bump();
454 HaskellTokenType::RightBrace
455 }
456 _ => return false,
457 };
458
459 state.add_token(token_kind, start_pos, state.get_position());
460 true
461 }
462 else {
463 false
464 }
465 }
466}
467
468impl<'config> Lexer<HaskellLanguage> for HaskellLexer<'config> {
469 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HaskellLanguage>) -> LexOutput<HaskellLanguage> {
470 let mut state = State::new(source);
471
472 while state.not_at_end() {
473 let safe_point = state.get_position();
474 if self.skip_whitespace(&mut state) {
475 continue;
476 }
477
478 if self.lex_newline(&mut state) {
479 continue;
480 }
481
482 if self.lex_single_line_comment(&mut state) {
483 continue;
484 }
485
486 if self.lex_multi_line_comment(&mut state) {
487 continue;
488 }
489
490 if self.lex_identifier_or_keyword(&mut state) {
491 continue;
492 }
493
494 if self.lex_number(&mut state) {
495 continue;
496 }
497
498 if self.lex_string(&mut state) {
499 continue;
500 }
501
502 if self.lex_char(&mut state) {
503 continue;
504 }
505
506 if self.lex_operators(&mut state) {
507 continue;
508 }
509
510 if self.lex_delimiters(&mut state) {
511 continue;
512 }
513
514 let start_pos = state.get_position();
516 if state.peek().is_some() {
517 state.advance(1);
518 state.add_token(HaskellTokenType::Error, start_pos, state.get_position())
519 }
520
521 state.advance_if_dead_lock(safe_point)
522 }
523
524 let pos = state.get_position();
526 state.add_token(HaskellTokenType::Eof, pos, pos);
527
528 state.finish_with_cache(Ok(()), cache)
529 }
530}