1use crate::{kind::PurescriptSyntaxKind, language::PurescriptLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PurescriptLanguage>;
5
6#[derive(Clone)]
7pub struct PurescriptLexer<'config> {
8 config: &'config PurescriptLanguage,
9}
10
11impl<'config> PurescriptLexer<'config> {
12 pub fn new(config: &'config PurescriptLanguage) -> Self {
13 Self { config }
14 }
15
16 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17 while state.not_at_end() {
18 if self.skip_whitespace(state) {
19 continue;
20 }
21
22 if self.lex_newline(state) {
23 continue;
24 }
25
26 if self.lex_comment(state) {
27 continue;
28 }
29
30 if self.lex_identifier_or_keyword(state) {
31 continue;
32 }
33
34 if self.lex_number_literal(state) {
35 continue;
36 }
37
38 if self.lex_string_literal(state) {
39 continue;
40 }
41
42 if self.lex_char_literal(state) {
43 continue;
44 }
45
46 if self.lex_operator(state) {
47 continue;
48 }
49
50 if self.lex_delimiter(state) {
51 continue;
52 }
53
54 let start_pos = state.get_position();
56 if let Some(ch) = state.peek() {
57 state.advance(ch.len_utf8());
58 state.add_token(PurescriptSyntaxKind::Error, start_pos, state.get_position());
59 }
60 }
61
62 let eof_pos = state.get_position();
64 state.add_token(PurescriptSyntaxKind::Eof, eof_pos, eof_pos);
65
66 Ok(())
67 }
68
69 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
71 let start_pos = state.get_position();
72
73 while let Some(ch) = state.peek() {
74 if ch == ' ' || ch == '\t' {
75 state.advance(ch.len_utf8());
76 }
77 else {
78 break;
79 }
80 }
81
82 if state.get_position() > start_pos {
83 state.add_token(PurescriptSyntaxKind::Whitespace, start_pos, state.get_position());
84 true
85 }
86 else {
87 false
88 }
89 }
90
91 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
93 let start_pos = state.get_position();
94
95 if let Some('\n') = state.peek() {
96 state.advance(1);
97 state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
98 true
99 }
100 else if let Some('\r') = state.peek() {
101 state.advance(1);
102 if let Some('\n') = state.peek() {
103 state.advance(1);
104 }
105 state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
106 true
107 }
108 else {
109 false
110 }
111 }
112
113 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
115 let start_pos = state.get_position();
116
117 if let Some('-') = state.peek() {
118 state.advance(1);
119 if let Some('-') = state.peek() {
120 state.advance(1);
122 while let Some(ch) = state.peek() {
123 if ch == '\n' || ch == '\r' {
124 break;
125 }
126 state.advance(ch.len_utf8());
127 }
128 state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
129 true
130 }
131 else {
132 state.set_position(start_pos);
133 false
134 }
135 }
136 else if let Some('{') = state.peek() {
137 state.advance(1);
138 if let Some('-') = state.peek() {
139 state.advance(1);
141 let mut depth = 1;
142 while let Some(ch) = state.peek() {
143 if ch == '{' {
144 state.advance(1);
145 if let Some('-') = state.peek() {
146 depth += 1;
147 state.advance(1);
148 }
149 }
150 else if ch == '-' {
151 state.advance(1);
152 if let Some('}') = state.peek() {
153 depth -= 1;
154 state.advance(1);
155 if depth == 0 {
156 break;
157 }
158 }
159 }
160 else {
161 state.advance(ch.len_utf8());
162 }
163 }
164 state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
165 true
166 }
167 else {
168 state.set_position(start_pos);
169 false
170 }
171 }
172 else {
173 false
174 }
175 }
176
177 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
179 let start_pos = state.get_position();
180
181 if let Some(ch) = state.peek() {
182 if ch.is_ascii_alphabetic() || ch == '_' {
183 state.advance(ch.len_utf8());
184
185 while let Some(ch) = state.peek() {
186 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
187 state.advance(ch.len_utf8());
188 }
189 else {
190 break;
191 }
192 }
193
194 let text = state.get_text_in((start_pos..state.get_position()).into());
196
197 let token_kind = match text.as_ref() {
198 "ado" => PurescriptSyntaxKind::Ado,
199 "case" => PurescriptSyntaxKind::Case,
200 "class" => PurescriptSyntaxKind::Class,
201 "data" => PurescriptSyntaxKind::Data,
202 "derive" => PurescriptSyntaxKind::Derive,
203 "do" => PurescriptSyntaxKind::Do,
204 "else" => PurescriptSyntaxKind::Else,
205 "false" => PurescriptSyntaxKind::False,
206 "forall" => PurescriptSyntaxKind::Forall,
207 "foreign" => PurescriptSyntaxKind::Foreign,
208 "if" => PurescriptSyntaxKind::If,
209 "import" => PurescriptSyntaxKind::Import,
210 "in" => PurescriptSyntaxKind::In,
211 "infix" => PurescriptSyntaxKind::Infix,
212 "infixl" => PurescriptSyntaxKind::Infixl,
213 "infixr" => PurescriptSyntaxKind::Infixr,
214 "instance" => PurescriptSyntaxKind::Instance,
215 "let" => PurescriptSyntaxKind::Let,
216 "module" => PurescriptSyntaxKind::Module,
217 "newtype" => PurescriptSyntaxKind::Newtype,
218 "of" => PurescriptSyntaxKind::Of,
219 "then" => PurescriptSyntaxKind::Then,
220 "true" => PurescriptSyntaxKind::True,
221 "type" => PurescriptSyntaxKind::Type,
222 "where" => PurescriptSyntaxKind::Where,
223 _ => PurescriptSyntaxKind::Identifier,
224 };
225 state.add_token(token_kind, start_pos, state.get_position());
226 true
227 }
228 else {
229 false
230 }
231 }
232 else {
233 false
234 }
235 }
236
237 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
239 let start_pos = state.get_position();
240
241 if let Some(ch) = state.peek() {
242 if ch.is_ascii_digit() {
243 state.advance(1);
244
245 if ch == '0' {
247 if let Some('x') | Some('X') = state.peek() {
248 state.advance(1);
249 while let Some(ch) = state.peek() {
250 if ch.is_ascii_hexdigit() {
251 state.advance(1);
252 }
253 else {
254 break;
255 }
256 }
257 }
258 else {
259 while let Some(ch) = state.peek() {
261 if ch.is_ascii_digit() {
262 state.advance(1);
263 }
264 else {
265 break;
266 }
267 }
268 }
269 }
270 else {
271 while let Some(ch) = state.peek() {
273 if ch.is_ascii_digit() {
274 state.advance(1);
275 }
276 else {
277 break;
278 }
279 }
280 }
281
282 if let Some('.') = state.peek() {
284 state.advance(1);
285 while let Some(ch) = state.peek() {
286 if ch.is_ascii_digit() {
287 state.advance(1);
288 }
289 else {
290 break;
291 }
292 }
293 }
294
295 if let Some('e') | Some('E') = state.peek() {
297 state.advance(1);
298 if let Some('+') | Some('-') = state.peek() {
299 state.advance(1);
300 }
301 while let Some(ch) = state.peek() {
302 if ch.is_ascii_digit() {
303 state.advance(1);
304 }
305 else {
306 break;
307 }
308 }
309 }
310
311 state.add_token(PurescriptSyntaxKind::NumberLiteral, start_pos, state.get_position());
312 true
313 }
314 else {
315 false
316 }
317 }
318 else {
319 false
320 }
321 }
322
323 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
325 let start_pos = state.get_position();
326
327 if let Some('"') = state.peek() {
328 state.advance(1);
329
330 while let Some(ch) = state.peek() {
331 if ch == '"' {
332 state.advance(1);
333 break;
334 }
335 else if ch == '\\' {
336 state.advance(1);
337 if let Some(_) = state.peek() {
338 state.advance(1);
339 }
340 }
341 else if ch == '\n' || ch == '\r' {
342 break; }
344 else {
345 state.advance(ch.len_utf8());
346 }
347 }
348
349 state.add_token(PurescriptSyntaxKind::StringLiteral, start_pos, state.get_position());
350 true
351 }
352 else {
353 false
354 }
355 }
356
357 fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
359 let start_pos = state.get_position();
360
361 if let Some('\'') = state.peek() {
362 state.advance(1);
363
364 if let Some(ch) = state.peek() {
365 if ch == '\\' {
366 state.advance(1);
367 if let Some(_) = state.peek() {
368 state.advance(1);
369 }
370 }
371 else if ch != '\'' {
372 state.advance(ch.len_utf8());
373 }
374 }
375
376 if let Some('\'') = state.peek() {
377 state.advance(1);
378 state.add_token(PurescriptSyntaxKind::CharLiteral, start_pos, state.get_position());
379 true
380 }
381 else {
382 state.set_position(start_pos);
383 false
384 }
385 }
386 else {
387 false
388 }
389 }
390
391 fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
393 let start_pos = state.get_position();
394
395 if let Some(ch) = state.peek() {
396 let token_kind = match ch {
397 '+' => {
398 state.advance(1);
399 PurescriptSyntaxKind::Plus
400 }
401 '-' => {
402 state.advance(1);
403 if let Some('>') = state.peek() {
404 state.advance(1);
405 PurescriptSyntaxKind::Arrow
406 }
407 else {
408 PurescriptSyntaxKind::Minus
409 }
410 }
411 '*' => {
412 state.advance(1);
413 if let Some('*') = state.peek() {
414 state.advance(1);
415 PurescriptSyntaxKind::Caret }
417 else {
418 PurescriptSyntaxKind::Star
419 }
420 }
421 '/' => {
422 state.advance(1);
423 if let Some('=') = state.peek() {
424 state.advance(1);
425 PurescriptSyntaxKind::NotEqual
426 }
427 else {
428 PurescriptSyntaxKind::Slash
429 }
430 }
431 '%' => {
432 state.advance(1);
433 PurescriptSyntaxKind::Percent
434 }
435 '=' => {
436 state.advance(1);
437 match state.peek() {
438 Some('=') => {
439 state.advance(1);
440 PurescriptSyntaxKind::Equal
441 }
442 Some('>') => {
443 state.advance(1);
444 PurescriptSyntaxKind::FatArrow
445 }
446 _ => PurescriptSyntaxKind::Equal,
447 }
448 }
449 '<' => {
450 state.advance(1);
451 match state.peek() {
452 Some('=') => {
453 state.advance(1);
454 PurescriptSyntaxKind::LessEqual
455 }
456 Some('-') => {
457 state.advance(1);
458 PurescriptSyntaxKind::Bind
459 }
460 _ => PurescriptSyntaxKind::Less,
461 }
462 }
463 '>' => {
464 state.advance(1);
465 if let Some('=') = state.peek() {
466 state.advance(1);
467 PurescriptSyntaxKind::GreaterEqual
468 }
469 else {
470 PurescriptSyntaxKind::Greater
471 }
472 }
473 '&' => {
474 state.advance(1);
475 if let Some('&') = state.peek() {
476 state.advance(1);
477 PurescriptSyntaxKind::And
478 }
479 else {
480 return false;
481 }
482 }
483 '|' => {
484 state.advance(1);
485 if let Some('|') = state.peek() {
486 state.advance(1);
487 PurescriptSyntaxKind::Or
488 }
489 else {
490 PurescriptSyntaxKind::Pipe
491 }
492 }
493 '\\' => {
494 state.advance(1);
495 PurescriptSyntaxKind::Backslash
496 }
497 _ => return false,
498 };
499
500 state.add_token(token_kind, start_pos, state.get_position());
501 true
502 }
503 else {
504 false
505 }
506 }
507
508 fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
510 let start_pos = state.get_position();
511
512 if let Some(ch) = state.peek() {
513 let token_kind = match ch {
514 '(' => PurescriptSyntaxKind::LeftParen,
515 ')' => PurescriptSyntaxKind::RightParen,
516 '[' => PurescriptSyntaxKind::LeftBracket,
517 ']' => PurescriptSyntaxKind::RightBracket,
518 '{' => PurescriptSyntaxKind::LeftBrace,
519 '}' => PurescriptSyntaxKind::RightBrace,
520 ',' => PurescriptSyntaxKind::Comma,
521 ';' => PurescriptSyntaxKind::Semicolon,
522 '.' => PurescriptSyntaxKind::Dot,
523 ':' => {
524 state.advance(1);
525 if let Some(':') = state.peek() {
526 state.advance(1);
527 state.add_token(PurescriptSyntaxKind::ColonColon, start_pos, state.get_position());
528 return true;
529 }
530 else {
531 state.add_token(PurescriptSyntaxKind::Colon, start_pos, state.get_position());
532 return true;
533 }
534 }
535 '?' => PurescriptSyntaxKind::Question,
536 '_' => PurescriptSyntaxKind::Underscore,
537 '@' => PurescriptSyntaxKind::At,
538 _ => return false,
539 };
540
541 state.advance(ch.len_utf8());
542 state.add_token(token_kind, start_pos, state.get_position());
543 true
544 }
545 else {
546 false
547 }
548 }
549}
550
551impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
552 fn lex_incremental(
553 &self,
554 source: impl Source,
555 changed: usize,
556 cache: IncrementalCache<PurescriptLanguage>,
557 ) -> LexOutput<PurescriptLanguage> {
558 let mut state = LexerState::new_with_cache(source, changed, cache);
559 let result = self.run(&mut state);
560 state.finish(result)
561 }
562}