1use crate::{kind::PurescriptSyntaxKind, language::PurescriptLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
5
6#[derive(Clone)]
7pub struct PurescriptLexer<'config> {
8 _config: &'config PurescriptLanguage,
9}
10
11impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
12 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
13 let mut state = State::new_with_cache(source, 0, cache);
14 let result = self.run(&mut state);
15 if result.is_ok() {
16 state.add_eof();
17 }
18 state.finish_with_cache(result, cache)
19 }
20}
21
22impl<'config> PurescriptLexer<'config> {
23 pub fn new(config: &'config PurescriptLanguage) -> Self {
24 Self { _config: config }
25 }
26
27 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
28 while state.not_at_end() {
29 let safe_point = state.get_position();
30 if self.skip_whitespace(state) {
31 continue;
32 }
33
34 if self.lex_newline(state) {
35 continue;
36 }
37
38 if self.lex_comment(state) {
39 continue;
40 }
41
42 if self.lex_identifier_or_keyword(state) {
43 continue;
44 }
45
46 if self.lex_number_literal(state) {
47 continue;
48 }
49
50 if self.lex_string_literal(state) {
51 continue;
52 }
53
54 if self.lex_char_literal(state) {
55 continue;
56 }
57
58 if self.lex_operator(state) {
59 continue;
60 }
61
62 if self.lex_delimiter(state) {
63 continue;
64 }
65
66 let start_pos = state.get_position();
68 if let Some(ch) = state.peek() {
69 state.advance(ch.len_utf8());
70 state.add_token(PurescriptSyntaxKind::Error, start_pos, state.get_position());
71 }
72
73 state.advance_if_dead_lock(safe_point);
74 }
75
76 Ok(())
77 }
78
79 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81 let start_pos = state.get_position();
82
83 while let Some(ch) = state.peek() {
84 if ch == ' ' || ch == '\t' {
85 state.advance(ch.len_utf8());
86 }
87 else {
88 break;
89 }
90 }
91
92 if state.get_position() > start_pos {
93 state.add_token(PurescriptSyntaxKind::Whitespace, start_pos, state.get_position());
94 true
95 }
96 else {
97 false
98 }
99 }
100
101 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103 let start_pos = state.get_position();
104
105 if let Some('\n') = state.peek() {
106 state.advance(1);
107 state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
108 true
109 }
110 else if let Some('\r') = state.peek() {
111 state.advance(1);
112 if let Some('\n') = state.peek() {
113 state.advance(1);
114 }
115 state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
116 true
117 }
118 else {
119 false
120 }
121 }
122
123 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
125 let start_pos = state.get_position();
126
127 if let Some('-') = state.peek() {
128 state.advance(1);
129 if let Some('-') = state.peek() {
130 state.advance(1);
132 while let Some(ch) = state.peek() {
133 if ch == '\n' || ch == '\r' {
134 break;
135 }
136 state.advance(ch.len_utf8());
137 }
138 state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
139 true
140 }
141 else {
142 state.set_position(start_pos);
143 false
144 }
145 }
146 else if let Some('{') = state.peek() {
147 state.advance(1);
148 if let Some('-') = state.peek() {
149 state.advance(1);
151 let mut depth = 1;
152 while let Some(ch) = state.peek() {
153 if ch == '{' {
154 state.advance(1);
155 if let Some('-') = state.peek() {
156 depth += 1;
157 state.advance(1);
158 }
159 }
160 else if ch == '-' {
161 state.advance(1);
162 if let Some('}') = state.peek() {
163 depth -= 1;
164 state.advance(1);
165 if depth == 0 {
166 break;
167 }
168 }
169 }
170 else {
171 state.advance(ch.len_utf8());
172 }
173 }
174 state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
175 true
176 }
177 else {
178 state.set_position(start_pos);
179 false
180 }
181 }
182 else {
183 false
184 }
185 }
186
187 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189 let start_pos = state.get_position();
190
191 if let Some(ch) = state.peek() {
192 if ch.is_ascii_alphabetic() || ch == '_' {
193 state.advance(ch.len_utf8());
194
195 while let Some(ch) = state.peek() {
196 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
197 state.advance(ch.len_utf8());
198 }
199 else {
200 break;
201 }
202 }
203
204 let text = state.get_text_in((start_pos..state.get_position()).into());
206
207 let token_kind = match text.as_ref() {
208 "ado" => PurescriptSyntaxKind::Ado,
209 "case" => PurescriptSyntaxKind::Case,
210 "class" => PurescriptSyntaxKind::Class,
211 "data" => PurescriptSyntaxKind::Data,
212 "derive" => PurescriptSyntaxKind::Derive,
213 "do" => PurescriptSyntaxKind::Do,
214 "else" => PurescriptSyntaxKind::Else,
215 "false" => PurescriptSyntaxKind::False,
216 "forall" => PurescriptSyntaxKind::Forall,
217 "foreign" => PurescriptSyntaxKind::Foreign,
218 "if" => PurescriptSyntaxKind::If,
219 "import" => PurescriptSyntaxKind::Import,
220 "in" => PurescriptSyntaxKind::In,
221 "infix" => PurescriptSyntaxKind::Infix,
222 "infixl" => PurescriptSyntaxKind::Infixl,
223 "infixr" => PurescriptSyntaxKind::Infixr,
224 "instance" => PurescriptSyntaxKind::Instance,
225 "let" => PurescriptSyntaxKind::Let,
226 "module" => PurescriptSyntaxKind::Module,
227 "newtype" => PurescriptSyntaxKind::Newtype,
228 "of" => PurescriptSyntaxKind::Of,
229 "then" => PurescriptSyntaxKind::Then,
230 "true" => PurescriptSyntaxKind::True,
231 "type" => PurescriptSyntaxKind::Type,
232 "where" => PurescriptSyntaxKind::Where,
233 _ => PurescriptSyntaxKind::Identifier,
234 };
235 state.add_token(token_kind, start_pos, state.get_position());
236 true
237 }
238 else {
239 false
240 }
241 }
242 else {
243 false
244 }
245 }
246
247 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
249 let start_pos = state.get_position();
250
251 if let Some(ch) = state.peek() {
252 if ch.is_ascii_digit() {
253 state.advance(1);
254
255 if ch == '0' {
257 if let Some('x') | Some('X') = state.peek() {
258 state.advance(1);
259 while let Some(ch) = state.peek() {
260 if ch.is_ascii_hexdigit() {
261 state.advance(1);
262 }
263 else {
264 break;
265 }
266 }
267 }
268 else {
269 while let Some(ch) = state.peek() {
271 if ch.is_ascii_digit() {
272 state.advance(1);
273 }
274 else {
275 break;
276 }
277 }
278 }
279 }
280 else {
281 while let Some(ch) = state.peek() {
283 if ch.is_ascii_digit() {
284 state.advance(1);
285 }
286 else {
287 break;
288 }
289 }
290 }
291
292 if let Some('.') = state.peek() {
294 state.advance(1);
295 while let Some(ch) = state.peek() {
296 if ch.is_ascii_digit() {
297 state.advance(1);
298 }
299 else {
300 break;
301 }
302 }
303 }
304
305 if let Some('e') | Some('E') = state.peek() {
307 state.advance(1);
308 if let Some('+') | Some('-') = state.peek() {
309 state.advance(1);
310 }
311 while let Some(ch) = state.peek() {
312 if ch.is_ascii_digit() {
313 state.advance(1);
314 }
315 else {
316 break;
317 }
318 }
319 }
320
321 state.add_token(PurescriptSyntaxKind::NumberLiteral, start_pos, state.get_position());
322 true
323 }
324 else {
325 false
326 }
327 }
328 else {
329 false
330 }
331 }
332
333 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
335 let start_pos = state.get_position();
336
337 if let Some('"') = state.peek() {
338 state.advance(1);
339
340 while let Some(ch) = state.peek() {
341 if ch == '"' {
342 state.advance(1);
343 break;
344 }
345 else if ch == '\\' {
346 state.advance(1);
347 if let Some(_) = state.peek() {
348 state.advance(1);
349 }
350 }
351 else if ch == '\n' || ch == '\r' {
352 break; }
354 else {
355 state.advance(ch.len_utf8());
356 }
357 }
358
359 state.add_token(PurescriptSyntaxKind::StringLiteral, start_pos, state.get_position());
360 true
361 }
362 else {
363 false
364 }
365 }
366
367 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
369 let start_pos = state.get_position();
370
371 if let Some('\'') = state.peek() {
372 state.advance(1);
373
374 if let Some(ch) = state.peek() {
375 if ch == '\\' {
376 state.advance(1);
377 if let Some(_) = state.peek() {
378 state.advance(1);
379 }
380 }
381 else if ch != '\'' {
382 state.advance(ch.len_utf8());
383 }
384 }
385
386 if let Some('\'') = state.peek() {
387 state.advance(1);
388 state.add_token(PurescriptSyntaxKind::CharLiteral, start_pos, state.get_position());
389 true
390 }
391 else {
392 state.set_position(start_pos);
393 false
394 }
395 }
396 else {
397 false
398 }
399 }
400
401 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
403 let start_pos = state.get_position();
404
405 if let Some(ch) = state.peek() {
406 let token_kind = match ch {
407 '+' => {
408 state.advance(1);
409 PurescriptSyntaxKind::Plus
410 }
411 '-' => {
412 state.advance(1);
413 if let Some('>') = state.peek() {
414 state.advance(1);
415 PurescriptSyntaxKind::Arrow
416 }
417 else {
418 PurescriptSyntaxKind::Minus
419 }
420 }
421 '*' => {
422 state.advance(1);
423 if let Some('*') = state.peek() {
424 state.advance(1);
425 PurescriptSyntaxKind::Caret }
427 else {
428 PurescriptSyntaxKind::Star
429 }
430 }
431 '/' => {
432 state.advance(1);
433 if let Some('=') = state.peek() {
434 state.advance(1);
435 PurescriptSyntaxKind::NotEqual
436 }
437 else {
438 PurescriptSyntaxKind::Slash
439 }
440 }
441 '%' => {
442 state.advance(1);
443 PurescriptSyntaxKind::Percent
444 }
445 '=' => {
446 state.advance(1);
447 match state.peek() {
448 Some('=') => {
449 state.advance(1);
450 PurescriptSyntaxKind::Equal
451 }
452 Some('>') => {
453 state.advance(1);
454 PurescriptSyntaxKind::FatArrow
455 }
456 _ => PurescriptSyntaxKind::Equal,
457 }
458 }
459 '<' => {
460 state.advance(1);
461 match state.peek() {
462 Some('=') => {
463 state.advance(1);
464 PurescriptSyntaxKind::LessEqual
465 }
466 Some('-') => {
467 state.advance(1);
468 PurescriptSyntaxKind::Bind
469 }
470 _ => PurescriptSyntaxKind::Less,
471 }
472 }
473 '>' => {
474 state.advance(1);
475 if let Some('=') = state.peek() {
476 state.advance(1);
477 PurescriptSyntaxKind::GreaterEqual
478 }
479 else {
480 PurescriptSyntaxKind::Greater
481 }
482 }
483 '&' => {
484 state.advance(1);
485 if let Some('&') = state.peek() {
486 state.advance(1);
487 PurescriptSyntaxKind::And
488 }
489 else {
490 return false;
491 }
492 }
493 '|' => {
494 state.advance(1);
495 if let Some('|') = state.peek() {
496 state.advance(1);
497 PurescriptSyntaxKind::Or
498 }
499 else {
500 PurescriptSyntaxKind::Pipe
501 }
502 }
503 '\\' => {
504 state.advance(1);
505 PurescriptSyntaxKind::Backslash
506 }
507 _ => return false,
508 };
509
510 state.add_token(token_kind, start_pos, state.get_position());
511 true
512 }
513 else {
514 false
515 }
516 }
517
518 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
520 let start_pos = state.get_position();
521
522 if let Some(ch) = state.peek() {
523 let token_kind = match ch {
524 '(' => PurescriptSyntaxKind::LeftParen,
525 ')' => PurescriptSyntaxKind::RightParen,
526 '[' => PurescriptSyntaxKind::LeftBracket,
527 ']' => PurescriptSyntaxKind::RightBracket,
528 '{' => PurescriptSyntaxKind::LeftBrace,
529 '}' => PurescriptSyntaxKind::RightBrace,
530 ',' => PurescriptSyntaxKind::Comma,
531 ';' => PurescriptSyntaxKind::Semicolon,
532 '.' => PurescriptSyntaxKind::Dot,
533 ':' => {
534 state.advance(1);
535 if let Some(':') = state.peek() {
536 state.advance(1);
537 state.add_token(PurescriptSyntaxKind::ColonColon, start_pos, state.get_position());
538 return true;
539 }
540 else {
541 state.add_token(PurescriptSyntaxKind::Colon, start_pos, state.get_position());
542 return true;
543 }
544 }
545 '?' => PurescriptSyntaxKind::Question,
546 '_' => PurescriptSyntaxKind::Underscore,
547 '@' => PurescriptSyntaxKind::At,
548 _ => return false,
549 };
550
551 state.advance(ch.len_utf8());
552 state.add_token(token_kind, start_pos, state.get_position());
553 true
554 }
555 else {
556 false
557 }
558 }
559}