1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PurescriptLanguage, lexer::token_type::PurescriptTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
8
9#[derive(Clone)]
10pub struct PurescriptLexer<'config> {
12 config: &'config PurescriptLanguage,
13}
14
15impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
16 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
17 let mut state = State::new_with_cache(source, 0, cache);
18 let result = self.run(&mut state);
19 if result.is_ok() {
20 state.add_eof();
21 }
22 state.finish_with_cache(result, cache)
23 }
24}
25
26impl<'config> PurescriptLexer<'config> {
27 pub fn new(config: &'config PurescriptLanguage) -> Self {
30 Self { config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36 if self.skip_whitespace(state) {
37 continue;
38 }
39
40 if self.lex_newline(state) {
41 continue;
42 }
43
44 if self.lex_comment(state) {
45 continue;
46 }
47
48 if self.lex_identifier_or_keyword(state) {
49 continue;
50 }
51
52 if self.lex_number_literal(state) {
53 continue;
54 }
55
56 if self.lex_string_literal(state) {
57 continue;
58 }
59
60 if self.lex_char_literal(state) {
61 continue;
62 }
63
64 if self.lex_operator(state) {
65 continue;
66 }
67
68 if self.lex_delimiter(state) {
69 continue;
70 }
71
72 let start_pos = state.get_position();
74 if let Some(ch) = state.peek() {
75 state.advance(ch.len_utf8());
76 state.add_token(PurescriptTokenType::Error, start_pos, state.get_position())
77 }
78
79 state.advance_if_dead_lock(safe_point)
80 }
81
82 Ok(())
83 }
84
85 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start_pos = state.get_position();
88
89 while let Some(ch) = state.peek() {
90 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
91 }
92
93 if state.get_position() > start_pos {
94 state.add_token(PurescriptTokenType::Whitespace, start_pos, state.get_position());
95 true
96 }
97 else {
98 false
99 }
100 }
101
102 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104 let start_pos = state.get_position();
105
106 if let Some('\n') = state.peek() {
107 state.advance(1);
108 state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
109 true
110 }
111 else if let Some('\r') = state.peek() {
112 state.advance(1);
113 if let Some('\n') = state.peek() {
114 state.advance(1)
115 }
116 state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
117 true
118 }
119 else {
120 false
121 }
122 }
123
124 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126 let start_pos = state.get_position();
127
128 if let Some('-') = state.peek() {
129 state.advance(1);
130 if let Some('-') = state.peek() {
131 state.advance(1);
133 while let Some(ch) = state.peek() {
134 if ch == '\n' || ch == '\r' {
135 break;
136 }
137 state.advance(ch.len_utf8())
138 }
139 state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
140 true
141 }
142 else {
143 state.set_position(start_pos);
144 false
145 }
146 }
147 else if let Some('{') = state.peek() {
148 state.advance(1);
149 if let Some('-') = state.peek() {
150 state.advance(1);
152 let mut depth = 1;
153 while let Some(ch) = state.peek() {
154 if ch == '{' {
155 state.advance(1);
156 if let Some('-') = state.peek() {
157 depth += 1;
158 state.advance(1)
159 }
160 }
161 else if ch == '-' {
162 state.advance(1);
163 if let Some('}') = state.peek() {
164 depth -= 1;
165 state.advance(1);
166 if depth == 0 {
167 break;
168 }
169 }
170 }
171 else {
172 state.advance(ch.len_utf8())
173 }
174 }
175 state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
176 true
177 }
178 else {
179 state.set_position(start_pos);
180 false
181 }
182 }
183 else {
184 false
185 }
186 }
187
188 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
190 let start_pos = state.get_position();
191
192 if let Some(ch) = state.peek() {
193 if ch.is_ascii_alphabetic() || ch == '_' {
194 state.advance(ch.len_utf8());
195
196 while let Some(ch) = state.peek() {
197 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
198 state.advance(ch.len_utf8());
199 }
200 else {
201 break;
202 }
203 }
204
205 let text = state.get_text_in((start_pos..state.get_position()).into());
207
208 let token_kind = match text.as_ref() {
209 "ado" => PurescriptTokenType::Ado,
210 "case" => PurescriptTokenType::Case,
211 "class" => PurescriptTokenType::Class,
212 "data" => PurescriptTokenType::Data,
213 "derive" => PurescriptTokenType::Derive,
214 "do" => PurescriptTokenType::Do,
215 "else" => PurescriptTokenType::Else,
216 "false" => PurescriptTokenType::False,
217 "forall" => PurescriptTokenType::Forall,
218 "foreign" => PurescriptTokenType::Foreign,
219 "if" => PurescriptTokenType::If,
220 "import" => PurescriptTokenType::Import,
221 "in" => PurescriptTokenType::In,
222 "infix" => PurescriptTokenType::Infix,
223 "infixl" => PurescriptTokenType::Infixl,
224 "infixr" => PurescriptTokenType::Infixr,
225 "instance" => PurescriptTokenType::Instance,
226 "let" => PurescriptTokenType::Let,
227 "module" => PurescriptTokenType::Module,
228 "newtype" => PurescriptTokenType::Newtype,
229 "of" => PurescriptTokenType::Of,
230 "then" => PurescriptTokenType::Then,
231 "true" => PurescriptTokenType::True,
232 "type" => PurescriptTokenType::Type,
233 "where" => PurescriptTokenType::Where,
234 _ => PurescriptTokenType::Identifier,
235 };
236 state.add_token(token_kind, start_pos, state.get_position());
237 true
238 }
239 else {
240 false
241 }
242 }
243 else {
244 false
245 }
246 }
247
248 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
250 let start_pos = state.get_position();
251
252 if let Some(ch) = state.peek() {
253 if ch.is_ascii_digit() {
254 state.advance(1);
255
256 if ch == '0' {
258 if let Some('x') | Some('X') = state.peek() {
259 state.advance(1);
260 while let Some(ch) = state.peek() {
261 if ch.is_ascii_hexdigit() {
262 state.advance(1);
263 }
264 else {
265 break;
266 }
267 }
268 }
269 else {
270 while let Some(ch) = state.peek() {
272 if ch.is_ascii_digit() { state.advance(1) } else { break }
273 }
274 }
275 }
276 else {
277 while let Some(ch) = state.peek() {
279 if ch.is_ascii_digit() { state.advance(1) } else { break }
280 }
281 }
282
283 if let Some('.') = state.peek() {
285 state.advance(1);
286 while let Some(ch) = state.peek() {
287 if ch.is_ascii_digit() { state.advance(1) } else { break }
288 }
289 }
290
291 if let Some('e') | Some('E') = state.peek() {
293 state.advance(1);
294 if let Some('+') | Some('-') = state.peek() {
295 state.advance(1)
296 }
297 while let Some(ch) = state.peek() {
298 if ch.is_ascii_digit() { state.advance(1) } else { break }
299 }
300 }
301
302 state.add_token(PurescriptTokenType::NumberLiteral, start_pos, state.get_position());
303 true
304 }
305 else {
306 false
307 }
308 }
309 else {
310 false
311 }
312 }
313
314 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
316 let start_pos = state.get_position();
317
318 if let Some('"') = state.peek() {
319 state.advance(1);
320
321 while let Some(ch) = state.peek() {
322 if ch == '"' {
323 state.advance(1);
324 break;
325 }
326 else if ch == '\\' {
327 state.advance(1);
328 if let Some(_) = state.peek() {
329 state.advance(1)
330 }
331 }
332 else if ch == '\n' || ch == '\r' {
333 break; }
335 else {
336 state.advance(ch.len_utf8())
337 }
338 }
339
340 state.add_token(PurescriptTokenType::StringLiteral, start_pos, state.get_position());
341 true
342 }
343 else {
344 false
345 }
346 }
347
348 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
350 let start_pos = state.get_position();
351
352 if let Some('\'') = state.peek() {
353 state.advance(1);
354
355 if let Some(ch) = state.peek() {
356 if ch == '\\' {
357 state.advance(1);
358 if let Some(_) = state.peek() {
359 state.advance(1)
360 }
361 }
362 else if ch != '\'' {
363 state.advance(ch.len_utf8())
364 }
365 }
366
367 if let Some('\'') = state.peek() {
368 state.advance(1);
369 state.add_token(PurescriptTokenType::CharLiteral, start_pos, state.get_position());
370 true
371 }
372 else {
373 state.set_position(start_pos);
374 false
375 }
376 }
377 else {
378 false
379 }
380 }
381
382 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
384 let start_pos = state.get_position();
385
386 if let Some(ch) = state.peek() {
387 let token_kind = match ch {
388 '+' => {
389 state.advance(1);
390 PurescriptTokenType::Plus
391 }
392 '-' => {
393 state.advance(1);
394 if let Some('>') = state.peek() {
395 state.advance(1);
396 PurescriptTokenType::Arrow
397 }
398 else {
399 PurescriptTokenType::Minus
400 }
401 }
402 '*' => {
403 state.advance(1);
404 if let Some('*') = state.peek() {
405 state.advance(1);
406 PurescriptTokenType::Caret }
408 else {
409 PurescriptTokenType::Star
410 }
411 }
412 '/' => {
413 state.advance(1);
414 if let Some('=') = state.peek() {
415 state.advance(1);
416 PurescriptTokenType::NotEqual
417 }
418 else {
419 PurescriptTokenType::Slash
420 }
421 }
422 '%' => {
423 state.advance(1);
424 PurescriptTokenType::Percent
425 }
426 '=' => {
427 state.advance(1);
428 match state.peek() {
429 Some('=') => {
430 state.advance(1);
431 PurescriptTokenType::Equal
432 }
433 Some('>') => {
434 state.advance(1);
435 PurescriptTokenType::FatArrow
436 }
437 _ => PurescriptTokenType::Equal,
438 }
439 }
440 '<' => {
441 state.advance(1);
442 match state.peek() {
443 Some('=') => {
444 state.advance(1);
445 PurescriptTokenType::LessEqual
446 }
447 Some('-') => {
448 state.advance(1);
449 PurescriptTokenType::Bind
450 }
451 _ => PurescriptTokenType::Less,
452 }
453 }
454 '>' => {
455 state.advance(1);
456 if let Some('=') = state.peek() {
457 state.advance(1);
458 PurescriptTokenType::GreaterEqual
459 }
460 else {
461 PurescriptTokenType::Greater
462 }
463 }
464 '&' => {
465 state.advance(1);
466 if let Some('&') = state.peek() {
467 state.advance(1);
468 PurescriptTokenType::And
469 }
470 else {
471 return false;
472 }
473 }
474 '|' => {
475 state.advance(1);
476 if let Some('|') = state.peek() {
477 state.advance(1);
478 PurescriptTokenType::Or
479 }
480 else {
481 PurescriptTokenType::Pipe
482 }
483 }
484 '\\' => {
485 state.advance(1);
486 PurescriptTokenType::Backslash
487 }
488 _ => return false,
489 };
490
491 state.add_token(token_kind, start_pos, state.get_position());
492 true
493 }
494 else {
495 false
496 }
497 }
498
499 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
501 let start_pos = state.get_position();
502
503 if let Some(ch) = state.peek() {
504 let token_kind = match ch {
505 '(' => PurescriptTokenType::LeftParen,
506 ')' => PurescriptTokenType::RightParen,
507 '[' => PurescriptTokenType::LeftBracket,
508 ']' => PurescriptTokenType::RightBracket,
509 '{' => PurescriptTokenType::LeftBrace,
510 '}' => PurescriptTokenType::RightBrace,
511 ',' => PurescriptTokenType::Comma,
512 ';' => PurescriptTokenType::Semicolon,
513 '.' => PurescriptTokenType::Dot,
514 ':' => {
515 state.advance(1);
516 if let Some(':') = state.peek() {
517 state.advance(1);
518 state.add_token(PurescriptTokenType::ColonColon, start_pos, state.get_position());
519 return true;
520 }
521 else {
522 state.add_token(PurescriptTokenType::Colon, start_pos, state.get_position());
523 return true;
524 }
525 }
526 '?' => PurescriptTokenType::Question,
527 '_' => PurescriptTokenType::Underscore,
528 '@' => PurescriptTokenType::At,
529 '`' => PurescriptTokenType::Tick,
530 _ => return false,
531 };
532
533 state.advance(ch.len_utf8());
534 state.add_token(token_kind, start_pos, state.get_position());
535 true
536 }
537 else {
538 false
539 }
540 }
541}