1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PurescriptLanguage, lexer::token_type::PurescriptTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
8
9#[derive(Clone)]
10pub struct PurescriptLexer<'config> {
11 _config: &'config PurescriptLanguage,
12}
13
14impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
15 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
16 let mut state = State::new_with_cache(source, 0, cache);
17 let result = self.run(&mut state);
18 if result.is_ok() {
19 state.add_eof();
20 }
21 state.finish_with_cache(result, cache)
22 }
23}
24
25impl<'config> PurescriptLexer<'config> {
26 pub fn new(config: &'config PurescriptLanguage) -> Self {
28 Self { _config: config }
29 }
30
31 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32 while state.not_at_end() {
33 let safe_point = state.get_position();
34 if self.skip_whitespace(state) {
35 continue;
36 }
37
38 if self.lex_newline(state) {
39 continue;
40 }
41
42 if self.lex_comment(state) {
43 continue;
44 }
45
46 if self.lex_identifier_or_keyword(state) {
47 continue;
48 }
49
50 if self.lex_number_literal(state) {
51 continue;
52 }
53
54 if self.lex_string_literal(state) {
55 continue;
56 }
57
58 if self.lex_char_literal(state) {
59 continue;
60 }
61
62 if self.lex_operator(state) {
63 continue;
64 }
65
66 if self.lex_delimiter(state) {
67 continue;
68 }
69
70 let start_pos = state.get_position();
72 if let Some(ch) = state.peek() {
73 state.advance(ch.len_utf8());
74 state.add_token(PurescriptTokenType::Error, start_pos, state.get_position())
75 }
76
77 state.advance_if_dead_lock(safe_point)
78 }
79
80 Ok(())
81 }
82
83 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85 let start_pos = state.get_position();
86
87 while let Some(ch) = state.peek() {
88 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
89 }
90
91 if state.get_position() > start_pos {
92 state.add_token(PurescriptTokenType::Whitespace, start_pos, state.get_position());
93 true
94 }
95 else {
96 false
97 }
98 }
99
100 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102 let start_pos = state.get_position();
103
104 if let Some('\n') = state.peek() {
105 state.advance(1);
106 state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
107 true
108 }
109 else if let Some('\r') = state.peek() {
110 state.advance(1);
111 if let Some('\n') = state.peek() {
112 state.advance(1)
113 }
114 state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
115 true
116 }
117 else {
118 false
119 }
120 }
121
122 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124 let start_pos = state.get_position();
125
126 if let Some('-') = state.peek() {
127 state.advance(1);
128 if let Some('-') = state.peek() {
129 state.advance(1);
131 while let Some(ch) = state.peek() {
132 if ch == '\n' || ch == '\r' {
133 break;
134 }
135 state.advance(ch.len_utf8())
136 }
137 state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
138 true
139 }
140 else {
141 state.set_position(start_pos);
142 false
143 }
144 }
145 else if let Some('{') = state.peek() {
146 state.advance(1);
147 if let Some('-') = state.peek() {
148 state.advance(1);
150 let mut depth = 1;
151 while let Some(ch) = state.peek() {
152 if ch == '{' {
153 state.advance(1);
154 if let Some('-') = state.peek() {
155 depth += 1;
156 state.advance(1)
157 }
158 }
159 else if ch == '-' {
160 state.advance(1);
161 if let Some('}') = state.peek() {
162 depth -= 1;
163 state.advance(1);
164 if depth == 0 {
165 break;
166 }
167 }
168 }
169 else {
170 state.advance(ch.len_utf8())
171 }
172 }
173 state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
174 true
175 }
176 else {
177 state.set_position(start_pos);
178 false
179 }
180 }
181 else {
182 false
183 }
184 }
185
186 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188 let start_pos = state.get_position();
189
190 if let Some(ch) = state.peek() {
191 if ch.is_ascii_alphabetic() || ch == '_' {
192 state.advance(ch.len_utf8());
193
194 while let Some(ch) = state.peek() {
195 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
196 state.advance(ch.len_utf8());
197 }
198 else {
199 break;
200 }
201 }
202
203 let text = state.get_text_in((start_pos..state.get_position()).into());
205
206 let token_kind = match text.as_ref() {
207 "ado" => PurescriptTokenType::Ado,
208 "case" => PurescriptTokenType::Case,
209 "class" => PurescriptTokenType::Class,
210 "data" => PurescriptTokenType::Data,
211 "derive" => PurescriptTokenType::Derive,
212 "do" => PurescriptTokenType::Do,
213 "else" => PurescriptTokenType::Else,
214 "false" => PurescriptTokenType::False,
215 "forall" => PurescriptTokenType::Forall,
216 "foreign" => PurescriptTokenType::Foreign,
217 "if" => PurescriptTokenType::If,
218 "import" => PurescriptTokenType::Import,
219 "in" => PurescriptTokenType::In,
220 "infix" => PurescriptTokenType::Infix,
221 "infixl" => PurescriptTokenType::Infixl,
222 "infixr" => PurescriptTokenType::Infixr,
223 "instance" => PurescriptTokenType::Instance,
224 "let" => PurescriptTokenType::Let,
225 "module" => PurescriptTokenType::Module,
226 "newtype" => PurescriptTokenType::Newtype,
227 "of" => PurescriptTokenType::Of,
228 "then" => PurescriptTokenType::Then,
229 "true" => PurescriptTokenType::True,
230 "type" => PurescriptTokenType::Type,
231 "where" => PurescriptTokenType::Where,
232 _ => PurescriptTokenType::Identifier,
233 };
234 state.add_token(token_kind, start_pos, state.get_position());
235 true
236 }
237 else {
238 false
239 }
240 }
241 else {
242 false
243 }
244 }
245
246 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
248 let start_pos = state.get_position();
249
250 if let Some(ch) = state.peek() {
251 if ch.is_ascii_digit() {
252 state.advance(1);
253
254 if ch == '0' {
256 if let Some('x') | Some('X') = state.peek() {
257 state.advance(1);
258 while let Some(ch) = state.peek() {
259 if ch.is_ascii_hexdigit() {
260 state.advance(1);
261 }
262 else {
263 break;
264 }
265 }
266 }
267 else {
268 while let Some(ch) = state.peek() {
270 if ch.is_ascii_digit() { state.advance(1) } else { break }
271 }
272 }
273 }
274 else {
275 while let Some(ch) = state.peek() {
277 if ch.is_ascii_digit() { state.advance(1) } else { break }
278 }
279 }
280
281 if let Some('.') = state.peek() {
283 state.advance(1);
284 while let Some(ch) = state.peek() {
285 if ch.is_ascii_digit() { state.advance(1) } else { break }
286 }
287 }
288
289 if let Some('e') | Some('E') = state.peek() {
291 state.advance(1);
292 if let Some('+') | Some('-') = state.peek() {
293 state.advance(1)
294 }
295 while let Some(ch) = state.peek() {
296 if ch.is_ascii_digit() { state.advance(1) } else { break }
297 }
298 }
299
300 state.add_token(PurescriptTokenType::NumberLiteral, start_pos, state.get_position());
301 true
302 }
303 else {
304 false
305 }
306 }
307 else {
308 false
309 }
310 }
311
312 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
314 let start_pos = state.get_position();
315
316 if let Some('"') = state.peek() {
317 state.advance(1);
318
319 while let Some(ch) = state.peek() {
320 if ch == '"' {
321 state.advance(1);
322 break;
323 }
324 else if ch == '\\' {
325 state.advance(1);
326 if let Some(_) = state.peek() {
327 state.advance(1)
328 }
329 }
330 else if ch == '\n' || ch == '\r' {
331 break; }
333 else {
334 state.advance(ch.len_utf8())
335 }
336 }
337
338 state.add_token(PurescriptTokenType::StringLiteral, start_pos, state.get_position());
339 true
340 }
341 else {
342 false
343 }
344 }
345
346 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
348 let start_pos = state.get_position();
349
350 if let Some('\'') = state.peek() {
351 state.advance(1);
352
353 if let Some(ch) = state.peek() {
354 if ch == '\\' {
355 state.advance(1);
356 if let Some(_) = state.peek() {
357 state.advance(1)
358 }
359 }
360 else if ch != '\'' {
361 state.advance(ch.len_utf8())
362 }
363 }
364
365 if let Some('\'') = state.peek() {
366 state.advance(1);
367 state.add_token(PurescriptTokenType::CharLiteral, start_pos, state.get_position());
368 true
369 }
370 else {
371 state.set_position(start_pos);
372 false
373 }
374 }
375 else {
376 false
377 }
378 }
379
380 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
382 let start_pos = state.get_position();
383
384 if let Some(ch) = state.peek() {
385 let token_kind = match ch {
386 '+' => {
387 state.advance(1);
388 PurescriptTokenType::Plus
389 }
390 '-' => {
391 state.advance(1);
392 if let Some('>') = state.peek() {
393 state.advance(1);
394 PurescriptTokenType::Arrow
395 }
396 else {
397 PurescriptTokenType::Minus
398 }
399 }
400 '*' => {
401 state.advance(1);
402 if let Some('*') = state.peek() {
403 state.advance(1);
404 PurescriptTokenType::Caret }
406 else {
407 PurescriptTokenType::Star
408 }
409 }
410 '/' => {
411 state.advance(1);
412 if let Some('=') = state.peek() {
413 state.advance(1);
414 PurescriptTokenType::NotEqual
415 }
416 else {
417 PurescriptTokenType::Slash
418 }
419 }
420 '%' => {
421 state.advance(1);
422 PurescriptTokenType::Percent
423 }
424 '=' => {
425 state.advance(1);
426 match state.peek() {
427 Some('=') => {
428 state.advance(1);
429 PurescriptTokenType::Equal
430 }
431 Some('>') => {
432 state.advance(1);
433 PurescriptTokenType::FatArrow
434 }
435 _ => PurescriptTokenType::Equal,
436 }
437 }
438 '<' => {
439 state.advance(1);
440 match state.peek() {
441 Some('=') => {
442 state.advance(1);
443 PurescriptTokenType::LessEqual
444 }
445 Some('-') => {
446 state.advance(1);
447 PurescriptTokenType::Bind
448 }
449 _ => PurescriptTokenType::Less,
450 }
451 }
452 '>' => {
453 state.advance(1);
454 if let Some('=') = state.peek() {
455 state.advance(1);
456 PurescriptTokenType::GreaterEqual
457 }
458 else {
459 PurescriptTokenType::Greater
460 }
461 }
462 '&' => {
463 state.advance(1);
464 if let Some('&') = state.peek() {
465 state.advance(1);
466 PurescriptTokenType::And
467 }
468 else {
469 return false;
470 }
471 }
472 '|' => {
473 state.advance(1);
474 if let Some('|') = state.peek() {
475 state.advance(1);
476 PurescriptTokenType::Or
477 }
478 else {
479 PurescriptTokenType::Pipe
480 }
481 }
482 '\\' => {
483 state.advance(1);
484 PurescriptTokenType::Backslash
485 }
486 _ => return false,
487 };
488
489 state.add_token(token_kind, start_pos, state.get_position());
490 true
491 }
492 else {
493 false
494 }
495 }
496
497 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
499 let start_pos = state.get_position();
500
501 if let Some(ch) = state.peek() {
502 let token_kind = match ch {
503 '(' => PurescriptTokenType::LeftParen,
504 ')' => PurescriptTokenType::RightParen,
505 '[' => PurescriptTokenType::LeftBracket,
506 ']' => PurescriptTokenType::RightBracket,
507 '{' => PurescriptTokenType::LeftBrace,
508 '}' => PurescriptTokenType::RightBrace,
509 ',' => PurescriptTokenType::Comma,
510 ';' => PurescriptTokenType::Semicolon,
511 '.' => PurescriptTokenType::Dot,
512 ':' => {
513 state.advance(1);
514 if let Some(':') = state.peek() {
515 state.advance(1);
516 state.add_token(PurescriptTokenType::ColonColon, start_pos, state.get_position());
517 return true;
518 }
519 else {
520 state.add_token(PurescriptTokenType::Colon, start_pos, state.get_position());
521 return true;
522 }
523 }
524 '?' => PurescriptTokenType::Question,
525 '_' => PurescriptTokenType::Underscore,
526 '@' => PurescriptTokenType::At,
527 '`' => PurescriptTokenType::Tick,
528 _ => return false,
529 };
530
531 state.advance(ch.len_utf8());
532 state.add_token(token_kind, start_pos, state.get_position());
533 true
534 }
535 else {
536 false
537 }
538 }
539}