1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HaskellLanguage, lexer::token_type::HaskellTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, HaskellLanguage>;
8
9#[derive(Clone)]
10pub struct HaskellLexer<'config> {
11 _config: &'config HaskellLanguage,
12}
13
14impl<'config> HaskellLexer<'config> {
15 pub fn new(config: &'config HaskellLanguage) -> Self {
16 Self { _config: config }
17 }
18
19 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
20 let start_pos = state.get_position();
21 while let Some(ch) = state.peek() {
22 if ch == ' ' || ch == '\t' {
23 state.bump();
24 }
25 else {
26 break;
27 }
28 }
29
30 if state.get_position() > start_pos {
31 state.add_token(HaskellTokenType::Whitespace, start_pos, state.get_position());
32 true
33 }
34 else {
35 false
36 }
37 }
38
39 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
40 let start_pos = state.get_position();
41
42 if let Some('\n') = state.peek() {
43 state.bump();
44 state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
45 true
46 }
47 else if let Some('\r') = state.peek() {
48 state.bump();
49 if let Some('\n') = state.peek() {
50 state.bump();
51 }
52 state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
53 true
54 }
55 else {
56 false
57 }
58 }
59
60 fn lex_single_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
61 let start_pos = state.get_position();
62
63 if let Some('-') = state.peek() {
64 if let Some('-') = state.peek_next_n(1) {
65 state.advance(2);
66 while let Some(ch) = state.peek() {
67 if ch == '\n' || ch == '\r' {
68 break;
69 }
70 state.bump();
71 }
72 state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
73 true
74 }
75 else {
76 false
77 }
78 }
79 else {
80 false
81 }
82 }
83
84 fn lex_multi_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85 let start_pos = state.get_position();
86
87 if let Some('{') = state.peek() {
88 if let Some('-') = state.peek_next_n(1) {
89 state.advance(2);
90 let mut depth = 1;
91 while let Some(ch) = state.peek() {
92 if ch == '{' && state.peek_next_n(1) == Some('-') {
93 depth += 1;
94 state.advance(2)
95 }
96 else if ch == '-' && state.peek_next_n(1) == Some('}') {
97 depth -= 1;
98 state.advance(2);
99 if depth == 0 {
100 break;
101 }
102 }
103 else {
104 state.bump();
105 }
106 }
107 state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
108 true
109 }
110 else {
111 false
112 }
113 }
114 else {
115 false
116 }
117 }
118
119 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120 let start_pos = state.get_position();
121
122 if let Some(ch) = state.peek() {
123 if ch.is_ascii_alphabetic() || ch == '_' {
124 state.bump();
125
126 while let Some(ch) = state.peek() {
127 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
128 state.bump();
129 }
130 else {
131 break;
132 }
133 }
134
135 let end_pos = state.get_position();
136 let text = state.get_text_in((start_pos..end_pos).into());
137 let kind = self.keyword_or_identifier(text.as_ref());
138
139 state.add_token(kind, start_pos, end_pos);
140 true
141 }
142 else {
143 false
144 }
145 }
146 else {
147 false
148 }
149 }
150
151 fn keyword_or_identifier(&self, text: &str) -> HaskellTokenType {
152 match text {
153 "case" => HaskellTokenType::Case,
154 "class" => HaskellTokenType::Class,
155 "data" => HaskellTokenType::Data,
156 "default" => HaskellTokenType::Default,
157 "deriving" => HaskellTokenType::Deriving,
158 "do" => HaskellTokenType::Do,
159 "else" => HaskellTokenType::Else,
160 "if" => HaskellTokenType::If,
161 "import" => HaskellTokenType::Import,
162 "in" => HaskellTokenType::In,
163 "infix" => HaskellTokenType::Infix,
164 "infixl" => HaskellTokenType::Infixl,
165 "infixr" => HaskellTokenType::Infixr,
166 "instance" => HaskellTokenType::Instance,
167 "let" => HaskellTokenType::Let,
168 "module" => HaskellTokenType::Module,
169 "newtype" => HaskellTokenType::Newtype,
170 "of" => HaskellTokenType::Of,
171 "then" => HaskellTokenType::Then,
172 "type" => HaskellTokenType::Type,
173 "where" => HaskellTokenType::Where,
174 _ => HaskellTokenType::Identifier,
175 }
176 }
177
178 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
179 let start_pos = state.get_position();
180
181 if let Some(ch) = state.peek() {
182 if ch.is_ascii_digit() {
183 state.bump();
184
185 while let Some(ch) = state.peek() {
186 if ch.is_ascii_digit() {
187 state.bump();
188 }
189 else if ch == '.' {
190 state.bump();
191 while let Some(ch) = state.peek() {
192 if ch.is_ascii_digit() {
193 state.bump();
194 }
195 else {
196 break;
197 }
198 }
199 break;
200 }
201 else {
202 break;
203 }
204 }
205
206 state.add_token(HaskellTokenType::Number, start_pos, state.get_position());
207 true
208 }
209 else {
210 false
211 }
212 }
213 else {
214 false
215 }
216 }
217
218 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219 let start_pos = state.get_position();
220
221 if let Some('"') = state.peek() {
222 state.bump();
223
224 while let Some(ch) = state.peek() {
225 if ch == '"' {
226 state.bump();
227 state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
228 return true;
229 }
230 else if ch == '\\' {
231 state.bump();
232 if let Some(_) = state.peek() {
233 state.bump();
234 }
235 }
236 else {
237 state.bump();
238 }
239 }
240
241 state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
242 true
243 }
244 else {
245 false
246 }
247 }
248
249 fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
250 let start_pos = state.get_position();
251
252 if let Some('\'') = state.peek() {
253 state.bump();
254
255 if let Some(ch) = state.peek() {
256 if ch == '\\' {
257 state.bump();
258 if let Some(_) = state.peek() {
259 state.bump();
260 }
261 }
262 else if ch != '\'' {
263 state.bump();
264 }
265 }
266
267 if let Some('\'') = state.peek() {
268 state.bump();
269 state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
270 true
271 }
272 else {
273 state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
274 true
275 }
276 }
277 else {
278 false
279 }
280 }
281
282 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
283 let start_pos = state.get_position();
284
285 if let Some(ch) = state.peek() {
286 let token_kind = match ch {
287 '+' => {
288 state.bump();
289 if let Some('+') = state.peek() {
290 state.bump();
291 HaskellTokenType::Append
292 }
293 else {
294 HaskellTokenType::Plus
295 }
296 }
297 '-' => {
298 state.bump();
299 if let Some('>') = state.peek() {
300 state.bump();
301 HaskellTokenType::Arrow
302 }
303 else {
304 HaskellTokenType::Minus
305 }
306 }
307 '*' => {
308 state.bump();
309 HaskellTokenType::Star
310 }
311 '/' => {
312 state.bump();
313 HaskellTokenType::Slash
314 }
315 '=' => {
316 state.bump();
317 if let Some('=') = state.peek() {
318 state.bump();
319 HaskellTokenType::Equal
320 }
321 else {
322 HaskellTokenType::Assign
323 }
324 }
325 '<' => {
326 state.bump();
327 if let Some('=') = state.peek() {
328 state.bump();
329 HaskellTokenType::LessEqual
330 }
331 else if let Some('-') = state.peek() {
332 state.bump();
333 HaskellTokenType::LeftArrow
334 }
335 else {
336 HaskellTokenType::Less
337 }
338 }
339 '>' => {
340 state.bump();
341 if let Some('=') = state.peek() {
342 state.bump();
343 HaskellTokenType::GreaterEqual
344 }
345 else {
346 HaskellTokenType::Greater
347 }
348 }
349 ':' => {
350 state.bump();
351 if let Some(':') = state.peek() {
352 state.bump();
353 HaskellTokenType::DoubleColon
354 }
355 else {
356 HaskellTokenType::Colon
357 }
358 }
359 '|' => {
360 state.bump();
361 HaskellTokenType::Pipe
362 }
363 '&' => {
364 state.bump();
365 HaskellTokenType::Ampersand
366 }
367 '!' => {
368 state.bump();
369 HaskellTokenType::Bang
370 }
371 '?' => {
372 state.bump();
373 HaskellTokenType::Question
374 }
375 ';' => {
376 state.bump();
377 HaskellTokenType::Semicolon
378 }
379 ',' => {
380 state.bump();
381 HaskellTokenType::Comma
382 }
383 '.' => {
384 state.bump();
385 if let Some('.') = state.peek() {
386 state.bump();
387 HaskellTokenType::DoubleDot
388 }
389 else {
390 HaskellTokenType::Dot
391 }
392 }
393 '$' => {
394 state.bump();
395 HaskellTokenType::Dollar
396 }
397 '@' => {
398 state.bump();
399 HaskellTokenType::At
400 }
401 '~' => {
402 state.bump();
403 HaskellTokenType::Tilde
404 }
405 '\\' => {
406 state.bump();
407 HaskellTokenType::Backslash
408 }
409 '`' => {
410 state.bump();
411 HaskellTokenType::Backtick
412 }
413 _ => return false,
414 };
415
416 state.add_token(token_kind, start_pos, state.get_position());
417 true
418 }
419 else {
420 false
421 }
422 }
423
424 fn lex_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
425 let start_pos = state.get_position();
426
427 if let Some(ch) = state.peek() {
428 let token_kind = match ch {
429 '(' => {
430 state.bump();
431 HaskellTokenType::LeftParen
432 }
433 ')' => {
434 state.bump();
435 HaskellTokenType::RightParen
436 }
437 '[' => {
438 state.bump();
439 HaskellTokenType::LeftBracket
440 }
441 ']' => {
442 state.bump();
443 HaskellTokenType::RightBracket
444 }
445 '{' => {
446 state.bump();
447 HaskellTokenType::LeftBrace
448 }
449 '}' => {
450 state.bump();
451 HaskellTokenType::RightBrace
452 }
453 _ => return false,
454 };
455
456 state.add_token(token_kind, start_pos, state.get_position());
457 true
458 }
459 else {
460 false
461 }
462 }
463}
464
465impl<'config> Lexer<HaskellLanguage> for HaskellLexer<'config> {
466 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HaskellLanguage>) -> LexOutput<HaskellLanguage> {
467 let mut state = State::new(source);
468
469 while state.not_at_end() {
470 let safe_point = state.get_position();
471 if self.skip_whitespace(&mut state) {
472 continue;
473 }
474
475 if self.lex_newline(&mut state) {
476 continue;
477 }
478
479 if self.lex_single_line_comment(&mut state) {
480 continue;
481 }
482
483 if self.lex_multi_line_comment(&mut state) {
484 continue;
485 }
486
487 if self.lex_identifier_or_keyword(&mut state) {
488 continue;
489 }
490
491 if self.lex_number(&mut state) {
492 continue;
493 }
494
495 if self.lex_string(&mut state) {
496 continue;
497 }
498
499 if self.lex_char(&mut state) {
500 continue;
501 }
502
503 if self.lex_operators(&mut state) {
504 continue;
505 }
506
507 if self.lex_delimiters(&mut state) {
508 continue;
509 }
510
511 let start_pos = state.get_position();
513 if state.peek().is_some() {
514 state.advance(1);
515 state.add_token(HaskellTokenType::Error, start_pos, state.get_position())
516 }
517
518 state.advance_if_dead_lock(safe_point)
519 }
520
521 let pos = state.get_position();
523 state.add_token(HaskellTokenType::Eof, pos, pos);
524
525 state.finish_with_cache(Ok(()), cache)
526 }
527}