1#![allow(clippy::upper_case_acronyms)]
2
3use logos::Logos;
4
5use miette::{Diagnostic, SourceSpan};
6use thiserror::Error;
7
8use claw_common::Source;
9
10#[derive(Debug, PartialEq, Clone)]
11pub struct TokenData {
12 pub token: Token,
13 pub span: SourceSpan,
14}
15
16#[derive(Error, Debug, Diagnostic)]
17#[error("Unable to tokenize input")]
18#[diagnostic()]
19pub struct LexerError {
20 #[source_code]
21 src: Source,
22 #[label("Here")]
23 span: SourceSpan,
24}
25
26pub fn tokenize(src: Source, contents: &str) -> Result<Vec<TokenData>, LexerError> {
27 let lexer = Token::lexer(contents);
28
29 lexer
30 .spanned()
31 .map(|(token, span)| match token {
32 Ok(token) => Ok(TokenData {
33 token,
34 span: SourceSpan::from(span),
35 }),
36 Err(_error) => Err(LexerError {
37 src: src.clone(),
38 span: span.into(),
39 }),
40 })
41 .collect()
42}
43
44#[derive(Logos, Debug, PartialEq, Clone)]
46#[logos(error = ())]
47#[logos(skip r"[ \t\r\n\f]+")]
48#[logos(skip r"//[^\n]*")]
49#[logos(subpattern word = r"[a-z][a-z0-9]*|[A-Z][A-Z0-9]*")]
50#[logos(subpattern id = r"%?(?&word)(-(?&word))*")]
51pub enum Token {
52 #[token("\"", parse_string_literal)]
54 #[token("r", parse_raw_string_literal)]
55 StringLiteral(String),
56
57 #[regex(r"[0-9][_0-9]*", |lex| parse_decint_literal(lex.slice()))]
59 #[regex(r"0b[01][_01]*", |lex| parse_bin_literal(lex.slice()))]
60 #[regex(r"0x[0-9a-fA-F][_0-9a-fA-F]*", |lex| parse_hex_literal(lex.slice()))]
61 IntLiteral(u64),
62
63 #[regex(r"[0-9][_0-9]*\.[0-9][_0-9]*", |lex| parse_decfloat_literal(lex.slice()))]
65 FloatLiteral(f64),
66
67 #[regex(r"(?&id)", |lex| lex.slice().to_string())]
69 Identifier(String),
70
71 #[token("export")]
74 Export,
75
76 #[token("import")]
78 Import,
79
80 #[token("from")]
82 From,
83
84 #[token("func")]
86 Func,
87
88 #[token("if")]
90 If,
91
92 #[token("for")]
94 For,
95
96 #[token("in")]
98 In,
99
100 #[token("loop")]
102 Loop,
103
104 #[token("break")]
106 Break,
107
108 #[token("continue")]
110 Continue,
111
112 #[token("return")]
114 Return,
115
116 #[token("result")]
118 Result,
119
120 #[token("string")]
122 String,
123
124 #[token("u8")]
126 U8,
127
128 #[token("u16")]
130 U16,
131
132 #[token("u32")]
134 U32,
135
136 #[token("u64")]
138 U64,
139
140 #[token("s8")]
142 S8,
143
144 #[token("s16")]
146 S16,
147
148 #[token("s32")]
150 S32,
151
152 #[token("s64")]
154 S64,
155
156 #[token("f32")]
158 F32,
159
160 #[token("f64")]
162 F64,
163
164 #[token("as")]
166 As,
167
168 #[token("at")]
170 At,
171
172 #[token("let")]
174 Let,
175
176 #[token("mut")]
178 Mut,
179
180 #[token("bool")]
182 Bool,
183
184 #[token("true")]
186 True,
187
188 #[token("false")]
190 False,
191
192 #[token("(")]
195 LParen,
196
197 #[token(")")]
199 RParen,
200
201 #[token("{")]
203 LBrace,
204
205 #[token("}")]
207 RBrace,
208
209 #[token("[")]
211 LBracket,
212
213 #[token("]")]
215 RBracket,
216
217 #[token(",")]
219 Comma,
220
221 #[token(".")]
223 Dot,
224
225 #[token("..")]
227 Range,
228
229 #[token(":")]
231 Colon,
232
233 #[token(";")]
235 Semicolon,
236
237 #[token("=")]
239 Assign,
240
241 #[token("->")]
243 Arrow,
244
245 #[token("+")]
247 Add,
248
249 #[token("-")]
251 Sub,
252
253 #[token("*")]
255 Mult,
256
257 #[token("/")]
259 Div,
260
261 #[token("%")]
263 Mod,
264
265 #[token("!")]
267 Invert,
268
269 #[token("and")]
271 LogicalAnd,
272
273 #[token("or")]
275 LogicalOr,
276
277 #[token("|")]
279 BitOr,
280
281 #[token("&")]
283 BitAnd,
284
285 #[token("^")]
287 BitXor,
288
289 #[token("<<")]
291 BitShiftL,
292
293 #[token(">>")]
295 BitShiftR,
296
297 #[token(">>>")]
299 ArithShiftR,
300
301 #[token("|=")]
303 BitOrAssign,
304
305 #[token("&=")]
307 BitAndAssign,
308
309 #[token("^=")]
311 BitXorAssign,
312
313 #[token("+=")]
315 AddAssign,
316
317 #[token("-=")]
319 SubAssign,
320
321 #[token("*=")]
323 StarAssign,
324
325 #[token("/=")]
327 DivAssign,
328
329 #[token("<")]
331 LT,
332
333 #[token("<=")]
335 LTE,
336
337 #[token(">")]
339 GT,
340
341 #[token(">=")]
343 GTE,
344
345 #[token("==")]
347 EQ,
348
349 #[token("!=")]
351 NEQ,
352}
353
354impl std::fmt::Display for Token {
355 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
356 match self {
357 Token::StringLiteral(s) => write!(f, "\"{}\"", s),
358 Token::IntLiteral(i) => write!(f, "{}", i),
359 Token::FloatLiteral(float) => write!(f, "{:?}", float),
360 Token::Identifier(ident) => write!(f, "{}", ident),
361 Token::Export => write!(f, "export"),
362 Token::Import => write!(f, "import"),
363 Token::From => write!(f, "from"),
364 Token::Func => write!(f, "func"),
365 Token::If => write!(f, "if"),
366 Token::For => write!(f, "for"),
367 Token::In => write!(f, "in"),
368 Token::Loop => write!(f, "loop"),
369 Token::Break => write!(f, "break"),
370 Token::Continue => write!(f, "continue"),
371 Token::Return => write!(f, "return"),
372 Token::Result => write!(f, "result"),
373 Token::String => write!(f, "string"),
374 Token::U8 => write!(f, "u8"),
375 Token::U16 => write!(f, "u16"),
376 Token::U32 => write!(f, "u32"),
377 Token::U64 => write!(f, "u64"),
378 Token::S8 => write!(f, "S8"),
379 Token::S16 => write!(f, "S16"),
380 Token::S32 => write!(f, "S32"),
381 Token::S64 => write!(f, "s64"),
382 Token::F32 => write!(f, "f32"),
383 Token::F64 => write!(f, "f64"),
384 Token::As => write!(f, "as"),
385 Token::At => write!(f, "at"),
386 Token::Let => write!(f, "let"),
387 Token::Mut => write!(f, "mut"),
388 Token::Bool => write!(f, "bool"),
389 Token::True => write!(f, "true"),
390 Token::False => write!(f, "false"),
391 Token::LParen => write!(f, "("),
392 Token::RParen => write!(f, ")"),
393 Token::LBrace => write!(f, "{{"),
394 Token::RBrace => write!(f, "}}"),
395 Token::LBracket => write!(f, "["),
396 Token::RBracket => write!(f, "]"),
397 Token::Comma => write!(f, ","),
398 Token::Dot => write!(f, "."),
399 Token::Range => write!(f, ".."),
400 Token::Colon => write!(f, ":"),
401 Token::Semicolon => write!(f, ";"),
402 Token::Assign => write!(f, "="),
403 Token::Arrow => write!(f, "->"),
404 Token::Add => write!(f, "+"),
405 Token::Sub => write!(f, "-"),
406 Token::Mult => write!(f, "*"),
407 Token::Div => write!(f, "/"),
408 Token::Mod => write!(f, "%"),
409 Token::Invert => write!(f, "!"),
410 Token::LogicalAnd => write!(f, "and"),
411 Token::LogicalOr => write!(f, "or"),
412 Token::BitOr => write!(f, "|"),
413 Token::BitAnd => write!(f, "&"),
414 Token::BitXor => write!(f, "^"),
415 Token::BitShiftL => write!(f, "<<"),
416 Token::BitShiftR => write!(f, ">>"),
417 Token::ArithShiftR => write!(f, ">>>"),
418 Token::BitOrAssign => write!(f, "|="),
419 Token::BitAndAssign => write!(f, "&="),
420 Token::BitXorAssign => write!(f, "^="),
421 Token::AddAssign => write!(f, "+="),
422 Token::SubAssign => write!(f, "-="),
423 Token::StarAssign => write!(f, "*="),
424 Token::DivAssign => write!(f, "/="),
425 Token::LT => write!(f, "<"),
426 Token::LTE => write!(f, "<="),
427 Token::GT => write!(f, ">"),
428 Token::GTE => write!(f, ">="),
429 Token::EQ => write!(f, "=="),
430 Token::NEQ => write!(f, "!="),
431 }
432 }
433}
434
435fn parse_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
437 let mut c_iter = lex.remainder().chars();
438 let mut buf = String::new();
439
440 while let Some(c) = c_iter.next() {
441 if c == '"' {
443 lex.bump(1);
444 return Some(buf);
445 }
446
447 if c == '\\' {
449 lex.bump(1);
450 if let Some((c_esc, c_len)) = parse_escaped_char(&mut c_iter) {
451 lex.bump(c_len);
452 buf.push(c_esc);
453 }
454 } else {
455 lex.bump(c.len_utf8());
456 buf.push(c);
457 }
458 }
459
460 None
461}
462
463fn parse_escaped_char(lex: &mut std::str::Chars) -> Option<(char, usize)> {
467 let res = match lex.next()? {
468 '\"' => ('\"', 1),
469 '\\' => ('\\', 1),
470 '/' => ('/', 1),
471 'b' => ('\u{0008}', 1),
472 'f' => ('\u{000C}', 1),
473 'n' => ('\n', 1),
474 'r' => ('\r', 1),
475 't' => ('\t', 1),
476 'u' => {
477 let next_4: [Option<char>; 4] = [lex.next(), lex.next(), lex.next(), lex.next()];
479 let next_4: Option<Vec<char>> = next_4.iter().copied().collect();
480 let next_4: String = next_4?.into_iter().collect();
481
482 let code_point = u32::from_str_radix(&next_4, 16).ok()?;
483 let new_c: char = std::char::from_u32(code_point)?;
484
485 (new_c, 5)
486 }
487 _ => return None,
488 };
489
490 Some(res)
491}
492
493fn parse_raw_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
495 let mut c_iter = lex.remainder().chars();
496 let mut buf = String::new();
497
498 let mut starting_hashes = 0;
499 let mut starting_quote = false;
500
501 while let Some(c) = c_iter.next() {
502 lex.bump(c.len_utf8());
503 if c == '"' {
504 starting_quote = true;
505 break;
506 }
507 if c == '#' {
508 starting_hashes += 1;
509 } else {
510 return None;
511 }
512 }
513
514 if !starting_quote {
515 return None;
516 }
517
518 let mut seen_quote = false;
519 let mut hash_count = 0;
520
521 while let Some(c) = c_iter.next() {
522 lex.bump(c.len_utf8());
523 if seen_quote && c == '#' {
524 hash_count += 1;
525
526 if hash_count == starting_hashes {
527 return Some(buf);
528 }
529 continue;
530 }
531
532 if seen_quote {
534 buf.push('"');
535 }
536 seen_quote = false;
538
539 for _ in 0..hash_count {
541 buf.push('#');
542 }
543 hash_count = 0;
545
546 if c == '"' {
547 seen_quote = true;
548 } else {
549 buf.push(c);
550 }
551 }
552
553 None
554}
555
556fn parse_decint_literal(s: &str) -> Option<u64> {
557 s.replace('_', "").parse().ok()
558}
559
560fn parse_decfloat_literal(s: &str) -> Option<f64> {
561 s.replace('_', "").parse().ok()
562}
563
564fn parse_bin_literal(s: &str) -> Option<u64> {
565 u64::from_str_radix(&s[2..].replace('_', ""), 2).ok()
566}
567
568fn parse_hex_literal(s: &str) -> Option<u64> {
569 u64::from_str_radix(&s[2..].replace('_', ""), 16).ok()
570}
571
572#[cfg(test)]
573mod test {
574 use super::*;
575 use claw_common::make_source;
576 use pretty_assertions::assert_eq;
577
578 #[test]
579 fn tokenize_func_declaration() {
580 let contents = "func test(a: u32) -> u32";
581 let src = make_source("test", contents);
582 let ident_test = Token::Identifier("test".to_owned());
583 let ident_a = Token::Identifier("a".to_owned());
584 let output = vec![
585 (Token::Func, SourceSpan::from(0..4)),
586 (ident_test, SourceSpan::from(5..9)),
587 (Token::LParen, SourceSpan::from(9..10)),
588 (ident_a, SourceSpan::from(10..11)),
589 (Token::Colon, SourceSpan::from(11..12)),
590 (Token::U32, SourceSpan::from(13..16)),
591 (Token::RParen, SourceSpan::from(16..17)),
592 (Token::Arrow, SourceSpan::from(18..20)),
593 (Token::U32, SourceSpan::from(21..24)),
594 ]
595 .into_iter()
596 .map(to_token_data)
597 .collect::<Vec<TokenData>>();
598
599 match tokenize(src, contents) {
600 Ok(tokens) => assert_eq!(output, tokens),
601 Err(_) => panic!("Should not have failed"),
602 }
603 }
604
605 #[test]
606 fn tokenize_let() {
607 let contents = r#"let a = "asdf\"";"#;
608 let src = make_source("test", contents);
609 let ident_a = Token::Identifier("a".to_owned());
610 let string_asdf = Token::StringLiteral(String::from(r#"asdf""#));
611 let output = vec![
612 (Token::Let, SourceSpan::from(0..3)),
613 (ident_a, SourceSpan::from(4..5)),
614 (Token::Assign, SourceSpan::from(6..7)),
615 (string_asdf, SourceSpan::from(8..16)),
616 (Token::Semicolon, SourceSpan::from(16..17)),
617 ]
618 .into_iter()
619 .map(to_token_data)
620 .collect::<Vec<TokenData>>();
621
622 match tokenize(src, contents) {
623 Ok(tokens) => assert_eq!(output, tokens),
624 Err(_) => panic!("Should not have failed"),
625 }
626 }
627
628 fn to_token_data(d: (Token, SourceSpan)) -> TokenData {
629 TokenData {
630 token: d.0,
631 span: d.1,
632 }
633 }
634}