claw_parser/
lexer.rs

1#![allow(clippy::upper_case_acronyms)]
2
3use logos::Logos;
4
5use miette::{Diagnostic, SourceSpan};
6use thiserror::Error;
7
8use claw_common::Source;
9
10#[derive(Debug, PartialEq, Clone)]
11pub struct TokenData {
12    pub token: Token,
13    pub span: SourceSpan,
14}
15
16#[derive(Error, Debug, Diagnostic)]
17#[error("Unable to tokenize input")]
18#[diagnostic()]
19pub struct LexerError {
20    #[source_code]
21    src: Source,
22    #[label("Here")]
23    span: SourceSpan,
24}
25
26pub fn tokenize(src: Source, contents: &str) -> Result<Vec<TokenData>, LexerError> {
27    let lexer = Token::lexer(contents);
28
29    lexer
30        .spanned()
31        .map(|(token, span)| match token {
32            Ok(token) => Ok(TokenData {
33                token,
34                span: SourceSpan::from(span),
35            }),
36            Err(_error) => Err(LexerError {
37                src: src.clone(),
38                span: span.into(),
39            }),
40        })
41        .collect()
42}
43
44/// The Token type for the language.
45#[derive(Logos, Debug, PartialEq, Clone)]
46#[logos(error = ())]
47#[logos(skip r"[ \t\r\n\f]+")]
48#[logos(skip r"//[^\n]*")]
49#[logos(subpattern word = r"[a-z][a-z0-9]*|[A-Z][A-Z0-9]*")]
50#[logos(subpattern id = r"%?(?&word)(-(?&word))*")]
51pub enum Token {
52    /// Double-quoted string literal
53    #[token("\"", parse_string_literal)]
54    #[token("r", parse_raw_string_literal)]
55    StringLiteral(String),
56
57    /// A Decimal number literal
58    #[regex(r"[0-9][_0-9]*", |lex| parse_decint_literal(lex.slice()))]
59    #[regex(r"0b[01][_01]*", |lex| parse_bin_literal(lex.slice()))]
60    #[regex(r"0x[0-9a-fA-F][_0-9a-fA-F]*", |lex| parse_hex_literal(lex.slice()))]
61    IntLiteral(u64),
62
63    /// A Decimal floating point literal
64    #[regex(r"[0-9][_0-9]*\.[0-9][_0-9]*", |lex| parse_decfloat_literal(lex.slice()))]
65    FloatLiteral(f64),
66
67    /// An Identifier
68    #[regex(r"(?&id)", |lex| lex.slice().to_string())]
69    Identifier(String),
70
71    // Keywords -----------------------------------------
72    /// The Export Keyword
73    #[token("export")]
74    Export,
75
76    /// The Import Keyword
77    #[token("import")]
78    Import,
79
80    /// The From Keyword
81    #[token("from")]
82    From,
83
84    /// The Function "func" Keyword
85    #[token("func")]
86    Func,
87
88    /// The If Keyword
89    #[token("if")]
90    If,
91
92    /// The For Keyword
93    #[token("for")]
94    For,
95
96    /// The In Keyword
97    #[token("in")]
98    In,
99
100    /// The Loop Keyword
101    #[token("loop")]
102    Loop,
103
104    /// The Break Keyword
105    #[token("break")]
106    Break,
107
108    /// The Continue Keyword
109    #[token("continue")]
110    Continue,
111
112    /// The Return Keyword
113    #[token("return")]
114    Return,
115
116    /// The Result Type Keyword
117    #[token("result")]
118    Result,
119
120    /// The String Type Keyword
121    #[token("string")]
122    String,
123
124    /// The Unsigned 8-bit Integer Type Keyword
125    #[token("u8")]
126    U8,
127
128    /// The Unsigned 16-bit Integer Type Keyword
129    #[token("u16")]
130    U16,
131
132    /// The Unsigned 32-bit Integer Type Keyword
133    #[token("u32")]
134    U32,
135
136    /// The Unsigned 64-bit Integer Type Keyword
137    #[token("u64")]
138    U64,
139
140    /// The Signed 8-bit Integer Type Keyword
141    #[token("s8")]
142    S8,
143
144    /// The Signed 16-bit Integer Type Keyword
145    #[token("s16")]
146    S16,
147
148    /// The Signed 32-bit Integer Type Keyword
149    #[token("s32")]
150    S32,
151
152    /// The Signed 32-bit Integer Type Keyword
153    #[token("s64")]
154    S64,
155
156    /// The 32-bit Floating-point Type Keyword
157    #[token("f32")]
158    F32,
159
160    /// The 64-bit Floating-point Type Keyword
161    #[token("f64")]
162    F64,
163
164    /// The As Keyword
165    #[token("as")]
166    As,
167
168    /// The At Keyword
169    #[token("at")]
170    At,
171
172    /// The Let Keyword
173    #[token("let")]
174    Let,
175
176    /// The Mut Keyword
177    #[token("mut")]
178    Mut,
179
180    /// The Bool Keyword
181    #[token("bool")]
182    Bool,
183
184    /// The True Keyword
185    #[token("true")]
186    True,
187
188    /// The False Keyword
189    #[token("false")]
190    False,
191
192    // Symbols -----------------------------------------
193    /// Left Parenthesis Symbol "("
194    #[token("(")]
195    LParen,
196
197    /// Right Parenthesis Symbol ")"
198    #[token(")")]
199    RParen,
200
201    /// Left Brace Symbol "{"
202    #[token("{")]
203    LBrace,
204
205    /// Right Brace Symbol "}"
206    #[token("}")]
207    RBrace,
208
209    /// Left Bracket Symbol "["
210    #[token("[")]
211    LBracket,
212
213    /// Right Bracket Symbol "]"
214    #[token("]")]
215    RBracket,
216
217    /// The Comma Delimiter ","
218    #[token(",")]
219    Comma,
220
221    /// The Period or Dot Operator "."
222    #[token(".")]
223    Dot,
224
225    /// The Range Operator ".."
226    #[token("..")]
227    Range,
228
229    /// Colon Symbol ":"
230    #[token(":")]
231    Colon,
232
233    /// Semicolon Symbol ";"
234    #[token(";")]
235    Semicolon,
236
237    /// Assignment Operator "="
238    #[token("=")]
239    Assign,
240
241    /// The Right Arrow Symbol "->"
242    #[token("->")]
243    Arrow,
244
245    /// Addition Operator "+"
246    #[token("+")]
247    Add,
248
249    /// Subtraction Operator "-"
250    #[token("-")]
251    Sub,
252
253    /// Multiplication Operator "*"
254    #[token("*")]
255    Mult,
256
257    /// Division Operator "/"
258    #[token("/")]
259    Div,
260
261    /// Modulo Operator "%"
262    #[token("%")]
263    Mod,
264
265    /// Invert Operator "!"
266    #[token("!")]
267    Invert,
268
269    /// Logical And Operator
270    #[token("and")]
271    LogicalAnd,
272
273    /// Logical Or Operator
274    #[token("or")]
275    LogicalOr,
276
277    /// Bitwise Or "|"
278    #[token("|")]
279    BitOr,
280
281    /// Bitwise And "&"
282    #[token("&")]
283    BitAnd,
284
285    /// Bitwise XOR "^"
286    #[token("^")]
287    BitXor,
288
289    /// Bit Shift Left Operator "<<"
290    #[token("<<")]
291    BitShiftL,
292
293    /// Bit Shift Right Operator ">>"
294    #[token(">>")]
295    BitShiftR,
296
297    /// Arithmetic Shift Right Operator ">>>"
298    #[token(">>>")]
299    ArithShiftR,
300
301    /// Bitwise-Or and Assign Operator "+="
302    #[token("|=")]
303    BitOrAssign,
304
305    /// Bitwise-And and Assign Operator "+="
306    #[token("&=")]
307    BitAndAssign,
308
309    /// Bitwsie-Xor and Assign Operator "+="
310    #[token("^=")]
311    BitXorAssign,
312
313    /// Add and Assign Operator "+="
314    #[token("+=")]
315    AddAssign,
316
317    /// Subtract and Assign Operator "-="
318    #[token("-=")]
319    SubAssign,
320
321    /// Star Operator "*=" (used for multiply)
322    #[token("*=")]
323    StarAssign,
324
325    /// Division Operator "/"
326    #[token("/=")]
327    DivAssign,
328
329    /// Less-than Operator "<"
330    #[token("<")]
331    LT,
332
333    /// Less-than or Equal Operator "<="
334    #[token("<=")]
335    LTE,
336
337    /// Greater-than Operator ">"
338    #[token(">")]
339    GT,
340
341    /// Greater-than or Equal Operator ">="
342    #[token(">=")]
343    GTE,
344
345    /// Equals Operator "=="
346    #[token("==")]
347    EQ,
348
349    // Not Equals Operator "!="
350    #[token("!=")]
351    NEQ,
352}
353
354impl std::fmt::Display for Token {
355    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
356        match self {
357            Token::StringLiteral(s) => write!(f, "\"{}\"", s),
358            Token::IntLiteral(i) => write!(f, "{}", i),
359            Token::FloatLiteral(float) => write!(f, "{:?}", float),
360            Token::Identifier(ident) => write!(f, "{}", ident),
361            Token::Export => write!(f, "export"),
362            Token::Import => write!(f, "import"),
363            Token::From => write!(f, "from"),
364            Token::Func => write!(f, "func"),
365            Token::If => write!(f, "if"),
366            Token::For => write!(f, "for"),
367            Token::In => write!(f, "in"),
368            Token::Loop => write!(f, "loop"),
369            Token::Break => write!(f, "break"),
370            Token::Continue => write!(f, "continue"),
371            Token::Return => write!(f, "return"),
372            Token::Result => write!(f, "result"),
373            Token::String => write!(f, "string"),
374            Token::U8 => write!(f, "u8"),
375            Token::U16 => write!(f, "u16"),
376            Token::U32 => write!(f, "u32"),
377            Token::U64 => write!(f, "u64"),
378            Token::S8 => write!(f, "S8"),
379            Token::S16 => write!(f, "S16"),
380            Token::S32 => write!(f, "S32"),
381            Token::S64 => write!(f, "s64"),
382            Token::F32 => write!(f, "f32"),
383            Token::F64 => write!(f, "f64"),
384            Token::As => write!(f, "as"),
385            Token::At => write!(f, "at"),
386            Token::Let => write!(f, "let"),
387            Token::Mut => write!(f, "mut"),
388            Token::Bool => write!(f, "bool"),
389            Token::True => write!(f, "true"),
390            Token::False => write!(f, "false"),
391            Token::LParen => write!(f, "("),
392            Token::RParen => write!(f, ")"),
393            Token::LBrace => write!(f, "{{"),
394            Token::RBrace => write!(f, "}}"),
395            Token::LBracket => write!(f, "["),
396            Token::RBracket => write!(f, "]"),
397            Token::Comma => write!(f, ","),
398            Token::Dot => write!(f, "."),
399            Token::Range => write!(f, ".."),
400            Token::Colon => write!(f, ":"),
401            Token::Semicolon => write!(f, ";"),
402            Token::Assign => write!(f, "="),
403            Token::Arrow => write!(f, "->"),
404            Token::Add => write!(f, "+"),
405            Token::Sub => write!(f, "-"),
406            Token::Mult => write!(f, "*"),
407            Token::Div => write!(f, "/"),
408            Token::Mod => write!(f, "%"),
409            Token::Invert => write!(f, "!"),
410            Token::LogicalAnd => write!(f, "and"),
411            Token::LogicalOr => write!(f, "or"),
412            Token::BitOr => write!(f, "|"),
413            Token::BitAnd => write!(f, "&"),
414            Token::BitXor => write!(f, "^"),
415            Token::BitShiftL => write!(f, "<<"),
416            Token::BitShiftR => write!(f, ">>"),
417            Token::ArithShiftR => write!(f, ">>>"),
418            Token::BitOrAssign => write!(f, "|="),
419            Token::BitAndAssign => write!(f, "&="),
420            Token::BitXorAssign => write!(f, "^="),
421            Token::AddAssign => write!(f, "+="),
422            Token::SubAssign => write!(f, "-="),
423            Token::StarAssign => write!(f, "*="),
424            Token::DivAssign => write!(f, "/="),
425            Token::LT => write!(f, "<"),
426            Token::LTE => write!(f, "<="),
427            Token::GT => write!(f, ">"),
428            Token::GTE => write!(f, ">="),
429            Token::EQ => write!(f, "=="),
430            Token::NEQ => write!(f, "!="),
431        }
432    }
433}
434
435/// Parses a string according to the JSON string format in ECMA-404.
436fn parse_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
437    let mut c_iter = lex.remainder().chars();
438    let mut buf = String::new();
439
440    while let Some(c) = c_iter.next() {
441        // End the parse when you encounter another quote
442        if c == '"' {
443            lex.bump(1);
444            return Some(buf);
445        }
446
447        // If slash, then parse an escaped character
448        if c == '\\' {
449            lex.bump(1);
450            if let Some((c_esc, c_len)) = parse_escaped_char(&mut c_iter) {
451                lex.bump(c_len);
452                buf.push(c_esc);
453            }
454        } else {
455            lex.bump(c.len_utf8());
456            buf.push(c);
457        }
458    }
459
460    None
461}
462
463/// Parses an escaped character according to the JSON string format in ECMA-404.
464/// Takes in an iterator which starts after the beginning slash.
465/// If successful, returns the produced char and the length of input consumed.
466fn parse_escaped_char(lex: &mut std::str::Chars) -> Option<(char, usize)> {
467    let res = match lex.next()? {
468        '\"' => ('\"', 1),
469        '\\' => ('\\', 1),
470        '/' => ('/', 1),
471        'b' => ('\u{0008}', 1),
472        'f' => ('\u{000C}', 1),
473        'n' => ('\n', 1),
474        'r' => ('\r', 1),
475        't' => ('\t', 1),
476        'u' => {
477            // Combine next for characters together, fail if they can't be found
478            let next_4: [Option<char>; 4] = [lex.next(), lex.next(), lex.next(), lex.next()];
479            let next_4: Option<Vec<char>> = next_4.iter().copied().collect();
480            let next_4: String = next_4?.into_iter().collect();
481
482            let code_point = u32::from_str_radix(&next_4, 16).ok()?;
483            let new_c: char = std::char::from_u32(code_point)?;
484
485            (new_c, 5)
486        }
487        _ => return None,
488    };
489
490    Some(res)
491}
492
493/// Parses a raw string literal
494fn parse_raw_string_literal(lex: &mut logos::Lexer<'_, Token>) -> Option<String> {
495    let mut c_iter = lex.remainder().chars();
496    let mut buf = String::new();
497
498    let mut starting_hashes = 0;
499    let mut starting_quote = false;
500
501    while let Some(c) = c_iter.next() {
502        lex.bump(c.len_utf8());
503        if c == '"' {
504            starting_quote = true;
505            break;
506        }
507        if c == '#' {
508            starting_hashes += 1;
509        } else {
510            return None;
511        }
512    }
513
514    if !starting_quote {
515        return None;
516    }
517
518    let mut seen_quote = false;
519    let mut hash_count = 0;
520
521    while let Some(c) = c_iter.next() {
522        lex.bump(c.len_utf8());
523        if seen_quote && c == '#' {
524            hash_count += 1;
525
526            if hash_count == starting_hashes {
527                return Some(buf);
528            }
529            continue;
530        }
531
532        // Append the unused marker quote
533        if seen_quote {
534            buf.push('"');
535        }
536        // Reset the seen quote flag
537        seen_quote = false;
538
539        // Append the unused marker hashes
540        for _ in 0..hash_count {
541            buf.push('#');
542        }
543        // Reset the hash count
544        hash_count = 0;
545
546        if c == '"' {
547            seen_quote = true;
548        } else {
549            buf.push(c);
550        }
551    }
552
553    None
554}
555
556fn parse_decint_literal(s: &str) -> Option<u64> {
557    s.replace('_', "").parse().ok()
558}
559
560fn parse_decfloat_literal(s: &str) -> Option<f64> {
561    s.replace('_', "").parse().ok()
562}
563
564fn parse_bin_literal(s: &str) -> Option<u64> {
565    u64::from_str_radix(&s[2..].replace('_', ""), 2).ok()
566}
567
568fn parse_hex_literal(s: &str) -> Option<u64> {
569    u64::from_str_radix(&s[2..].replace('_', ""), 16).ok()
570}
571
572#[cfg(test)]
573mod test {
574    use super::*;
575    use claw_common::make_source;
576    use pretty_assertions::assert_eq;
577
578    #[test]
579    fn tokenize_func_declaration() {
580        let contents = "func test(a: u32) -> u32";
581        let src = make_source("test", contents);
582        let ident_test = Token::Identifier("test".to_owned());
583        let ident_a = Token::Identifier("a".to_owned());
584        let output = vec![
585            (Token::Func, SourceSpan::from(0..4)),
586            (ident_test, SourceSpan::from(5..9)),
587            (Token::LParen, SourceSpan::from(9..10)),
588            (ident_a, SourceSpan::from(10..11)),
589            (Token::Colon, SourceSpan::from(11..12)),
590            (Token::U32, SourceSpan::from(13..16)),
591            (Token::RParen, SourceSpan::from(16..17)),
592            (Token::Arrow, SourceSpan::from(18..20)),
593            (Token::U32, SourceSpan::from(21..24)),
594        ]
595        .into_iter()
596        .map(to_token_data)
597        .collect::<Vec<TokenData>>();
598
599        match tokenize(src, contents) {
600            Ok(tokens) => assert_eq!(output, tokens),
601            Err(_) => panic!("Should not have failed"),
602        }
603    }
604
605    #[test]
606    fn tokenize_let() {
607        let contents = r#"let a = "asdf\"";"#;
608        let src = make_source("test", contents);
609        let ident_a = Token::Identifier("a".to_owned());
610        let string_asdf = Token::StringLiteral(String::from(r#"asdf""#));
611        let output = vec![
612            (Token::Let, SourceSpan::from(0..3)),
613            (ident_a, SourceSpan::from(4..5)),
614            (Token::Assign, SourceSpan::from(6..7)),
615            (string_asdf, SourceSpan::from(8..16)),
616            (Token::Semicolon, SourceSpan::from(16..17)),
617        ]
618        .into_iter()
619        .map(to_token_data)
620        .collect::<Vec<TokenData>>();
621
622        match tokenize(src, contents) {
623            Ok(tokens) => assert_eq!(output, tokens),
624            Err(_) => panic!("Should not have failed"),
625        }
626    }
627
628    fn to_token_data(d: (Token, SourceSpan)) -> TokenData {
629        TokenData {
630            token: d.0,
631            span: d.1,
632        }
633    }
634}