spade_parser/
lexer.rs

1use logos::Logos;
2
3use num::BigUint;
4use spade_common::name::Identifier;
5
6#[derive(Debug, PartialEq, Clone)]
7pub enum LiteralKind {
8    Unsized,
9    Signed(BigUint),
10    Unsigned(BigUint),
11}
12
13fn parse_int(slice: &str, radix: u32) -> (BigUint, LiteralKind) {
14    let lower = slice.to_ascii_lowercase().replace(['_'], "");
15
16    let (cleaned, kind) = if lower.contains("u") {
17        let split = lower.split("u").collect::<Vec<_>>();
18        let kind = LiteralKind::Unsigned(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
19        (split[0], kind)
20    } else if lower.contains("i") {
21        let split = lower.split("i").collect::<Vec<_>>();
22        let kind = LiteralKind::Signed(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
23        (split[0], kind)
24    } else {
25        (lower.as_str(), LiteralKind::Unsized)
26    };
27
28    (
29        BigUint::parse_bytes(cleaned.as_bytes(), radix).unwrap(),
30        kind,
31    )
32}
33
34fn process_ident(ident: &str) -> Identifier {
35    if ident.starts_with("r#") {
36        Identifier::intern(&ident[2..])
37    } else {
38        Identifier::intern(ident)
39    }
40}
41
42#[derive(Logos, Debug, PartialEq, Clone)]
43pub enum TokenKind {
44    // Unholy regex for unicode identifiers. Stolen from Repnop who stole it from Evrey
45    #[regex(r#"(r#)?(?x:
46        [\p{XID_Start}_]
47        \p{XID_Continue}*
48        (\u{3F} | \u{21} | (\u{3F}\u{21}) | \u{2048})? # ? ! ?! ⁈
49    )"#, |lex| process_ident(lex.slice()))]
50    Identifier(Identifier),
51
52    #[regex(r"[0-9][0-9_]*([uUiI][0-9]+)?", |lex| {
53        parse_int(lex.slice(), 10)
54    })]
55    Integer((BigUint, LiteralKind)),
56    #[regex(r"0x[0-9A-Fa-f][0-9_A-Fa-f]*([uUiI][0-9]+)?", |lex| {
57        parse_int(&lex.slice()[2..], 16)
58    })]
59    HexInteger((BigUint, LiteralKind)),
60    #[regex(r"0b[0-1][0-1_]*([uUiI][0-9]+)?", |lex| {
61        parse_int(&lex.slice()[2..], 2)
62    })]
63    BinInteger((BigUint, LiteralKind)),
64
65    #[token("true")]
66    True,
67    #[token("false")]
68    False,
69
70    #[token("LOW")]
71    Low,
72    #[token("HIGH")]
73    High,
74    #[token("HIGHIMP")]
75    HighImp,
76
77    // Keywords
78    #[token("reg")]
79    Reg,
80    #[token("let")]
81    Let,
82    #[token("decl")]
83    Decl,
84    #[token("inst")]
85    Instance,
86    #[token("reset")]
87    Reset,
88    #[token("initial")]
89    Initial,
90    #[token("if")]
91    If,
92    #[token("else")]
93    Else,
94    #[token("match")]
95    Match,
96    #[token("set")]
97    Set,
98
99    #[token("pipeline")]
100    Pipeline,
101    #[token("stage")]
102    Stage,
103    #[token("entity")]
104    Entity,
105    #[token("trait")]
106    Trait,
107    #[token("impl")]
108    Impl,
109    #[token("for")]
110    For,
111    #[token("fn")]
112    Function,
113    #[token("enum")]
114    Enum,
115    #[token("struct")]
116    Struct,
117    #[token("port")]
118    Port,
119    #[token("mod")]
120    Mod,
121    #[token("use")]
122    Use,
123    #[token("as")]
124    As,
125    #[token("assert")]
126    Assert,
127    #[token("mut")]
128    Mut,
129    #[token("inv")]
130    Inv,
131    #[token("pub")]
132    Pub,
133    #[token("where")]
134    Where,
135
136    #[token("gen")]
137    Gen,
138
139    #[token("extern")]
140    Extern,
141    #[token("unsafe")]
142    Unsafe,
143
144    // Math operators
145    #[token("+")]
146    Plus,
147    #[token("-")]
148    Minus,
149    #[token("*")]
150    Asterisk,
151    #[token("/")]
152    Slash,
153    #[token("%")]
154    Percentage,
155    #[token("==")]
156    Equals,
157    #[token("!=")]
158    NotEquals,
159    #[token("<")]
160    Lt,
161    #[token(">")]
162    Gt,
163    #[token("<=")]
164    Le,
165    #[token(">=")]
166    Ge,
167    #[token(">>>")]
168    ArithmeticRightShift,
169    #[token(">>")]
170    RightShift,
171    #[token("<<")]
172    LeftShift,
173    #[token("||")]
174    DoublePipe,
175    #[token("&&")]
176    LogicalAnd,
177    #[token("^^")]
178    LogicalXor,
179    #[token("&")]
180    Ampersand,
181    #[token("|")]
182    Pipe,
183    #[token("!")]
184    Not,
185    #[token("^")]
186    BitwiseXor,
187    #[token("~")]
188    Tilde,
189    #[token("`")]
190    InfixOperatorSeparator,
191
192    // Other operators
193    #[token("=")]
194    Assignment,
195
196    #[token("(")]
197    OpenParen,
198    #[token(")")]
199    CloseParen,
200
201    #[token("{")]
202    OpenBrace,
203    #[token("}")]
204    CloseBrace,
205
206    #[token("[")]
207    OpenBracket,
208    #[token("]")]
209    CloseBracket,
210
211    #[token("=>")]
212    FatArrow,
213    #[token("->")]
214    SlimArrow,
215    #[token(",")]
216    Comma,
217    #[token(".")]
218    Dot,
219    #[token("..")]
220    DotDot,
221    #[token(";")]
222    Semi,
223    #[token(";")]
224    GreekQuestionMark,
225    #[token(":")]
226    Colon,
227    #[token("::")]
228    PathSeparator,
229    #[token("#")]
230    Hash,
231    #[token("$")]
232    Dollar,
233
234    #[regex(r#"'[\p{XID_Start}_]\p{XID_Continue}*"#, |lex| Identifier::intern(&lex.slice()[1..]))]
235    Label(Identifier),
236
237    #[regex(r#"@[\p{XID_Start}_]\p{XID_Continue}*"#, |lex| Identifier::intern(&lex.slice()[1..]))]
238    LabelRef(Identifier),
239
240    #[regex(r#"b'(\\.|[^\\'])*'"#, |lex| lex.slice()[2..(lex.slice().len() - 1)].to_string())]
241    AsciiCharLiteral(String),
242    #[regex(r#"b"(\\.|[^\\"])*""#, |lex| lex.slice()[2..(lex.slice().len() - 1)].to_string())]
243    AsciiStringLiteral(String),
244    // Not actually used in the language at the moment, hence the lack of inner
245    // content. It is just used to hint to the user to use b'...'
246    #[regex(r#"'\w'"#)]
247    Utf8CharLiteral,
248    #[regex(r#""[^"]*""#, |lex| lex.slice().replace("\"", ""))]
249    String(String),
250
251    #[regex("///[^\n]*", |lex| lex.slice()[3..].to_string(), allow_greedy = true)]
252    OutsideDocumentation(String),
253    #[regex("//![^\n]*", |lex| lex.slice()[3..].to_string(), allow_greedy = true)]
254    InsideDocumentation(String),
255
256    /// Ignoring whitespace
257    #[regex("[ \t\n\r]", logos::skip)]
258    Whitespace,
259
260    #[regex("//[^\n]*", allow_greedy = true)]
261    Comment,
262
263    #[token("/*")]
264    BlockCommentStart,
265    #[token("*/")]
266    BlockCommentEnd,
267
268    Eof,
269}
270
271impl TokenKind {
272    pub fn as_str(&self) -> &'static str {
273        match self {
274            TokenKind::Identifier(_) => "identifier",
275            TokenKind::Integer(_) => "integer",
276            TokenKind::HexInteger(_) => "hexadecimal integer",
277            TokenKind::BinInteger(_) => "binary integer",
278            TokenKind::True => "true",
279            TokenKind::False => "false",
280            TokenKind::Low => "LOW",
281            TokenKind::High => "HIGH",
282            TokenKind::HighImp => "HIGHIMP",
283
284            TokenKind::Let => "let",
285            TokenKind::Reg => "reg",
286            TokenKind::Decl => "decl",
287            TokenKind::Entity => "entity",
288            TokenKind::Pipeline => "pipeline",
289            TokenKind::Stage => "stage",
290            TokenKind::Instance => "inst",
291            TokenKind::Reset => "reset",
292            TokenKind::Initial => "initial",
293            TokenKind::If => "if",
294            TokenKind::Else => "else",
295            TokenKind::Match => "match",
296            TokenKind::Impl => "impl",
297            TokenKind::Trait => "trait",
298            TokenKind::For => "for",
299            TokenKind::Function => "fn",
300            TokenKind::Enum => "enum",
301            TokenKind::Struct => "struct",
302            TokenKind::Port => "port",
303            TokenKind::Mod => "mod",
304            TokenKind::As => "as",
305            TokenKind::Use => "use",
306            TokenKind::Assert => "assert",
307            TokenKind::Set => "set",
308            TokenKind::Mut => "mut",
309            TokenKind::Inv => "inv",
310            TokenKind::Pub => "pub",
311            TokenKind::Where => "where",
312
313            TokenKind::Gen => "gen",
314
315            TokenKind::Extern => "extern",
316            TokenKind::Unsafe => "unsafe",
317
318            TokenKind::Assignment => "=",
319            TokenKind::Plus => "+",
320            TokenKind::Minus => "-",
321            TokenKind::Asterisk => "*",
322            TokenKind::Slash => "/",
323            TokenKind::Percentage => "%",
324            TokenKind::Equals => "==",
325            TokenKind::NotEquals => "!=",
326            TokenKind::Lt => "<",
327            TokenKind::Gt => ">",
328            TokenKind::Le => "<=",
329            TokenKind::Ge => ">=",
330            TokenKind::LeftShift => "<<",
331            TokenKind::RightShift => ">>",
332            TokenKind::ArithmeticRightShift => ">>>",
333            TokenKind::DoublePipe => "||",
334            TokenKind::LogicalAnd => "&&",
335            TokenKind::LogicalXor => "^^",
336            TokenKind::Ampersand => "&",
337            TokenKind::Pipe => "|",
338            TokenKind::Not => "!",
339            TokenKind::Tilde => "~",
340            TokenKind::BitwiseXor => "^",
341            TokenKind::InfixOperatorSeparator => "`",
342
343            TokenKind::OpenParen => "(",
344            TokenKind::CloseParen => ")",
345            TokenKind::OpenBrace => "{",
346            TokenKind::CloseBrace => "}",
347            TokenKind::OpenBracket => "[",
348            TokenKind::CloseBracket => "]",
349
350            TokenKind::FatArrow => "=>",
351            TokenKind::SlimArrow => "->",
352            TokenKind::Semi => ";",
353            TokenKind::GreekQuestionMark => "GreekQuestionMark(;)",
354            TokenKind::Colon => ":",
355            TokenKind::Comma => ",",
356            TokenKind::Dot => ".",
357            TokenKind::DotDot => "..",
358            TokenKind::PathSeparator => "::",
359
360            TokenKind::Hash => "#",
361            TokenKind::Dollar => "$",
362
363            TokenKind::Eof => "end of file",
364
365            TokenKind::Label(_) => "label",
366            TokenKind::LabelRef(_) => "label ref",
367
368            TokenKind::AsciiCharLiteral(_) => "ASCII char literal",
369            TokenKind::AsciiStringLiteral(_) => "ASCII string literal",
370            TokenKind::Utf8CharLiteral => "Unicode char literal",
371            TokenKind::String(_) => "string",
372
373            TokenKind::OutsideDocumentation(_) => "///",
374            TokenKind::InsideDocumentation(_) => "//!",
375
376            TokenKind::Whitespace => "whitespace",
377            TokenKind::Comment => "comment",
378
379            TokenKind::BlockCommentStart => "/*",
380            TokenKind::BlockCommentEnd => "*/",
381        }
382    }
383
384    pub fn is_identifier(&self) -> bool {
385        matches!(self, TokenKind::Identifier(_))
386    }
387
388    pub fn is_string(&self) -> bool {
389        matches!(self, TokenKind::String(_))
390    }
391
392    pub fn is_integer(&self) -> bool {
393        matches!(
394            self,
395            TokenKind::Integer(_) | TokenKind::HexInteger(_) | TokenKind::BinInteger(_)
396        )
397    }
398
399    pub fn as_biguint(&self) -> Option<BigUint> {
400        match self {
401            TokenKind::Integer((i, _))
402            | TokenKind::HexInteger((i, _))
403            | TokenKind::BinInteger((i, _)) => Some(i.clone()),
404            _ => None,
405        }
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use spade_common::num_ext::InfallibleToBigUint;
412
413    use super::*;
414
415    #[test]
416    fn identifiers_work() {
417        let mut lex = TokenKind::lexer("abc123_");
418
419        assert_eq!(
420            lex.next(),
421            Some(Ok(TokenKind::Identifier(Identifier::intern("abc123_"))))
422        );
423    }
424
425    #[test]
426    fn integer_literals_work() {
427        let mut lex = TokenKind::lexer("123");
428
429        assert_eq!(
430            lex.next(),
431            Some(Ok(TokenKind::Integer((
432                123_u32.to_biguint(),
433                LiteralKind::Unsized
434            ))))
435        );
436        assert_eq!(lex.next(), None);
437    }
438
439    #[test]
440    fn sized_uint_integer_literals_work() {
441        let mut lex = TokenKind::lexer("123u3");
442
443        assert_eq!(
444            lex.next(),
445            Some(Ok(TokenKind::Integer((
446                123_u32.to_biguint(),
447                LiteralKind::Unsigned(3u32.to_biguint())
448            ))))
449        );
450        assert_eq!(lex.next(), None);
451    }
452
453    #[test]
454    fn sized_int_integer_literals_work() {
455        let mut lex = TokenKind::lexer("123i3");
456
457        assert_eq!(
458            lex.next(),
459            Some(Ok(TokenKind::Integer((
460                123_u32.to_biguint(),
461                LiteralKind::Signed(3u32.to_biguint())
462            ))))
463        );
464        assert_eq!(lex.next(), None);
465    }
466
467    #[test]
468    fn hex_array() {
469        let mut lex = TokenKind::lexer("[0x45]");
470        assert_eq!(lex.next(), Some(Ok(TokenKind::OpenBracket)));
471        assert_eq!(
472            lex.next(),
473            Some(Ok(TokenKind::HexInteger((
474                0x45_u32.to_biguint(),
475                LiteralKind::Unsized
476            ))))
477        );
478        assert_eq!(lex.next(), Some(Ok(TokenKind::CloseBracket)));
479        assert_eq!(lex.next(), None);
480    }
481
482    #[test]
483    fn invalid_hex_is_not_hex() {
484        let mut lex = TokenKind::lexer("0xg");
485        assert_eq!(
486            lex.next(),
487            Some(Ok(TokenKind::Integer((
488                0_u32.to_biguint(),
489                LiteralKind::Unsized
490            ))))
491        );
492        assert_eq!(
493            lex.next(),
494            Some(Ok(TokenKind::Identifier(Identifier::intern("xg"))))
495        );
496        assert_eq!(lex.next(), None);
497    }
498
499    #[test]
500    fn doc_comments_slice_correctly() {
501        let mut lex = TokenKind::lexer("//! Hello\n///G'day");
502        assert_eq!(
503            lex.next(),
504            Some(Ok(TokenKind::InsideDocumentation(" Hello".to_string())))
505        );
506        assert_eq!(
507            lex.next(),
508            Some(Ok(TokenKind::OutsideDocumentation("G'day".to_string())))
509        );
510        assert_eq!(lex.next(), None);
511    }
512}