spade_parser/
lexer.rs

1use logos::Logos;
2
3use num::BigUint;
4
5#[derive(Debug, PartialEq, Clone)]
6pub enum LiteralKind {
7    Unsized,
8    Signed(BigUint),
9    Unsigned(BigUint),
10}
11
12fn parse_int(slice: &str, radix: u32) -> (BigUint, LiteralKind) {
13    let lower = slice.to_ascii_lowercase().replace(['_'], "");
14
15    let (cleaned, kind) = if lower.contains("u") {
16        let split = lower.split("u").collect::<Vec<_>>();
17        let kind = LiteralKind::Unsigned(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
18        (split[0], kind)
19    } else if lower.contains("i") {
20        let split = lower.split("i").collect::<Vec<_>>();
21        let kind = LiteralKind::Signed(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
22        (split[0], kind)
23    } else {
24        (lower.as_str(), LiteralKind::Unsized)
25    };
26
27    (
28        BigUint::parse_bytes(cleaned.as_bytes(), radix).unwrap(),
29        kind,
30    )
31}
32
33#[derive(Logos, Debug, PartialEq, Clone)]
34pub enum TokenKind {
35    // Unholy regex for unicode identifiers. Stolen from Repnop who stole it from Evrey
36    #[regex(r#"(?x:
37        [\p{XID_Start}_]
38        \p{XID_Continue}*
39        (\u{3F} | \u{21} | (\u{3F}\u{21}) | \u{2048})? # ? ! ?! ⁈
40    )"#, |lex| lex.slice().to_string())]
41    Identifier(String),
42
43    #[regex(r"[0-9][0-9_]*([uUiI][0-9]+)?", |lex| {
44        parse_int(lex.slice(), 10)
45    })]
46    Integer((BigUint, LiteralKind)),
47    #[regex(r"0x[0-9A-Fa-f][0-9_A-Fa-f]*([uUiI][0-9]+)?", |lex| {
48        parse_int(&lex.slice()[2..], 16)
49    })]
50    HexInteger((BigUint, LiteralKind)),
51    #[regex(r"0b[0-1][0-1_]*([uUiI][0-9]+)?", |lex| {
52        parse_int(&lex.slice()[2..], 2)
53    })]
54    BinInteger((BigUint, LiteralKind)),
55
56    #[token("true")]
57    True,
58    #[token("false")]
59    False,
60
61    #[token("LOW")]
62    Low,
63    #[token("HIGH")]
64    High,
65    #[token("HIGHIMP")]
66    HighImp,
67
68    // Keywords
69    #[token("reg")]
70    Reg,
71    #[token("let")]
72    Let,
73    #[token("decl")]
74    Decl,
75    #[token("inst")]
76    Instance,
77    #[token("reset")]
78    Reset,
79    #[token("initial")]
80    Initial,
81    #[token("if")]
82    If,
83    #[token("else")]
84    Else,
85    #[token("match")]
86    Match,
87    #[token("set")]
88    Set,
89
90    #[token("pipeline")]
91    Pipeline,
92    #[token("stage")]
93    Stage,
94    #[token("entity")]
95    Entity,
96    #[token("trait")]
97    Trait,
98    #[token("impl")]
99    Impl,
100    #[token("for")]
101    For,
102    #[token("fn")]
103    Function,
104    #[token("enum")]
105    Enum,
106    #[token("struct")]
107    Struct,
108    #[token("port")]
109    Port,
110    #[token("mod")]
111    Mod,
112    #[token("use")]
113    Use,
114    #[token("as")]
115    As,
116    #[token("assert")]
117    Assert,
118    #[token("mut")]
119    Mut,
120    #[token("inv")]
121    Inv,
122    #[token("where")]
123    Where,
124
125    #[token("gen")]
126    Gen,
127
128    #[token("extern")]
129    Extern,
130
131    // Math operators
132    #[token("+")]
133    Plus,
134    #[token("-")]
135    Minus,
136    #[token("*")]
137    Asterisk,
138    #[token("/")]
139    Slash,
140    #[token("%")]
141    Percentage,
142    #[token("==")]
143    Equals,
144    #[token("!=")]
145    NotEquals,
146    #[token("<")]
147    Lt,
148    #[token(">")]
149    Gt,
150    #[token("<=")]
151    Le,
152    #[token(">=")]
153    Ge,
154    #[token(">>>")]
155    ArithmeticRightShift,
156    #[token(">>")]
157    RightShift,
158    #[token("<<")]
159    LeftShift,
160    #[token("||")]
161    LogicalOr,
162    #[token("&&")]
163    LogicalAnd,
164    #[token("^^")]
165    LogicalXor,
166    #[token("&")]
167    Ampersand,
168    #[token("|")]
169    BitwiseOr,
170    #[token("!")]
171    Not,
172    #[token("^")]
173    BitwiseXor,
174    #[token("~")]
175    Tilde,
176    #[token("`")]
177    InfixOperatorSeparator,
178    #[token("'")]
179    SingleQuote,
180
181    // Other operators
182    #[token("=")]
183    Assignment,
184
185    #[token("(")]
186    OpenParen,
187    #[token(")")]
188    CloseParen,
189
190    #[token("{")]
191    OpenBrace,
192    #[token("}")]
193    CloseBrace,
194
195    #[token("[")]
196    OpenBracket,
197    #[token("]")]
198    CloseBracket,
199
200    #[token("=>")]
201    FatArrow,
202    #[token("->")]
203    SlimArrow,
204    #[token(",")]
205    Comma,
206    #[token(".")]
207    Dot,
208    #[token(";")]
209    Semi,
210    #[token(";")]
211    GreekQuestionMark,
212    #[token(":")]
213    Colon,
214    #[token("::")]
215    PathSeparator,
216    #[token("#")]
217    Hash,
218    #[token("$")]
219    Dollar,
220
221    #[regex("///[^\n]*", |lex| lex.slice()[3..].to_string())]
222    OutsideDocumentation(String),
223    #[regex("//![^\n]*", |lex| lex.slice()[3..].to_string())]
224    InsideDocumentation(String),
225
226    /// Ignoring whitespace
227    #[regex("[ \t\n\r]", logos::skip)]
228    Whitespace,
229
230    #[regex("//[^\n]*", logos::skip)]
231    Comment,
232
233    #[token("/*")]
234    BlockCommentStart,
235    #[token("*/")]
236    BlockCommentEnd,
237
238    Eof,
239}
240
241impl TokenKind {
242    pub fn as_str(&self) -> &'static str {
243        match self {
244            TokenKind::Identifier(_) => "identifier",
245            TokenKind::Integer(_) => "integer",
246            TokenKind::HexInteger(_) => "hexadecimal integer",
247            TokenKind::BinInteger(_) => "binary integer",
248            TokenKind::True => "true",
249            TokenKind::False => "false",
250            TokenKind::Low => "LOW",
251            TokenKind::High => "HIGH",
252            TokenKind::HighImp => "HIGHIMP",
253
254            TokenKind::Let => "let",
255            TokenKind::Reg => "reg",
256            TokenKind::Decl => "decl",
257            TokenKind::Entity => "entity",
258            TokenKind::Pipeline => "pipeline",
259            TokenKind::Stage => "stage",
260            TokenKind::Instance => "inst",
261            TokenKind::Reset => "reset",
262            TokenKind::Initial => "initial",
263            TokenKind::If => "if",
264            TokenKind::Else => "else",
265            TokenKind::Match => "match",
266            TokenKind::Impl => "impl",
267            TokenKind::Trait => "trait",
268            TokenKind::For => "for",
269            TokenKind::Function => "fn",
270            TokenKind::Enum => "enum",
271            TokenKind::Struct => "struct",
272            TokenKind::Port => "port",
273            TokenKind::Mod => "mod",
274            TokenKind::As => "as",
275            TokenKind::Use => "use",
276            TokenKind::Assert => "assert",
277            TokenKind::Set => "set",
278            TokenKind::Mut => "mut",
279            TokenKind::Inv => "inv",
280            TokenKind::Where => "where",
281
282            TokenKind::Gen => "gen",
283
284            TokenKind::Extern => "extern",
285
286            TokenKind::Assignment => "=",
287            TokenKind::Plus => "+",
288            TokenKind::Minus => "-",
289            TokenKind::Asterisk => "*",
290            TokenKind::Slash => "/",
291            TokenKind::Percentage => "%",
292            TokenKind::Equals => "==",
293            TokenKind::NotEquals => "!=",
294            TokenKind::Lt => "<",
295            TokenKind::Gt => ">",
296            TokenKind::Le => "<=",
297            TokenKind::Ge => ">=",
298            TokenKind::LeftShift => "<<",
299            TokenKind::RightShift => ">>",
300            TokenKind::ArithmeticRightShift => ">>>",
301            TokenKind::LogicalOr => "||",
302            TokenKind::LogicalAnd => "&&",
303            TokenKind::LogicalXor => "^^",
304            TokenKind::Ampersand => "&",
305            TokenKind::BitwiseOr => "|",
306            TokenKind::Not => "!",
307            TokenKind::Tilde => "~",
308            TokenKind::BitwiseXor => "^",
309            TokenKind::InfixOperatorSeparator => "`",
310
311            TokenKind::OpenParen => "(",
312            TokenKind::CloseParen => ")",
313            TokenKind::OpenBrace => "{",
314            TokenKind::CloseBrace => "}",
315            TokenKind::OpenBracket => "[",
316            TokenKind::CloseBracket => "]",
317
318            TokenKind::FatArrow => "=>",
319            TokenKind::SlimArrow => "->",
320            TokenKind::Semi => ";",
321            TokenKind::GreekQuestionMark => "GreekQuestionMark(;)",
322            TokenKind::Colon => ":",
323            TokenKind::Comma => ",",
324            TokenKind::Dot => ".",
325            TokenKind::PathSeparator => "::",
326            TokenKind::SingleQuote => "'",
327
328            TokenKind::Hash => "#",
329            TokenKind::Dollar => "$",
330
331            TokenKind::Eof => "end of file",
332
333            TokenKind::OutsideDocumentation(_) => "///",
334            TokenKind::InsideDocumentation(_) => "//!",
335
336            TokenKind::Whitespace => "whitespace",
337            TokenKind::Comment => "comment",
338
339            TokenKind::BlockCommentStart => "/*",
340            TokenKind::BlockCommentEnd => "*/",
341        }
342    }
343
344    pub fn is_identifier(&self) -> bool {
345        matches!(self, TokenKind::Identifier(_))
346    }
347    pub fn is_integer(&self) -> bool {
348        matches!(
349            self,
350            TokenKind::Integer(_) | TokenKind::HexInteger(_) | TokenKind::BinInteger(_)
351        )
352    }
353
354    pub fn as_biguint(&self) -> Option<BigUint> {
355        match self {
356            TokenKind::Integer((i, _))
357            | TokenKind::HexInteger((i, _))
358            | TokenKind::BinInteger((i, _)) => Some(i.clone()),
359            _ => None,
360        }
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use spade_common::num_ext::InfallibleToBigUint;
367
368    use super::*;
369
370    #[test]
371    fn identifiers_work() {
372        let mut lex = TokenKind::lexer("abc123_");
373
374        assert_eq!(
375            lex.next(),
376            Some(Ok(TokenKind::Identifier("abc123_".to_string())))
377        );
378    }
379
380    #[test]
381    fn integer_literals_work() {
382        let mut lex = TokenKind::lexer("123");
383
384        assert_eq!(
385            lex.next(),
386            Some(Ok(TokenKind::Integer((
387                123_u32.to_biguint(),
388                LiteralKind::Unsized
389            ))))
390        );
391        assert_eq!(lex.next(), None);
392    }
393
394    #[test]
395    fn sized_uint_integer_literals_work() {
396        let mut lex = TokenKind::lexer("123u3");
397
398        assert_eq!(
399            lex.next(),
400            Some(Ok(TokenKind::Integer((
401                123_u32.to_biguint(),
402                LiteralKind::Unsigned(3u32.to_biguint())
403            ))))
404        );
405        assert_eq!(lex.next(), None);
406    }
407
408    #[test]
409    fn sized_int_integer_literals_work() {
410        let mut lex = TokenKind::lexer("123i3");
411
412        assert_eq!(
413            lex.next(),
414            Some(Ok(TokenKind::Integer((
415                123_u32.to_biguint(),
416                LiteralKind::Signed(3u32.to_biguint())
417            ))))
418        );
419        assert_eq!(lex.next(), None);
420    }
421
422    #[test]
423    fn hex_array() {
424        let mut lex = TokenKind::lexer("[0x45]");
425        assert_eq!(lex.next(), Some(Ok(TokenKind::OpenBracket)));
426        assert_eq!(
427            lex.next(),
428            Some(Ok(TokenKind::HexInteger((
429                0x45_u32.to_biguint(),
430                LiteralKind::Unsized
431            ))))
432        );
433        assert_eq!(lex.next(), Some(Ok(TokenKind::CloseBracket)));
434        assert_eq!(lex.next(), None);
435    }
436
437    #[test]
438    fn invalid_hex_is_not_hex() {
439        let mut lex = TokenKind::lexer("0xg");
440        assert_eq!(
441            lex.next(),
442            Some(Ok(TokenKind::Integer((
443                0_u32.to_biguint(),
444                LiteralKind::Unsized
445            ))))
446        );
447        assert_eq!(
448            lex.next(),
449            Some(Ok(TokenKind::Identifier("xg".to_string())))
450        );
451        assert_eq!(lex.next(), None);
452    }
453
454    #[test]
455    fn doc_comments_slice_correctly() {
456        let mut lex = TokenKind::lexer("//! Hello\n///G'day");
457        assert_eq!(
458            lex.next(),
459            Some(Ok(TokenKind::InsideDocumentation(" Hello".to_string())))
460        );
461        assert_eq!(
462            lex.next(),
463            Some(Ok(TokenKind::OutsideDocumentation("G'day".to_string())))
464        );
465        assert_eq!(lex.next(), None);
466    }
467}