spade_parser/
lexer.rs

1use logos::Logos;
2
3use num::BigUint;
4
5#[derive(Debug, PartialEq, Clone)]
6pub enum LiteralKind {
7    Unsized,
8    Signed(BigUint),
9    Unsigned(BigUint),
10}
11
12fn parse_int(slice: &str, radix: u32) -> (BigUint, LiteralKind) {
13    let lower = slice.to_ascii_lowercase().replace(['_'], "");
14
15    let (cleaned, kind) = if lower.contains("u") {
16        let split = lower.split("u").collect::<Vec<_>>();
17        let kind = LiteralKind::Unsigned(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
18        (split[0], kind)
19    } else if lower.contains("i") {
20        let split = lower.split("i").collect::<Vec<_>>();
21        let kind = LiteralKind::Signed(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
22        (split[0], kind)
23    } else {
24        (lower.as_str(), LiteralKind::Unsized)
25    };
26
27    (
28        BigUint::parse_bytes(cleaned.as_bytes(), radix).unwrap(),
29        kind,
30    )
31}
32
33#[derive(Logos, Debug, PartialEq, Clone)]
34pub enum TokenKind {
35    // Unholy regex for unicode identifiers. Stolen from Repnop who stole it from Evrey
36    #[regex(r#"(?x:
37        [\p{XID_Start}_]
38        \p{XID_Continue}*
39        (\u{3F} | \u{21} | (\u{3F}\u{21}) | \u{2048})? # ? ! ?! ⁈
40    )"#, |lex| lex.slice().to_string())]
41    Identifier(String),
42
43    #[regex(r"[0-9][0-9_]*([uUiI][0-9]+)?", |lex| {
44        parse_int(lex.slice(), 10)
45    })]
46    Integer((BigUint, LiteralKind)),
47    #[regex(r"0x[0-9A-Fa-f][0-9_A-Fa-f]*([uUiI][0-9]+)?", |lex| {
48        parse_int(&lex.slice()[2..], 16)
49    })]
50    HexInteger((BigUint, LiteralKind)),
51    #[regex(r"0b[0-1][0-1_]*([uUiI][0-9]+)?", |lex| {
52        parse_int(&lex.slice()[2..], 2)
53    })]
54    BinInteger((BigUint, LiteralKind)),
55
56    #[token("true")]
57    True,
58    #[token("false")]
59    False,
60
61    #[token("LOW")]
62    Low,
63    #[token("HIGH")]
64    High,
65    #[token("HIGHIMP")]
66    HighImp,
67
68    // Keywords
69    #[token("reg")]
70    Reg,
71    #[token("let")]
72    Let,
73    #[token("decl")]
74    Decl,
75    #[token("inst")]
76    Instance,
77    #[token("reset")]
78    Reset,
79    #[token("initial")]
80    Initial,
81    #[token("if")]
82    If,
83    #[token("else")]
84    Else,
85    #[token("match")]
86    Match,
87    #[token("set")]
88    Set,
89
90    #[token("pipeline")]
91    Pipeline,
92    #[token("stage")]
93    Stage,
94    #[token("entity")]
95    Entity,
96    #[token("trait")]
97    Trait,
98    #[token("impl")]
99    Impl,
100    #[token("for")]
101    For,
102    #[token("fn")]
103    Function,
104    #[token("enum")]
105    Enum,
106    #[token("struct")]
107    Struct,
108    #[token("port")]
109    Port,
110    #[token("mod")]
111    Mod,
112    #[token("use")]
113    Use,
114    #[token("as")]
115    As,
116    #[token("assert")]
117    Assert,
118    #[token("mut")]
119    Mut,
120    #[token("inv")]
121    Inv,
122    #[token("where")]
123    Where,
124
125    #[token("gen")]
126    Gen,
127
128    #[token("extern")]
129    Extern,
130    #[token("unsafe")]
131    Unsafe,
132
133    // Math operators
134    #[token("+")]
135    Plus,
136    #[token("-")]
137    Minus,
138    #[token("*")]
139    Asterisk,
140    #[token("/")]
141    Slash,
142    #[token("%")]
143    Percentage,
144    #[token("==")]
145    Equals,
146    #[token("!=")]
147    NotEquals,
148    #[token("<")]
149    Lt,
150    #[token(">")]
151    Gt,
152    #[token("<=")]
153    Le,
154    #[token(">=")]
155    Ge,
156    #[token(">>>")]
157    ArithmeticRightShift,
158    #[token(">>")]
159    RightShift,
160    #[token("<<")]
161    LeftShift,
162    #[token("||")]
163    DoublePipe,
164    #[token("&&")]
165    LogicalAnd,
166    #[token("^^")]
167    LogicalXor,
168    #[token("&")]
169    Ampersand,
170    #[token("|")]
171    Pipe,
172    #[token("!")]
173    Not,
174    #[token("^")]
175    BitwiseXor,
176    #[token("~")]
177    Tilde,
178    #[token("`")]
179    InfixOperatorSeparator,
180    #[token("'")]
181    SingleQuote,
182
183    // Other operators
184    #[token("=")]
185    Assignment,
186
187    #[token("(")]
188    OpenParen,
189    #[token(")")]
190    CloseParen,
191
192    #[token("{")]
193    OpenBrace,
194    #[token("}")]
195    CloseBrace,
196
197    #[token("[")]
198    OpenBracket,
199    #[token("]")]
200    CloseBracket,
201
202    #[token("=>")]
203    FatArrow,
204    #[token("->")]
205    SlimArrow,
206    #[token(",")]
207    Comma,
208    #[token(".")]
209    Dot,
210    #[token("..")]
211    DotDot,
212    #[token(";")]
213    Semi,
214    #[token(";")]
215    GreekQuestionMark,
216    #[token(":")]
217    Colon,
218    #[token("::")]
219    PathSeparator,
220    #[token("#")]
221    Hash,
222    #[token("$")]
223    Dollar,
224
225    #[regex(r#""[^"]*""#, |lex| lex.slice().replace("\"", ""))]
226    String(String),
227
228    #[regex("///[^\n]*", |lex| lex.slice()[3..].to_string())]
229    OutsideDocumentation(String),
230    #[regex("//![^\n]*", |lex| lex.slice()[3..].to_string())]
231    InsideDocumentation(String),
232
233    /// Ignoring whitespace
234    #[regex("[ \t\n\r]", logos::skip)]
235    Whitespace,
236
237    #[regex("//[^\n]*")]
238    Comment,
239
240    #[token("/*")]
241    BlockCommentStart,
242    #[token("*/")]
243    BlockCommentEnd,
244
245    Eof,
246}
247
248impl TokenKind {
249    pub fn as_str(&self) -> &'static str {
250        match self {
251            TokenKind::Identifier(_) => "identifier",
252            TokenKind::Integer(_) => "integer",
253            TokenKind::HexInteger(_) => "hexadecimal integer",
254            TokenKind::BinInteger(_) => "binary integer",
255            TokenKind::True => "true",
256            TokenKind::False => "false",
257            TokenKind::Low => "LOW",
258            TokenKind::High => "HIGH",
259            TokenKind::HighImp => "HIGHIMP",
260
261            TokenKind::Let => "let",
262            TokenKind::Reg => "reg",
263            TokenKind::Decl => "decl",
264            TokenKind::Entity => "entity",
265            TokenKind::Pipeline => "pipeline",
266            TokenKind::Stage => "stage",
267            TokenKind::Instance => "inst",
268            TokenKind::Reset => "reset",
269            TokenKind::Initial => "initial",
270            TokenKind::If => "if",
271            TokenKind::Else => "else",
272            TokenKind::Match => "match",
273            TokenKind::Impl => "impl",
274            TokenKind::Trait => "trait",
275            TokenKind::For => "for",
276            TokenKind::Function => "fn",
277            TokenKind::Enum => "enum",
278            TokenKind::Struct => "struct",
279            TokenKind::Port => "port",
280            TokenKind::Mod => "mod",
281            TokenKind::As => "as",
282            TokenKind::Use => "use",
283            TokenKind::Assert => "assert",
284            TokenKind::Set => "set",
285            TokenKind::Mut => "mut",
286            TokenKind::Inv => "inv",
287            TokenKind::Where => "where",
288
289            TokenKind::Gen => "gen",
290
291            TokenKind::Extern => "extern",
292            TokenKind::Unsafe => "unsafe",
293
294            TokenKind::Assignment => "=",
295            TokenKind::Plus => "+",
296            TokenKind::Minus => "-",
297            TokenKind::Asterisk => "*",
298            TokenKind::Slash => "/",
299            TokenKind::Percentage => "%",
300            TokenKind::Equals => "==",
301            TokenKind::NotEquals => "!=",
302            TokenKind::Lt => "<",
303            TokenKind::Gt => ">",
304            TokenKind::Le => "<=",
305            TokenKind::Ge => ">=",
306            TokenKind::LeftShift => "<<",
307            TokenKind::RightShift => ">>",
308            TokenKind::ArithmeticRightShift => ">>>",
309            TokenKind::DoublePipe => "||",
310            TokenKind::LogicalAnd => "&&",
311            TokenKind::LogicalXor => "^^",
312            TokenKind::Ampersand => "&",
313            TokenKind::Pipe => "|",
314            TokenKind::Not => "!",
315            TokenKind::Tilde => "~",
316            TokenKind::BitwiseXor => "^",
317            TokenKind::InfixOperatorSeparator => "`",
318
319            TokenKind::OpenParen => "(",
320            TokenKind::CloseParen => ")",
321            TokenKind::OpenBrace => "{",
322            TokenKind::CloseBrace => "}",
323            TokenKind::OpenBracket => "[",
324            TokenKind::CloseBracket => "]",
325
326            TokenKind::FatArrow => "=>",
327            TokenKind::SlimArrow => "->",
328            TokenKind::Semi => ";",
329            TokenKind::GreekQuestionMark => "GreekQuestionMark(;)",
330            TokenKind::Colon => ":",
331            TokenKind::Comma => ",",
332            TokenKind::Dot => ".",
333            TokenKind::DotDot => "..",
334            TokenKind::PathSeparator => "::",
335            TokenKind::SingleQuote => "'",
336
337            TokenKind::Hash => "#",
338            TokenKind::Dollar => "$",
339
340            TokenKind::Eof => "end of file",
341
342            TokenKind::String(_) => "string",
343
344            TokenKind::OutsideDocumentation(_) => "///",
345            TokenKind::InsideDocumentation(_) => "//!",
346
347            TokenKind::Whitespace => "whitespace",
348            TokenKind::Comment => "comment",
349
350            TokenKind::BlockCommentStart => "/*",
351            TokenKind::BlockCommentEnd => "*/",
352        }
353    }
354
355    pub fn is_identifier(&self) -> bool {
356        matches!(self, TokenKind::Identifier(_))
357    }
358
359    pub fn is_string(&self) -> bool {
360        matches!(self, TokenKind::String(_))
361    }
362
363    pub fn is_integer(&self) -> bool {
364        matches!(
365            self,
366            TokenKind::Integer(_) | TokenKind::HexInteger(_) | TokenKind::BinInteger(_)
367        )
368    }
369
370    pub fn as_biguint(&self) -> Option<BigUint> {
371        match self {
372            TokenKind::Integer((i, _))
373            | TokenKind::HexInteger((i, _))
374            | TokenKind::BinInteger((i, _)) => Some(i.clone()),
375            _ => None,
376        }
377    }
378}
379
380#[cfg(test)]
381mod tests {
382    use spade_common::num_ext::InfallibleToBigUint;
383
384    use super::*;
385
386    #[test]
387    fn identifiers_work() {
388        let mut lex = TokenKind::lexer("abc123_");
389
390        assert_eq!(
391            lex.next(),
392            Some(Ok(TokenKind::Identifier("abc123_".to_string())))
393        );
394    }
395
396    #[test]
397    fn integer_literals_work() {
398        let mut lex = TokenKind::lexer("123");
399
400        assert_eq!(
401            lex.next(),
402            Some(Ok(TokenKind::Integer((
403                123_u32.to_biguint(),
404                LiteralKind::Unsized
405            ))))
406        );
407        assert_eq!(lex.next(), None);
408    }
409
410    #[test]
411    fn sized_uint_integer_literals_work() {
412        let mut lex = TokenKind::lexer("123u3");
413
414        assert_eq!(
415            lex.next(),
416            Some(Ok(TokenKind::Integer((
417                123_u32.to_biguint(),
418                LiteralKind::Unsigned(3u32.to_biguint())
419            ))))
420        );
421        assert_eq!(lex.next(), None);
422    }
423
424    #[test]
425    fn sized_int_integer_literals_work() {
426        let mut lex = TokenKind::lexer("123i3");
427
428        assert_eq!(
429            lex.next(),
430            Some(Ok(TokenKind::Integer((
431                123_u32.to_biguint(),
432                LiteralKind::Signed(3u32.to_biguint())
433            ))))
434        );
435        assert_eq!(lex.next(), None);
436    }
437
438    #[test]
439    fn hex_array() {
440        let mut lex = TokenKind::lexer("[0x45]");
441        assert_eq!(lex.next(), Some(Ok(TokenKind::OpenBracket)));
442        assert_eq!(
443            lex.next(),
444            Some(Ok(TokenKind::HexInteger((
445                0x45_u32.to_biguint(),
446                LiteralKind::Unsized
447            ))))
448        );
449        assert_eq!(lex.next(), Some(Ok(TokenKind::CloseBracket)));
450        assert_eq!(lex.next(), None);
451    }
452
453    #[test]
454    fn invalid_hex_is_not_hex() {
455        let mut lex = TokenKind::lexer("0xg");
456        assert_eq!(
457            lex.next(),
458            Some(Ok(TokenKind::Integer((
459                0_u32.to_biguint(),
460                LiteralKind::Unsized
461            ))))
462        );
463        assert_eq!(
464            lex.next(),
465            Some(Ok(TokenKind::Identifier("xg".to_string())))
466        );
467        assert_eq!(lex.next(), None);
468    }
469
470    #[test]
471    fn doc_comments_slice_correctly() {
472        let mut lex = TokenKind::lexer("//! Hello\n///G'day");
473        assert_eq!(
474            lex.next(),
475            Some(Ok(TokenKind::InsideDocumentation(" Hello".to_string())))
476        );
477        assert_eq!(
478            lex.next(),
479            Some(Ok(TokenKind::OutsideDocumentation("G'day".to_string())))
480        );
481        assert_eq!(lex.next(), None);
482    }
483}