spade_parser/
lexer.rs

1use logos::Logos;
2
3use num::BigUint;
4
5#[derive(Debug, PartialEq, Clone)]
6pub enum LiteralKind {
7    Unsized,
8    Signed(BigUint),
9    Unsigned(BigUint),
10}
11
12fn parse_int(slice: &str, radix: u32) -> (BigUint, LiteralKind) {
13    let lower = slice.to_ascii_lowercase().replace(['_'], "");
14
15    let (cleaned, kind) = if lower.contains("u") {
16        let split = lower.split("u").collect::<Vec<_>>();
17        let kind = LiteralKind::Unsigned(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
18        (split[0], kind)
19    } else if lower.contains("i") {
20        let split = lower.split("i").collect::<Vec<_>>();
21        let kind = LiteralKind::Signed(BigUint::parse_bytes(split[1].as_bytes(), 10).unwrap());
22        (split[0], kind)
23    } else {
24        (lower.as_str(), LiteralKind::Unsized)
25    };
26
27    (
28        BigUint::parse_bytes(cleaned.as_bytes(), radix).unwrap(),
29        kind,
30    )
31}
32
33#[derive(Logos, Debug, PartialEq, Clone)]
34pub enum TokenKind {
35    // Unholy regex for unicode identifiers. Stolen from Repnop who stole it from Evrey
36    #[regex(r#"(?x:
37        [\p{XID_Start}_]
38        \p{XID_Continue}*
39        (\u{3F} | \u{21} | (\u{3F}\u{21}) | \u{2048})? # ? ! ?! ⁈
40    )"#, |lex| lex.slice().to_string())]
41    Identifier(String),
42
43    #[regex(r"[0-9][0-9_]*([uUiI][0-9]+)?", |lex| {
44        parse_int(lex.slice(), 10)
45    })]
46    Integer((BigUint, LiteralKind)),
47    #[regex(r"0x[0-9A-Fa-f][0-9_A-Fa-f]*([uUiI][0-9]+)?", |lex| {
48        parse_int(&lex.slice()[2..], 16)
49    })]
50    HexInteger((BigUint, LiteralKind)),
51    #[regex(r"0b[0-1][0-1_]*([uUiI][0-9]+)?", |lex| {
52        parse_int(&lex.slice()[2..], 2)
53    })]
54    BinInteger((BigUint, LiteralKind)),
55
56    #[token("true")]
57    True,
58    #[token("false")]
59    False,
60
61    #[token("LOW")]
62    Low,
63    #[token("HIGH")]
64    High,
65    #[token("HIGHIMP")]
66    HighImp,
67
68    // Keywords
69    #[token("reg")]
70    Reg,
71    #[token("let")]
72    Let,
73    #[token("decl")]
74    Decl,
75    #[token("inst")]
76    Instance,
77    #[token("reset")]
78    Reset,
79    #[token("initial")]
80    Initial,
81    #[token("if")]
82    If,
83    #[token("else")]
84    Else,
85    #[token("match")]
86    Match,
87    #[token("set")]
88    Set,
89
90    #[token("pipeline")]
91    Pipeline,
92    #[token("stage")]
93    Stage,
94    #[token("entity")]
95    Entity,
96    #[token("trait")]
97    Trait,
98    #[token("impl")]
99    Impl,
100    #[token("for")]
101    For,
102    #[token("fn")]
103    Function,
104    #[token("enum")]
105    Enum,
106    #[token("struct")]
107    Struct,
108    #[token("port")]
109    Port,
110    #[token("mod")]
111    Mod,
112    #[token("use")]
113    Use,
114    #[token("as")]
115    As,
116    #[token("assert")]
117    Assert,
118    #[token("mut")]
119    Mut,
120    #[token("inv")]
121    Inv,
122    #[token("where")]
123    Where,
124
125    #[token("gen")]
126    Gen,
127
128    #[token("extern")]
129    Extern,
130
131    // Math operators
132    #[token("+")]
133    Plus,
134    #[token("-")]
135    Minus,
136    #[token("*")]
137    Asterisk,
138    #[token("/")]
139    Slash,
140    #[token("%")]
141    Percentage,
142    #[token("==")]
143    Equals,
144    #[token("!=")]
145    NotEquals,
146    #[token("<")]
147    Lt,
148    #[token(">")]
149    Gt,
150    #[token("<=")]
151    Le,
152    #[token(">=")]
153    Ge,
154    #[token(">>>")]
155    ArithmeticRightShift,
156    #[token(">>")]
157    RightShift,
158    #[token("<<")]
159    LeftShift,
160    #[token("||")]
161    LogicalOr,
162    #[token("&&")]
163    LogicalAnd,
164    #[token("^^")]
165    LogicalXor,
166    #[token("&")]
167    Ampersand,
168    #[token("|")]
169    BitwiseOr,
170    #[token("!")]
171    Not,
172    #[token("^")]
173    BitwiseXor,
174    #[token("~")]
175    Tilde,
176    #[token("`")]
177    InfixOperatorSeparator,
178    #[token("'")]
179    SingleQuote,
180
181    // Other operators
182    #[token("=")]
183    Assignment,
184
185    #[token("(")]
186    OpenParen,
187    #[token(")")]
188    CloseParen,
189
190    #[token("{")]
191    OpenBrace,
192    #[token("}")]
193    CloseBrace,
194
195    #[token("[")]
196    OpenBracket,
197    #[token("]")]
198    CloseBracket,
199
200    #[token("=>")]
201    FatArrow,
202    #[token("->")]
203    SlimArrow,
204    #[token(",")]
205    Comma,
206    #[token(".")]
207    Dot,
208    #[token("..")]
209    DotDot,
210    #[token(";")]
211    Semi,
212    #[token(";")]
213    GreekQuestionMark,
214    #[token(":")]
215    Colon,
216    #[token("::")]
217    PathSeparator,
218    #[token("#")]
219    Hash,
220    #[token("$")]
221    Dollar,
222
223    #[regex(r#""[^"]*""#, |lex| lex.slice().replace("\"", ""))]
224    String(String),
225
226    #[regex("///[^\n]*", |lex| lex.slice()[3..].to_string())]
227    OutsideDocumentation(String),
228    #[regex("//![^\n]*", |lex| lex.slice()[3..].to_string())]
229    InsideDocumentation(String),
230
231    /// Ignoring whitespace
232    #[regex("[ \t\n\r]", logos::skip)]
233    Whitespace,
234
235    #[regex("//[^\n]*", logos::skip)]
236    Comment,
237
238    #[token("/*")]
239    BlockCommentStart,
240    #[token("*/")]
241    BlockCommentEnd,
242
243    Eof,
244}
245
246impl TokenKind {
247    pub fn as_str(&self) -> &'static str {
248        match self {
249            TokenKind::Identifier(_) => "identifier",
250            TokenKind::Integer(_) => "integer",
251            TokenKind::HexInteger(_) => "hexadecimal integer",
252            TokenKind::BinInteger(_) => "binary integer",
253            TokenKind::True => "true",
254            TokenKind::False => "false",
255            TokenKind::Low => "LOW",
256            TokenKind::High => "HIGH",
257            TokenKind::HighImp => "HIGHIMP",
258
259            TokenKind::Let => "let",
260            TokenKind::Reg => "reg",
261            TokenKind::Decl => "decl",
262            TokenKind::Entity => "entity",
263            TokenKind::Pipeline => "pipeline",
264            TokenKind::Stage => "stage",
265            TokenKind::Instance => "inst",
266            TokenKind::Reset => "reset",
267            TokenKind::Initial => "initial",
268            TokenKind::If => "if",
269            TokenKind::Else => "else",
270            TokenKind::Match => "match",
271            TokenKind::Impl => "impl",
272            TokenKind::Trait => "trait",
273            TokenKind::For => "for",
274            TokenKind::Function => "fn",
275            TokenKind::Enum => "enum",
276            TokenKind::Struct => "struct",
277            TokenKind::Port => "port",
278            TokenKind::Mod => "mod",
279            TokenKind::As => "as",
280            TokenKind::Use => "use",
281            TokenKind::Assert => "assert",
282            TokenKind::Set => "set",
283            TokenKind::Mut => "mut",
284            TokenKind::Inv => "inv",
285            TokenKind::Where => "where",
286
287            TokenKind::Gen => "gen",
288
289            TokenKind::Extern => "extern",
290
291            TokenKind::Assignment => "=",
292            TokenKind::Plus => "+",
293            TokenKind::Minus => "-",
294            TokenKind::Asterisk => "*",
295            TokenKind::Slash => "/",
296            TokenKind::Percentage => "%",
297            TokenKind::Equals => "==",
298            TokenKind::NotEquals => "!=",
299            TokenKind::Lt => "<",
300            TokenKind::Gt => ">",
301            TokenKind::Le => "<=",
302            TokenKind::Ge => ">=",
303            TokenKind::LeftShift => "<<",
304            TokenKind::RightShift => ">>",
305            TokenKind::ArithmeticRightShift => ">>>",
306            TokenKind::LogicalOr => "||",
307            TokenKind::LogicalAnd => "&&",
308            TokenKind::LogicalXor => "^^",
309            TokenKind::Ampersand => "&",
310            TokenKind::BitwiseOr => "|",
311            TokenKind::Not => "!",
312            TokenKind::Tilde => "~",
313            TokenKind::BitwiseXor => "^",
314            TokenKind::InfixOperatorSeparator => "`",
315
316            TokenKind::OpenParen => "(",
317            TokenKind::CloseParen => ")",
318            TokenKind::OpenBrace => "{",
319            TokenKind::CloseBrace => "}",
320            TokenKind::OpenBracket => "[",
321            TokenKind::CloseBracket => "]",
322
323            TokenKind::FatArrow => "=>",
324            TokenKind::SlimArrow => "->",
325            TokenKind::Semi => ";",
326            TokenKind::GreekQuestionMark => "GreekQuestionMark(;)",
327            TokenKind::Colon => ":",
328            TokenKind::Comma => ",",
329            TokenKind::Dot => ".",
330            TokenKind::DotDot => "..",
331            TokenKind::PathSeparator => "::",
332            TokenKind::SingleQuote => "'",
333
334            TokenKind::Hash => "#",
335            TokenKind::Dollar => "$",
336
337            TokenKind::Eof => "end of file",
338
339            TokenKind::String(_) => "string",
340
341            TokenKind::OutsideDocumentation(_) => "///",
342            TokenKind::InsideDocumentation(_) => "//!",
343
344            TokenKind::Whitespace => "whitespace",
345            TokenKind::Comment => "comment",
346
347            TokenKind::BlockCommentStart => "/*",
348            TokenKind::BlockCommentEnd => "*/",
349        }
350    }
351
352    pub fn is_identifier(&self) -> bool {
353        matches!(self, TokenKind::Identifier(_))
354    }
355    pub fn is_integer(&self) -> bool {
356        matches!(
357            self,
358            TokenKind::Integer(_) | TokenKind::HexInteger(_) | TokenKind::BinInteger(_)
359        )
360    }
361
362    pub fn as_biguint(&self) -> Option<BigUint> {
363        match self {
364            TokenKind::Integer((i, _))
365            | TokenKind::HexInteger((i, _))
366            | TokenKind::BinInteger((i, _)) => Some(i.clone()),
367            _ => None,
368        }
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use spade_common::num_ext::InfallibleToBigUint;
375
376    use super::*;
377
378    #[test]
379    fn identifiers_work() {
380        let mut lex = TokenKind::lexer("abc123_");
381
382        assert_eq!(
383            lex.next(),
384            Some(Ok(TokenKind::Identifier("abc123_".to_string())))
385        );
386    }
387
388    #[test]
389    fn integer_literals_work() {
390        let mut lex = TokenKind::lexer("123");
391
392        assert_eq!(
393            lex.next(),
394            Some(Ok(TokenKind::Integer((
395                123_u32.to_biguint(),
396                LiteralKind::Unsized
397            ))))
398        );
399        assert_eq!(lex.next(), None);
400    }
401
402    #[test]
403    fn sized_uint_integer_literals_work() {
404        let mut lex = TokenKind::lexer("123u3");
405
406        assert_eq!(
407            lex.next(),
408            Some(Ok(TokenKind::Integer((
409                123_u32.to_biguint(),
410                LiteralKind::Unsigned(3u32.to_biguint())
411            ))))
412        );
413        assert_eq!(lex.next(), None);
414    }
415
416    #[test]
417    fn sized_int_integer_literals_work() {
418        let mut lex = TokenKind::lexer("123i3");
419
420        assert_eq!(
421            lex.next(),
422            Some(Ok(TokenKind::Integer((
423                123_u32.to_biguint(),
424                LiteralKind::Signed(3u32.to_biguint())
425            ))))
426        );
427        assert_eq!(lex.next(), None);
428    }
429
430    #[test]
431    fn hex_array() {
432        let mut lex = TokenKind::lexer("[0x45]");
433        assert_eq!(lex.next(), Some(Ok(TokenKind::OpenBracket)));
434        assert_eq!(
435            lex.next(),
436            Some(Ok(TokenKind::HexInteger((
437                0x45_u32.to_biguint(),
438                LiteralKind::Unsized
439            ))))
440        );
441        assert_eq!(lex.next(), Some(Ok(TokenKind::CloseBracket)));
442        assert_eq!(lex.next(), None);
443    }
444
445    #[test]
446    fn invalid_hex_is_not_hex() {
447        let mut lex = TokenKind::lexer("0xg");
448        assert_eq!(
449            lex.next(),
450            Some(Ok(TokenKind::Integer((
451                0_u32.to_biguint(),
452                LiteralKind::Unsized
453            ))))
454        );
455        assert_eq!(
456            lex.next(),
457            Some(Ok(TokenKind::Identifier("xg".to_string())))
458        );
459        assert_eq!(lex.next(), None);
460    }
461
462    #[test]
463    fn doc_comments_slice_correctly() {
464        let mut lex = TokenKind::lexer("//! Hello\n///G'day");
465        assert_eq!(
466            lex.next(),
467            Some(Ok(TokenKind::InsideDocumentation(" Hello".to_string())))
468        );
469        assert_eq!(
470            lex.next(),
471            Some(Ok(TokenKind::OutsideDocumentation("G'day".to_string())))
472        );
473        assert_eq!(lex.next(), None);
474    }
475}