qbe_parser/lexer/
tokens.rs

1use std::fmt::{Display, Formatter};
2use std::str::FromStr;
3use std::sync::OnceLock;
4
5use chumsky::prelude::*;
6
7use crate::ast::{
8    BlockName, FloatLiteral, GlobalName, Ident, NumericLiteral, Span, Spanned, StringLiteral,
9    TemporaryName, TypeName,
10};
11
12#[derive(Clone, Debug, Eq, PartialEq, Hash)]
13pub enum Token {
14    //
15    // identifiers
16    //
17    Ident(Ident),
18    TypeName(TypeName),
19    GlobalName(GlobalName),
20    TemporaryName(TemporaryName),
21    BlockName(BlockName),
22    //
23    // literals
24    //
25    StringLiteral(StringLiteral),
26    Number(NumericLiteral<u64>),
27    Integer(NumericLiteral<i128>),
28    Float(FloatLiteral),
29    //
30    // other
31    //
32    Keyword(Keyword),
33    ShortTypeSpec(ShortTypeSpec),
34    Operator(Operator),
35    // delimiters
36    OpenBrace,
37    CloseBrace,
38    OpenParen,
39    CloseParen,
40    //
41    // magic
42    //
43    /// Represents a series of one or more newlines.
44    ///
45    /// The QBE IR uses newlines to separate instructions,
46    /// and restricts the usage of newlines in other locations.
47    Newline,
48}
49impl Token {
50    #[inline]
51    pub(crate) fn number<'a>() -> impl TokenParser<'a, NumericLiteral<u64>> {
52        select!(Token::Number(num) => num).labelled("number (unsigned)")
53    }
54}
55macro_rules! token_impls {
56    (
57        complex { $($complex_variant:ident),+ $(,)? },
58        wraps_enum { $($wrap_variant:ident),+ $(,)? },
59        simple { $($simple_variant:ident  => $value:literal),+ $(,)? },
60        magic { $($magic_variant:ident => $magic_text:literal as $desc:literal),+ $(,)? } $(,)?
61    )
62    => {
63        impl Token {
64            pub fn span(&self) -> Option<Span> {
65                #[deny(unreachable_patterns)]
66                match self {
67                    $(Self::$complex_variant(inner) => Some(inner.span()),)*
68                    $(Self::$wrap_variant(_) => None,)*
69                    $(Self::$simple_variant => None,)*
70                    $(Self::$magic_variant => None,)*
71                }
72            }
73            /// If the token is "magic" and has special behavior.
74            #[inline]
75            pub fn is_magic(&self) -> bool {
76                match self {
77                    $(Self::$magic_variant => true,)*
78                    _ => false,
79                }
80            }
81            /// The text of the token.
82            ///
83            /// This will differ from the [Display] impl only if the token is [magic](Self::is_magic).
84            #[inline]
85            pub fn text(&self) -> impl Display + '_ {
86                struct Text<'a>(&'a Token);
87                impl Display for Text<'_> {
88                    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
89                        match self.0 {
90                            $(Token::$magic_variant => f.write_str($magic_text),)+
91                            _ => {
92                                assert!(!self.0.is_magic());
93                                write!(f, "{}", self.0)
94                            }
95                        }
96                    }
97                }
98                Text(self)
99            }
100        }
101        /// Display a human-readable description of the token.
102        ///
103        /// This is either the text of the token or `<desc>` if the token is magic.
104        /// Use [`Token::text`] if you want the actual text of the token.
105        impl Display for Token {
106            fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
107                match self {
108                    $(Self::$complex_variant(inner) => Display::fmt(inner, f),)*
109                    $(Self::$wrap_variant(inner) => Display::fmt(inner, f),)+
110                    $(Self::$simple_variant => Display::fmt(&$value, f),)+
111                    $(Self::$magic_variant => f.write_str($desc),)+
112                }
113            }
114        }
115    }
116}
117token_impls! {
118    complex {
119        Ident,
120        TypeName,
121        GlobalName,
122        TemporaryName,
123        BlockName,
124        StringLiteral,
125        Number,
126        Integer,
127        Float,
128    },
129    wraps_enum {
130        Keyword,
131        Operator,
132        ShortTypeSpec,
133    },
134    simple {
135        OpenBrace => '{',
136        CloseBrace => '}',
137        OpenParen => '(',
138        CloseParen => ')',
139    },
140    magic {
141        Newline => "\n" as "<newline>",
142    }
143}
144macro_rules! token_wrapper_from {
145    ($($variant:ident),+ $(,)?) => {
146        $(impl From<$variant> for Token {
147            #[inline]
148            fn from(value: $variant) -> Self {
149                Token::$variant(value)
150            }
151        }
152        impl From<$variant> for Spanned<Token> {
153            #[inline]
154            fn from(value: $variant) -> Self {
155                let span = value.span();
156                Spanned {
157                    value: Token::$variant(value),
158                    span,
159                }
160            }
161        })*
162    };
163}
164token_wrapper_from! {
165    Ident,
166    TypeName,
167    GlobalName,
168    TemporaryName,
169    BlockName,
170    StringLiteral,
171}
172impl From<FloatLiteral> for Token {
173    fn from(value: FloatLiteral) -> Self {
174        Token::Float(value)
175    }
176}
177impl From<FloatLiteral> for Spanned<Token> {
178    fn from(value: FloatLiteral) -> Self {
179        Spanned {
180            span: value.span(),
181            value: Token::Float(value),
182        }
183    }
184}
185#[allow(unused)]
186const _TOKEN_USED: () = {
187    let _ = Token::span;
188};
189
190macro_rules! define_keyword_enum {
191    ($evis:vis enum $target:ident {
192        $($kw:ident),+ $(,)?
193    }) => {
194        paste3::paste! {
195            define_string_enum!($evis enum $target {
196                $($kw([<$kw:snake>])),*
197            });
198        }
199    };
200}
201macro_rules! define_string_enum {
202    ($(#[$emeta:meta])* enum $target:ident {
203        $($kw:ident ( $($inner:tt)* )),+ $(,)?
204    }) => {
205        paste3::paste! {
206            macro_rules! [<$target:snake>] {
207                $(($($inner)*) => ($crate::lexer::$target::$kw);)*
208            }
209        }
210        define_string_enum!(@nomacro $(#[$emeta])* enum $target {
211            $($kw => stringify!($($inner)*)),*
212        });
213    };
214    (@nomacro $(#[$emeta:meta])* enum $target:ident {
215        // TODO: The `=>` should be neither `+`, nor `?`, but exactly once
216        $($kw:ident => $text:expr),+ $(,)?
217    }) => {
218        #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
219        pub enum $target {
220            $($kw),*
221        }
222        impl $target {
223            pub const ALL: [Self; Self::COUNT] = [$(Self::$kw),*];
224            pub const COUNT: usize = define_string_enum!(@count $($kw),*);
225            #[inline]
226            pub fn text(self) -> &'static str {
227                match self {
228                    $(Self::$kw => $text),*
229                }
230            }
231            #[inline]
232            pub fn to_token(self) -> Token {
233                Token::$target(self)
234            }
235            /// Parses this token, returning the [`Span`] of the value.
236            ///
237            /// Equivalent to calling [`just`] with [`Self::to_token`].
238            /// This gives superior error messages to using [`select!`].
239            #[inline]
240            pub(crate) fn parser<'a>(self) -> impl TokenParser<'a, Span> {
241                just(self.to_token()).to_span()
242            }
243            pub(super) fn text_parser<'a>() -> impl StringParser<'a, $target> {
244                // TODO: Would be nice to cache the result directly,
245                // but that module is currently unstable
246                const COUNT: usize = define_string_enum!(@count $($kw),*);
247                static SORTED_TOKENS: OnceLock<[$target; COUNT]> = OnceLock::new();
248                let sorted_tokens: [$target; COUNT] = *SORTED_TOKENS.get_or_init(|| {
249                    let mut tokens: [$target; Self::COUNT] = Self::ALL.clone();
250                    tokens.sort_by_key(|tk| {
251                        let text = tk.text();
252                        // long tokens must always come before short tokens,
253                        // then sort alphabetically
254                        (text.len(), text)
255                    });
256                    tokens
257                });
258                let parsers = sorted_tokens.map(|token| just(token.text()).to(token));
259                choice(parsers)
260            }
261        }
262        impl From<$target> for Token {
263            #[inline]
264            fn from(value: $target) -> Self {
265                value.to_token()
266            }
267        }
268        impl From<$target> for Spanned<Token> {
269            #[inline]
270            fn from(value: $target) -> Self {
271                Spanned {
272                    value: value.to_token(),
273                    span: Span::MISSING,
274                }
275            }
276        }
277        impl FromStr for $target {
278            type Err = paste3::paste!([<Invalid $target Error>]);
279            fn from_str(s: &str) -> Result<Self, Self::Err> {
280                match s {
281                    $($text => Ok(Self::$kw),)*
282                    _ => Err(paste3::paste!([<Invalid $target Error>])),
283                }
284            }
285        }
286        impl Display for $target {
287            fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
288                f.write_str(self.text())
289            }
290        }
291    };
292    (@text $kw:ident) => (paste3::paste!(stringify!([<$kw:lower>])));
293    (@text $kw:ident => $text:expr) => ($text);
294    (@text $kw:ident( $($inner:tt)* )) => (stringify!($($inner)*));
295    (@count) => (0);
296    (@count $kw:ident) => (1);
297    (@count $first:ident , $($kw:ident),*) => (1 + define_string_enum!(@count $($kw),*));
298}
299define_keyword_enum!(
300    enum Keyword {
301        Align,
302        Call,
303        Data,
304        Env,
305        Export,
306        Function,
307        Hlt,
308        Jmp,
309        Jnz,
310        Phi,
311        Ret,
312        Section,
313        Thread,
314        Type,
315    }
316);
317define_string_enum!(
318    enum ShortTypeSpec {
319        // base types (BASETY)
320        Word(w),
321        Long(l),
322        Single(s),
323        Double(d),
324        // extended types (EXTTY)
325        Byte(b),
326        Half(h),
327        // Sub-word types (SUBWTY)
328        SignedByte(sb),
329        UnsignedByte(ub),
330        SignedHalf(sh),
331        UnsignedHalf(uh),
332    }
333);
334#[allow(unused)]
335const _SHORT_TYPE_SPEC_USED: () = {
336    let _ = ShortTypeSpec::parser;
337    let _ = short_type_spec!(w);
338};
339define_string_enum!(
340    /// Defines operators, the third main class of tokens besides [`Keyword`] and [`ShortTypeSpec`].
341    ///
342    /// # Symbols
343    /// Most operators are "symbols", which have two special behaviors when lexing.
344    ///
345    /// First, if exactly one of two consecutive tokens is a symbol,
346    /// then the spacing between the tokens can be omitted.
347    /// This means that `type=`, `=w`, `foo,` will lex as `type =`, `= w`, `foo ,`.
348    /// This behavior is by design and is described in the [spacing] section of the QBE reference.
349    ///
350    /// The only non-symbol that this rule clearly creates is ['z'](Operator::ZeroInitMarker),
351    /// as we don't want `zero` to parse as `z ero`.
352    ///
353    /// The second property is that symbols have higher lexer priority than all other tokens.
354    /// This is an implementation detail that would require a fair deal of work to resolve.
355    /// It means that `:` cannot be a symbol, as it could be confused with a type name.
356    ///
357    /// [spacing]: https://c9x.me/compile/doc/il.html#Spacing
358    enum Operator {
359        SingleEquals(=),
360        Colon(:),
361        Comma(,),
362        // The plus operator is not considered a symbol as it could be confused with numbers.
363        // This limitation would be straightforward to remove by special-casing digits.
364        Plus(+),
365        // TODO: Should this be an `Ident`?
366        ZeroInitMarker(z),
367        Ellipsis(...),
368    }
369);
370impl Operator {
371    /// Determine if this operator is a symbol for the purposes of lexing.
372    #[inline]
373    pub fn is_symbol(&self) -> bool {
374        !matches!(
375            self,
376            Operator::ZeroInitMarker | Operator::Colon | Operator::Plus
377        )
378    }
379}
380#[derive(thiserror::Error, Debug, Copy, Clone)]
381#[error("Keyword is not valid")]
382pub struct InvalidKeywordError;
383
384#[derive(thiserror::Error, Debug, Copy, Clone)]
385#[error("Short type spec is not valid")]
386pub struct InvalidShortTypeSpecError;
387
388#[derive(thiserror::Error, Debug, Copy, Clone)]
389#[error("Operator is not valid")]
390pub struct InvalidOperatorError;
391
392// re-export macros (this works?)
393use crate::lexer::{StringParser, TokenParser};
394pub(crate) use keyword;
395pub(crate) use operator;
396#[allow(unused_imports)]
397pub(crate) use short_type_spec;
398
399#[cfg(test)]
400mod test {
401    use crate::lexer::Token;
402
403    #[test]
404    fn token_macros() {
405        assert_eq!(operator!(=).text(), "=");
406        assert_eq!(keyword!(align).text(), "align");
407    }
408
409    #[test]
410    fn display() {
411        assert_eq!(operator!(=).to_token().to_string(), "=");
412        assert_eq!(operator!(,).to_token().to_string(), ",");
413        assert_eq!(short_type_spec!(w).to_token().to_string(), "w");
414        assert_eq!(Token::Newline.to_string(), "<newline>");
415    }
416}