Skip to main content

qala_compiler/
token.rs

1//! tokens: the units the lexer produces and the parser consumes. [`TokenKind`]
2//! is the classification (with a payload for literals and identifiers), [`Token`]
3//! pairs a kind with a [`Span`]. the stream always ends in an explicit
4//! [`TokenKind::Eof`].
5//!
6//! the operator set is deliberately one variant per operator, even where two
7//! operators share a prefix, so the Pratt parser can match a kind and look up
8//! its precedence without re-inspecting source text.
9
10use crate::span::Span;
11
12/// what a token is. literals and identifiers carry their decoded value; every
13/// keyword, every operator, every piece of punctuation, and end of file are
14/// nullary variants.
15///
16/// `PartialEq` but not `Eq`, because [`TokenKind::Float`] holds an `f64` and
17/// `f64` is not `Eq`. that is fine; nothing needs `TokenKind: Eq`.
18#[derive(Debug, Clone, PartialEq)]
19pub enum TokenKind {
20    // ---- literals (payload carried) ----
21    /// an integer literal, already parsed (decimal, `0x`, or `0b`; underscores
22    /// stripped). stored as the non-negative magnitude; a leading `-` is a
23    /// separate token the parser folds in.
24    Int(i64),
25    /// a float literal, already parsed (underscores stripped, exponent applied).
26    Float(f64),
27    /// a byte literal `b'X'`, the single byte it denotes.
28    Byte(u8),
29    /// a string literal with no interpolation, escapes already decoded.
30    Str(String),
31    /// the decoded text before the first interpolation in a string with
32    /// interpolations (may be empty). followed by an [`TokenKind::InterpStart`].
33    StrStart(String),
34    /// the decoded text between one interpolation's `}` and the next one's `{`
35    /// (may be empty). sits between an [`TokenKind::InterpEnd`] and an
36    /// [`TokenKind::InterpStart`].
37    StrMid(String),
38    /// the decoded text after the last interpolation in a string (may be empty).
39    /// follows an [`TokenKind::InterpEnd`] and closes the string.
40    StrEnd(String),
41    /// opens an embedded interpolation expression. between this and the matching
42    /// [`TokenKind::InterpEnd`] the lexer emits ordinary tokens, so any
43    /// expression (including a nested string) works inside `{ ... }`.
44    InterpStart,
45    /// closes the interpolation expression opened by [`TokenKind::InterpStart`].
46    InterpEnd,
47    /// an identifier: ASCII `[A-Za-z_][A-Za-z0-9_]*` that is not a keyword.
48    Ident(String),
49
50    // ---- keywords (reserved words and primitive type names) ----
51    Fn,
52    Let,
53    Mut,
54    If,
55    Else,
56    While,
57    For,
58    In,
59    Return,
60    Break,
61    Continue,
62    Defer,
63    Match,
64    Struct,
65    Enum,
66    Interface,
67    Comptime,
68    Is,
69    Pure,
70    Io,
71    Alloc,
72    Panic,
73    Or,
74    /// the `self` keyword. named `SelfKw` because `Self` and `self` are reserved
75    /// in Rust and cannot be used as identifiers here.
76    SelfKw,
77    /// the boolean literal `true`. lexed as a keyword, not an identifier.
78    True,
79    /// the boolean literal `false`. lexed as a keyword, not an identifier.
80    False,
81    /// the primitive type name `i64`.
82    I64Ty,
83    /// the primitive type name `f64`.
84    F64Ty,
85    /// the primitive type name `bool`.
86    BoolTy,
87    /// the primitive type name `str`.
88    StrTy,
89    /// the primitive type name `byte`.
90    ByteTy,
91    /// the primitive type name `void`.
92    VoidTy,
93
94    // ---- operators and punctuation (one variant per operator) ----
95    /// `+`
96    Plus,
97    /// `-`
98    Minus,
99    /// `*`
100    Star,
101    /// `/`
102    Slash,
103    /// `%`
104    Percent,
105    /// `==`
106    EqEq,
107    /// `!=`
108    BangEq,
109    /// `<`
110    Lt,
111    /// `<=`
112    LtEq,
113    /// `>`
114    Gt,
115    /// `>=`
116    GtEq,
117    /// `&&`
118    AmpAmp,
119    /// `||`
120    PipePipe,
121    /// `!`
122    Bang,
123    /// `=`
124    Eq,
125    /// `.`
126    Dot,
127    /// `,`
128    Comma,
129    /// `:`
130    Colon,
131    /// `;`
132    Semi,
133    /// `(`
134    LParen,
135    /// `)`
136    RParen,
137    /// `[`
138    LBracket,
139    /// `]`
140    RBracket,
141    /// `{`
142    LBrace,
143    /// `}`
144    RBrace,
145    /// `->` (return-type arrow)
146    Arrow,
147    /// `=>` (match-arm arrow)
148    FatArrow,
149    /// `|>` (pipeline)
150    PipeGt,
151    /// `?` (postfix error propagation)
152    Question,
153    /// `..` (exclusive range)
154    DotDot,
155    /// `..=` (inclusive range)
156    DotDotEq,
157
158    /// end of the token stream.
159    Eof,
160}
161
162/// a token: its kind and the source span it covers.
163#[derive(Debug, Clone, PartialEq)]
164pub struct Token {
165    /// the classification and payload.
166    pub kind: TokenKind,
167    /// the source region this token spans, opening to closing byte inclusive.
168    pub span: Span,
169}
170
171impl Token {
172    /// build a token from a kind and a span.
173    pub fn new(kind: TokenKind, span: Span) -> Self {
174        Token { kind, span }
175    }
176}
177
178/// the keyword for an identifier string, or `None` if it is an ordinary
179/// identifier.
180///
181/// one `match` in one place: easy to audit against the language's reserved-word
182/// list, and the compiler turns a small string match into efficient code. the
183/// lexer carries no standard-library knowledge: names like `Result`, `Option`,
184/// `Ok`, `Err`, `Some`, `None`, and the built-in functions `print`, `println`,
185/// `len`, `push`, `pop`, `sqrt`, `abs`, `assert`, `type_of`, `open`, `close`,
186/// `map`, `filter`, `reduce` are NOT keywords; the resolver and type-checker
187/// know them, the scanner does not.
188pub fn keyword(ident: &str) -> Option<TokenKind> {
189    use TokenKind::*;
190    Some(match ident {
191        "fn" => Fn,
192        "let" => Let,
193        "mut" => Mut,
194        "if" => If,
195        "else" => Else,
196        "while" => While,
197        "for" => For,
198        "in" => In,
199        "return" => Return,
200        "break" => Break,
201        "continue" => Continue,
202        "defer" => Defer,
203        "match" => Match,
204        "struct" => Struct,
205        "enum" => Enum,
206        "interface" => Interface,
207        "comptime" => Comptime,
208        "is" => Is,
209        "pure" => Pure,
210        "io" => Io,
211        "alloc" => Alloc,
212        "panic" => Panic,
213        "or" => Or,
214        "self" => SelfKw,
215        "true" => True,
216        "false" => False,
217        // primitive type names are reserved too.
218        "i64" => I64Ty,
219        "f64" => F64Ty,
220        "bool" => BoolTy,
221        "str" => StrTy,
222        "byte" => ByteTy,
223        "void" => VoidTy,
224        _ => return None,
225    })
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn reserved_words_lex_to_their_keyword_kind() {
234        let cases: &[(&str, TokenKind)] = &[
235            ("fn", TokenKind::Fn),
236            ("let", TokenKind::Let),
237            ("mut", TokenKind::Mut),
238            ("if", TokenKind::If),
239            ("else", TokenKind::Else),
240            ("while", TokenKind::While),
241            ("for", TokenKind::For),
242            ("in", TokenKind::In),
243            ("return", TokenKind::Return),
244            ("break", TokenKind::Break),
245            ("continue", TokenKind::Continue),
246            ("defer", TokenKind::Defer),
247            ("match", TokenKind::Match),
248            ("struct", TokenKind::Struct),
249            ("enum", TokenKind::Enum),
250            ("interface", TokenKind::Interface),
251            ("comptime", TokenKind::Comptime),
252            ("is", TokenKind::Is),
253            ("pure", TokenKind::Pure),
254            ("io", TokenKind::Io),
255            ("alloc", TokenKind::Alloc),
256            ("panic", TokenKind::Panic),
257            ("or", TokenKind::Or),
258            ("self", TokenKind::SelfKw),
259        ];
260        for (src, expected) in cases {
261            assert_eq!(keyword(src), Some(expected.clone()), "keyword({src:?})");
262        }
263    }
264
265    #[test]
266    fn true_and_false_are_boolean_keyword_kinds_not_idents() {
267        assert_eq!(keyword("true"), Some(TokenKind::True));
268        assert_eq!(keyword("false"), Some(TokenKind::False));
269    }
270
271    #[test]
272    fn primitive_type_names_are_keyword_kinds() {
273        assert_eq!(keyword("i64"), Some(TokenKind::I64Ty));
274        assert_eq!(keyword("f64"), Some(TokenKind::F64Ty));
275        assert_eq!(keyword("bool"), Some(TokenKind::BoolTy));
276        assert_eq!(keyword("str"), Some(TokenKind::StrTy));
277        assert_eq!(keyword("byte"), Some(TokenKind::ByteTy));
278        assert_eq!(keyword("void"), Some(TokenKind::VoidTy));
279    }
280
281    #[test]
282    fn stdlib_and_result_family_names_are_not_keywords() {
283        // the lexer carries no stdlib knowledge: these are ordinary identifiers.
284        let not_keywords = [
285            "Result", "Option", "Ok", "Err", "Some", "None", "print", "println", "len", "push",
286            "pop", "sqrt", "abs", "assert", "type_of", "open", "close", "map", "filter", "reduce",
287        ];
288        for name in not_keywords {
289            assert_eq!(keyword(name), None, "{name:?} must not be a keyword");
290        }
291    }
292
293    #[test]
294    fn ordinary_identifiers_are_not_keywords() {
295        for name in ["foo", "_x", "_", "x1", "__", "fooBar", "Fn", "LET"] {
296            assert_eq!(keyword(name), None, "{name:?} must not be a keyword");
297        }
298    }
299
300    #[test]
301    fn a_token_pairs_a_kind_with_a_span() {
302        let tok = Token::new(TokenKind::Plus, Span::new(3, 1));
303        assert_eq!(tok.kind, TokenKind::Plus);
304        assert_eq!(tok.span, Span::new(3, 1));
305    }
306
307    #[test]
308    fn there_is_an_eof_kind() {
309        // the lexer emits this at the end of every token stream.
310        let eof = Token::new(TokenKind::Eof, Span::new(0, 0));
311        assert_eq!(eof.kind, TokenKind::Eof);
312    }
313
314    #[test]
315    fn literal_kinds_carry_their_payload() {
316        assert_eq!(TokenKind::Int(42), TokenKind::Int(42));
317        assert_ne!(TokenKind::Int(42), TokenKind::Int(43));
318        assert_eq!(TokenKind::Byte(b'A'), TokenKind::Byte(65));
319        assert_eq!(
320            TokenKind::Str("abc".to_string()),
321            TokenKind::Str("abc".to_string())
322        );
323        assert_eq!(
324            TokenKind::StrStart("hi ".to_string()),
325            TokenKind::StrStart("hi ".to_string())
326        );
327        assert_eq!(
328            TokenKind::Ident("x".to_string()),
329            TokenKind::Ident("x".to_string())
330        );
331        // float carries an f64; PartialEq works, Eq is intentionally absent.
332        assert_eq!(TokenKind::Float(1.5), TokenKind::Float(1.5));
333    }
334}