qala_compiler/token.rs
1//! tokens: the units the lexer produces and the parser consumes. [`TokenKind`]
2//! is the classification (with a payload for literals and identifiers), [`Token`]
3//! pairs a kind with a [`Span`]. the stream always ends in an explicit
4//! [`TokenKind::Eof`].
5//!
6//! the operator set is deliberately one variant per operator, even where two
7//! operators share a prefix, so the Pratt parser can match a kind and look up
8//! its precedence without re-inspecting source text.
9
10use crate::span::Span;
11
12/// what a token is. literals and identifiers carry their decoded value; every
13/// keyword, every operator, every piece of punctuation, and end of file are
14/// nullary variants.
15///
16/// `PartialEq` but not `Eq`, because [`TokenKind::Float`] holds an `f64` and
17/// `f64` is not `Eq`. that is fine; nothing needs `TokenKind: Eq`.
18#[derive(Debug, Clone, PartialEq)]
19pub enum TokenKind {
20 // ---- literals (payload carried) ----
21 /// an integer literal, already parsed (decimal, `0x`, or `0b`; underscores
22 /// stripped). stored as the non-negative magnitude; a leading `-` is a
23 /// separate token the parser folds in.
24 Int(i64),
25 /// a float literal, already parsed (underscores stripped, exponent applied).
26 Float(f64),
27 /// a byte literal `b'X'`, the single byte it denotes.
28 Byte(u8),
29 /// a string literal with no interpolation, escapes already decoded.
30 Str(String),
31 /// the decoded text before the first interpolation in a string with
32 /// interpolations (may be empty). followed by an [`TokenKind::InterpStart`].
33 StrStart(String),
34 /// the decoded text between one interpolation's `}` and the next one's `{`
35 /// (may be empty). sits between an [`TokenKind::InterpEnd`] and an
36 /// [`TokenKind::InterpStart`].
37 StrMid(String),
38 /// the decoded text after the last interpolation in a string (may be empty).
39 /// follows an [`TokenKind::InterpEnd`] and closes the string.
40 StrEnd(String),
41 /// opens an embedded interpolation expression. between this and the matching
42 /// [`TokenKind::InterpEnd`] the lexer emits ordinary tokens, so any
43 /// expression (including a nested string) works inside `{ ... }`.
44 InterpStart,
45 /// closes the interpolation expression opened by [`TokenKind::InterpStart`].
46 InterpEnd,
47 /// an identifier: ASCII `[A-Za-z_][A-Za-z0-9_]*` that is not a keyword.
48 Ident(String),
49
50 // ---- keywords (reserved words and primitive type names) ----
51 Fn,
52 Let,
53 Mut,
54 If,
55 Else,
56 While,
57 For,
58 In,
59 Return,
60 Break,
61 Continue,
62 Defer,
63 Match,
64 Struct,
65 Enum,
66 Interface,
67 Comptime,
68 Is,
69 Pure,
70 Io,
71 Alloc,
72 Panic,
73 Or,
74 /// the `self` keyword. named `SelfKw` because `Self` and `self` are reserved
75 /// in Rust and cannot be used as identifiers here.
76 SelfKw,
77 /// the boolean literal `true`. lexed as a keyword, not an identifier.
78 True,
79 /// the boolean literal `false`. lexed as a keyword, not an identifier.
80 False,
81 /// the primitive type name `i64`.
82 I64Ty,
83 /// the primitive type name `f64`.
84 F64Ty,
85 /// the primitive type name `bool`.
86 BoolTy,
87 /// the primitive type name `str`.
88 StrTy,
89 /// the primitive type name `byte`.
90 ByteTy,
91 /// the primitive type name `void`.
92 VoidTy,
93
94 // ---- operators and punctuation (one variant per operator) ----
95 /// `+`
96 Plus,
97 /// `-`
98 Minus,
99 /// `*`
100 Star,
101 /// `/`
102 Slash,
103 /// `%`
104 Percent,
105 /// `==`
106 EqEq,
107 /// `!=`
108 BangEq,
109 /// `<`
110 Lt,
111 /// `<=`
112 LtEq,
113 /// `>`
114 Gt,
115 /// `>=`
116 GtEq,
117 /// `&&`
118 AmpAmp,
119 /// `||`
120 PipePipe,
121 /// `!`
122 Bang,
123 /// `=`
124 Eq,
125 /// `.`
126 Dot,
127 /// `,`
128 Comma,
129 /// `:`
130 Colon,
131 /// `;`
132 Semi,
133 /// `(`
134 LParen,
135 /// `)`
136 RParen,
137 /// `[`
138 LBracket,
139 /// `]`
140 RBracket,
141 /// `{`
142 LBrace,
143 /// `}`
144 RBrace,
145 /// `->` (return-type arrow)
146 Arrow,
147 /// `=>` (match-arm arrow)
148 FatArrow,
149 /// `|>` (pipeline)
150 PipeGt,
151 /// `?` (postfix error propagation)
152 Question,
153 /// `..` (exclusive range)
154 DotDot,
155 /// `..=` (inclusive range)
156 DotDotEq,
157
158 /// end of the token stream.
159 Eof,
160}
161
162/// a token: its kind and the source span it covers.
163#[derive(Debug, Clone, PartialEq)]
164pub struct Token {
165 /// the classification and payload.
166 pub kind: TokenKind,
167 /// the source region this token spans, opening to closing byte inclusive.
168 pub span: Span,
169}
170
171impl Token {
172 /// build a token from a kind and a span.
173 pub fn new(kind: TokenKind, span: Span) -> Self {
174 Token { kind, span }
175 }
176}
177
178/// the keyword for an identifier string, or `None` if it is an ordinary
179/// identifier.
180///
181/// one `match` in one place: easy to audit against the language's reserved-word
182/// list, and the compiler turns a small string match into efficient code. the
183/// lexer carries no standard-library knowledge: names like `Result`, `Option`,
184/// `Ok`, `Err`, `Some`, `None`, and the built-in functions `print`, `println`,
185/// `len`, `push`, `pop`, `sqrt`, `abs`, `assert`, `type_of`, `open`, `close`,
186/// `map`, `filter`, `reduce` are NOT keywords; the resolver and type-checker
187/// know them, the scanner does not.
188pub fn keyword(ident: &str) -> Option<TokenKind> {
189 use TokenKind::*;
190 Some(match ident {
191 "fn" => Fn,
192 "let" => Let,
193 "mut" => Mut,
194 "if" => If,
195 "else" => Else,
196 "while" => While,
197 "for" => For,
198 "in" => In,
199 "return" => Return,
200 "break" => Break,
201 "continue" => Continue,
202 "defer" => Defer,
203 "match" => Match,
204 "struct" => Struct,
205 "enum" => Enum,
206 "interface" => Interface,
207 "comptime" => Comptime,
208 "is" => Is,
209 "pure" => Pure,
210 "io" => Io,
211 "alloc" => Alloc,
212 "panic" => Panic,
213 "or" => Or,
214 "self" => SelfKw,
215 "true" => True,
216 "false" => False,
217 // primitive type names are reserved too.
218 "i64" => I64Ty,
219 "f64" => F64Ty,
220 "bool" => BoolTy,
221 "str" => StrTy,
222 "byte" => ByteTy,
223 "void" => VoidTy,
224 _ => return None,
225 })
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn reserved_words_lex_to_their_keyword_kind() {
234 let cases: &[(&str, TokenKind)] = &[
235 ("fn", TokenKind::Fn),
236 ("let", TokenKind::Let),
237 ("mut", TokenKind::Mut),
238 ("if", TokenKind::If),
239 ("else", TokenKind::Else),
240 ("while", TokenKind::While),
241 ("for", TokenKind::For),
242 ("in", TokenKind::In),
243 ("return", TokenKind::Return),
244 ("break", TokenKind::Break),
245 ("continue", TokenKind::Continue),
246 ("defer", TokenKind::Defer),
247 ("match", TokenKind::Match),
248 ("struct", TokenKind::Struct),
249 ("enum", TokenKind::Enum),
250 ("interface", TokenKind::Interface),
251 ("comptime", TokenKind::Comptime),
252 ("is", TokenKind::Is),
253 ("pure", TokenKind::Pure),
254 ("io", TokenKind::Io),
255 ("alloc", TokenKind::Alloc),
256 ("panic", TokenKind::Panic),
257 ("or", TokenKind::Or),
258 ("self", TokenKind::SelfKw),
259 ];
260 for (src, expected) in cases {
261 assert_eq!(keyword(src), Some(expected.clone()), "keyword({src:?})");
262 }
263 }
264
265 #[test]
266 fn true_and_false_are_boolean_keyword_kinds_not_idents() {
267 assert_eq!(keyword("true"), Some(TokenKind::True));
268 assert_eq!(keyword("false"), Some(TokenKind::False));
269 }
270
271 #[test]
272 fn primitive_type_names_are_keyword_kinds() {
273 assert_eq!(keyword("i64"), Some(TokenKind::I64Ty));
274 assert_eq!(keyword("f64"), Some(TokenKind::F64Ty));
275 assert_eq!(keyword("bool"), Some(TokenKind::BoolTy));
276 assert_eq!(keyword("str"), Some(TokenKind::StrTy));
277 assert_eq!(keyword("byte"), Some(TokenKind::ByteTy));
278 assert_eq!(keyword("void"), Some(TokenKind::VoidTy));
279 }
280
281 #[test]
282 fn stdlib_and_result_family_names_are_not_keywords() {
283 // the lexer carries no stdlib knowledge: these are ordinary identifiers.
284 let not_keywords = [
285 "Result", "Option", "Ok", "Err", "Some", "None", "print", "println", "len", "push",
286 "pop", "sqrt", "abs", "assert", "type_of", "open", "close", "map", "filter", "reduce",
287 ];
288 for name in not_keywords {
289 assert_eq!(keyword(name), None, "{name:?} must not be a keyword");
290 }
291 }
292
293 #[test]
294 fn ordinary_identifiers_are_not_keywords() {
295 for name in ["foo", "_x", "_", "x1", "__", "fooBar", "Fn", "LET"] {
296 assert_eq!(keyword(name), None, "{name:?} must not be a keyword");
297 }
298 }
299
300 #[test]
301 fn a_token_pairs_a_kind_with_a_span() {
302 let tok = Token::new(TokenKind::Plus, Span::new(3, 1));
303 assert_eq!(tok.kind, TokenKind::Plus);
304 assert_eq!(tok.span, Span::new(3, 1));
305 }
306
307 #[test]
308 fn there_is_an_eof_kind() {
309 // the lexer emits this at the end of every token stream.
310 let eof = Token::new(TokenKind::Eof, Span::new(0, 0));
311 assert_eq!(eof.kind, TokenKind::Eof);
312 }
313
314 #[test]
315 fn literal_kinds_carry_their_payload() {
316 assert_eq!(TokenKind::Int(42), TokenKind::Int(42));
317 assert_ne!(TokenKind::Int(42), TokenKind::Int(43));
318 assert_eq!(TokenKind::Byte(b'A'), TokenKind::Byte(65));
319 assert_eq!(
320 TokenKind::Str("abc".to_string()),
321 TokenKind::Str("abc".to_string())
322 );
323 assert_eq!(
324 TokenKind::StrStart("hi ".to_string()),
325 TokenKind::StrStart("hi ".to_string())
326 );
327 assert_eq!(
328 TokenKind::Ident("x".to_string()),
329 TokenKind::Ident("x".to_string())
330 );
331 // float carries an f64; PartialEq works, Eq is intentionally absent.
332 assert_eq!(TokenKind::Float(1.5), TokenKind::Float(1.5));
333 }
334}