Skip to main content

kore/
lexer.rs

1//! KORE Lexer - Significant whitespace + Rust-like tokens
2//!
3//! Key features:
4//! - Python-style indentation (INDENT/DEDENT tokens)
5//! - Rust-style identifiers and literals
6//! - JSX-style angle brackets for UI
7//! - Effect annotations with `with` keyword
8
9use logos::Logos;
10use crate::span::Span;
11use crate::error::{KoreError, KoreResult};
12
13#[derive(Logos, Debug, Clone, PartialEq)]
14#[logos(skip r"[ \t\r]+")]  // Skip horizontal whitespace AND carriage returns
15pub enum TokenKind {
16    // === Keywords ===
17    #[token("fn")]
18    Fn,
19    #[token("let")]
20    Let,
21    #[token("mut")]
22    Mut,
23    #[token("var")]
24    Var,
25    #[token("const")]
26    Const,
27    #[token("if")]
28    If,
29    #[token("else")]
30    Else,
31    #[token("elif")]
32    Elif,
33    #[token("match")]
34    Match,
35    #[token("for")]
36    For,
37    #[token("while")]
38    While,
39    #[token("loop")]
40    Loop,
41    #[token("break")]
42    Break,
43    #[token("continue")]
44    Continue,
45    #[token("return")]
46    Return,
47    #[token("await")]
48    Await,
49    #[token("in")]
50    In,
51    #[token("with")]
52    With,
53    #[token("as")]
54    As,
55    #[token("type")]
56    TypeKw,
57    #[token("struct")]
58    Struct,
59    #[token("enum")]
60    Enum,
61    #[token("trait")]
62    Trait,
63    #[token("impl")]
64    Impl,
65    #[token("pub")]
66    Pub,
67    #[token("mod")]
68    Mod,
69    #[token("use")]
70    Use,
71    #[token("self")]
72    SelfLower,
73    #[token("Self")]
74    SelfUpper,
75    #[token("true")]
76    True,
77    #[token("false")]
78    False,
79    #[token("none")]
80    None,
81
82    // === Special Keywords (First-Class Citizens) ===
83    #[token("component")]
84    Component,
85    #[token("shader")]
86    Shader,
87    #[token("actor")]
88    Actor,
89    #[token("spawn")]
90    Spawn,
91    #[token("send")]
92    Send,
93    #[token("receive")]
94    Receive,
95    #[token("emit")]
96    Emit,
97    #[token("comptime")]
98    Comptime,
99    #[token("macro")]
100    Macro,
101    #[token("vertex")]
102    Vertex,
103    #[token("fragment")]
104    Fragment,
105
106    // === Testing ===
107    #[token("test")]
108    Test,
109    // Note: 'compute' is NOT a keyword - it's handled as an identifier in shader contexts
110
111    // === Effect Keywords ===
112    #[token("Pure")]
113    Pure,
114    #[token("IO")]
115    Io,
116    #[token("async")]  // lowercase for 'async fn' syntax
117    AsyncKw,
118    #[token("Async")]  // capital for 'with Async' effect syntax
119    Async,
120    #[token("GPU")]
121    Gpu,
122    #[token("Reactive")]
123    Reactive,
124    #[token("Unsafe")]
125    Unsafe,
126
127    // === Literals ===
128    #[regex(r"[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
129    Int(i64),
130
131    #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
132    Float(f64),
133
134    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
135        let s = lex.slice();
136        Some(unescape(&s[1..s.len()-1]))
137    })]
138    String(String),
139
140    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
141        let s = lex.slice();
142        // For f-strings, we currently don't unescape everything because brace handling is complex,
143        // but we SHOULD unescape quotes at least.
144        // For now, let's treat f-strings same as strings but knowing formatting comes later.
145        // Actually, if we unescape, we might break {variable} if the user wrote \{variable} (escaped brace).
146        // Let's leave FString raw for now or handle it carefully.
147        // The user issue is specifically about standard strings.
148        Some(s[2..s.len()-1].to_string())
149    })]
150    FString(String),
151
152    #[regex(r#"'([^'\\]|\\.)*'"#, |lex| {
153        let s = lex.slice();
154        Some(unescape(&s[1..s.len()-1]))
155    })]
156    Char(String),
157
158    // === Identifiers ===
159    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
160    Ident(String),
161
162    // === Operators ===
163    #[token("+")]
164    Plus,
165    #[token("-")]
166    Minus,
167    #[token("*")]
168    Star,
169    #[token("/")]
170    Slash,
171    #[token("%")]
172    Percent,
173    #[token("**")]
174    Power,
175    #[token("==")]
176    EqEq,
177    #[token("!=")]
178    NotEq,
179    #[token("<")]
180    Lt,
181    #[token(">")]
182    Gt,
183    #[token("<=")]
184    LtEq,
185    #[token(">=")]
186    GtEq,
187    #[token("&&")]
188    And,
189    #[token("||")]
190    Or,
191    #[token("!")]
192    Not,
193    #[token("&")]
194    Amp,
195    #[token("|")]
196    Pipe,
197    #[token("^")]
198    Caret,
199    #[token("~")]
200    Tilde,
201    #[token("<<")]
202    Shl,
203    #[token(">>")]
204    Shr,
205
206    // === Assignment ===
207    #[token("=")]
208    Eq,
209    #[token("+=")]
210    PlusEq,
211    #[token("-=")]
212    MinusEq,
213    #[token("*=")]
214    StarEq,
215    #[token("/=")]
216    SlashEq,
217
218    // === Punctuation ===
219    #[token("(")]
220    LParen,
221    #[token(")")]
222    RParen,
223    #[token("[")]
224    LBracket,
225    #[token("]")]
226    RBracket,
227    #[token("{")]
228    LBrace,
229    #[token("}")]
230    RBrace,
231    #[token(",")]
232    Comma,
233    #[token(".")]
234    Dot,
235    #[token("..")]
236    DotDot,
237    #[token("...")]
238    DotDotDot,
239    #[token(":")]
240    Colon,
241    #[token("::")]
242    ColonColon,
243    #[token(";")]
244    Semi,
245    #[token("->")]
246    Arrow,
247    #[token("=>")]
248    FatArrow,
249    #[token("@")]
250    At,
251    #[token("?")]
252    Question,
253
254    // === JSX-like ===
255    #[token("</")]
256    LtSlash,
257
258    // === Whitespace (significant!) ===
259    #[regex(r"\n[ \t]*", |lex| lex.slice().to_string())]
260    Newline(String),
261
262    #[regex(r"//[^\n]*", priority = 3)]
263    Comment,
264
265    #[regex(r"#[^\n]*", priority = 2)]
266    HashComment,
267
268    // Synthetic tokens (inserted during indent processing)
269    Indent,
270    Dedent,
271    Eof,
272}
273
274#[derive(Debug, Clone, PartialEq)]
275pub struct Token {
276    pub kind: TokenKind,
277    pub span: Span,
278}
279
280impl Token {
281    pub fn new(kind: TokenKind, span: Span) -> Self {
282        Self { kind, span }
283    }
284}
285
286pub struct Lexer<'a> {
287    source: &'a str,
288}
289
290impl<'a> Lexer<'a> {
291    pub fn new(source: &'a str) -> Self {
292        Self { source }
293    }
294
295    pub fn tokenize(&self) -> KoreResult<Vec<Token>> {
296        let mut lex = TokenKind::lexer(self.source);
297        let mut raw_tokens = Vec::new();
298
299        while let Some(result) = lex.next() {
300            let span = Span::new(lex.span().start, lex.span().end);
301            match result {
302                Ok(kind) => {
303                    // Skip comments
304                    if matches!(kind, TokenKind::Comment | TokenKind::HashComment) {
305                        continue;
306                    }
307                    raw_tokens.push(Token::new(kind, span));
308                }
309                Err(_) => {
310                    return Err(KoreError::lexer(
311                        format!("Unexpected character: '{}'", &self.source[span.start..span.end]),
312                        span,
313                    ));
314                }
315            }
316        }
317
318        // Process indentation
319        let tokens = self.process_indentation(raw_tokens)?;
320        Ok(tokens)
321    }
322
323    /// Convert newlines with leading whitespace into INDENT/DEDENT tokens
324    fn process_indentation(&self, raw: Vec<Token>) -> KoreResult<Vec<Token>> {
325        let mut result = Vec::new();
326        let mut indent_stack: Vec<usize> = vec![0]; // Stack of indent levels
327        let mut iter = raw.into_iter().peekable();
328
329        while let Some(token) = iter.next() {
330            match &token.kind {
331                TokenKind::Newline(ws) => {
332                    // Check if this is a blank line (followed by another newline)
333                    if let Some(next) = iter.peek() {
334                        if matches!(next.kind, TokenKind::Newline(_)) {
335                            continue;
336                        }
337                    }
338
339                    // Calculate indent level (count spaces, tabs = 4 spaces)
340                    let indent: usize = ws[1..].chars().map(|c| if c == '\t' { 4 } else { 1 }).sum();
341                    let current = *indent_stack.last().unwrap();
342
343                    if indent > current {
344                        // Increased indent
345                        indent_stack.push(indent);
346                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
347                        result.push(Token::new(TokenKind::Indent, token.span));
348                    } else if indent < current {
349                        // Decreased indent - may produce multiple DEDENTs
350                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
351                        while indent_stack.len() > 1 && *indent_stack.last().unwrap() > indent {
352                            indent_stack.pop();
353                            result.push(Token::new(TokenKind::Dedent, token.span));
354                        }
355                    } else {
356                        // Same indent level
357                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
358                    }
359                }
360                _ => {
361                    result.push(token);
362                }
363            }
364        }
365
366        // Close remaining indents
367        let final_span = result.last().map(|t| t.span).unwrap_or(Span::new(0, 0));
368        while indent_stack.len() > 1 {
369            indent_stack.pop();
370            result.push(Token::new(TokenKind::Dedent, final_span));
371        }
372
373        result.push(Token::new(TokenKind::Eof, final_span));
374        Ok(result)
375    }
376}
377
378fn unescape(s: &str) -> String {
379    let mut result = String::with_capacity(s.len());
380    let mut chars = s.chars();
381    while let Some(c) = chars.next() {
382        if c == '\\' {
383            match chars.next() {
384                Some('n') => result.push('\n'),
385                Some('r') => result.push('\r'),
386                Some('t') => result.push('\t'),
387                Some('0') => result.push('\0'),
388                Some('\\') => result.push('\\'),
389                Some('"') => result.push('"'),
390                Some('\'') => result.push('\''),
391                Some(other) => {
392                    result.push('\\');
393                    result.push(other);
394                }
395                None => result.push('\\'),
396            }
397        } else {
398            result.push(c);
399        }
400    }
401    result
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407
408    #[test]
409    fn test_basic_tokens() {
410        let source = "fn factorial(n: Int) -> Int";
411        let tokens = Lexer::new(source).tokenize().unwrap();
412        assert!(matches!(tokens[0].kind, TokenKind::Fn));
413        assert!(matches!(tokens[1].kind, TokenKind::Ident(_)));
414    }
415
416    #[test]
417    fn test_indentation() {
418        let source = "fn foo():\n    let x = 1\n    let y = 2\n";
419        let tokens = Lexer::new(source).tokenize().unwrap();
420        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Indent)));
421    }
422}
423