Skip to main content

kore/
lexer.rs

1//! KORE Lexer - Significant whitespace + Rust-like tokens
2//!
3//! Key features:
4//! - Python-style indentation (INDENT/DEDENT tokens)
5//! - Rust-style identifiers and literals
6//! - JSX-style angle brackets for UI
7//! - Effect annotations with `with` keyword
8
9use logos::Logos;
10use crate::span::Span;
11use crate::error::{KoreError, KoreResult};
12
13#[derive(Logos, Debug, Clone, PartialEq)]
14#[logos(skip r"[ \t\r]+")]  // Skip horizontal whitespace AND carriage returns
15pub enum TokenKind {
16    // === Keywords ===
17    #[token("fn")]
18    Fn,
19    #[token("let")]
20    Let,
21    #[token("mut")]
22    Mut,
23    #[token("const")]
24    Const,
25    #[token("if")]
26    If,
27    #[token("else")]
28    Else,
29    #[token("elif")]
30    Elif,
31    #[token("match")]
32    Match,
33    #[token("for")]
34    For,
35    #[token("while")]
36    While,
37    #[token("loop")]
38    Loop,
39    #[token("break")]
40    Break,
41    #[token("continue")]
42    Continue,
43    #[token("return")]
44    Return,
45    #[token("await")]
46    Await,
47    #[token("in")]
48    In,
49    #[token("with")]
50    With,
51    #[token("as")]
52    As,
53    #[token("type")]
54    TypeKw,
55    #[token("struct")]
56    Struct,
57    #[token("enum")]
58    Enum,
59    #[token("trait")]
60    Trait,
61    #[token("impl")]
62    Impl,
63    #[token("pub")]
64    Pub,
65    #[token("mod")]
66    Mod,
67    #[token("use")]
68    Use,
69    #[token("self")]
70    SelfLower,
71    #[token("Self")]
72    SelfUpper,
73    #[token("true")]
74    True,
75    #[token("false")]
76    False,
77    #[token("none")]
78    None,
79
80    // === Special Keywords (First-Class Citizens) ===
81    #[token("component")]
82    Component,
83    #[token("shader")]
84    Shader,
85    #[token("actor")]
86    Actor,
87    #[token("spawn")]
88    Spawn,
89    #[token("send")]
90    Send,
91    #[token("receive")]
92    Receive,
93    #[token("emit")]
94    Emit,
95    #[token("comptime")]
96    Comptime,
97    #[token("macro")]
98    Macro,
99    #[token("vertex")]
100    Vertex,
101    #[token("fragment")]
102    Fragment,
103
104    // === Testing ===
105    #[token("test")]
106    Test,
107    // Note: 'compute' is NOT a keyword - it's handled as an identifier in shader contexts
108
109    // === Effect Keywords ===
110    #[token("Pure")]
111    Pure,
112    #[token("IO")]
113    Io,
114    #[token("async")]  // lowercase for 'async fn' syntax
115    AsyncKw,
116    #[token("Async")]  // capital for 'with Async' effect syntax
117    Async,
118    #[token("GPU")]
119    Gpu,
120    #[token("Reactive")]
121    Reactive,
122    #[token("Unsafe")]
123    Unsafe,
124
125    // === Literals ===
126    #[regex(r"[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
127    Int(i64),
128
129    #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
130    Float(f64),
131
132    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
133        let s = lex.slice();
134        Some(unescape(&s[1..s.len()-1]))
135    })]
136    String(String),
137
138    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
139        let s = lex.slice();
140        // For f-strings, we currently don't unescape everything because brace handling is complex,
141        // but we SHOULD unescape quotes at least.
142        // For now, let's treat f-strings same as strings but knowing formatting comes later.
143        // Actually, if we unescape, we might break {variable} if the user wrote \{variable} (escaped brace).
144        // Let's leave FString raw for now or handle it carefully.
145        // The user issue is specifically about standard strings.
146        Some(s[2..s.len()-1].to_string())
147    })]
148    FString(String),
149
150    #[regex(r#"'([^'\\]|\\.)*'"#, |lex| {
151        let s = lex.slice();
152        Some(unescape(&s[1..s.len()-1]))
153    })]
154    Char(String),
155
156    // === Identifiers ===
157    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
158    Ident(String),
159
160    // === Operators ===
161    #[token("+")]
162    Plus,
163    #[token("-")]
164    Minus,
165    #[token("*")]
166    Star,
167    #[token("/")]
168    Slash,
169    #[token("%")]
170    Percent,
171    #[token("**")]
172    Power,
173    #[token("==")]
174    EqEq,
175    #[token("!=")]
176    NotEq,
177    #[token("<")]
178    Lt,
179    #[token(">")]
180    Gt,
181    #[token("<=")]
182    LtEq,
183    #[token(">=")]
184    GtEq,
185    #[token("&&")]
186    And,
187    #[token("||")]
188    Or,
189    #[token("!")]
190    Not,
191    #[token("&")]
192    Amp,
193    #[token("|")]
194    Pipe,
195    #[token("^")]
196    Caret,
197    #[token("~")]
198    Tilde,
199    #[token("<<")]
200    Shl,
201    #[token(">>")]
202    Shr,
203
204    // === Assignment ===
205    #[token("=")]
206    Eq,
207    #[token("+=")]
208    PlusEq,
209    #[token("-=")]
210    MinusEq,
211    #[token("*=")]
212    StarEq,
213    #[token("/=")]
214    SlashEq,
215
216    // === Punctuation ===
217    #[token("(")]
218    LParen,
219    #[token(")")]
220    RParen,
221    #[token("[")]
222    LBracket,
223    #[token("]")]
224    RBracket,
225    #[token("{")]
226    LBrace,
227    #[token("}")]
228    RBrace,
229    #[token(",")]
230    Comma,
231    #[token(".")]
232    Dot,
233    #[token("..")]
234    DotDot,
235    #[token("...")]
236    DotDotDot,
237    #[token(":")]
238    Colon,
239    #[token("::")]
240    ColonColon,
241    #[token(";")]
242    Semi,
243    #[token("->")]
244    Arrow,
245    #[token("=>")]
246    FatArrow,
247    #[token("@")]
248    At,
249    #[token("?")]
250    Question,
251
252    // === JSX-like ===
253    #[token("</")]
254    LtSlash,
255
256    // === Whitespace (significant!) ===
257    #[regex(r"\n[ \t]*", |lex| lex.slice().to_string())]
258    Newline(String),
259
260    #[regex(r"//[^\n]*", priority = 3)]
261    Comment,
262
263    #[regex(r"#[^\n]*", priority = 2)]
264    HashComment,
265
266    // Synthetic tokens (inserted during indent processing)
267    Indent,
268    Dedent,
269    Eof,
270}
271
272#[derive(Debug, Clone, PartialEq)]
273pub struct Token {
274    pub kind: TokenKind,
275    pub span: Span,
276}
277
278impl Token {
279    pub fn new(kind: TokenKind, span: Span) -> Self {
280        Self { kind, span }
281    }
282}
283
284pub struct Lexer<'a> {
285    source: &'a str,
286}
287
288impl<'a> Lexer<'a> {
289    pub fn new(source: &'a str) -> Self {
290        Self { source }
291    }
292
293    pub fn tokenize(&self) -> KoreResult<Vec<Token>> {
294        let mut lex = TokenKind::lexer(self.source);
295        let mut raw_tokens = Vec::new();
296
297        while let Some(result) = lex.next() {
298            let span = Span::new(lex.span().start, lex.span().end);
299            match result {
300                Ok(kind) => {
301                    // Skip comments
302                    if matches!(kind, TokenKind::Comment | TokenKind::HashComment) {
303                        continue;
304                    }
305                    raw_tokens.push(Token::new(kind, span));
306                }
307                Err(_) => {
308                    return Err(KoreError::lexer(
309                        format!("Unexpected character: '{}'", &self.source[span.start..span.end]),
310                        span,
311                    ));
312                }
313            }
314        }
315
316        // Process indentation
317        let tokens = self.process_indentation(raw_tokens)?;
318        Ok(tokens)
319    }
320
321    /// Convert newlines with leading whitespace into INDENT/DEDENT tokens
322    fn process_indentation(&self, raw: Vec<Token>) -> KoreResult<Vec<Token>> {
323        let mut result = Vec::new();
324        let mut indent_stack: Vec<usize> = vec![0]; // Stack of indent levels
325        let mut iter = raw.into_iter().peekable();
326
327        while let Some(token) = iter.next() {
328            match &token.kind {
329                TokenKind::Newline(ws) => {
330                    // Check if this is a blank line (followed by another newline)
331                    if let Some(next) = iter.peek() {
332                        if matches!(next.kind, TokenKind::Newline(_)) {
333                            continue;
334                        }
335                    }
336
337                    // Calculate indent level (count spaces, tabs = 4 spaces)
338                    let indent: usize = ws[1..].chars().map(|c| if c == '\t' { 4 } else { 1 }).sum();
339                    let current = *indent_stack.last().unwrap();
340
341                    if indent > current {
342                        // Increased indent
343                        indent_stack.push(indent);
344                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
345                        result.push(Token::new(TokenKind::Indent, token.span));
346                    } else if indent < current {
347                        // Decreased indent - may produce multiple DEDENTs
348                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
349                        while indent_stack.len() > 1 && *indent_stack.last().unwrap() > indent {
350                            indent_stack.pop();
351                            result.push(Token::new(TokenKind::Dedent, token.span));
352                        }
353                    } else {
354                        // Same indent level
355                        result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
356                    }
357                }
358                _ => {
359                    result.push(token);
360                }
361            }
362        }
363
364        // Close remaining indents
365        let final_span = result.last().map(|t| t.span).unwrap_or(Span::new(0, 0));
366        while indent_stack.len() > 1 {
367            indent_stack.pop();
368            result.push(Token::new(TokenKind::Dedent, final_span));
369        }
370
371        result.push(Token::new(TokenKind::Eof, final_span));
372        Ok(result)
373    }
374}
375
376fn unescape(s: &str) -> String {
377    let mut result = String::with_capacity(s.len());
378    let mut chars = s.chars();
379    while let Some(c) = chars.next() {
380        if c == '\\' {
381            match chars.next() {
382                Some('n') => result.push('\n'),
383                Some('r') => result.push('\r'),
384                Some('t') => result.push('\t'),
385                Some('0') => result.push('\0'),
386                Some('\\') => result.push('\\'),
387                Some('"') => result.push('"'),
388                Some('\'') => result.push('\''),
389                Some(other) => {
390                    result.push('\\');
391                    result.push(other);
392                }
393                None => result.push('\\'),
394            }
395        } else {
396            result.push(c);
397        }
398    }
399    result
400}
401
402#[cfg(test)]
403mod tests {
404    use super::*;
405
406    #[test]
407    fn test_basic_tokens() {
408        let source = "fn factorial(n: Int) -> Int";
409        let tokens = Lexer::new(source).tokenize().unwrap();
410        assert!(matches!(tokens[0].kind, TokenKind::Fn));
411        assert!(matches!(tokens[1].kind, TokenKind::Ident(_)));
412    }
413
414    #[test]
415    fn test_indentation() {
416        let source = "fn foo():\n    let x = 1\n    let y = 2\n";
417        let tokens = Lexer::new(source).tokenize().unwrap();
418        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Indent)));
419    }
420}
421