1use logos::Logos;
10use crate::span::Span;
11use crate::error::{KoreError, KoreResult};
12
13#[derive(Logos, Debug, Clone, PartialEq)]
14#[logos(skip r"[ \t\r]+")] pub enum TokenKind {
16 #[token("fn")]
18 Fn,
19 #[token("let")]
20 Let,
21 #[token("mut")]
22 Mut,
23 #[token("const")]
24 Const,
25 #[token("if")]
26 If,
27 #[token("else")]
28 Else,
29 #[token("elif")]
30 Elif,
31 #[token("match")]
32 Match,
33 #[token("for")]
34 For,
35 #[token("while")]
36 While,
37 #[token("loop")]
38 Loop,
39 #[token("break")]
40 Break,
41 #[token("continue")]
42 Continue,
43 #[token("return")]
44 Return,
45 #[token("await")]
46 Await,
47 #[token("in")]
48 In,
49 #[token("with")]
50 With,
51 #[token("as")]
52 As,
53 #[token("type")]
54 TypeKw,
55 #[token("struct")]
56 Struct,
57 #[token("enum")]
58 Enum,
59 #[token("trait")]
60 Trait,
61 #[token("impl")]
62 Impl,
63 #[token("pub")]
64 Pub,
65 #[token("mod")]
66 Mod,
67 #[token("use")]
68 Use,
69 #[token("self")]
70 SelfLower,
71 #[token("Self")]
72 SelfUpper,
73 #[token("true")]
74 True,
75 #[token("false")]
76 False,
77 #[token("none")]
78 None,
79
80 #[token("component")]
82 Component,
83 #[token("shader")]
84 Shader,
85 #[token("actor")]
86 Actor,
87 #[token("spawn")]
88 Spawn,
89 #[token("send")]
90 Send,
91 #[token("receive")]
92 Receive,
93 #[token("emit")]
94 Emit,
95 #[token("comptime")]
96 Comptime,
97 #[token("macro")]
98 Macro,
99 #[token("vertex")]
100 Vertex,
101 #[token("fragment")]
102 Fragment,
103
104 #[token("test")]
106 Test,
107 #[token("Pure")]
111 Pure,
112 #[token("IO")]
113 Io,
114 #[token("async")] AsyncKw,
116 #[token("Async")] Async,
118 #[token("GPU")]
119 Gpu,
120 #[token("Reactive")]
121 Reactive,
122 #[token("Unsafe")]
123 Unsafe,
124
125 #[regex(r"[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
127 Int(i64),
128
129 #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
130 Float(f64),
131
132 #[regex(r#""([^"\\]|\\.)*""#, |lex| {
133 let s = lex.slice();
134 Some(unescape(&s[1..s.len()-1]))
135 })]
136 String(String),
137
138 #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
139 let s = lex.slice();
140 Some(s[2..s.len()-1].to_string())
147 })]
148 FString(String),
149
150 #[regex(r#"'([^'\\]|\\.)*'"#, |lex| {
151 let s = lex.slice();
152 Some(unescape(&s[1..s.len()-1]))
153 })]
154 Char(String),
155
156 #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
158 Ident(String),
159
160 #[token("+")]
162 Plus,
163 #[token("-")]
164 Minus,
165 #[token("*")]
166 Star,
167 #[token("/")]
168 Slash,
169 #[token("%")]
170 Percent,
171 #[token("**")]
172 Power,
173 #[token("==")]
174 EqEq,
175 #[token("!=")]
176 NotEq,
177 #[token("<")]
178 Lt,
179 #[token(">")]
180 Gt,
181 #[token("<=")]
182 LtEq,
183 #[token(">=")]
184 GtEq,
185 #[token("&&")]
186 And,
187 #[token("||")]
188 Or,
189 #[token("!")]
190 Not,
191 #[token("&")]
192 Amp,
193 #[token("|")]
194 Pipe,
195 #[token("^")]
196 Caret,
197 #[token("~")]
198 Tilde,
199 #[token("<<")]
200 Shl,
201 #[token(">>")]
202 Shr,
203
204 #[token("=")]
206 Eq,
207 #[token("+=")]
208 PlusEq,
209 #[token("-=")]
210 MinusEq,
211 #[token("*=")]
212 StarEq,
213 #[token("/=")]
214 SlashEq,
215
216 #[token("(")]
218 LParen,
219 #[token(")")]
220 RParen,
221 #[token("[")]
222 LBracket,
223 #[token("]")]
224 RBracket,
225 #[token("{")]
226 LBrace,
227 #[token("}")]
228 RBrace,
229 #[token(",")]
230 Comma,
231 #[token(".")]
232 Dot,
233 #[token("..")]
234 DotDot,
235 #[token("...")]
236 DotDotDot,
237 #[token(":")]
238 Colon,
239 #[token("::")]
240 ColonColon,
241 #[token(";")]
242 Semi,
243 #[token("->")]
244 Arrow,
245 #[token("=>")]
246 FatArrow,
247 #[token("@")]
248 At,
249 #[token("?")]
250 Question,
251
252 #[token("</")]
254 LtSlash,
255
256 #[regex(r"\n[ \t]*", |lex| lex.slice().to_string())]
258 Newline(String),
259
260 #[regex(r"//[^\n]*", priority = 3)]
261 Comment,
262
263 #[regex(r"#[^\n]*", priority = 2)]
264 HashComment,
265
266 Indent,
268 Dedent,
269 Eof,
270}
271
272#[derive(Debug, Clone, PartialEq)]
273pub struct Token {
274 pub kind: TokenKind,
275 pub span: Span,
276}
277
278impl Token {
279 pub fn new(kind: TokenKind, span: Span) -> Self {
280 Self { kind, span }
281 }
282}
283
284pub struct Lexer<'a> {
285 source: &'a str,
286}
287
288impl<'a> Lexer<'a> {
289 pub fn new(source: &'a str) -> Self {
290 Self { source }
291 }
292
293 pub fn tokenize(&self) -> KoreResult<Vec<Token>> {
294 let mut lex = TokenKind::lexer(self.source);
295 let mut raw_tokens = Vec::new();
296
297 while let Some(result) = lex.next() {
298 let span = Span::new(lex.span().start, lex.span().end);
299 match result {
300 Ok(kind) => {
301 if matches!(kind, TokenKind::Comment | TokenKind::HashComment) {
303 continue;
304 }
305 raw_tokens.push(Token::new(kind, span));
306 }
307 Err(_) => {
308 return Err(KoreError::lexer(
309 format!("Unexpected character: '{}'", &self.source[span.start..span.end]),
310 span,
311 ));
312 }
313 }
314 }
315
316 let tokens = self.process_indentation(raw_tokens)?;
318 Ok(tokens)
319 }
320
321 fn process_indentation(&self, raw: Vec<Token>) -> KoreResult<Vec<Token>> {
323 let mut result = Vec::new();
324 let mut indent_stack: Vec<usize> = vec![0]; let mut iter = raw.into_iter().peekable();
326
327 while let Some(token) = iter.next() {
328 match &token.kind {
329 TokenKind::Newline(ws) => {
330 if let Some(next) = iter.peek() {
332 if matches!(next.kind, TokenKind::Newline(_)) {
333 continue;
334 }
335 }
336
337 let indent: usize = ws[1..].chars().map(|c| if c == '\t' { 4 } else { 1 }).sum();
339 let current = *indent_stack.last().unwrap();
340
341 if indent > current {
342 indent_stack.push(indent);
344 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
345 result.push(Token::new(TokenKind::Indent, token.span));
346 } else if indent < current {
347 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
349 while indent_stack.len() > 1 && *indent_stack.last().unwrap() > indent {
350 indent_stack.pop();
351 result.push(Token::new(TokenKind::Dedent, token.span));
352 }
353 } else {
354 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
356 }
357 }
358 _ => {
359 result.push(token);
360 }
361 }
362 }
363
364 let final_span = result.last().map(|t| t.span).unwrap_or(Span::new(0, 0));
366 while indent_stack.len() > 1 {
367 indent_stack.pop();
368 result.push(Token::new(TokenKind::Dedent, final_span));
369 }
370
371 result.push(Token::new(TokenKind::Eof, final_span));
372 Ok(result)
373 }
374}
375
376fn unescape(s: &str) -> String {
377 let mut result = String::with_capacity(s.len());
378 let mut chars = s.chars();
379 while let Some(c) = chars.next() {
380 if c == '\\' {
381 match chars.next() {
382 Some('n') => result.push('\n'),
383 Some('r') => result.push('\r'),
384 Some('t') => result.push('\t'),
385 Some('0') => result.push('\0'),
386 Some('\\') => result.push('\\'),
387 Some('"') => result.push('"'),
388 Some('\'') => result.push('\''),
389 Some(other) => {
390 result.push('\\');
391 result.push(other);
392 }
393 None => result.push('\\'),
394 }
395 } else {
396 result.push(c);
397 }
398 }
399 result
400}
401
402#[cfg(test)]
403mod tests {
404 use super::*;
405
406 #[test]
407 fn test_basic_tokens() {
408 let source = "fn factorial(n: Int) -> Int";
409 let tokens = Lexer::new(source).tokenize().unwrap();
410 assert!(matches!(tokens[0].kind, TokenKind::Fn));
411 assert!(matches!(tokens[1].kind, TokenKind::Ident(_)));
412 }
413
414 #[test]
415 fn test_indentation() {
416 let source = "fn foo():\n let x = 1\n let y = 2\n";
417 let tokens = Lexer::new(source).tokenize().unwrap();
418 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Indent)));
419 }
420}
421