1use logos::Logos;
10use crate::span::Span;
11use crate::error::{KoreError, KoreResult};
12
13#[derive(Logos, Debug, Clone, PartialEq)]
14#[logos(skip r"[ \t\r]+")] pub enum TokenKind {
16 #[token("fn")]
18 Fn,
19 #[token("let")]
20 Let,
21 #[token("mut")]
22 Mut,
23 #[token("var")]
24 Var,
25 #[token("const")]
26 Const,
27 #[token("if")]
28 If,
29 #[token("else")]
30 Else,
31 #[token("elif")]
32 Elif,
33 #[token("match")]
34 Match,
35 #[token("for")]
36 For,
37 #[token("while")]
38 While,
39 #[token("loop")]
40 Loop,
41 #[token("break")]
42 Break,
43 #[token("continue")]
44 Continue,
45 #[token("return")]
46 Return,
47 #[token("await")]
48 Await,
49 #[token("in")]
50 In,
51 #[token("with")]
52 With,
53 #[token("as")]
54 As,
55 #[token("type")]
56 TypeKw,
57 #[token("struct")]
58 Struct,
59 #[token("enum")]
60 Enum,
61 #[token("trait")]
62 Trait,
63 #[token("impl")]
64 Impl,
65 #[token("pub")]
66 Pub,
67 #[token("mod")]
68 Mod,
69 #[token("use")]
70 Use,
71 #[token("self")]
72 SelfLower,
73 #[token("Self")]
74 SelfUpper,
75 #[token("true")]
76 True,
77 #[token("false")]
78 False,
79 #[token("none")]
80 None,
81
82 #[token("component")]
84 Component,
85 #[token("shader")]
86 Shader,
87 #[token("actor")]
88 Actor,
89 #[token("spawn")]
90 Spawn,
91 #[token("send")]
92 Send,
93 #[token("receive")]
94 Receive,
95 #[token("emit")]
96 Emit,
97 #[token("comptime")]
98 Comptime,
99 #[token("macro")]
100 Macro,
101 #[token("vertex")]
102 Vertex,
103 #[token("fragment")]
104 Fragment,
105
106 #[token("test")]
108 Test,
109 #[token("Pure")]
113 Pure,
114 #[token("IO")]
115 Io,
116 #[token("async")] AsyncKw,
118 #[token("Async")] Async,
120 #[token("GPU")]
121 Gpu,
122 #[token("Reactive")]
123 Reactive,
124 #[token("Unsafe")]
125 Unsafe,
126
127 #[regex(r"[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
129 Int(i64),
130
131 #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse().ok())]
132 Float(f64),
133
134 #[regex(r#""([^"\\]|\\.)*""#, |lex| {
135 let s = lex.slice();
136 Some(unescape(&s[1..s.len()-1]))
137 })]
138 String(String),
139
140 #[regex(r#"f"([^"\\]|\\.)*""#, |lex| {
141 let s = lex.slice();
142 Some(s[2..s.len()-1].to_string())
149 })]
150 FString(String),
151
152 #[regex(r#"'([^'\\]|\\.)*'"#, |lex| {
153 let s = lex.slice();
154 Some(unescape(&s[1..s.len()-1]))
155 })]
156 Char(String),
157
158 #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
160 Ident(String),
161
162 #[token("+")]
164 Plus,
165 #[token("-")]
166 Minus,
167 #[token("*")]
168 Star,
169 #[token("/")]
170 Slash,
171 #[token("%")]
172 Percent,
173 #[token("**")]
174 Power,
175 #[token("==")]
176 EqEq,
177 #[token("!=")]
178 NotEq,
179 #[token("<")]
180 Lt,
181 #[token(">")]
182 Gt,
183 #[token("<=")]
184 LtEq,
185 #[token(">=")]
186 GtEq,
187 #[token("&&")]
188 And,
189 #[token("||")]
190 Or,
191 #[token("!")]
192 Not,
193 #[token("&")]
194 Amp,
195 #[token("|")]
196 Pipe,
197 #[token("^")]
198 Caret,
199 #[token("~")]
200 Tilde,
201 #[token("<<")]
202 Shl,
203 #[token(">>")]
204 Shr,
205
206 #[token("=")]
208 Eq,
209 #[token("+=")]
210 PlusEq,
211 #[token("-=")]
212 MinusEq,
213 #[token("*=")]
214 StarEq,
215 #[token("/=")]
216 SlashEq,
217
218 #[token("(")]
220 LParen,
221 #[token(")")]
222 RParen,
223 #[token("[")]
224 LBracket,
225 #[token("]")]
226 RBracket,
227 #[token("{")]
228 LBrace,
229 #[token("}")]
230 RBrace,
231 #[token(",")]
232 Comma,
233 #[token(".")]
234 Dot,
235 #[token("..")]
236 DotDot,
237 #[token("...")]
238 DotDotDot,
239 #[token(":")]
240 Colon,
241 #[token("::")]
242 ColonColon,
243 #[token(";")]
244 Semi,
245 #[token("->")]
246 Arrow,
247 #[token("=>")]
248 FatArrow,
249 #[token("@")]
250 At,
251 #[token("?")]
252 Question,
253
254 #[token("</")]
256 LtSlash,
257
258 #[regex(r"\n[ \t]*", |lex| lex.slice().to_string())]
260 Newline(String),
261
262 #[regex(r"//[^\n]*", priority = 3)]
263 Comment,
264
265 #[regex(r"#[^\n]*", priority = 2)]
266 HashComment,
267
268 Indent,
270 Dedent,
271 Eof,
272}
273
274#[derive(Debug, Clone, PartialEq)]
275pub struct Token {
276 pub kind: TokenKind,
277 pub span: Span,
278}
279
280impl Token {
281 pub fn new(kind: TokenKind, span: Span) -> Self {
282 Self { kind, span }
283 }
284}
285
286pub struct Lexer<'a> {
287 source: &'a str,
288}
289
290impl<'a> Lexer<'a> {
291 pub fn new(source: &'a str) -> Self {
292 Self { source }
293 }
294
295 pub fn tokenize(&self) -> KoreResult<Vec<Token>> {
296 let mut lex = TokenKind::lexer(self.source);
297 let mut raw_tokens = Vec::new();
298
299 while let Some(result) = lex.next() {
300 let span = Span::new(lex.span().start, lex.span().end);
301 match result {
302 Ok(kind) => {
303 if matches!(kind, TokenKind::Comment | TokenKind::HashComment) {
305 continue;
306 }
307 raw_tokens.push(Token::new(kind, span));
308 }
309 Err(_) => {
310 return Err(KoreError::lexer(
311 format!("Unexpected character: '{}'", &self.source[span.start..span.end]),
312 span,
313 ));
314 }
315 }
316 }
317
318 let tokens = self.process_indentation(raw_tokens)?;
320 Ok(tokens)
321 }
322
323 fn process_indentation(&self, raw: Vec<Token>) -> KoreResult<Vec<Token>> {
325 let mut result = Vec::new();
326 let mut indent_stack: Vec<usize> = vec![0]; let mut iter = raw.into_iter().peekable();
328
329 while let Some(token) = iter.next() {
330 match &token.kind {
331 TokenKind::Newline(ws) => {
332 if let Some(next) = iter.peek() {
334 if matches!(next.kind, TokenKind::Newline(_)) {
335 continue;
336 }
337 }
338
339 let indent: usize = ws[1..].chars().map(|c| if c == '\t' { 4 } else { 1 }).sum();
341 let current = *indent_stack.last().unwrap();
342
343 if indent > current {
344 indent_stack.push(indent);
346 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
347 result.push(Token::new(TokenKind::Indent, token.span));
348 } else if indent < current {
349 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
351 while indent_stack.len() > 1 && *indent_stack.last().unwrap() > indent {
352 indent_stack.pop();
353 result.push(Token::new(TokenKind::Dedent, token.span));
354 }
355 } else {
356 result.push(Token::new(TokenKind::Newline(ws.clone()), token.span));
358 }
359 }
360 _ => {
361 result.push(token);
362 }
363 }
364 }
365
366 let final_span = result.last().map(|t| t.span).unwrap_or(Span::new(0, 0));
368 while indent_stack.len() > 1 {
369 indent_stack.pop();
370 result.push(Token::new(TokenKind::Dedent, final_span));
371 }
372
373 result.push(Token::new(TokenKind::Eof, final_span));
374 Ok(result)
375 }
376}
377
378fn unescape(s: &str) -> String {
379 let mut result = String::with_capacity(s.len());
380 let mut chars = s.chars();
381 while let Some(c) = chars.next() {
382 if c == '\\' {
383 match chars.next() {
384 Some('n') => result.push('\n'),
385 Some('r') => result.push('\r'),
386 Some('t') => result.push('\t'),
387 Some('0') => result.push('\0'),
388 Some('\\') => result.push('\\'),
389 Some('"') => result.push('"'),
390 Some('\'') => result.push('\''),
391 Some(other) => {
392 result.push('\\');
393 result.push(other);
394 }
395 None => result.push('\\'),
396 }
397 } else {
398 result.push(c);
399 }
400 }
401 result
402}
403
404#[cfg(test)]
405mod tests {
406 use super::*;
407
408 #[test]
409 fn test_basic_tokens() {
410 let source = "fn factorial(n: Int) -> Int";
411 let tokens = Lexer::new(source).tokenize().unwrap();
412 assert!(matches!(tokens[0].kind, TokenKind::Fn));
413 assert!(matches!(tokens[1].kind, TokenKind::Ident(_)));
414 }
415
416 #[test]
417 fn test_indentation() {
418 let source = "fn foo():\n let x = 1\n let y = 2\n";
419 let tokens = Lexer::new(source).tokenize().unwrap();
420 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Indent)));
421 }
422}
423