1use anyhow::{Context, Result};
2
3use super::utils::{compute_indent, is_identifier_start, lex_identifier, lex_number};
4use crate::language::syntax::tokens::{Keyword, Token, TokenKind, keyword_from_ident};
5
6#[derive(Debug, Default)]
7pub struct Lexer {
8 source: String,
9}
10
11impl Lexer {
12 pub fn new(content: impl Into<String>) -> Self {
13 Self {
14 source: content.into(),
15 }
16 }
17
18 pub fn with_source(mut self, content: impl Into<String>) -> Self {
19 self.source = content.into();
20 self
21 }
22
23 pub fn lex(self) -> Result<Vec<Token>> {
24 lex_source(&self.source)
25 }
26}
27
28fn lex_source(source: &str) -> Result<Vec<Token>> {
29 let mut tokens = Vec::new();
30 let mut indent_stack = vec![0usize];
31
32 for (line_idx, raw_line) in source.lines().enumerate() {
33 let (indent_level, cursor_start) = compute_indent(raw_line);
34 let trimmed = raw_line[cursor_start..].trim_end();
35
36 let current_indent = *indent_stack.last().context("indent stack corruption")?;
37 let line_number = line_idx + 1;
38
39 if indent_level > current_indent {
40 indent_stack.push(indent_level);
41 tokens.push(Token::new(
42 TokenKind::Indent,
43 String::new(),
44 indent_level,
45 line_number,
46 1,
47 ));
48 } else {
49 while indent_level < *indent_stack.last().unwrap() {
50 indent_stack.pop();
51 tokens.push(Token::new(
52 TokenKind::Dedent,
53 String::new(),
54 indent_level,
55 line_number,
56 1,
57 ));
58 }
59 }
60
61 let mut cursor = cursor_start;
62 let bytes = raw_line.as_bytes();
63 let len = raw_line.len();
64
65 while cursor < len {
66 let ch = raw_line.as_bytes()[cursor];
67 let column = cursor + 1;
68
69 match ch {
70 b' ' | b'\t' => {
71 cursor += 1;
72 }
73 b'#' => {
74 if !trimmed.is_empty() {
76 tokens.push(Token::new(
77 TokenKind::Comment,
78 raw_line[cursor..].trim().to_string(),
79 indent_level,
80 line_number,
81 column,
82 ));
83 }
84 break;
85 }
86 b'"' | b'\'' => {
87 let quote = ch as char;
88 let mut end = cursor + 1;
89 let mut escaped = false;
90 while end < len {
91 let c = bytes[end] as char;
92 if c == quote && !escaped {
93 end += 1;
94 break;
95 }
96 escaped = !escaped && c == '\\';
97 end += 1;
98 }
99 let lexeme = &raw_line[cursor..end];
100 tokens.push(Token::new(
101 TokenKind::String,
102 lexeme,
103 indent_level,
104 line_number,
105 column,
106 ));
107 cursor = end;
108 }
109 b'0'..=b'9' => {
110 let (end, kind) = lex_number(raw_line, cursor);
111 let lexeme = &raw_line[cursor..end];
112 tokens.push(Token::new(kind, lexeme, indent_level, line_number, column));
113 cursor = end;
114 }
115 b'@' => {
116 tokens.push(Token::new(
117 TokenKind::Keyword(Keyword::At),
118 "@",
119 indent_level,
120 line_number,
121 column,
122 ));
123 cursor += 1;
124 }
125 b'-' => {
126 if cursor + 1 < len && bytes[cursor + 1] == b'>' {
127 tokens.push(Token::new(
128 TokenKind::Arrow,
129 "->",
130 indent_level,
131 line_number,
132 column,
133 ));
134 cursor += 2;
135 } else {
136 tokens.push(Token::new(
137 TokenKind::Minus,
138 "-",
139 indent_level,
140 line_number,
141 column,
142 ));
143 cursor += 1;
144 }
145 }
146 b'=' => {
147 if cursor + 1 < len && bytes[cursor + 1] == b'=' {
148 tokens.push(Token::new(
149 TokenKind::DoubleEquals,
150 "==",
151 indent_level,
152 line_number,
153 column,
154 ));
155 cursor += 2;
156 } else {
157 tokens.push(Token::new(
158 TokenKind::Equals,
159 "=",
160 indent_level,
161 line_number,
162 column,
163 ));
164 cursor += 1;
165 }
166 }
167 b'!' => {
168 if cursor + 1 < len && bytes[cursor + 1] == b'=' {
169 tokens.push(Token::new(
170 TokenKind::NotEquals,
171 "!=",
172 indent_level,
173 line_number,
174 column,
175 ));
176 cursor += 2;
177 } else {
178 tokens.push(Token::new(
179 TokenKind::Unknown,
180 "!",
181 indent_level,
182 line_number,
183 column,
184 ));
185 cursor += 1;
186 }
187 }
188 b'>' => {
189 if cursor + 1 < len && bytes[cursor + 1] == b'=' {
190 tokens.push(Token::new(
191 TokenKind::GreaterEqual,
192 ">=",
193 indent_level,
194 line_number,
195 column,
196 ));
197 cursor += 2;
198 } else {
199 tokens.push(Token::new(
200 TokenKind::Greater,
201 ">",
202 indent_level,
203 line_number,
204 column,
205 ));
206 cursor += 1;
207 }
208 }
209 b'<' => {
210 if cursor + 1 < len && bytes[cursor + 1] == b'=' {
211 tokens.push(Token::new(
212 TokenKind::LessEqual,
213 "<=",
214 indent_level,
215 line_number,
216 column,
217 ));
218 cursor += 2;
219 } else {
220 tokens.push(Token::new(
221 TokenKind::Less,
222 "<",
223 indent_level,
224 line_number,
225 column,
226 ));
227 cursor += 1;
228 }
229 }
230 b'{' => {
231 tokens.push(Token::new(
232 TokenKind::LBrace,
233 "{",
234 indent_level,
235 line_number,
236 column,
237 ));
238 cursor += 1;
239 }
240 b'}' => {
241 tokens.push(Token::new(
242 TokenKind::RBrace,
243 "}",
244 indent_level,
245 line_number,
246 column,
247 ));
248 cursor += 1;
249 }
250 b'[' => {
251 tokens.push(Token::new(
252 TokenKind::LBracket,
253 "[",
254 indent_level,
255 line_number,
256 column,
257 ));
258 cursor += 1;
259 }
260 b']' => {
261 tokens.push(Token::new(
262 TokenKind::RBracket,
263 "]",
264 indent_level,
265 line_number,
266 column,
267 ));
268 cursor += 1;
269 }
270 b'(' => {
271 tokens.push(Token::new(
272 TokenKind::LParen,
273 "(",
274 indent_level,
275 line_number,
276 column,
277 ));
278 cursor += 1;
279 }
280 b')' => {
281 tokens.push(Token::new(
282 TokenKind::RParen,
283 ")",
284 indent_level,
285 line_number,
286 column,
287 ));
288 cursor += 1;
289 }
290 b',' => {
291 tokens.push(Token::new(
292 TokenKind::Comma,
293 ",",
294 indent_level,
295 line_number,
296 column,
297 ));
298 cursor += 1;
299 }
300 b':' => {
301 tokens.push(Token::new(
302 TokenKind::Colon,
303 ":",
304 indent_level,
305 line_number,
306 column,
307 ));
308 cursor += 1;
309 }
310 b'+' => {
311 tokens.push(Token::new(
312 TokenKind::Plus,
313 "+",
314 indent_level,
315 line_number,
316 column,
317 ));
318 cursor += 1;
319 }
320 b'*' => {
321 tokens.push(Token::new(
322 TokenKind::Asterisk,
323 "*",
324 indent_level,
325 line_number,
326 column,
327 ));
328 cursor += 1;
329 }
330 b'/' => {
331 tokens.push(Token::new(
332 TokenKind::Slash,
333 "/",
334 indent_level,
335 line_number,
336 column,
337 ));
338 cursor += 1;
339 }
340 b'.' => {
341 tokens.push(Token::new(
342 TokenKind::Dot,
343 ".",
344 indent_level,
345 line_number,
346 column,
347 ));
348 cursor += 1;
349 }
350 _ => {
351 if is_identifier_start(ch as char) {
352 let end = lex_identifier(raw_line, cursor);
353 let ident = &raw_line[cursor..end];
354 let lower = ident.to_ascii_lowercase();
355 let kind = if let Some(keyword) = keyword_from_ident(&lower) {
356 TokenKind::Keyword(keyword)
357 } else if lower == "true" || lower == "false" {
358 TokenKind::Boolean
359 } else {
360 TokenKind::Identifier
361 };
362 tokens.push(Token::new(kind, ident, indent_level, line_number, column));
363 cursor = end;
364 } else {
365 tokens.push(Token::new(
366 TokenKind::Unknown,
367 (ch as char).to_string(),
368 indent_level,
369 line_number,
370 column,
371 ));
372 cursor += 1;
373 }
374 }
375 }
376 }
377
378 if !trimmed.is_empty() {
379 tokens.push(Token::new(
380 TokenKind::Newline,
381 "\\n",
382 indent_level,
383 line_number,
384 raw_line.len() + 1,
385 ));
386 }
387 }
388
389 while indent_stack.len() > 1 {
390 indent_stack.pop();
391 tokens.push(Token::new(
392 TokenKind::Dedent,
393 String::new(),
394 0,
395 source.lines().count() + 1,
396 1,
397 ));
398 }
399
400 tokens.push(Token::new(
401 TokenKind::Eof,
402 String::new(),
403 0,
404 source.lines().count() + 1,
405 1,
406 ));
407
408 Ok(tokens)
409}