devalang_wasm/language/syntax/lexer/
lex.rs

1use anyhow::{Context, Result};
2
3use super::utils::{compute_indent, is_identifier_start, lex_identifier, lex_number};
4use crate::language::syntax::tokens::{Keyword, Token, TokenKind, keyword_from_ident};
5
6#[derive(Debug, Default)]
7pub struct Lexer {
8    source: String,
9}
10
11impl Lexer {
12    pub fn new(content: impl Into<String>) -> Self {
13        Self {
14            source: content.into(),
15        }
16    }
17
18    pub fn with_source(mut self, content: impl Into<String>) -> Self {
19        self.source = content.into();
20        self
21    }
22
23    pub fn lex(self) -> Result<Vec<Token>> {
24        lex_source(&self.source)
25    }
26}
27
28fn lex_source(source: &str) -> Result<Vec<Token>> {
29    let mut tokens = Vec::new();
30    let mut indent_stack = vec![0usize];
31
32    for (line_idx, raw_line) in source.lines().enumerate() {
33        let (indent_level, cursor_start) = compute_indent(raw_line);
34        let trimmed = raw_line[cursor_start..].trim_end();
35
36        let current_indent = *indent_stack.last().context("indent stack corruption")?;
37        let line_number = line_idx + 1;
38
39        if indent_level > current_indent {
40            indent_stack.push(indent_level);
41            tokens.push(Token::new(
42                TokenKind::Indent,
43                String::new(),
44                indent_level,
45                line_number,
46                1,
47            ));
48        } else {
49            while indent_level < *indent_stack.last().unwrap() {
50                indent_stack.pop();
51                tokens.push(Token::new(
52                    TokenKind::Dedent,
53                    String::new(),
54                    indent_level,
55                    line_number,
56                    1,
57                ));
58            }
59        }
60
61        let mut cursor = cursor_start;
62        let bytes = raw_line.as_bytes();
63        let len = raw_line.len();
64
65        while cursor < len {
66            let ch = raw_line.as_bytes()[cursor];
67            let column = cursor + 1;
68
69            match ch {
70                b' ' | b'\t' => {
71                    cursor += 1;
72                }
73                b'#' => {
74                    // Comment to end of line
75                    if !trimmed.is_empty() {
76                        tokens.push(Token::new(
77                            TokenKind::Comment,
78                            raw_line[cursor..].trim().to_string(),
79                            indent_level,
80                            line_number,
81                            column,
82                        ));
83                    }
84                    break;
85                }
86                b'"' | b'\'' => {
87                    let quote = ch as char;
88                    let mut end = cursor + 1;
89                    let mut escaped = false;
90                    while end < len {
91                        let c = bytes[end] as char;
92                        if c == quote && !escaped {
93                            end += 1;
94                            break;
95                        }
96                        escaped = !escaped && c == '\\';
97                        end += 1;
98                    }
99                    let lexeme = &raw_line[cursor..end];
100                    tokens.push(Token::new(
101                        TokenKind::String,
102                        lexeme,
103                        indent_level,
104                        line_number,
105                        column,
106                    ));
107                    cursor = end;
108                }
109                b'0'..=b'9' => {
110                    let (end, kind) = lex_number(raw_line, cursor);
111                    let lexeme = &raw_line[cursor..end];
112                    tokens.push(Token::new(kind, lexeme, indent_level, line_number, column));
113                    cursor = end;
114                }
115                b'@' => {
116                    tokens.push(Token::new(
117                        TokenKind::Keyword(Keyword::At),
118                        "@",
119                        indent_level,
120                        line_number,
121                        column,
122                    ));
123                    cursor += 1;
124                }
125                b'-' => {
126                    if cursor + 1 < len && bytes[cursor + 1] == b'>' {
127                        tokens.push(Token::new(
128                            TokenKind::Arrow,
129                            "->",
130                            indent_level,
131                            line_number,
132                            column,
133                        ));
134                        cursor += 2;
135                    } else {
136                        tokens.push(Token::new(
137                            TokenKind::Minus,
138                            "-",
139                            indent_level,
140                            line_number,
141                            column,
142                        ));
143                        cursor += 1;
144                    }
145                }
146                b'=' => {
147                    if cursor + 1 < len && bytes[cursor + 1] == b'=' {
148                        tokens.push(Token::new(
149                            TokenKind::DoubleEquals,
150                            "==",
151                            indent_level,
152                            line_number,
153                            column,
154                        ));
155                        cursor += 2;
156                    } else {
157                        tokens.push(Token::new(
158                            TokenKind::Equals,
159                            "=",
160                            indent_level,
161                            line_number,
162                            column,
163                        ));
164                        cursor += 1;
165                    }
166                }
167                b'!' => {
168                    if cursor + 1 < len && bytes[cursor + 1] == b'=' {
169                        tokens.push(Token::new(
170                            TokenKind::NotEquals,
171                            "!=",
172                            indent_level,
173                            line_number,
174                            column,
175                        ));
176                        cursor += 2;
177                    } else {
178                        tokens.push(Token::new(
179                            TokenKind::Unknown,
180                            "!",
181                            indent_level,
182                            line_number,
183                            column,
184                        ));
185                        cursor += 1;
186                    }
187                }
188                b'>' => {
189                    if cursor + 1 < len && bytes[cursor + 1] == b'=' {
190                        tokens.push(Token::new(
191                            TokenKind::GreaterEqual,
192                            ">=",
193                            indent_level,
194                            line_number,
195                            column,
196                        ));
197                        cursor += 2;
198                    } else {
199                        tokens.push(Token::new(
200                            TokenKind::Greater,
201                            ">",
202                            indent_level,
203                            line_number,
204                            column,
205                        ));
206                        cursor += 1;
207                    }
208                }
209                b'<' => {
210                    if cursor + 1 < len && bytes[cursor + 1] == b'=' {
211                        tokens.push(Token::new(
212                            TokenKind::LessEqual,
213                            "<=",
214                            indent_level,
215                            line_number,
216                            column,
217                        ));
218                        cursor += 2;
219                    } else {
220                        tokens.push(Token::new(
221                            TokenKind::Less,
222                            "<",
223                            indent_level,
224                            line_number,
225                            column,
226                        ));
227                        cursor += 1;
228                    }
229                }
230                b'{' => {
231                    tokens.push(Token::new(
232                        TokenKind::LBrace,
233                        "{",
234                        indent_level,
235                        line_number,
236                        column,
237                    ));
238                    cursor += 1;
239                }
240                b'}' => {
241                    tokens.push(Token::new(
242                        TokenKind::RBrace,
243                        "}",
244                        indent_level,
245                        line_number,
246                        column,
247                    ));
248                    cursor += 1;
249                }
250                b'[' => {
251                    tokens.push(Token::new(
252                        TokenKind::LBracket,
253                        "[",
254                        indent_level,
255                        line_number,
256                        column,
257                    ));
258                    cursor += 1;
259                }
260                b']' => {
261                    tokens.push(Token::new(
262                        TokenKind::RBracket,
263                        "]",
264                        indent_level,
265                        line_number,
266                        column,
267                    ));
268                    cursor += 1;
269                }
270                b'(' => {
271                    tokens.push(Token::new(
272                        TokenKind::LParen,
273                        "(",
274                        indent_level,
275                        line_number,
276                        column,
277                    ));
278                    cursor += 1;
279                }
280                b')' => {
281                    tokens.push(Token::new(
282                        TokenKind::RParen,
283                        ")",
284                        indent_level,
285                        line_number,
286                        column,
287                    ));
288                    cursor += 1;
289                }
290                b',' => {
291                    tokens.push(Token::new(
292                        TokenKind::Comma,
293                        ",",
294                        indent_level,
295                        line_number,
296                        column,
297                    ));
298                    cursor += 1;
299                }
300                b':' => {
301                    tokens.push(Token::new(
302                        TokenKind::Colon,
303                        ":",
304                        indent_level,
305                        line_number,
306                        column,
307                    ));
308                    cursor += 1;
309                }
310                b'+' => {
311                    tokens.push(Token::new(
312                        TokenKind::Plus,
313                        "+",
314                        indent_level,
315                        line_number,
316                        column,
317                    ));
318                    cursor += 1;
319                }
320                b'*' => {
321                    tokens.push(Token::new(
322                        TokenKind::Asterisk,
323                        "*",
324                        indent_level,
325                        line_number,
326                        column,
327                    ));
328                    cursor += 1;
329                }
330                b'/' => {
331                    tokens.push(Token::new(
332                        TokenKind::Slash,
333                        "/",
334                        indent_level,
335                        line_number,
336                        column,
337                    ));
338                    cursor += 1;
339                }
340                b'.' => {
341                    tokens.push(Token::new(
342                        TokenKind::Dot,
343                        ".",
344                        indent_level,
345                        line_number,
346                        column,
347                    ));
348                    cursor += 1;
349                }
350                _ => {
351                    if is_identifier_start(ch as char) {
352                        let end = lex_identifier(raw_line, cursor);
353                        let ident = &raw_line[cursor..end];
354                        let lower = ident.to_ascii_lowercase();
355                        let kind = if let Some(keyword) = keyword_from_ident(&lower) {
356                            TokenKind::Keyword(keyword)
357                        } else if lower == "true" || lower == "false" {
358                            TokenKind::Boolean
359                        } else {
360                            TokenKind::Identifier
361                        };
362                        tokens.push(Token::new(kind, ident, indent_level, line_number, column));
363                        cursor = end;
364                    } else {
365                        tokens.push(Token::new(
366                            TokenKind::Unknown,
367                            (ch as char).to_string(),
368                            indent_level,
369                            line_number,
370                            column,
371                        ));
372                        cursor += 1;
373                    }
374                }
375            }
376        }
377
378        if !trimmed.is_empty() {
379            tokens.push(Token::new(
380                TokenKind::Newline,
381                "\\n",
382                indent_level,
383                line_number,
384                raw_line.len() + 1,
385            ));
386        }
387    }
388
389    while indent_stack.len() > 1 {
390        indent_stack.pop();
391        tokens.push(Token::new(
392            TokenKind::Dedent,
393            String::new(),
394            0,
395            source.lines().count() + 1,
396            1,
397        ));
398    }
399
400    tokens.push(Token::new(
401        TokenKind::Eof,
402        String::new(),
403        0,
404        source.lines().count() + 1,
405        1,
406    ));
407
408    Ok(tokens)
409}