Skip to main content

virtual_rust/
lexer.rs

1//! Lexical analyzer that transforms Rust source code into a stream of tokens.
2//!
3//! Handles all Rust literal forms: strings (with escape sequences), characters,
4//! integers (decimal, hex, binary, octal with `_` separators), floats, and
5//! keyword/identifier discrimination.
6
7use crate::token::Token;
8
9/// Lexer that tokenizes Rust source code character by character.
10pub struct Lexer {
11    input: Vec<char>,
12    pos: usize,
13    line: usize,
14    col: usize,
15}
16
17/// A lexical error with source location.
18#[derive(Debug)]
19pub struct LexError {
20    pub message: String,
21    pub line: usize,
22    pub col: usize,
23}
24
25impl LexError {
26    fn new(message: impl Into<String>, line: usize, col: usize) -> Self {
27        LexError {
28            message: message.into(),
29            line,
30            col,
31        }
32    }
33}
34
35impl std::fmt::Display for LexError {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        write!(
38            f,
39            "Lex error at {}:{}: {}",
40            self.line, self.col, self.message
41        )
42    }
43}
44
45// ── Character navigation ─────────────────────────────────────────────
46
47impl Lexer {
48    pub fn new(input: &str) -> Self {
49        Lexer {
50            input: input.chars().collect(),
51            pos: 0,
52            line: 1,
53            col: 1,
54        }
55    }
56
57    /// Returns the current character without consuming it.
58    fn current(&self) -> Option<char> {
59        self.input.get(self.pos).copied()
60    }
61
62    /// Returns the next character (lookahead) without consuming.
63    fn peek(&self) -> Option<char> {
64        self.input.get(self.pos + 1).copied()
65    }
66
67    /// Consumes and returns the current character, tracking line/col.
68    fn advance(&mut self) -> Option<char> {
69        let ch = self.current();
70        if let Some(c) = ch {
71            self.pos += 1;
72            if c == '\n' {
73                self.line += 1;
74                self.col = 1;
75            } else {
76                self.col += 1;
77            }
78        }
79        ch
80    }
81
82    /// Creates a `LexError` at the lexer's current position.
83    fn error(&self, message: impl Into<String>) -> LexError {
84        LexError::new(message, self.line, self.col)
85    }
86
87    // ── Whitespace & comments ─────────────────────────────────────────
88
89    fn skip_whitespace(&mut self) {
90        while let Some(c) = self.current() {
91            if c.is_whitespace() {
92                self.advance();
93            } else {
94                break;
95            }
96        }
97    }
98
99    fn skip_line_comment(&mut self) {
100        while let Some(c) = self.current() {
101            if c == '\n' {
102                break;
103            }
104            self.advance();
105        }
106    }
107
108    fn skip_block_comment(&mut self) {
109        // skip /*
110        self.advance();
111        self.advance();
112        let mut depth = 1;
113        while depth > 0 {
114            match self.current() {
115                Some('/') if self.peek() == Some('*') => {
116                    self.advance();
117                    self.advance();
118                    depth += 1;
119                }
120                Some('*') if self.peek() == Some('/') => {
121                    self.advance();
122                    self.advance();
123                    depth -= 1;
124                }
125                Some(_) => {
126                    self.advance();
127                }
128                None => break,
129            }
130        }
131    }
132
133    // ── Literal readers ──────────────────────────────────────────────
134
135    /// Reads a `"..."`-delimited string literal with escape processing.
136    fn read_string(&mut self) -> Result<Token, LexError> {
137        self.advance(); // skip opening "
138        let mut s = String::new();
139        loop {
140            match self.current() {
141                Some('"') => {
142                    self.advance();
143                    return Ok(Token::StringLiteral(s));
144                }
145                Some('\\') => {
146                    self.advance();
147                    match self.current() {
148                        Some('n') => {
149                            s.push('\n');
150                            self.advance();
151                        }
152                        Some('t') => {
153                            s.push('\t');
154                            self.advance();
155                        }
156                        Some('r') => {
157                            s.push('\r');
158                            self.advance();
159                        }
160                        Some('\\') => {
161                            s.push('\\');
162                            self.advance();
163                        }
164                        Some('"') => {
165                            s.push('"');
166                            self.advance();
167                        }
168                        Some('0') => {
169                            s.push('\0');
170                            self.advance();
171                        }
172                        Some(c) => {
173                            return Err(self.error(format!("Unknown escape sequence: \\{c}")));
174                        }
175                        None => {
176                            return Err(self.error("Unterminated string"));
177                        }
178                    }
179                }
180                Some(c) => {
181                    s.push(c);
182                    self.advance();
183                }
184                None => {
185                    return Err(self.error("Unterminated string"));
186                }
187            }
188        }
189    }
190
191    /// Reads a `'c'`-delimited character literal with escape processing.
192    fn read_char(&mut self) -> Result<Token, LexError> {
193        self.advance(); // skip opening '
194        let ch = match self.current() {
195            Some('\\') => {
196                self.advance();
197                match self.current() {
198                    Some('n') => '\n',
199                    Some('t') => '\t',
200                    Some('r') => '\r',
201                    Some('\\') => '\\',
202                    Some('\'') => '\'',
203                    Some('0') => '\0',
204                    _ => return Err(self.error("Invalid char escape")),
205                }
206            }
207            Some(c) => c,
208            None => return Err(self.error("Unterminated char literal")),
209        };
210        self.advance();
211        if self.current() != Some('\'') {
212            return Err(self.error("Unterminated char literal"));
213        }
214        self.advance(); // skip closing '
215        Ok(Token::CharLiteral(ch))
216    }
217
218    /// Reads a numeric literal (decimal, hex `0x`, binary `0b`, octal `0o`).
219    /// Handles `_` separators and optional type suffixes (e.g. `42i32`).
220    fn read_number(&mut self) -> Result<Token, LexError> {
221        let mut num = String::new();
222        let mut is_float = false;
223
224        // Check for hex, octal, binary
225        if self.current() == Some('0') {
226            match self.peek() {
227                Some('x') | Some('X') => {
228                    num.push('0');
229                    self.advance();
230                    num.push('x');
231                    self.advance();
232                    while let Some(c) = self.current() {
233                        if c.is_ascii_hexdigit() || c == '_' {
234                            if c != '_' {
235                                num.push(c);
236                            }
237                            self.advance();
238                        } else {
239                            break;
240                        }
241                    }
242                    let val = i64::from_str_radix(&num[2..], 16).map_err(|_| {
243                        self.error(format!("Invalid hex literal: {num}"))
244                    })?;
245                    return Ok(Token::IntLiteral(val));
246                }
247                Some('b') | Some('B') => {
248                    self.advance();
249                    self.advance();
250                    while let Some(c) = self.current() {
251                        if c == '0' || c == '1' || c == '_' {
252                            if c != '_' {
253                                num.push(c);
254                            }
255                            self.advance();
256                        } else {
257                            break;
258                        }
259                    }
260                    let val = i64::from_str_radix(&num, 2).map_err(|_| {
261                        self.error(format!("Invalid binary literal: 0b{num}"))
262                    })?;
263                    return Ok(Token::IntLiteral(val));
264                }
265                Some('o') | Some('O') => {
266                    self.advance();
267                    self.advance();
268                    while let Some(c) = self.current() {
269                        if ('0'..='7').contains(&c) || c == '_' {
270                            if c != '_' {
271                                num.push(c);
272                            }
273                            self.advance();
274                        } else {
275                            break;
276                        }
277                    }
278                    let val = i64::from_str_radix(&num, 8).map_err(|_| {
279                        self.error(format!("Invalid octal literal: 0o{num}"))
280                    })?;
281                    return Ok(Token::IntLiteral(val));
282                }
283                _ => {}
284            }
285        }
286
287        while let Some(c) = self.current() {
288            if c.is_ascii_digit() || c == '_' {
289                if c != '_' {
290                    num.push(c);
291                }
292                self.advance();
293            } else if c == '.' && !is_float {
294                // Check if next char is a digit (to differentiate from method calls)
295                if let Some(next) = self.peek() {
296                    if next.is_ascii_digit() {
297                        is_float = true;
298                        num.push(c);
299                        self.advance();
300                    } else {
301                        break;
302                    }
303                } else {
304                    break;
305                }
306            } else {
307                break;
308            }
309        }
310
311        // Skip type suffix like i32, u64, f64 etc.
312        if let Some(c) = self.current() {
313            if c == 'i' || c == 'u' || c == 'f' {
314                let start = self.pos;
315                let mut suffix = String::new();
316                while let Some(sc) = self.current() {
317                    if sc.is_alphanumeric() {
318                        suffix.push(sc);
319                        self.advance();
320                    } else {
321                        break;
322                    }
323                }
324                // Check if it's a valid type suffix
325                match suffix.as_str() {
326                    "i8" | "i16" | "i32" | "i64" | "i128" | "isize" | "u8" | "u16" | "u32"
327                    | "u64" | "u128" | "usize" | "f32" | "f64" => {
328                        if suffix.starts_with('f') {
329                            is_float = true;
330                        }
331                    }
332                    _ => {
333                        // Not a valid suffix, rewind
334                        self.pos = start;
335                    }
336                }
337            }
338        }
339
340        if is_float {
341            let val: f64 = num.parse().map_err(|_| {
342                self.error(format!("Invalid float literal: {num}"))
343            })?;
344            Ok(Token::FloatLiteral(val))
345        } else {
346            let val: i64 = num.parse().map_err(|_| {
347                self.error(format!("Invalid integer literal: {num}"))
348            })?;
349            Ok(Token::IntLiteral(val))
350        }
351    }
352
353    /// Reads an identifier or keyword.
354    fn read_ident(&mut self) -> Token {
355        let mut ident = String::new();
356        while let Some(c) = self.current() {
357            if c.is_alphanumeric() || c == '_' {
358                ident.push(c);
359                self.advance();
360            } else {
361                break;
362            }
363        }
364        Token::keyword_from_str(&ident).unwrap_or(Token::Ident(ident))
365    }
366
367    // ── Main tokenizer loop ───────────────────────────────────────────
368
369    /// Tokenizes the entire input into a `Vec<Token>`, ending with `Token::Eof`.
370    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
371        let mut tokens = Vec::new();
372
373        loop {
374            self.skip_whitespace();
375
376            match self.current() {
377                None => {
378                    tokens.push(Token::Eof);
379                    return Ok(tokens);
380                }
381                Some('/') => match self.peek() {
382                    Some('/') => {
383                        self.skip_line_comment();
384                        continue;
385                    }
386                    Some('*') => {
387                        self.skip_block_comment();
388                        continue;
389                    }
390                    Some('=') => {
391                        self.advance();
392                        self.advance();
393                        tokens.push(Token::SlashEq);
394                    }
395                    _ => {
396                        self.advance();
397                        tokens.push(Token::Slash);
398                    }
399                },
400                Some('"') => {
401                    tokens.push(self.read_string()?);
402                }
403                Some('\'') => {
404                    // Could be char literal or lifetime - try char literal first
405                    tokens.push(self.read_char()?);
406                }
407                Some(c) if c.is_ascii_digit() => {
408                    tokens.push(self.read_number()?);
409                }
410                Some(c) if c.is_alphabetic() || c == '_' => {
411                    tokens.push(self.read_ident());
412                }
413                Some('+') => {
414                    self.advance();
415                    if self.current() == Some('=') {
416                        self.advance();
417                        tokens.push(Token::PlusEq);
418                    } else {
419                        tokens.push(Token::Plus);
420                    }
421                }
422                Some('-') => {
423                    self.advance();
424                    if self.current() == Some('>') {
425                        self.advance();
426                        tokens.push(Token::Arrow);
427                    } else if self.current() == Some('=') {
428                        self.advance();
429                        tokens.push(Token::MinusEq);
430                    } else {
431                        tokens.push(Token::Minus);
432                    }
433                }
434                Some('*') => {
435                    self.advance();
436                    if self.current() == Some('=') {
437                        self.advance();
438                        tokens.push(Token::StarEq);
439                    } else {
440                        tokens.push(Token::Star);
441                    }
442                }
443                Some('%') => {
444                    self.advance();
445                    if self.current() == Some('=') {
446                        self.advance();
447                        tokens.push(Token::PercentEq);
448                    } else {
449                        tokens.push(Token::Percent);
450                    }
451                }
452                Some('=') => {
453                    self.advance();
454                    if self.current() == Some('=') {
455                        self.advance();
456                        tokens.push(Token::EqEq);
457                    } else if self.current() == Some('>') {
458                        self.advance();
459                        tokens.push(Token::FatArrow);
460                    } else {
461                        tokens.push(Token::Eq);
462                    }
463                }
464                Some('!') => {
465                    self.advance();
466                    if self.current() == Some('=') {
467                        self.advance();
468                        tokens.push(Token::NotEq);
469                    } else {
470                        tokens.push(Token::Not);
471                    }
472                }
473                Some('<') => {
474                    self.advance();
475                    if self.current() == Some('=') {
476                        self.advance();
477                        tokens.push(Token::LtEq);
478                    } else if self.current() == Some('<') {
479                        self.advance();
480                        tokens.push(Token::Shl);
481                    } else {
482                        tokens.push(Token::Lt);
483                    }
484                }
485                Some('>') => {
486                    self.advance();
487                    if self.current() == Some('=') {
488                        self.advance();
489                        tokens.push(Token::GtEq);
490                    } else if self.current() == Some('>') {
491                        self.advance();
492                        tokens.push(Token::Shr);
493                    } else {
494                        tokens.push(Token::Gt);
495                    }
496                }
497                Some('&') => {
498                    self.advance();
499                    if self.current() == Some('&') {
500                        self.advance();
501                        tokens.push(Token::And);
502                    } else {
503                        tokens.push(Token::Ampersand);
504                    }
505                }
506                Some('|') => {
507                    self.advance();
508                    if self.current() == Some('|') {
509                        self.advance();
510                        tokens.push(Token::Or);
511                    } else {
512                        tokens.push(Token::Pipe);
513                    }
514                }
515                Some('^') => {
516                    self.advance();
517                    tokens.push(Token::Caret);
518                }
519                Some('~') => {
520                    self.advance();
521                    tokens.push(Token::Tilde);
522                }
523                Some('(') => {
524                    self.advance();
525                    tokens.push(Token::LParen);
526                }
527                Some(')') => {
528                    self.advance();
529                    tokens.push(Token::RParen);
530                }
531                Some('{') => {
532                    self.advance();
533                    tokens.push(Token::LBrace);
534                }
535                Some('}') => {
536                    self.advance();
537                    tokens.push(Token::RBrace);
538                }
539                Some('[') => {
540                    self.advance();
541                    tokens.push(Token::LBracket);
542                }
543                Some(']') => {
544                    self.advance();
545                    tokens.push(Token::RBracket);
546                }
547                Some(',') => {
548                    self.advance();
549                    tokens.push(Token::Comma);
550                }
551                Some(';') => {
552                    self.advance();
553                    tokens.push(Token::Semicolon);
554                }
555                Some(':') => {
556                    self.advance();
557                    if self.current() == Some(':') {
558                        self.advance();
559                        tokens.push(Token::ColonColon);
560                    } else {
561                        tokens.push(Token::Colon);
562                    }
563                }
564                Some('.') => {
565                    self.advance();
566                    if self.current() == Some('.') {
567                        self.advance();
568                        if self.current() == Some('=') {
569                            self.advance();
570                            tokens.push(Token::DotDotEq);
571                        } else {
572                            tokens.push(Token::DotDot);
573                        }
574                    } else {
575                        tokens.push(Token::Dot);
576                    }
577                }
578                Some('#') => {
579                    self.advance();
580                    // Skip attributes like #[...]
581                    if self.current() == Some('[') {
582                        let mut depth = 1;
583                        self.advance();
584                        while depth > 0 {
585                            match self.current() {
586                                Some('[') => {
587                                    depth += 1;
588                                    self.advance();
589                                }
590                                Some(']') => {
591                                    depth -= 1;
592                                    self.advance();
593                                }
594                                Some(_) => {
595                                    self.advance();
596                                }
597                                None => break,
598                            }
599                        }
600                        continue;
601                    }
602                    tokens.push(Token::Hash);
603                }
604                Some(c) => {
605                    return Err(self.error(format!("Unexpected character: '{c}'")));
606                }
607            }
608        }
609    }
610}