Skip to main content

qcl/
token.rs

1use alloc::{format, string::String, sync::Arc, vec::Vec};
2use core::fmt::Debug;
3
4use crate::error::{Error, Result};
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    LParen,           // (
9    RParen,           // )
10    LBrace,           // {
11    RBrace,           // }
12    LBracket,         // [
13    RBracket,         // ]
14    Dot,              // .
15    Colon,            // :
16    Comma,            // ,
17    Semicolon,        // ;
18    Nil,              // nil
19    Eq,               // ==
20    Ne,               // !=
21    Gt,               // >
22    Lt,               // <
23    Ge,               // >=
24    Le,               // <=
25    In,               // in
26    And,              // &&
27    Or,               // ||
28    Not,              // !
29    Add,              // +
30    Sub,              // -
31    Mul,              // *
32    Div,              // /
33    Mod,              // %
34    At,               // @
35    Question,         // ?
36    QuestionQuestion, // ??
37    Str(Arc<str>),    // "abc"
38    Int(i64),         // 1
39    Float(f64),       // 1.1
40    Bool(bool),       // true, false
41    Id(Arc<str>),     // identifier
42}
43
44/// [chars] and [idx] can be used for syntax error reporting.
45pub struct Tokenizer {
46    chars: Vec<char>,
47    idx: usize,
48    len: usize,
49    pub tokens: Vec<Token>,
50}
51
52impl Tokenizer {
53    #[allow(clippy::new_ret_no_self)]
54    pub fn new(s: &str) -> Result<Vec<Token>> {
55        let chars: Vec<char> = s.chars().collect();
56        let len = chars.len();
57        let mut t = Tokenizer {
58            chars,
59            idx: 0,
60            len,
61            tokens: Vec::with_capacity(s.len() / 4), // Preallocate a reasonable size
62        };
63        t.parse()?;
64        Ok(t.tokens)
65    }
66
67    fn eof(&self) -> bool {
68        self.idx >= self.len
69    }
70
71    fn peek(&self, offset: usize) -> Option<char> {
72        self.chars.get(self.idx + offset).copied()
73    }
74
75    fn err<T: AsRef<str>>(&self, msg: T) -> String {
76        // Compute line:col from idx by scanning from start
77        let mut line = 1usize;
78        let mut col = 1usize;
79        for i in 0..self.idx.min(self.len) {
80            if self.chars[i] == '\n' {
81                line += 1;
82                col = 1;
83            } else {
84                col += 1;
85            }
86        }
87        // Collect near 10(max) chars around the error position
88        let r_idx = if self.idx + 5 < self.len {
89            self.idx + 5
90        } else {
91            self.len
92        };
93        let l_idx = self.idx.saturating_sub(5);
94        let r_idx = if r_idx > self.len { self.len } else { r_idx };
95        let chars = &self.chars[l_idx..r_idx];
96        let chars: String = chars.iter().collect();
97        let c = self.chars.get(self.idx);
98        let ctx = if let Some(&c) = c {
99            format!("'{}' at {}:{}, near '{}'", c, line, col, chars)
100        } else {
101            format!("at end ({}:{}), near '{}'", line, col, chars)
102        };
103        format!("Syntax error:\n{} ({})", msg.as_ref(), ctx)
104    }
105
106    fn skip_whitespace(&mut self) {
107        while self.idx < self.len && self.chars[self.idx].is_whitespace() {
108            self.idx += 1;
109        }
110    }
111
112    fn is_id_start(c: char) -> bool {
113        c.is_alphabetic() || c == '_'
114    }
115
116    fn is_id_continue(c: char) -> bool {
117        c.is_alphanumeric() || c == '_' || c == '-'
118    }
119
120    fn parse_str(&mut self) -> Result<()> {
121        let mut s = String::new();
122        let quote = self.chars[self.idx];
123        self.idx += 1;
124
125        while !self.eof() {
126            let c = self.chars[self.idx];
127            match c {
128                '\\' => {
129                    self.idx += 1;
130                    if self.eof() {
131                        return Err(Error::Tokenize(self.err("Invalid escape sequence")));
132                    }
133
134                    let escaped = match self.chars[self.idx] {
135                        '\\' => '\\',
136                        '"' => '"',
137                        '\'' => '\'',
138                        'n' => '\n',
139                        'r' => '\r',
140                        't' => '\t',
141                        '0' => '\0',
142                        'u' => {
143                            self.idx += 1;
144                            if self.idx + 4 > self.len {
145                                return Err(Error::Tokenize(self.err("Invalid \\uXXXX escape, need 4 hex digits")));
146                            }
147                            let hex: String = self.chars[self.idx..self.idx + 4].iter().collect();
148                            let code = u32::from_str_radix(&hex, 16)
149                                .map_err(|_| Error::Tokenize(self.err(format!("Invalid unicode escape: \\u{hex}"))))?;
150                            let ch = char::from_u32(code).ok_or_else(|| {
151                                Error::Tokenize(self.err(format!("Invalid unicode codepoint: \\u{hex}")))
152                            })?;
153                            self.idx += 4;
154                            s.push(ch);
155                            continue;
156                        }
157                        other => {
158                            return Err(Error::Tokenize(
159                                self.err(format!("Unsupported escape sequence: \\{other}")),
160                            ));
161                        }
162                    };
163                    s.push(escaped);
164                    self.idx += 1;
165                }
166                c if c == quote => {
167                    self.idx += 1;
168                    self.tokens.push(Token::Str(Arc::<str>::from(s)));
169                    return Ok(());
170                }
171                _ => {
172                    s.push(c);
173                    self.idx += 1;
174                }
175            }
176        }
177
178        Err(Error::Tokenize(self.err("String not closed")))
179    }
180
181    /// Check whether a sign (+/-) should be treated as the start of a numeric literal
182    /// rather than a binary operator. A sign is a prefix when there is no preceding
183    /// value-producing token.
184    fn sign_starts_number(&self) -> bool {
185        match self.tokens.last() {
186            None => true, // beginning of input
187            Some(tok) => !matches!(
188                tok,
189                Token::Int(_)
190                    | Token::Float(_)
191                    | Token::Str(_)
192                    | Token::Bool(_)
193                    | Token::Nil
194                    | Token::Id(_)
195                    | Token::RParen
196                    | Token::RBracket
197                    | Token::RBrace
198            ),
199        }
200    }
201
202    /// eg.:
203    /// - @a -> [At, Id("a")]
204    /// - @a.b -> [At, Id("a"), Dot, Id("b")]
205    /// - @a.0.1 -> [At, Id("a"), Dot, Int(0), Dot, Int(1)]
206    fn parse_num(&mut self) -> Result<()> {
207        let start_idx = self.idx;
208
209        // Handle optional sign prefix
210        if !self.eof() && (self.chars[self.idx] == '-' || self.chars[self.idx] == '+') {
211            self.idx += 1;
212        }
213
214        // Check for hex (0x) or octal (0o) prefix
215        if !self.eof()
216            && self.chars[self.idx] == '0'
217            && self.idx + 1 < self.len
218            && (self.chars[self.idx + 1] == 'x'
219                || self.chars[self.idx + 1] == 'X'
220                || self.chars[self.idx + 1] == 'o'
221                || self.chars[self.idx + 1] == 'O')
222        {
223            self.idx = start_idx;
224            return self.parse_int();
225        }
226
227        // Reset idx to after sign (or start) for normal decimal parsing
228        self.idx = start_idx;
229        let mut dot_count = 0;
230        while !self.eof() {
231            let c = self.chars[self.idx];
232            if c.is_ascii_digit() {
233                self.idx += 1;
234            } else if c == '.' {
235                if dot_count > 0 {
236                    return Err(Error::Tokenize(self.err("Invalid float, multiple '.'")));
237                }
238                self.idx += 1;
239                dot_count += 1;
240            } else if (c == '-' || c == '+') && self.idx == start_idx {
241                self.idx += 1;
242            } else {
243                break;
244            }
245        }
246
247        if self.idx > start_idx && self.chars[self.idx - 1] == '.' {
248            return Err(Error::Tokenize(self.err("Invalid float, ends with '.'")));
249        }
250
251        let num_str: String = self.chars[start_idx..self.idx].iter().collect();
252
253        let num = if dot_count > 0 {
254            match num_str.parse() {
255                Ok(f) => Token::Float(f),
256                Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid float"), num_str))),
257            }
258        } else {
259            match num_str.parse() {
260                Ok(i) => Token::Int(i),
261                Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid int"), num_str))),
262            }
263        };
264        self.tokens.push(num);
265        Ok(())
266    }
267
268    fn parse_id(&mut self) -> Result<()> {
269        if self.eof() || !Self::is_id_start(self.chars[self.idx]) {
270            return Err(Error::Tokenize(self.err("Invalid identifier start")));
271        }
272
273        let start_idx = self.idx;
274        self.idx = self.scan_id_end(start_idx);
275        self.push_id_token(start_idx, self.idx);
276        Ok(())
277    }
278
279    fn scan_id_end(&self, start_idx: usize) -> usize {
280        let mut end = start_idx + 1;
281        while end < self.len && Self::is_id_continue(self.chars[end]) {
282            end += 1;
283        }
284        end
285    }
286
287    fn push_id_token(&mut self, start_idx: usize, end_idx: usize) {
288        let id: String = self.chars[start_idx..end_idx].iter().collect();
289        self.tokens.push(Token::Id(Arc::<str>::from(id)));
290    }
291
292    fn parse_ident_or_keyword(&mut self) -> Result<()> {
293        if self.eof() || !Self::is_id_start(self.chars[self.idx]) {
294            return Err(Error::Tokenize(self.err("Invalid identifier start")));
295        }
296
297        let start_idx = self.idx;
298        let end_idx = self.scan_id_end(start_idx);
299        self.idx = end_idx;
300
301        let token = match &self.chars[start_idx..end_idx] {
302            ['t', 'r', 'u', 'e'] => Token::Bool(true),
303            ['f', 'a', 'l', 's', 'e'] => Token::Bool(false),
304            ['n', 'i', 'l'] => Token::Nil,
305            ['i', 'n'] => Token::In,
306            _ => {
307                self.push_id_token(start_idx, end_idx);
308                return Ok(());
309            }
310        };
311        self.tokens.push(token);
312        Ok(())
313    }
314
315    /// - `@a.(@b - 1)` -> [At, Id("a"), Dot, LParen, At, Id("b"), Sub, Int(1), RParen]
316    /// - `@a` -> [At, Id("a")]
317    fn parse_at_list(&mut self) -> Result<()> {
318        self.idx += 1;
319        self.tokens.push(Token::At);
320
321        while !self.eof() {
322            let c = self.chars[self.idx];
323            if Self::is_id_start(c) {
324                self.parse_id()?;
325                continue;
326            }
327            if c.is_ascii_digit() {
328                self.parse_int()?;
329                continue;
330            }
331            if matches!(c, '+' | '-') && self.tokens.last().is_some_and(|tok| tok == &Token::Dot) {
332                if self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
333                    self.parse_int()?;
334                    continue;
335                }
336            }
337
338            if c == '.' {
339                self.idx += 1;
340                self.tokens.push(Token::Dot);
341                continue;
342            }
343            break;
344        }
345        Ok(())
346    }
347
348    fn parse_int(&mut self) -> Result<()> {
349        let start_idx = self.idx;
350
351        if !self.eof() && (self.chars[self.idx] == '-' || self.chars[self.idx] == '+') {
352            self.idx += 1;
353        }
354
355        if !self.eof()
356            && self.chars[self.idx] == '0'
357            && self.idx + 1 < self.len
358            && (self.chars[self.idx + 1] == 'x'
359                || self.chars[self.idx + 1] == 'X'
360                || self.chars[self.idx + 1] == 'o'
361                || self.chars[self.idx + 1] == 'O')
362        {
363            let is_hex = self.chars[self.idx + 1] == 'x' || self.chars[self.idx + 1] == 'X';
364            let radix = if is_hex { 16 } else { 8 };
365            self.idx += 2; // skip '0x' or '0o'
366            let digits_start = self.idx;
367            while !self.eof() {
368                let c = self.chars[self.idx];
369                let valid = if is_hex {
370                    c.is_ascii_hexdigit()
371                } else {
372                    matches!(c, '0'..='7')
373                };
374                if valid {
375                    self.idx += 1;
376                } else {
377                    break;
378                }
379            }
380            if self.idx == digits_start {
381                let label = if is_hex { "hex" } else { "octal" };
382                return Err(Error::Tokenize(self.err(format!("Invalid {label} literal, no digits"))));
383            }
384            let digits: String = self.chars[digits_start..self.idx].iter().collect();
385            let val = i64::from_str_radix(&digits, radix).map_err(|_| {
386                Error::Tokenize(self.err(format!("Invalid int: 0{}{}", if is_hex { "x" } else { "o" }, digits)))
387            })?;
388            let val = if start_idx < self.chars.len() && self.chars[start_idx] == '-' {
389                val.checked_neg()
390                    .ok_or_else(|| Error::Tokenize(self.err("Integer overflow")))?
391            } else {
392                val
393            };
394            self.tokens.push(Token::Int(val));
395            return Ok(());
396        }
397
398        while !self.eof() {
399            let c = self.chars[self.idx];
400            if c.is_ascii_digit() {
401                self.idx += 1;
402            } else {
403                break;
404            }
405        }
406
407        let num_str: String = self.chars[start_idx..self.idx].iter().collect();
408        let num = match num_str.parse() {
409            Ok(i) => i,
410            Err(_) => return Err(Error::Tokenize(format!("{}: {}", self.err("Invalid int"), num_str))),
411        };
412        self.tokens.push(Token::Int(num));
413        Ok(())
414    }
415
416    fn parse_punctuations(&mut self) -> Result<()> {
417        let c = self.chars[self.idx];
418        match c {
419            '(' => {
420                self.idx += 1;
421                self.tokens.push(Token::LParen);
422                Ok(())
423            }
424            ')' => {
425                self.idx += 1;
426                self.tokens.push(Token::RParen);
427                Ok(())
428            }
429            '{' => {
430                self.idx += 1;
431                self.tokens.push(Token::LBrace);
432                Ok(())
433            }
434            '}' => {
435                self.idx += 1;
436                self.tokens.push(Token::RBrace);
437                Ok(())
438            }
439            '[' => {
440                self.idx += 1;
441                self.tokens.push(Token::LBracket);
442                Ok(())
443            }
444            ']' => {
445                self.idx += 1;
446                self.tokens.push(Token::RBracket);
447                Ok(())
448            }
449            ':' => {
450                self.idx += 1;
451                self.tokens.push(Token::Colon);
452                Ok(())
453            }
454            ',' => {
455                self.idx += 1;
456                self.tokens.push(Token::Comma);
457                Ok(())
458            }
459            ';' => {
460                self.idx += 1;
461                self.tokens.push(Token::Semicolon);
462                Ok(())
463            }
464            '.' => {
465                self.idx += 1;
466                self.tokens.push(Token::Dot);
467                if self.starts_int_literal() {
468                    return self.parse_int();
469                }
470                Ok(())
471            }
472            '&' => {
473                if self.peek(1) == Some('&') {
474                    self.idx += 2;
475                    self.tokens.push(Token::And);
476                    Ok(())
477                } else {
478                    Err(Error::Tokenize(self.err("Expect '&&'")))
479                }
480            }
481            '|' => {
482                if self.peek(1) == Some('|') {
483                    self.idx += 2;
484                    self.tokens.push(Token::Or);
485                    Ok(())
486                } else {
487                    Err(Error::Tokenize(self.err("Expect '||'")))
488                }
489            }
490            '+' => {
491                if self.sign_starts_number() && self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
492                    return self.parse_num();
493                }
494                self.idx += 1;
495                self.tokens.push(Token::Add);
496                Ok(())
497            }
498            '-' => {
499                if self.sign_starts_number() && self.peek(1).is_some_and(|next| next.is_ascii_digit()) {
500                    return self.parse_num();
501                }
502                self.idx += 1;
503                self.tokens.push(Token::Sub);
504                Ok(())
505            }
506            '*' => {
507                self.idx += 1;
508                self.tokens.push(Token::Mul);
509                Ok(())
510            }
511            '/' => {
512                if self.peek(1) == Some('*') {
513                    // Multi-line comment with nesting support
514                    self.idx += 2;
515                    let mut depth = 1usize;
516                    while !self.eof() && depth > 0 {
517                        if self.chars[self.idx] == '/' && self.peek(1) == Some('*') {
518                            depth += 1;
519                            self.idx += 2;
520                        } else if self.chars[self.idx] == '*' && self.peek(1) == Some('/') {
521                            depth -= 1;
522                            self.idx += 2;
523                        } else {
524                            self.idx += 1;
525                        }
526                    }
527                    if depth > 0 {
528                        return Err(Error::Tokenize(self.err("Unterminated block comment")));
529                    }
530                } else if self.peek(1) == Some('/') {
531                    self.idx += 2;
532                    // Skip single-line comment
533                    while !self.eof() {
534                        let c = self.chars[self.idx];
535                        if c == '\n' {
536                            self.idx += 1;
537                            break;
538                        }
539                        self.idx += 1;
540                    }
541                } else {
542                    self.idx += 1;
543                    self.tokens.push(Token::Div);
544                }
545                Ok(())
546            }
547            '%' => {
548                self.idx += 1;
549                self.tokens.push(Token::Mod);
550                Ok(())
551            }
552            '@' => self.parse_at_list(),
553            '=' => {
554                if self.peek(1) == Some('=') {
555                    self.idx += 2;
556                    self.tokens.push(Token::Eq);
557                    Ok(())
558                } else {
559                    Err(Error::Tokenize(self.err("Expect '=='")))
560                }
561            }
562            '!' => {
563                if self.peek(1) == Some('=') {
564                    self.idx += 2;
565                    self.tokens.push(Token::Ne);
566                    Ok(())
567                } else {
568                    self.idx += 1;
569                    self.tokens.push(Token::Not);
570                    Ok(())
571                }
572            }
573            '>' => {
574                if self.peek(1) == Some('=') {
575                    self.idx += 2;
576                    self.tokens.push(Token::Ge);
577                    Ok(())
578                } else {
579                    self.idx += 1;
580                    self.tokens.push(Token::Gt);
581                    Ok(())
582                }
583            }
584            '<' => {
585                if self.peek(1) == Some('=') {
586                    self.idx += 2;
587                    self.tokens.push(Token::Le);
588                    Ok(())
589                } else {
590                    self.idx += 1;
591                    self.tokens.push(Token::Lt);
592                    Ok(())
593                }
594            }
595            '?' => {
596                if self.peek(1) == Some('?') {
597                    self.idx += 2;
598                    self.tokens.push(Token::QuestionQuestion);
599                } else {
600                    self.idx += 1;
601                    self.tokens.push(Token::Question);
602                }
603                Ok(())
604            }
605            _ => Err(Error::Tokenize(self.err("Unknown punctuation"))),
606        }
607    }
608
609    fn parse(&mut self) -> Result<()> {
610        while !self.eof() {
611            self.skip_whitespace();
612            if self.eof() {
613                break;
614            }
615            let c = self.chars[self.idx];
616            match c {
617                '"' | '\'' => {
618                    self.parse_str()?;
619                }
620                '0'..='9' => {
621                    self.parse_num()?;
622                }
623                _ => {
624                    if self.is_punctuation(c) {
625                        self.parse_punctuations()?;
626                    } else if Self::is_id_start(c) {
627                        self.parse_ident_or_keyword()?;
628                    } else {
629                        return Err(Error::Tokenize(self.err("Invalid identifier start")));
630                    }
631                }
632            }
633        }
634        Ok(())
635    }
636
637    fn is_punctuation(&self, c: char) -> bool {
638        matches!(
639            c,
640            '(' | ')'
641                | '{'
642                | '}'
643                | '['
644                | ']'
645                | '.'
646                | ':'
647                | ','
648                | ';'
649                | '&'
650                | '|'
651                | '+'
652                | '-'
653                | '*'
654                | '/'
655                | '%'
656                | '@'
657                | '='
658                | '!'
659                | '>'
660                | '<'
661                | '?'
662        )
663    }
664
665    fn starts_int_literal(&self) -> bool {
666        if self.eof() {
667            return false;
668        }
669
670        let c = self.chars[self.idx];
671        if c.is_ascii_digit() {
672            return true;
673        }
674
675        if matches!(c, '+' | '-') {
676            return self.peek(1).is_some_and(|next| next.is_ascii_digit());
677        }
678
679        false
680    }
681}