Skip to main content

gracile_core/
lexer.rs

1//! Tokenizer that converts raw template source into a stream of tokens.
2
3use crate::error::{Error, Result, Span};
4
5/// All token variants produced by the lexer.
6#[derive(Debug, Clone, PartialEq)]
7pub enum TokenKind {
8    /// Verbatim text outside any tag.
9    RawText(String),
10    /// Content of a raw block (between `{#raw}` and `{/raw}`).
11    RawBody(String),
12    /// Content of a comment (between `{!` and `!}`).
13    CommentBody(String),
14
15    BlockOpen,    // {#
16    ContinueOpen, // {:
17    BlockClose,   // {/
18    SpecialOpen,  // {@
19    CommentOpen,  // {!
20    ExprOpen,     // {=  (escaped expression interpolation)
21    ExprOpenRaw,  // {~  (raw/unescaped expression interpolation)
22
23    Close,        // }
24    CommentClose, // !}
25
26    KwIf,
27    KwElse,
28    KwEach,
29    KwAs,
30    KwSnippet,
31    KwRaw,
32    KwRender,
33    KwConst,
34    KwInclude,
35    KwDebug,
36    KwIs,
37    KwNot,
38    KwIn,
39
40    StringLit(String),
41    IntLit(i64),
42    FloatLit(f64),
43    True,
44    False,
45    Null,
46
47    Ident(String),
48
49    Pipe,         // |
50    Or,           // ||
51    And,          // &&
52    Question,     // ?
53    NullCoalesce, // ??
54    Colon,        // :
55    Eq,           // ==
56    Neq,          // !=
57    Assign,       // =
58    Lt,           // <
59    Gt,           // >
60    Lte,          // <=
61    Gte,          // >=
62    Add,          // +
63    Sub,          // -
64    Mul,          // *
65    Div,          // /
66    Mod,          // %
67    Bang,         // ! (unary NOT)
68    Dot,          // .
69
70    LParen,   // (
71    RParen,   // )
72    LBracket, // [
73    RBracket, // ]
74    LBraceD,  // {  (destructuring open)
75    RBraceD,  // }  (destructuring close)
76    Comma,    // ,
77
78    Eof,
79}
80
81impl std::fmt::Display for TokenKind {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        let s = match self {
84            TokenKind::RawText(_) => "raw text",
85            TokenKind::RawBody(_) => "raw block content",
86            TokenKind::CommentBody(_) => "comment content",
87
88            TokenKind::BlockOpen => "'{#'",
89            TokenKind::ContinueOpen => "'{:'",
90            TokenKind::BlockClose => "'{/'",
91            TokenKind::SpecialOpen => "'{@'",
92            TokenKind::CommentOpen => "'{!'",
93            TokenKind::ExprOpen => "'{='",
94            TokenKind::ExprOpenRaw => "'{~'",
95            TokenKind::Close => "'}'",
96            TokenKind::CommentClose => "'!}'",
97
98            TokenKind::KwIf => "keyword 'if'",
99            TokenKind::KwElse => "keyword 'else'",
100            TokenKind::KwEach => "keyword 'each'",
101            TokenKind::KwAs => "keyword 'as'",
102            TokenKind::KwSnippet => "keyword 'snippet'",
103            TokenKind::KwRaw => "keyword 'raw'",
104            TokenKind::KwRender => "keyword 'render'",
105            TokenKind::KwConst => "keyword 'const'",
106            TokenKind::KwInclude => "keyword 'include'",
107            TokenKind::KwDebug => "keyword 'debug'",
108            TokenKind::KwIs => "keyword 'is'",
109            TokenKind::KwNot => "keyword 'not'",
110            TokenKind::KwIn => "keyword 'in'",
111
112            TokenKind::True => "'true'",
113            TokenKind::False => "'false'",
114            TokenKind::Null => "'null'",
115
116            TokenKind::Pipe => "'|'",
117            TokenKind::Or => "'||'",
118            TokenKind::And => "'&&'",
119            TokenKind::Question => "'?'",
120            TokenKind::NullCoalesce => "'??'",
121            TokenKind::Colon => "':'",
122            TokenKind::Eq => "'=='",
123            TokenKind::Neq => "'!='",
124            TokenKind::Assign => "'='",
125            TokenKind::Lt => "'<'",
126            TokenKind::Gt => "'>'",
127            TokenKind::Lte => "'<='",
128            TokenKind::Gte => "'>='",
129            TokenKind::Add => "'+'",
130            TokenKind::Sub => "'-'",
131            TokenKind::Mul => "'*'",
132            TokenKind::Div => "'/'",
133            TokenKind::Mod => "'%'",
134            TokenKind::Bang => "'!'",
135            TokenKind::Dot => "'.'",
136
137            TokenKind::LParen => "'('",
138            TokenKind::RParen => "')'",
139            TokenKind::LBracket => "'['",
140            TokenKind::RBracket => "']'",
141            TokenKind::LBraceD => "'{'",
142            TokenKind::RBraceD => "'}'",
143            TokenKind::Comma => "','",
144
145            TokenKind::Eof => "end of template",
146
147            // Variants with data are handled below.
148            TokenKind::StringLit(s) => return write!(f, "string '{s}'"),
149            TokenKind::IntLit(n) => return write!(f, "integer '{n}'"),
150            TokenKind::FloatLit(n) => return write!(f, "float '{n}'"),
151            TokenKind::Ident(s) => return write!(f, "'{s}'"),
152        };
153        f.write_str(s)
154    }
155}
156
157/// A lexed token with source position.
158#[derive(Debug, Clone)]
159pub struct Token {
160    pub kind: TokenKind,
161    pub span: Span,
162}
163
164/// Character-by-character tokenizer.
165pub struct Lexer {
166    chars: Vec<char>,
167    pos: usize,
168    line: u32,
169    col: u32,
170    offset: usize,
171}
172
173impl Lexer {
174    pub fn new(src: &str) -> Self {
175        Lexer {
176            chars: src.chars().collect(),
177            pos: 0,
178            line: 1,
179            col: 1,
180            offset: 0,
181        }
182    }
183
184    /// Tokenise the entire source and return the token stream.
185    pub fn tokenize(mut self) -> Result<Vec<Token>> {
186        let mut tokens = Vec::new();
187        self.lex_template(&mut tokens)?;
188        tokens.push(Token {
189            kind: TokenKind::Eof,
190            span: self.span(),
191        });
192        Ok(tokens)
193    }
194
195    fn span(&self) -> Span {
196        Span::new(self.line, self.col, self.offset)
197    }
198
199    fn peek(&self) -> Option<char> {
200        self.chars.get(self.pos).copied()
201    }
202
203    fn peek_at(&self, offset: usize) -> Option<char> {
204        self.chars.get(self.pos + offset).copied()
205    }
206
207    fn matches_at(&self, offset: usize, s: &str) -> bool {
208        s.chars()
209            .enumerate()
210            .all(|(i, c)| self.peek_at(offset + i) == Some(c))
211    }
212
213    fn advance(&mut self) -> Option<char> {
214        let c = self.chars.get(self.pos).copied()?;
215        self.pos += 1;
216        self.offset += c.len_utf8();
217        if c == '\n' {
218            self.line += 1;
219            self.col = 1;
220        } else {
221            self.col += 1;
222        }
223        Some(c)
224    }
225
226    fn advance_if(&mut self, c: char) -> bool {
227        if self.peek() == Some(c) {
228            self.advance();
229            true
230        } else {
231            false
232        }
233    }
234
235    fn at_end(&self) -> bool {
236        self.pos >= self.chars.len()
237    }
238
239    fn lex_template(&mut self, tokens: &mut Vec<Token>) -> Result<()> {
240        while !self.at_end() {
241            if self.peek() != Some('{') {
242                self.lex_raw_text(tokens);
243                continue;
244            }
245
246            let sigil = self.peek_at(1);
247
248            match sigil {
249                Some('#') => {
250                    // Detect raw block: {#raw}
251                    if self.matches_at(2, "raw}") {
252                        self.lex_raw_block(tokens)?;
253                    } else {
254                        let span = self.span();
255                        self.advance(); // `{`
256                        self.advance(); // `#`
257                        tokens.push(mk(TokenKind::BlockOpen, span));
258                        self.lex_tag(tokens)?;
259                    }
260                }
261                Some(':') => {
262                    let span = self.span();
263                    self.advance(); // `{`
264                    self.advance(); // `:`
265                    tokens.push(mk(TokenKind::ContinueOpen, span));
266                    self.lex_tag(tokens)?;
267                }
268                Some('/') => {
269                    let span = self.span();
270                    self.advance(); // `{`
271                    self.advance(); // `/`
272                    tokens.push(mk(TokenKind::BlockClose, span));
273                    self.lex_tag(tokens)?;
274                }
275                Some('@') => {
276                    let span = self.span();
277                    self.advance(); // `{`
278                    self.advance(); // `@`
279                    tokens.push(mk(TokenKind::SpecialOpen, span));
280                    self.lex_tag(tokens)?;
281                }
282                Some('!') => {
283                    let span = self.span();
284                    self.advance(); // `{`
285                    self.advance(); // `!`
286                    tokens.push(mk(TokenKind::CommentOpen, span));
287                    self.lex_comment(tokens)?;
288                }
289                Some('=') => {
290                    // `{=` — escaped expression interpolation.
291                    let span = self.span();
292                    self.advance(); // `{`
293                    self.advance(); // `=`
294                    tokens.push(mk(TokenKind::ExprOpen, span));
295                    self.lex_tag(tokens)?;
296                }
297                Some('~') => {
298                    // `{~` — raw (unescaped) expression interpolation.
299                    let span = self.span();
300                    self.advance(); // `{`
301                    self.advance(); // `~`
302                    tokens.push(mk(TokenKind::ExprOpenRaw, span));
303                    self.lex_tag(tokens)?;
304                }
305                Some('\\') => {
306                    // `{\=` → literal `{=`; `{\~` → literal `{~`.
307                    // Any other `{\X` falls through to the bare-`{` branch.
308                    let escaped = self.peek_at(2);
309                    if matches!(escaped, Some('=') | Some('~')) {
310                        let span = self.span();
311                        self.advance(); // `{`
312                        self.advance(); // `\`
313                        let sigil = self.advance().unwrap(); // `=` or `~`
314                        let mut text = format!("{{{sigil}");
315                        // absorb further non-`{` characters into the same RawText token
316                        while !self.at_end() && self.peek() != Some('{') {
317                            text.push(self.advance().unwrap());
318                        }
319                        tokens.push(mk(TokenKind::RawText(text), span));
320                    } else {
321                        // Bare `{\` not followed by a recognised escape — literal text.
322                        let span = self.span();
323                        self.advance(); // consume `{`
324                        let mut text = String::from("{");
325                        while !self.at_end() && self.peek() != Some('{') {
326                            text.push(self.advance().unwrap());
327                        }
328                        tokens.push(mk(TokenKind::RawText(text), span));
329                    }
330                }
331                _ => {
332                    // Bare `{` not followed by a recognised sigil — always literal text.
333                    let span = self.span();
334                    self.advance(); // consume `{`
335                    let mut text = String::from("{");
336                    while !self.at_end() && self.peek() != Some('{') {
337                        text.push(self.advance().unwrap());
338                    }
339                    tokens.push(mk(TokenKind::RawText(text), span));
340                }
341            }
342        }
343        Ok(())
344    }
345
346    fn lex_raw_text(&mut self, tokens: &mut Vec<Token>) {
347        let span = self.span();
348        let mut text = String::new();
349        while !self.at_end() && self.peek() != Some('{') {
350            text.push(self.advance().unwrap());
351        }
352        if !text.is_empty() {
353            tokens.push(mk(TokenKind::RawText(text), span));
354        }
355    }
356
357    fn lex_raw_block(&mut self, tokens: &mut Vec<Token>) -> Result<()> {
358        let open_span = self.span();
359        // Consume `{#raw}`  (6 chars)
360        for _ in 0..6 {
361            self.advance();
362        }
363        tokens.push(mk(TokenKind::BlockOpen, open_span));
364        tokens.push(mk(TokenKind::KwRaw, self.span()));
365        tokens.push(mk(TokenKind::Close, self.span()));
366
367        // Scan body until `{/raw}`
368        let body_span = self.span();
369        let mut body = String::new();
370        loop {
371            if self.at_end() {
372                return Err(Error::LexError {
373                    message: "Unclosed {#raw} block — expected {/raw}".to_string(),
374                    span: self.span(),
375                });
376            }
377            if self.matches_at(0, "{/raw}") {
378                break;
379            }
380            body.push(self.advance().unwrap());
381        }
382        tokens.push(mk(TokenKind::RawBody(body), body_span));
383
384        // Consume `{/raw}` (6 chars)
385        let close_span = self.span();
386        for _ in 0..6 {
387            self.advance();
388        }
389        tokens.push(mk(TokenKind::BlockClose, close_span.clone()));
390        tokens.push(mk(TokenKind::KwRaw, close_span.clone()));
391        tokens.push(mk(TokenKind::Close, close_span));
392        Ok(())
393    }
394
395    /// Tokenise the contents of a tag until the matching `}`.
396    /// Tracks brace depth so that destructuring patterns like `{ name, age }`
397    /// inside `{#each ... as { name, age }}` don't prematurely close the tag.
398    fn lex_tag(&mut self, tokens: &mut Vec<Token>) -> Result<()> {
399        let mut brace_depth: usize = 0;
400        loop {
401            self.skip_ws();
402            if self.at_end() {
403                return Err(Error::LexError {
404                    message: "Unexpected end of input inside tag".to_string(),
405                    span: self.span(),
406                });
407            }
408
409            if brace_depth == 0 && self.peek() == Some('}') {
410                let span = self.span();
411                self.advance();
412                tokens.push(mk(TokenKind::Close, span));
413                return Ok(());
414            }
415
416            if self.peek() == Some('{') {
417                brace_depth += 1;
418                let span = self.span();
419                self.advance();
420                tokens.push(mk(TokenKind::LBraceD, span));
421                continue;
422            }
423
424            if self.peek() == Some('}') {
425                // brace_depth > 0 here
426                brace_depth -= 1;
427                let span = self.span();
428                self.advance();
429                tokens.push(mk(TokenKind::RBraceD, span));
430                continue;
431            }
432
433            let tok = self.next_tag_token()?;
434            tokens.push(tok);
435        }
436    }
437
438    fn lex_comment(&mut self, tokens: &mut Vec<Token>) -> Result<()> {
439        let body_span = self.span();
440        let mut body = String::new();
441        loop {
442            if self.at_end() {
443                return Err(Error::LexError {
444                    message: "Unclosed comment — expected !}".to_string(),
445                    span: self.span(),
446                });
447            }
448            if self.peek() == Some('!') && self.peek_at(1) == Some('}') {
449                let close_span = self.span();
450                self.advance(); // `!`
451                self.advance(); // `}`
452                tokens.push(mk(TokenKind::CommentBody(body), body_span));
453                tokens.push(mk(TokenKind::CommentClose, close_span));
454                return Ok(());
455            }
456            body.push(self.advance().unwrap());
457        }
458    }
459
460    fn skip_ws(&mut self) {
461        while matches!(
462            self.peek(),
463            Some(' ') | Some('\t') | Some('\n') | Some('\r')
464        ) {
465            self.advance();
466        }
467    }
468
469    fn next_tag_token(&mut self) -> Result<Token> {
470        let span = self.span();
471        let c = self.peek().unwrap();
472
473        match c {
474            '"' | '\'' => {
475                let s = self.lex_string(c)?;
476                Ok(mk(TokenKind::StringLit(s), span))
477            }
478            '0'..='9' => {
479                let kind = self.lex_number()?;
480                Ok(mk(kind, span))
481            }
482            'a'..='z' | 'A'..='Z' | '_' => {
483                let name = self.lex_ident();
484                let kind = keyword_or_ident(name);
485                Ok(mk(kind, span))
486            }
487            '|' => {
488                self.advance();
489                let kind = if self.advance_if('|') {
490                    TokenKind::Or
491                } else {
492                    TokenKind::Pipe
493                };
494                Ok(mk(kind, span))
495            }
496            '&' => {
497                self.advance();
498                if self.advance_if('&') {
499                    Ok(mk(TokenKind::And, span))
500                } else {
501                    Err(Error::LexError {
502                        message: "Expected '&&' — lone '&' is not valid".to_string(),
503                        span,
504                    })
505                }
506            }
507            '?' => {
508                self.advance();
509                let kind = if self.advance_if('?') {
510                    TokenKind::NullCoalesce
511                } else {
512                    TokenKind::Question
513                };
514                Ok(mk(kind, span))
515            }
516            ':' => {
517                self.advance();
518                Ok(mk(TokenKind::Colon, span))
519            }
520            '=' => {
521                self.advance();
522                let kind = if self.advance_if('=') {
523                    TokenKind::Eq
524                } else {
525                    TokenKind::Assign
526                };
527                Ok(mk(kind, span))
528            }
529            '!' => {
530                self.advance();
531                let kind = if self.advance_if('=') {
532                    TokenKind::Neq
533                } else {
534                    TokenKind::Bang
535                };
536                Ok(mk(kind, span))
537            }
538            '<' => {
539                self.advance();
540                let kind = if self.advance_if('=') {
541                    TokenKind::Lte
542                } else {
543                    TokenKind::Lt
544                };
545                Ok(mk(kind, span))
546            }
547            '>' => {
548                self.advance();
549                let kind = if self.advance_if('=') {
550                    TokenKind::Gte
551                } else {
552                    TokenKind::Gt
553                };
554                Ok(mk(kind, span))
555            }
556            '+' => {
557                self.advance();
558                Ok(mk(TokenKind::Add, span))
559            }
560            '-' => {
561                self.advance();
562                Ok(mk(TokenKind::Sub, span))
563            }
564            '*' => {
565                self.advance();
566                Ok(mk(TokenKind::Mul, span))
567            }
568            '/' => {
569                self.advance();
570                Ok(mk(TokenKind::Div, span))
571            }
572            '%' => {
573                self.advance();
574                Ok(mk(TokenKind::Mod, span))
575            }
576            '.' => {
577                self.advance();
578                Ok(mk(TokenKind::Dot, span))
579            }
580            '(' => {
581                self.advance();
582                Ok(mk(TokenKind::LParen, span))
583            }
584            ')' => {
585                self.advance();
586                Ok(mk(TokenKind::RParen, span))
587            }
588            '[' => {
589                self.advance();
590                Ok(mk(TokenKind::LBracket, span))
591            }
592            ']' => {
593                self.advance();
594                Ok(mk(TokenKind::RBracket, span))
595            }
596            ',' => {
597                self.advance();
598                Ok(mk(TokenKind::Comma, span))
599            }
600            other => Err(Error::LexError {
601                message: format!("Unexpected character '{}' inside tag", other),
602                span,
603            }),
604        }
605    }
606
607    fn lex_ident(&mut self) -> String {
608        let mut s = String::new();
609        while matches!(
610            self.peek(),
611            Some('a'..='z') | Some('A'..='Z') | Some('0'..='9') | Some('_')
612        ) {
613            s.push(self.advance().unwrap());
614        }
615        s
616    }
617
618    fn lex_string(&mut self, quote: char) -> Result<String> {
619        self.advance(); // opening quote
620        let mut s = String::new();
621        loop {
622            match self.advance() {
623                None => {
624                    return Err(Error::LexError {
625                        message: "Unterminated string literal".to_string(),
626                        span: self.span(),
627                    });
628                }
629                Some(c) if c == quote => break,
630                Some('\\') => {
631                    let esc_span = self.span();
632                    match self.advance() {
633                        Some('"') => s.push('"'),
634                        Some('\'') => s.push('\''),
635                        Some('\\') => s.push('\\'),
636                        Some('n') => s.push('\n'),
637                        Some('r') => s.push('\r'),
638                        Some('t') => s.push('\t'),
639                        Some('0') => s.push('\0'),
640                        Some('u') => {
641                            if !self.advance_if('{') {
642                                return Err(Error::LexError {
643                                    message: "Expected '{' after \\u".to_string(),
644                                    span: self.span(),
645                                });
646                            }
647                            let mut hex = String::new();
648                            while matches!(
649                                self.peek(),
650                                Some('0'..='9') | Some('a'..='f') | Some('A'..='F')
651                            ) {
652                                hex.push(self.advance().unwrap());
653                            }
654                            if !self.advance_if('}') {
655                                return Err(Error::LexError {
656                                    message: "Expected '}' after unicode escape".to_string(),
657                                    span: self.span(),
658                                });
659                            }
660                            let code =
661                                u32::from_str_radix(&hex, 16).map_err(|_| Error::LexError {
662                                    message: format!("Invalid unicode escape \\u{{{}}}", hex),
663                                    span: esc_span.clone(),
664                                })?;
665                            s.push(char::from_u32(code).ok_or(Error::LexError {
666                                message: format!("Invalid unicode codepoint U+{:04X}", code),
667                                span: esc_span,
668                            })?);
669                        }
670                        Some(c) => {
671                            return Err(Error::LexError {
672                                message: format!("Unknown escape sequence '\\{}'", c),
673                                span: esc_span,
674                            });
675                        }
676                        None => {
677                            return Err(Error::LexError {
678                                message: "Unterminated escape sequence".to_string(),
679                                span: esc_span,
680                            });
681                        }
682                    }
683                }
684                Some(c) => s.push(c),
685            }
686        }
687        Ok(s)
688    }
689
690    fn lex_number(&mut self) -> Result<TokenKind> {
691        let mut s = String::new();
692        while matches!(self.peek(), Some('0'..='9')) {
693            s.push(self.advance().unwrap());
694        }
695        if self.peek() == Some('.') && matches!(self.peek_at(1), Some('0'..='9')) {
696            s.push(self.advance().unwrap()); // '.'
697            while matches!(self.peek(), Some('0'..='9')) {
698                s.push(self.advance().unwrap());
699            }
700            if matches!(self.peek(), Some('e') | Some('E')) {
701                s.push(self.advance().unwrap());
702                if matches!(self.peek(), Some('+') | Some('-')) {
703                    s.push(self.advance().unwrap());
704                }
705                while matches!(self.peek(), Some('0'..='9')) {
706                    s.push(self.advance().unwrap());
707                }
708            }
709            let span = self.span();
710            let f: f64 = s.parse().map_err(|_| Error::LexError {
711                message: format!("Invalid float literal '{}'", s),
712                span,
713            })?;
714            Ok(TokenKind::FloatLit(f))
715        } else {
716            let span = self.span();
717            let i: i64 = s.parse().map_err(|_| Error::LexError {
718                message: format!("Invalid integer literal '{}'", s),
719                span,
720            })?;
721            Ok(TokenKind::IntLit(i))
722        }
723    }
724}
725
726fn mk(kind: TokenKind, span: Span) -> Token {
727    Token { kind, span }
728}
729
730fn keyword_or_ident(s: String) -> TokenKind {
731    match s.as_str() {
732        "if" => TokenKind::KwIf,
733        "else" => TokenKind::KwElse,
734        "each" => TokenKind::KwEach,
735        "as" => TokenKind::KwAs,
736        "snippet" => TokenKind::KwSnippet,
737        "raw" => TokenKind::KwRaw,
738        "render" => TokenKind::KwRender,
739        "const" => TokenKind::KwConst,
740        "include" => TokenKind::KwInclude,
741        "debug" => TokenKind::KwDebug,
742        "is" => TokenKind::KwIs,
743        "not" => TokenKind::KwNot,
744        "in" => TokenKind::KwIn,
745        "true" => TokenKind::True,
746        "false" => TokenKind::False,
747        "null" => TokenKind::Null,
748        _ => TokenKind::Ident(s),
749    }
750}
751
752/// Tokenise a template source string and return the flat token stream.
753pub fn tokenize(src: &str) -> Result<Vec<Token>> {
754    Lexer::new(src).tokenize()
755}