Skip to main content

forge_codegen/
parser.rs

1//! Parser for Forge `.forge.html` files. Produces a flat token stream that the
2//! lowering pass converts to Askama syntax.
3//!
4//! We use a simple linear scanner rather than `chumsky` here because Forge's
5//! grammar is mostly token-by-token rewrites — a recursive parser would be
6//! overkill and harder to debug for the POC.
7
8use std::fmt;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub enum Token {
12    /// Literal HTML/text content.
13    Text(String),
14
15    /// `{{ expr }}` — escaped output.
16    EscapedExpr(String),
17
18    /// `{!! expr !!}` — unescaped output.
19    RawExpr(String),
20
21    /// `@{{ expr }}` — Blade-style escape: output the literal text
22    /// `{{ expr }}` without interpolation. Used to embed mustache syntax
23    /// in client-side templates (Vue, Alpine, Handlebars) that share Blade's
24    /// brace conventions.
25    LiteralExpr(String),
26
27    /// `@@name` or `@@name(args)` — Blade-style directive escape: output the
28    /// literal `@name(args)` text without executing the directive.
29    LiteralDirective { name: String, args: Option<String> },
30
31    /// `@directive(args)` — Blade-style directive.
32    Directive { name: String, args: Option<String> },
33
34    /// `<x-name attr="val">` — component open.
35    ComponentOpen {
36        name: String,
37        attrs: Vec<(String, String)>,
38        self_closing: bool,
39    },
40
41    /// `</x-name>` — component close.
42    ComponentClose { name: String },
43}
44
45impl fmt::Display for Token {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        match self {
48            Token::Text(s) => write!(f, "{s}"),
49            Token::EscapedExpr(e) => write!(f, "{{{{ {e} }}}}"),
50            Token::RawExpr(e) => write!(f, "{{!! {e} !!}}"),
51            Token::LiteralExpr(e) => write!(f, "@{{{{ {e} }}}}"),
52            Token::LiteralDirective {
53                name,
54                args: Some(a),
55            } => write!(f, "@@{name}({a})"),
56            Token::LiteralDirective { name, args: None } => write!(f, "@@{name}"),
57            Token::Directive {
58                name,
59                args: Some(a),
60            } => write!(f, "@{name}({a})"),
61            Token::Directive { name, args: None } => write!(f, "@{name}"),
62            Token::ComponentOpen {
63                name,
64                attrs,
65                self_closing,
66            } => {
67                write!(f, "<x-{name}")?;
68                for (k, v) in attrs {
69                    write!(f, " {k}=\"{v}\"")?;
70                }
71                if *self_closing {
72                    write!(f, " />")
73                } else {
74                    write!(f, ">")
75                }
76            }
77            Token::ComponentClose { name } => write!(f, "</x-{name}>"),
78        }
79    }
80}
81
82pub fn tokenize(input: &str) -> Vec<Token> {
83    let mut tokens = Vec::new();
84    let bytes = input.as_bytes();
85    let mut i = 0;
86    let mut text_start = 0;
87
88    while i < bytes.len() {
89        // @{{ ... }} — Blade escape, output `{{ ... }}` literally.
90        // Must come before the regular `{{ ... }}` rule so the `@` consumes
91        // the brace pair instead of letting it kick into interpolation.
92        if i + 2 < bytes.len() && bytes[i] == b'@' && bytes[i + 1] == b'{' && bytes[i + 2] == b'{' {
93            flush_text(input, text_start, i, &mut tokens);
94            if let Some(end) = find_close(&input[i + 3..], "}}") {
95                let expr = input[i + 3..i + 3 + end].trim().to_string();
96                tokens.push(Token::LiteralExpr(expr));
97                i += 3 + end + 2;
98                text_start = i;
99                continue;
100            }
101        }
102
103        // @@directive — Blade escape, output `@directive` literally.
104        if i + 1 < bytes.len()
105            && bytes[i] == b'@'
106            && bytes[i + 1] == b'@'
107            && i + 2 < bytes.len()
108            && (bytes[i + 2].is_ascii_alphabetic() || bytes[i + 2] == b'_')
109        {
110            flush_text(input, text_start, i, &mut tokens);
111            let dir_start = i + 2;
112            let mut dir_end = dir_start;
113            while dir_end < bytes.len()
114                && (bytes[dir_end].is_ascii_alphanumeric() || bytes[dir_end] == b'_')
115            {
116                dir_end += 1;
117            }
118            let name = input[dir_start..dir_end].to_string();
119            let mut args = None;
120            let mut new_i = dir_end;
121            if dir_end < bytes.len() && bytes[dir_end] == b'(' {
122                if let Some(close_offset) = find_matching_paren(&input[dir_end..]) {
123                    args = Some(input[dir_end + 1..dir_end + close_offset].to_string());
124                    new_i = dir_end + close_offset + 1;
125                }
126            }
127            tokens.push(Token::LiteralDirective { name, args });
128            i = new_i;
129            text_start = i;
130            continue;
131        }
132
133        // {{ ... }}
134        if i + 1 < bytes.len()
135            && bytes[i] == b'{'
136            && bytes[i + 1] == b'{'
137            && !(i + 2 < bytes.len() && bytes[i + 2] == b'-'/* askama escape */)
138        {
139            flush_text(input, text_start, i, &mut tokens);
140            if let Some(end) = find_close(&input[i + 2..], "}}") {
141                let expr = input[i + 2..i + 2 + end].trim().to_string();
142                tokens.push(Token::EscapedExpr(expr));
143                i += 2 + end + 2;
144                text_start = i;
145                continue;
146            }
147        }
148
149        // {!! ... !!}
150        if i + 2 < bytes.len() && bytes[i] == b'{' && bytes[i + 1] == b'!' && bytes[i + 2] == b'!' {
151            flush_text(input, text_start, i, &mut tokens);
152            if let Some(end) = find_close(&input[i + 3..], "!!}") {
153                let expr = input[i + 3..i + 3 + end].trim().to_string();
154                tokens.push(Token::RawExpr(expr));
155                i += 3 + end + 3;
156                text_start = i;
157                continue;
158            }
159        }
160
161        // @directive
162        if bytes[i] == b'@'
163            && i + 1 < bytes.len()
164            && (bytes[i + 1].is_ascii_alphabetic() || bytes[i + 1] == b'_')
165        {
166            flush_text(input, text_start, i, &mut tokens);
167            let dir_start = i + 1;
168            let mut dir_end = dir_start;
169            while dir_end < bytes.len()
170                && (bytes[dir_end].is_ascii_alphanumeric() || bytes[dir_end] == b'_')
171            {
172                dir_end += 1;
173            }
174            let name = input[dir_start..dir_end].to_string();
175            let mut args = None;
176            let mut new_i = dir_end;
177            if dir_end < bytes.len() && bytes[dir_end] == b'(' {
178                if let Some(close_offset) = find_matching_paren(&input[dir_end..]) {
179                    args = Some(input[dir_end + 1..dir_end + close_offset].to_string());
180                    new_i = dir_end + close_offset + 1;
181                }
182            }
183            tokens.push(Token::Directive { name, args });
184            i = new_i;
185            text_start = i;
186            continue;
187        }
188
189        // <x-component ...>
190        if bytes[i] == b'<' && i + 2 < bytes.len() && bytes[i + 1] == b'x' && bytes[i + 2] == b'-' {
191            flush_text(input, text_start, i, &mut tokens);
192            let after = &input[i + 3..];
193            // Component name
194            let name_end = after
195                .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
196                .unwrap_or(after.len());
197            let name = after[..name_end].to_string();
198            let rest_start = i + 3 + name_end;
199            let close_offset = input[rest_start..]
200                .find('>')
201                .unwrap_or(input.len() - rest_start);
202            let tag_inner = &input[rest_start..rest_start + close_offset];
203            let self_closing = tag_inner.ends_with('/');
204            let attrs = parse_attrs(tag_inner.trim_end_matches('/'));
205            tokens.push(Token::ComponentOpen {
206                name,
207                attrs,
208                self_closing,
209            });
210            i = rest_start + close_offset + 1;
211            text_start = i;
212            continue;
213        }
214
215        // </x-component>
216        if bytes[i] == b'<'
217            && i + 3 < bytes.len()
218            && bytes[i + 1] == b'/'
219            && bytes[i + 2] == b'x'
220            && bytes[i + 3] == b'-'
221        {
222            flush_text(input, text_start, i, &mut tokens);
223            let after = &input[i + 4..];
224            let name_end = after.find('>').unwrap_or(after.len());
225            let name = after[..name_end].trim().to_string();
226            tokens.push(Token::ComponentClose { name });
227            i += 4 + name_end + 1;
228            text_start = i;
229            continue;
230        }
231
232        i += 1;
233    }
234
235    flush_text(input, text_start, bytes.len(), &mut tokens);
236    tokens
237}
238
239fn flush_text(input: &str, start: usize, end: usize, tokens: &mut Vec<Token>) {
240    if end > start {
241        tokens.push(Token::Text(input[start..end].to_string()));
242    }
243}
244
245fn find_close(s: &str, needle: &str) -> Option<usize> {
246    s.find(needle)
247}
248
249fn find_matching_paren(s: &str) -> Option<usize> {
250    // s starts with '('. Returns offset of matching ')'.
251    let bytes = s.as_bytes();
252    if bytes.is_empty() || bytes[0] != b'(' {
253        return None;
254    }
255    let mut depth = 1;
256    let mut in_string = None::<u8>;
257    for (i, &b) in bytes.iter().enumerate().skip(1) {
258        if let Some(quote) = in_string {
259            if b == quote && bytes.get(i - 1) != Some(&b'\\') {
260                in_string = None;
261            }
262            continue;
263        }
264        match b {
265            b'"' | b'\'' => in_string = Some(b),
266            b'(' => depth += 1,
267            b')' => {
268                depth -= 1;
269                if depth == 0 {
270                    return Some(i);
271                }
272            }
273            _ => {}
274        }
275    }
276    None
277}
278
279fn parse_attrs(s: &str) -> Vec<(String, String)> {
280    let mut attrs = Vec::new();
281    let mut chars = s.char_indices().peekable();
282    while let Some((_, ch)) = chars.peek() {
283        if ch.is_whitespace() {
284            chars.next();
285            continue;
286        }
287        // Read attribute name
288        let mut name_end = 0;
289        let mut name = String::new();
290        let mut found_eq = false;
291        while let Some(&(idx, c)) = chars.peek() {
292            if c == '=' {
293                found_eq = true;
294                name_end = idx;
295                chars.next();
296                break;
297            }
298            if c.is_whitespace() {
299                name_end = idx;
300                break;
301            }
302            name.push(c);
303            chars.next();
304        }
305        let _ = name_end;
306        if !found_eq {
307            attrs.push((name, String::new()));
308            continue;
309        }
310        // Read value: quoted or unquoted
311        if let Some(&(_, q)) = chars.peek() {
312            if q == '"' || q == '\'' {
313                chars.next();
314                let mut val = String::new();
315                while let Some(&(_, c)) = chars.peek() {
316                    chars.next();
317                    if c == q {
318                        break;
319                    }
320                    val.push(c);
321                }
322                attrs.push((name, val));
323                continue;
324            }
325        }
326        // Unquoted value (read to whitespace)
327        let mut val = String::new();
328        while let Some(&(_, c)) = chars.peek() {
329            if c.is_whitespace() {
330                break;
331            }
332            val.push(c);
333            chars.next();
334        }
335        attrs.push((name, val));
336    }
337    attrs
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    fn first_non_text(tokens: &[Token]) -> &Token {
345        tokens
346            .iter()
347            .find(|t| !matches!(t, Token::Text(s) if s.trim().is_empty()))
348            .expect("no non-empty token")
349    }
350
351    #[test]
352    fn at_double_brace_emits_literal_expr_not_escaped() {
353        let toks = tokenize("@{{ handle }}");
354        assert!(
355            matches!(first_non_text(&toks), Token::LiteralExpr(s) if s == "handle"),
356            "got: {toks:?}"
357        );
358    }
359
360    #[test]
361    fn double_at_directive_emits_literal_directive() {
362        let toks = tokenize("@@if(user)");
363        let first = first_non_text(&toks);
364        assert!(
365            matches!(first, Token::LiteralDirective { name, args }
366                if name == "if" && args.as_deref() == Some("user")),
367            "got: {toks:?}"
368        );
369    }
370
371    #[test]
372    fn double_at_directive_without_args() {
373        let toks = tokenize("@@verbatim");
374        let first = first_non_text(&toks);
375        assert!(
376            matches!(first, Token::LiteralDirective { name, args }
377                if name == "verbatim" && args.is_none()),
378            "got: {toks:?}"
379        );
380    }
381
382    #[test]
383    fn at_double_brace_does_not_consume_following_real_interpolation() {
384        // The user's reported case: `@{{ '' }}{{ handle }}` should produce
385        // a literal `{{ '' }}` followed by interpolation of `handle`.
386        let toks = tokenize("@{{ '' }}{{ handle }}");
387        let mut iter = toks
388            .iter()
389            .filter(|t| !matches!(t, Token::Text(s) if s.trim().is_empty()));
390        let first = iter.next().expect("first");
391        let second = iter.next().expect("second");
392        assert!(
393            matches!(first, Token::LiteralExpr(s) if s == "''"),
394            "first: {first:?}"
395        );
396        assert!(
397            matches!(second, Token::EscapedExpr(s) if s == "handle"),
398            "second: {second:?}"
399        );
400    }
401
402    #[test]
403    fn regular_directive_still_works() {
404        // Sanity: `@if(x)` shouldn't be mis-categorized by the @@ rule.
405        let toks = tokenize("@if(x)");
406        let first = first_non_text(&toks);
407        assert!(
408            matches!(first, Token::Directive { name, args }
409                if name == "if" && args.as_deref() == Some("x")),
410            "got: {toks:?}"
411        );
412    }
413
414    #[test]
415    fn regular_interpolation_still_works() {
416        // Sanity: `{{ name }}` (no leading @) is still escaped output.
417        let toks = tokenize("{{ name }}");
418        let first = first_non_text(&toks);
419        assert!(
420            matches!(first, Token::EscapedExpr(s) if s == "name"),
421            "got: {toks:?}"
422        );
423    }
424}