Skip to main content

bubbles/compiler/
markup.rs

1//! Scanner for inline markup (`[name]…[/name]`, `[name /]`) combined with
2//! `{expr}` interpolation. Both are parsed in one left-to-right pass.
3
4/// A token produced by scanning text that may contain `{expr}` and `[markup]` syntax.
5#[derive(Debug, PartialEq, Eq)]
6pub enum TextToken<'a> {
7    /// A literal run of text with no substitution or markup.
8    Literal(&'a str),
9    /// The source text between `{` and `}`.
10    Expr(&'a str),
11    /// An opening markup tag: `[name]` or `[name key=val …]`.
12    MarkupOpen {
13        /// Tag name, e.g. `wave` in `[wave]`.
14        name: &'a str,
15        /// Zero or more `key=value` pairs.
16        properties: Vec<(&'a str, &'a str)>,
17    },
18    /// A closing markup tag: `[/name]`.
19    MarkupClose {
20        /// Tag name, e.g. `wave` in `[/wave]`.
21        name: &'a str,
22    },
23    /// A self-closing markup tag: `[name /]` or `[name key=val … /]`.
24    MarkupSelfClose {
25        /// Tag name, e.g. `pause` in `[pause /]`.
26        name: &'a str,
27        /// Zero or more `key=value` pairs.
28        properties: Vec<(&'a str, &'a str)>,
29    },
30}
31
32/// Errors returned by [`scan_text_segments`].
33#[derive(Debug, PartialEq, Eq)]
34pub enum MarkupScanError {
35    /// An unclosed `{` at the given byte offset.
36    UnclosedBrace(usize),
37    /// An unclosed `[` at the given byte offset.
38    UnclosedBracket(usize),
39}
40
41/// Scans `text` for `{expr}` and `[markup]` syntax, yielding tokens in order.
42///
43/// **Markup rules:**
44/// - `[identifier]` or `[identifier key=val …]` → [`TextToken::MarkupOpen`]
45/// - `[/identifier]` → [`TextToken::MarkupClose`]
46/// - `[identifier /]` or `[identifier key=val … /]` → [`TextToken::MarkupSelfClose`]
47/// - `[…]` whose content does not match any of the above → emitted verbatim
48///   as part of a [`TextToken::Literal`]
49///
50/// An unclosed `{` or `[` (no matching `}` / `]` before end of input) is
51/// always an error regardless of the content inside.
52///
53/// # Errors
54///
55/// Returns [`MarkupScanError::UnclosedBrace`] or [`MarkupScanError::UnclosedBracket`]
56/// with the byte offset of the unmatched delimiter.
57pub fn scan_text_segments(text: &str) -> Result<Vec<TextToken<'_>>, MarkupScanError> {
58    let mut tokens = Vec::new();
59    let bytes = text.as_bytes();
60    let mut i = 0usize;
61    let mut lit_start = 0usize;
62
63    macro_rules! flush_literal {
64        () => {
65            if lit_start < i {
66                tokens.push(TextToken::Literal(&text[lit_start..i]));
67            }
68        };
69    }
70
71    while i < bytes.len() {
72        match bytes[i] {
73            b'{' => {
74                let brace_start = i;
75                let rest = &text[i + 1..];
76                let close = rest
77                    .find('}')
78                    .ok_or(MarkupScanError::UnclosedBrace(brace_start))?;
79                flush_literal!();
80                tokens.push(TextToken::Expr(&rest[..close]));
81                i = i + 1 + close + 1;
82                lit_start = i;
83            }
84            b'[' => {
85                let bracket_start = i;
86                let rest = &text[i + 1..];
87                let close_rel = rest
88                    .find(']')
89                    .ok_or(MarkupScanError::UnclosedBracket(bracket_start))?;
90                let inner = &rest[..close_rel];
91                if let Some(tok) = try_parse_markup(inner) {
92                    flush_literal!();
93                    tokens.push(tok);
94                    i = i + 1 + close_rel + 1;
95                    lit_start = i;
96                } else {
97                    // Not markup – include the `[` in the current literal run
98                    // and let the scanner continue character-by-character.
99                    i += 1;
100                }
101            }
102            _ => {
103                i += 1;
104            }
105        }
106    }
107
108    if lit_start < text.len() {
109        tokens.push(TextToken::Literal(&text[lit_start..]));
110    }
111
112    Ok(tokens)
113}
114
115/// Returns `true` if `s` is a valid markup identifier (`[a-zA-Z_][a-zA-Z0-9_-]*`).
116fn is_identifier(s: &str) -> bool {
117    let mut chars = s.chars();
118    chars.next().is_some_and(|c| {
119        (c.is_ascii_alphabetic() || c == '_')
120            && chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
121    })
122}
123
124/// Parses zero or more `key=value` pairs separated by whitespace.
125///
126/// Returns `None` if any pair is malformed (missing `=` or non-identifier key).
127fn parse_properties(s: &str) -> Option<Vec<(&str, &str)>> {
128    if s.is_empty() {
129        return Some(Vec::new());
130    }
131    let mut props = Vec::new();
132    for part in s.split_whitespace() {
133        let eq = part.find('=')?;
134        let key = &part[..eq];
135        let val = &part[eq + 1..];
136        if !is_identifier(key) {
137            return None;
138        }
139        props.push((key, val));
140    }
141    Some(props)
142}
143
144/// Attempts to parse the content between `[` and `]` as a markup token.
145///
146/// Returns `None` if the content does not match the markup grammar, in which
147/// case the caller should treat the entire `[…]` as literal text.
148fn try_parse_markup(inner: &str) -> Option<TextToken<'_>> {
149    // Close tag: `/identifier`
150    if let Some(name_part) = inner.strip_prefix('/') {
151        let name = name_part.trim_start();
152        if is_identifier(name) && name.len() == name_part.len() {
153            return Some(TextToken::MarkupClose { name });
154        }
155        return None;
156    }
157
158    // Self-closing: content ends with ` /`
159    let (content, self_close) = inner
160        .strip_suffix(" /")
161        .map_or((inner, false), |rest| (rest, true));
162
163    // Split into name and optional property string on the first space
164    let (name, props_src) = content
165        .find(' ')
166        .map_or((content, ""), |sp| (&content[..sp], &content[sp + 1..]));
167
168    if !is_identifier(name) {
169        return None;
170    }
171
172    let properties = parse_properties(props_src)?;
173
174    if self_close {
175        Some(TextToken::MarkupSelfClose { name, properties })
176    } else {
177        Some(TextToken::MarkupOpen { name, properties })
178    }
179}
180
181#[cfg(test)]
182#[path = "markup_tests.rs"]
183mod tests;