bubbles/compiler/markup.rs
1//! Scanner for inline markup (`[name]…[/name]`, `[name /]`) combined with
2//! `{expr}` interpolation. Both are parsed in one left-to-right pass.
3
4/// A token produced by scanning text that may contain `{expr}` and `[markup]` syntax.
5#[derive(Debug, PartialEq, Eq)]
6pub enum TextToken<'a> {
7 /// A literal run of text with no substitution or markup.
8 Literal(&'a str),
9 /// The source text between `{` and `}`.
10 Expr(&'a str),
11 /// An opening markup tag: `[name]` or `[name key=val …]`.
12 MarkupOpen {
13 /// Tag name, e.g. `wave` in `[wave]`.
14 name: &'a str,
15 /// Zero or more `key=value` pairs.
16 properties: Vec<(&'a str, &'a str)>,
17 },
18 /// A closing markup tag: `[/name]`.
19 MarkupClose {
20 /// Tag name, e.g. `wave` in `[/wave]`.
21 name: &'a str,
22 },
23 /// A self-closing markup tag: `[name /]` or `[name key=val … /]`.
24 MarkupSelfClose {
25 /// Tag name, e.g. `pause` in `[pause /]`.
26 name: &'a str,
27 /// Zero or more `key=value` pairs.
28 properties: Vec<(&'a str, &'a str)>,
29 },
30}
31
32/// Errors returned by [`scan_text_segments`].
33#[derive(Debug, PartialEq, Eq)]
34pub enum MarkupScanError {
35 /// An unclosed `{` at the given byte offset.
36 UnclosedBrace(usize),
37 /// An unclosed `[` at the given byte offset.
38 UnclosedBracket(usize),
39}
40
41/// Scans `text` for `{expr}` and `[markup]` syntax, yielding tokens in order.
42///
43/// **Markup rules:**
44/// - `[identifier]` or `[identifier key=val …]` → [`TextToken::MarkupOpen`]
45/// - `[/identifier]` → [`TextToken::MarkupClose`]
46/// - `[identifier /]` or `[identifier key=val … /]` → [`TextToken::MarkupSelfClose`]
47/// - `[…]` whose content does not match any of the above → emitted verbatim
48/// as part of a [`TextToken::Literal`]
49///
50/// An unclosed `{` or `[` (no matching `}` / `]` before end of input) is
51/// always an error regardless of the content inside.
52///
53/// # Errors
54///
55/// Returns [`MarkupScanError::UnclosedBrace`] or [`MarkupScanError::UnclosedBracket`]
56/// with the byte offset of the unmatched delimiter.
57pub fn scan_text_segments(text: &str) -> Result<Vec<TextToken<'_>>, MarkupScanError> {
58 let mut tokens = Vec::new();
59 let bytes = text.as_bytes();
60 let mut i = 0usize;
61 let mut lit_start = 0usize;
62
63 macro_rules! flush_literal {
64 () => {
65 if lit_start < i {
66 tokens.push(TextToken::Literal(&text[lit_start..i]));
67 }
68 };
69 }
70
71 while i < bytes.len() {
72 match bytes[i] {
73 b'{' => {
74 let brace_start = i;
75 let rest = &text[i + 1..];
76 let close = rest
77 .find('}')
78 .ok_or(MarkupScanError::UnclosedBrace(brace_start))?;
79 flush_literal!();
80 tokens.push(TextToken::Expr(&rest[..close]));
81 i = i + 1 + close + 1;
82 lit_start = i;
83 }
84 b'[' => {
85 let bracket_start = i;
86 let rest = &text[i + 1..];
87 let close_rel = rest
88 .find(']')
89 .ok_or(MarkupScanError::UnclosedBracket(bracket_start))?;
90 let inner = &rest[..close_rel];
91 if let Some(tok) = try_parse_markup(inner) {
92 flush_literal!();
93 tokens.push(tok);
94 i = i + 1 + close_rel + 1;
95 lit_start = i;
96 } else {
97 // Not markup – include the `[` in the current literal run
98 // and let the scanner continue character-by-character.
99 i += 1;
100 }
101 }
102 _ => {
103 i += 1;
104 }
105 }
106 }
107
108 if lit_start < text.len() {
109 tokens.push(TextToken::Literal(&text[lit_start..]));
110 }
111
112 Ok(tokens)
113}
114
115/// Returns `true` if `s` is a valid markup identifier (`[a-zA-Z_][a-zA-Z0-9_-]*`).
116fn is_identifier(s: &str) -> bool {
117 let mut chars = s.chars();
118 chars.next().is_some_and(|c| {
119 (c.is_ascii_alphabetic() || c == '_')
120 && chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
121 })
122}
123
124/// Parses zero or more `key=value` pairs separated by whitespace.
125///
126/// Returns `None` if any pair is malformed (missing `=` or non-identifier key).
127fn parse_properties(s: &str) -> Option<Vec<(&str, &str)>> {
128 if s.is_empty() {
129 return Some(Vec::new());
130 }
131 let mut props = Vec::new();
132 for part in s.split_whitespace() {
133 let eq = part.find('=')?;
134 let key = &part[..eq];
135 let val = &part[eq + 1..];
136 if !is_identifier(key) {
137 return None;
138 }
139 props.push((key, val));
140 }
141 Some(props)
142}
143
144/// Attempts to parse the content between `[` and `]` as a markup token.
145///
146/// Returns `None` if the content does not match the markup grammar, in which
147/// case the caller should treat the entire `[…]` as literal text.
148fn try_parse_markup(inner: &str) -> Option<TextToken<'_>> {
149 // Close tag: `/identifier`
150 if let Some(name_part) = inner.strip_prefix('/') {
151 let name = name_part.trim_start();
152 if is_identifier(name) && name.len() == name_part.len() {
153 return Some(TextToken::MarkupClose { name });
154 }
155 return None;
156 }
157
158 // Self-closing: content ends with ` /`
159 let (content, self_close) = inner
160 .strip_suffix(" /")
161 .map_or((inner, false), |rest| (rest, true));
162
163 // Split into name and optional property string on the first space
164 let (name, props_src) = content
165 .find(' ')
166 .map_or((content, ""), |sp| (&content[..sp], &content[sp + 1..]));
167
168 if !is_identifier(name) {
169 return None;
170 }
171
172 let properties = parse_properties(props_src)?;
173
174 if self_close {
175 Some(TextToken::MarkupSelfClose { name, properties })
176 } else {
177 Some(TextToken::MarkupOpen { name, properties })
178 }
179}
180
181#[cfg(test)]
182#[path = "markup_tests.rs"]
183mod tests;