Skip to main content

panache_parser/parser/inlines/
inline_html.rs

1//! Inline raw HTML recognizer per CommonMark §6.6 / Pandoc `raw_html`.
2//!
3//! Matches a single HTML tag (open/close), comment, processing instruction,
4//! declaration, or CDATA section starting at byte 0 of `text`. Returns the
5//! length in bytes of the matched span, or `None` if the prefix doesn't
6//! parse.
7//!
8//! The recognizer is intentionally byte-level and conservative: when a span
9//! looks plausible but doesn't fully close (e.g. unterminated comment or
10//! quoted attribute), it returns `None` so the dispatcher falls back to
11//! emitting plain text.
12//!
13//! Backslash escapes and entity references inside the span are *not*
14//! decoded — callers are expected to emit the bytes verbatim into the CST,
15//! and the renderer must skip the standard text-token escaping for
16//! `INLINE_HTML` nodes.
17
18use crate::syntax::SyntaxKind;
19use rowan::GreenNodeBuilder;
20
21/// Try to match an inline raw HTML span starting at `text[0]`.
22/// Returns the length in bytes consumed, or `None` if no match.
23pub fn try_parse_inline_html(text: &str) -> Option<usize> {
24    if !text.starts_with('<') {
25        return None;
26    }
27    parse_html_comment(text)
28        .or_else(|| parse_cdata(text))
29        .or_else(|| parse_declaration(text))
30        .or_else(|| parse_processing_instruction(text))
31        .or_else(|| parse_close_tag(text))
32        .or_else(|| parse_open_tag(text))
33}
34
35/// Emit a single `INLINE_HTML` node holding the verbatim span.
36pub fn emit_inline_html(builder: &mut GreenNodeBuilder, raw: &str) {
37    builder.start_node(SyntaxKind::INLINE_HTML.into());
38    builder.token(SyntaxKind::INLINE_HTML_CONTENT.into(), raw);
39    builder.finish_node();
40}
41
42fn parse_html_comment(text: &str) -> Option<usize> {
43    if !text.starts_with("<!--") {
44        return None;
45    }
46    // Special degenerate forms: <!--> and <!--->
47    if text.as_bytes().get(4) == Some(&b'>') {
48        return Some(5);
49    }
50    if text.as_bytes().get(4) == Some(&b'-') && text.as_bytes().get(5) == Some(&b'>') {
51        return Some(6);
52    }
53    let after = &text[4..];
54    let end = after.find("-->")?;
55    Some(4 + end + 3)
56}
57
58fn parse_processing_instruction(text: &str) -> Option<usize> {
59    if !text.starts_with("<?") {
60        return None;
61    }
62    let after = &text[2..];
63    let end = after.find("?>")?;
64    Some(2 + end + 2)
65}
66
67fn parse_cdata(text: &str) -> Option<usize> {
68    const PREFIX: &str = "<![CDATA[";
69    if !text.starts_with(PREFIX) {
70        return None;
71    }
72    let after = &text[PREFIX.len()..];
73    let end = after.find("]]>")?;
74    Some(PREFIX.len() + end + 3)
75}
76
77fn parse_declaration(text: &str) -> Option<usize> {
78    let bytes = text.as_bytes();
79    if !text.starts_with("<!") || bytes.len() < 3 {
80        return None;
81    }
82    if !bytes[2].is_ascii_alphabetic() {
83        return None;
84    }
85    let mut i = 3;
86    while i < bytes.len() {
87        if bytes[i] == b'>' {
88            return Some(i + 1);
89        }
90        i += 1;
91    }
92    None
93}
94
95pub(crate) fn parse_close_tag(text: &str) -> Option<usize> {
96    let bytes = text.as_bytes();
97    if !text.starts_with("</") {
98        return None;
99    }
100    let mut i = 2;
101    if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
102        return None;
103    }
104    i += 1;
105    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
106        i += 1;
107    }
108    i = skip_ws_with_optional_lf(bytes, i);
109    if bytes.get(i) == Some(&b'>') {
110        Some(i + 1)
111    } else {
112        None
113    }
114}
115
116pub(crate) fn parse_open_tag(text: &str) -> Option<usize> {
117    let bytes = text.as_bytes();
118    if !text.starts_with('<') {
119        return None;
120    }
121    let mut i = 1;
122    if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
123        return None;
124    }
125    i += 1;
126    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
127        i += 1;
128    }
129    while let Some(after) = parse_attribute(bytes, i) {
130        i = after;
131    }
132    i = skip_ws_with_optional_lf(bytes, i);
133    if bytes.get(i) == Some(&b'/') {
134        i += 1;
135    }
136    if bytes.get(i) == Some(&b'>') {
137        Some(i + 1)
138    } else {
139        None
140    }
141}
142
143fn parse_attribute(bytes: &[u8], start: usize) -> Option<usize> {
144    let after_ws = skip_ws_required_with_optional_lf(bytes, start)?;
145    let mut i = after_ws;
146    let first = *bytes.get(i)?;
147    if !is_attr_name_start(first) {
148        return None;
149    }
150    i += 1;
151    while i < bytes.len() && is_attr_name_cont(bytes[i]) {
152        i += 1;
153    }
154    if let Some(after_value) = parse_attr_value_spec(bytes, i) {
155        i = after_value;
156    }
157    Some(i)
158}
159
160fn parse_attr_value_spec(bytes: &[u8], start: usize) -> Option<usize> {
161    let i_after_ws1 = skip_ws_with_optional_lf(bytes, start);
162    if bytes.get(i_after_ws1) != Some(&b'=') {
163        return None;
164    }
165    let mut i = i_after_ws1 + 1;
166    i = skip_ws_with_optional_lf(bytes, i);
167    parse_attr_value(bytes, i)
168}
169
170fn parse_attr_value(bytes: &[u8], start: usize) -> Option<usize> {
171    let q = *bytes.get(start)?;
172    match q {
173        b'"' | b'\'' => {
174            let mut j = start + 1;
175            while j < bytes.len() && bytes[j] != q {
176                j += 1;
177            }
178            if j >= bytes.len() {
179                return None;
180            }
181            Some(j + 1)
182        }
183        _ => {
184            let mut j = start;
185            while j < bytes.len() {
186                let b = bytes[j];
187                if matches!(
188                    b,
189                    b' ' | b'\t' | b'\n' | b'\r' | b'"' | b'\'' | b'=' | b'<' | b'>' | b'`'
190                ) {
191                    break;
192                }
193                j += 1;
194            }
195            if j == start { None } else { Some(j) }
196        }
197    }
198}
199
200fn is_attr_name_start(b: u8) -> bool {
201    b.is_ascii_alphabetic() || b == b'_' || b == b':'
202}
203
204fn is_attr_name_cont(b: u8) -> bool {
205    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b':' || b == b'-'
206}
207
208/// Skip "spaces, tabs, and up to one line ending". Returns the new index.
209/// Always succeeds (returns at least `start`).
210fn skip_ws_with_optional_lf(bytes: &[u8], start: usize) -> usize {
211    let mut i = start;
212    let mut saw_lf = false;
213    while i < bytes.len() {
214        match bytes[i] {
215            b' ' | b'\t' => i += 1,
216            b'\n' => {
217                if saw_lf {
218                    break;
219                }
220                saw_lf = true;
221                i += 1;
222            }
223            b'\r' => {
224                if saw_lf {
225                    break;
226                }
227                saw_lf = true;
228                i += 1;
229                if bytes.get(i) == Some(&b'\n') {
230                    i += 1;
231                }
232            }
233            _ => break,
234        }
235    }
236    i
237}
238
239/// Like `skip_ws_with_optional_lf`, but requires consuming at least one
240/// whitespace character (or one line ending).
241fn skip_ws_required_with_optional_lf(bytes: &[u8], start: usize) -> Option<usize> {
242    let after = skip_ws_with_optional_lf(bytes, start);
243    if after == start { None } else { Some(after) }
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    fn matches(input: &str, expected_len: usize) {
251        assert_eq!(
252            try_parse_inline_html(input),
253            Some(expected_len),
254            "expected {input:?} to match {expected_len}",
255        );
256    }
257
258    fn no_match(input: &str) {
259        assert_eq!(
260            try_parse_inline_html(input),
261            None,
262            "expected no match for {input:?}"
263        );
264    }
265
266    #[test]
267    fn simple_open_tag() {
268        matches("<a>", 3);
269        matches("<bab>", 5);
270        matches("<c2c>", 5);
271    }
272
273    #[test]
274    fn empty_element() {
275        matches("<a/>", 4);
276        matches("<b2/>", 5);
277        matches("<a  />", 6);
278    }
279
280    #[test]
281    fn open_tag_with_attrs() {
282        matches(r#"<a href="x">"#, r#"<a href="x">"#.len());
283        matches(
284            r#"<a foo="bar" baz='qux'>"#,
285            r#"<a foo="bar" baz='qux'>"#.len(),
286        );
287        matches(r#"<a foo=bar>"#, r#"<a foo=bar>"#.len());
288    }
289
290    #[test]
291    fn open_tag_attr_value_spans_lines() {
292        matches("<a href=\"foo\nbar\">", "<a href=\"foo\nbar\">".len());
293    }
294
295    #[test]
296    fn close_tag() {
297        matches("</a>", 4);
298        matches("</foo >", 7);
299    }
300
301    #[test]
302    fn comment_forms() {
303        matches("<!-->", 5);
304        matches("<!--->", 6);
305        matches("<!---->", 7);
306        matches("<!-- hi -->", 11);
307        matches("<!-- a\nb -->", 12);
308    }
309
310    #[test]
311    fn processing_instruction() {
312        matches("<?php $x; ?>", 12);
313    }
314
315    #[test]
316    fn cdata() {
317        matches("<![CDATA[a]]>", 13);
318    }
319
320    #[test]
321    fn declaration() {
322        matches("<!ELEMENT br EMPTY>", 19);
323    }
324
325    #[test]
326    fn rejects_illegal() {
327        no_match("<33>");
328        no_match("<__>");
329        no_match("<a h*#ref=\"hi\">");
330        no_match(r#"<a href="hi'>"#);
331        no_match("< a>");
332        no_match("<bar/ >");
333        no_match("<a href='bar'title=title>");
334        no_match("<");
335        no_match("<a");
336        no_match("<!--");
337        no_match("<![CDATA[abc");
338    }
339
340    #[test]
341    fn rejects_unclosed_quoted_value() {
342        no_match("<a href=\"foo");
343    }
344
345    #[test]
346    fn ignores_non_lt_prefix() {
347        no_match("foo");
348        no_match("a<b>");
349    }
350}