Skip to main content

panache_parser/parser/inlines/
inline_html.rs

1//! Inline raw HTML recognizer per CommonMark §6.6 / Pandoc `raw_html`.
2//!
3//! Matches a single HTML tag (open/close), comment, processing instruction,
4//! declaration, or CDATA section starting at byte 0 of `text`. Returns the
5//! length in bytes of the matched span, or `None` if the prefix doesn't
6//! parse.
7//!
8//! The recognizer is intentionally byte-level and conservative: when a span
9//! looks plausible but doesn't fully close (e.g. unterminated comment or
10//! quoted attribute), it returns `None` so the dispatcher falls back to
11//! emitting plain text.
12//!
13//! Backslash escapes and entity references inside the span are *not*
14//! decoded — callers are expected to emit the bytes verbatim into the CST,
15//! and the renderer must skip the standard text-token escaping for
16//! `INLINE_HTML` nodes.
17
18use crate::options::Dialect;
19use crate::syntax::SyntaxKind;
20use rowan::GreenNodeBuilder;
21
22/// Try to match an inline raw HTML span starting at `text[0]`.
23/// Returns the length in bytes consumed, or `None` if no match.
24///
25/// `dialect` controls whether bare HTML declarations (`<!DOCTYPE …>`,
26/// `<!ENTITY …>`) and CDATA sections (`<![CDATA[…]]>`) are recognized
27/// as raw HTML. Pandoc-markdown does not treat these as raw inline
28/// HTML — the bytes fall through to plain text. CommonMark dialect
29/// recognizes them per CommonMark §6.6.
30pub fn try_parse_inline_html(text: &str, dialect: Dialect) -> Option<usize> {
31    if !text.starts_with('<') {
32        return None;
33    }
34    let cdata_decl_allowed = dialect == Dialect::CommonMark;
35    parse_html_comment(text)
36        .or_else(|| {
37            if cdata_decl_allowed {
38                parse_cdata(text)
39            } else {
40                None
41            }
42        })
43        .or_else(|| {
44            if cdata_decl_allowed {
45                parse_declaration(text)
46            } else {
47                None
48            }
49        })
50        .or_else(|| parse_processing_instruction(text))
51        .or_else(|| parse_close_tag(text))
52        .or_else(|| parse_open_tag(text))
53}
54
55/// Emit a single `INLINE_HTML` node holding the verbatim span.
56pub fn emit_inline_html(builder: &mut GreenNodeBuilder, raw: &str) {
57    builder.start_node(SyntaxKind::INLINE_HTML.into());
58    builder.token(SyntaxKind::INLINE_HTML_CONTENT.into(), raw);
59    builder.finish_node();
60}
61
62fn parse_html_comment(text: &str) -> Option<usize> {
63    if !text.starts_with("<!--") {
64        return None;
65    }
66    // Special degenerate forms: <!--> and <!--->
67    if text.as_bytes().get(4) == Some(&b'>') {
68        return Some(5);
69    }
70    if text.as_bytes().get(4) == Some(&b'-') && text.as_bytes().get(5) == Some(&b'>') {
71        return Some(6);
72    }
73    let after = &text[4..];
74    let end = after.find("-->")?;
75    Some(4 + end + 3)
76}
77
78fn parse_processing_instruction(text: &str) -> Option<usize> {
79    if !text.starts_with("<?") {
80        return None;
81    }
82    let after = &text[2..];
83    let end = after.find("?>")?;
84    Some(2 + end + 2)
85}
86
87fn parse_cdata(text: &str) -> Option<usize> {
88    const PREFIX: &str = "<![CDATA[";
89    if !text.starts_with(PREFIX) {
90        return None;
91    }
92    let after = &text[PREFIX.len()..];
93    let end = after.find("]]>")?;
94    Some(PREFIX.len() + end + 3)
95}
96
97fn parse_declaration(text: &str) -> Option<usize> {
98    let bytes = text.as_bytes();
99    if !text.starts_with("<!") || bytes.len() < 3 {
100        return None;
101    }
102    if !bytes[2].is_ascii_alphabetic() {
103        return None;
104    }
105    let mut i = 3;
106    while i < bytes.len() {
107        if bytes[i] == b'>' {
108            return Some(i + 1);
109        }
110        i += 1;
111    }
112    None
113}
114
115pub(crate) fn parse_close_tag(text: &str) -> Option<usize> {
116    let bytes = text.as_bytes();
117    if !text.starts_with("</") {
118        return None;
119    }
120    let mut i = 2;
121    if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
122        return None;
123    }
124    i += 1;
125    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
126        i += 1;
127    }
128    i = skip_ws_with_optional_lf(bytes, i);
129    if bytes.get(i) == Some(&b'>') {
130        Some(i + 1)
131    } else {
132        None
133    }
134}
135
136pub(crate) fn parse_open_tag(text: &str) -> Option<usize> {
137    let bytes = text.as_bytes();
138    if !text.starts_with('<') {
139        return None;
140    }
141    let mut i = 1;
142    if i >= bytes.len() || !bytes[i].is_ascii_alphabetic() {
143        return None;
144    }
145    i += 1;
146    while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
147        i += 1;
148    }
149    while let Some(after) = parse_attribute(bytes, i) {
150        i = after;
151    }
152    i = skip_ws_with_optional_lf(bytes, i);
153    if bytes.get(i) == Some(&b'/') {
154        i += 1;
155    }
156    if bytes.get(i) == Some(&b'>') {
157        Some(i + 1)
158    } else {
159        None
160    }
161}
162
163fn parse_attribute(bytes: &[u8], start: usize) -> Option<usize> {
164    let after_ws = skip_ws_required_with_optional_lf(bytes, start)?;
165    let mut i = after_ws;
166    let first = *bytes.get(i)?;
167    if !is_attr_name_start(first) {
168        return None;
169    }
170    i += 1;
171    while i < bytes.len() && is_attr_name_cont(bytes[i]) {
172        i += 1;
173    }
174    if let Some(after_value) = parse_attr_value_spec(bytes, i) {
175        i = after_value;
176    }
177    Some(i)
178}
179
180fn parse_attr_value_spec(bytes: &[u8], start: usize) -> Option<usize> {
181    let i_after_ws1 = skip_ws_with_optional_lf(bytes, start);
182    if bytes.get(i_after_ws1) != Some(&b'=') {
183        return None;
184    }
185    let mut i = i_after_ws1 + 1;
186    i = skip_ws_with_optional_lf(bytes, i);
187    parse_attr_value(bytes, i)
188}
189
190fn parse_attr_value(bytes: &[u8], start: usize) -> Option<usize> {
191    let q = *bytes.get(start)?;
192    match q {
193        b'"' | b'\'' => {
194            let mut j = start + 1;
195            while j < bytes.len() && bytes[j] != q {
196                j += 1;
197            }
198            if j >= bytes.len() {
199                return None;
200            }
201            Some(j + 1)
202        }
203        _ => {
204            let mut j = start;
205            while j < bytes.len() {
206                let b = bytes[j];
207                if matches!(
208                    b,
209                    b' ' | b'\t' | b'\n' | b'\r' | b'"' | b'\'' | b'=' | b'<' | b'>' | b'`'
210                ) {
211                    break;
212                }
213                j += 1;
214            }
215            if j == start { None } else { Some(j) }
216        }
217    }
218}
219
220fn is_attr_name_start(b: u8) -> bool {
221    b.is_ascii_alphabetic() || b == b'_' || b == b':'
222}
223
224fn is_attr_name_cont(b: u8) -> bool {
225    b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b':' || b == b'-'
226}
227
228/// Skip "spaces, tabs, and up to one line ending". Returns the new index.
229/// Always succeeds (returns at least `start`).
230fn skip_ws_with_optional_lf(bytes: &[u8], start: usize) -> usize {
231    let mut i = start;
232    let mut saw_lf = false;
233    while i < bytes.len() {
234        match bytes[i] {
235            b' ' | b'\t' => i += 1,
236            b'\n' => {
237                if saw_lf {
238                    break;
239                }
240                saw_lf = true;
241                i += 1;
242            }
243            b'\r' => {
244                if saw_lf {
245                    break;
246                }
247                saw_lf = true;
248                i += 1;
249                if bytes.get(i) == Some(&b'\n') {
250                    i += 1;
251                }
252            }
253            _ => break,
254        }
255    }
256    i
257}
258
259/// Like `skip_ws_with_optional_lf`, but requires consuming at least one
260/// whitespace character (or one line ending).
261fn skip_ws_required_with_optional_lf(bytes: &[u8], start: usize) -> Option<usize> {
262    let after = skip_ws_with_optional_lf(bytes, start);
263    if after == start { None } else { Some(after) }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    fn matches(input: &str, expected_len: usize) {
271        // CommonMark dialect: full CommonMark §6.6 recognition (incl. CDATA
272        // and declarations). The byte-level recognizer assertions below
273        // are dialect-shared except for `cdata` and `declaration`, which
274        // are CommonMark-only and use `matches_cm` explicitly.
275        assert_eq!(
276            try_parse_inline_html(input, Dialect::CommonMark),
277            Some(expected_len),
278            "expected {input:?} to match {expected_len} under CommonMark",
279        );
280        assert_eq!(
281            try_parse_inline_html(input, Dialect::Pandoc),
282            Some(expected_len),
283            "expected {input:?} to match {expected_len} under Pandoc",
284        );
285    }
286
287    fn matches_cm(input: &str, expected_len: usize) {
288        assert_eq!(
289            try_parse_inline_html(input, Dialect::CommonMark),
290            Some(expected_len),
291            "expected {input:?} to match {expected_len} under CommonMark",
292        );
293    }
294
295    fn no_match(input: &str) {
296        assert_eq!(
297            try_parse_inline_html(input, Dialect::CommonMark),
298            None,
299            "expected no match for {input:?} under CommonMark",
300        );
301        assert_eq!(
302            try_parse_inline_html(input, Dialect::Pandoc),
303            None,
304            "expected no match for {input:?} under Pandoc",
305        );
306    }
307
308    fn no_match_pandoc(input: &str) {
309        assert_eq!(
310            try_parse_inline_html(input, Dialect::Pandoc),
311            None,
312            "expected no match for {input:?} under Pandoc dialect",
313        );
314    }
315
316    #[test]
317    fn simple_open_tag() {
318        matches("<a>", 3);
319        matches("<bab>", 5);
320        matches("<c2c>", 5);
321    }
322
323    #[test]
324    fn empty_element() {
325        matches("<a/>", 4);
326        matches("<b2/>", 5);
327        matches("<a  />", 6);
328    }
329
330    #[test]
331    fn open_tag_with_attrs() {
332        matches(r#"<a href="x">"#, r#"<a href="x">"#.len());
333        matches(
334            r#"<a foo="bar" baz='qux'>"#,
335            r#"<a foo="bar" baz='qux'>"#.len(),
336        );
337        matches(r#"<a foo=bar>"#, r#"<a foo=bar>"#.len());
338    }
339
340    #[test]
341    fn open_tag_attr_value_spans_lines() {
342        matches("<a href=\"foo\nbar\">", "<a href=\"foo\nbar\">".len());
343    }
344
345    #[test]
346    fn close_tag() {
347        matches("</a>", 4);
348        matches("</foo >", 7);
349    }
350
351    #[test]
352    fn comment_forms() {
353        matches("<!-->", 5);
354        matches("<!--->", 6);
355        matches("<!---->", 7);
356        matches("<!-- hi -->", 11);
357        matches("<!-- a\nb -->", 12);
358    }
359
360    #[test]
361    fn processing_instruction() {
362        matches("<?php $x; ?>", 12);
363    }
364
365    #[test]
366    fn cdata() {
367        matches_cm("<![CDATA[a]]>", 13);
368        // Pandoc-markdown does not recognize bare CDATA as inline raw HTML.
369        no_match_pandoc("<![CDATA[a]]>");
370    }
371
372    #[test]
373    fn declaration() {
374        matches_cm("<!ELEMENT br EMPTY>", 19);
375        matches_cm("<!DOCTYPE html>", 15);
376        // Pandoc-markdown does not recognize bare declarations as inline
377        // raw HTML — the bytes fall through to plain text.
378        no_match_pandoc("<!ELEMENT br EMPTY>");
379        no_match_pandoc("<!DOCTYPE html>");
380    }
381
382    #[test]
383    fn rejects_illegal() {
384        no_match("<33>");
385        no_match("<__>");
386        no_match("<a h*#ref=\"hi\">");
387        no_match(r#"<a href="hi'>"#);
388        no_match("< a>");
389        no_match("<bar/ >");
390        no_match("<a href='bar'title=title>");
391        no_match("<");
392        no_match("<a");
393        no_match("<!--");
394        no_match("<![CDATA[abc");
395    }
396
397    #[test]
398    fn rejects_unclosed_quoted_value() {
399        no_match("<a href=\"foo");
400    }
401
402    #[test]
403    fn ignores_non_lt_prefix() {
404        no_match("foo");
405        no_match("a<b>");
406    }
407}