Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
8use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
9
10/// HTML block-level tags as defined by CommonMark spec.
11/// These tags start an HTML block when found at the start of a line.
12const BLOCK_TAGS: &[&str] = &[
13    "address",
14    "article",
15    "aside",
16    "base",
17    "basefont",
18    "blockquote",
19    "body",
20    "caption",
21    "center",
22    "col",
23    "colgroup",
24    "dd",
25    "details",
26    "dialog",
27    "dir",
28    "div",
29    "dl",
30    "dt",
31    "fieldset",
32    "figcaption",
33    "figure",
34    "footer",
35    "form",
36    "frame",
37    "frameset",
38    "h1",
39    "h2",
40    "h3",
41    "h4",
42    "h5",
43    "h6",
44    "head",
45    "header",
46    "hr",
47    "html",
48    "iframe",
49    "legend",
50    "li",
51    "link",
52    "main",
53    "menu",
54    "menuitem",
55    "nav",
56    "noframes",
57    "ol",
58    "optgroup",
59    "option",
60    "p",
61    "param",
62    "section",
63    "source",
64    "summary",
65    "table",
66    "tbody",
67    "td",
68    "tfoot",
69    "th",
70    "thead",
71    "title",
72    "tr",
73    "track",
74    "ul",
75];
76
77/// Tags that contain raw/verbatim content (no Markdown processing inside).
78const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
79
80/// Information about a detected HTML block opening.
81#[derive(Debug, Clone, PartialEq, Eq)]
82pub(crate) enum HtmlBlockType {
83    /// HTML comment: <!-- ... -->
84    Comment,
85    /// Processing instruction: <? ... ?>
86    ProcessingInstruction,
87    /// Declaration: <!...>
88    Declaration,
89    /// CDATA section: <![CDATA[ ... ]]>
90    CData,
91    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
92    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
93    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
94    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
95    BlockTag {
96        tag_name: String,
97        is_verbatim: bool,
98        closed_by_blank_line: bool,
99    },
100    /// CommonMark §4.6 type 7: complete open or close tag on a line by
101    /// itself, tag name not in the type-1 verbatim list. Block ends at
102    /// blank line. Cannot interrupt a paragraph.
103    Type7,
104}
105
106/// Try to detect an HTML block opening from content.
107/// Returns block type if this is a valid HTML block start.
108///
109/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
110/// accept closing tags (`</div>`), type-6 blocks end at the next blank
111/// line (rather than a matching close tag), and type 7 is recognized.
112pub(crate) fn try_parse_html_block_start(
113    content: &str,
114    is_commonmark: bool,
115) -> Option<HtmlBlockType> {
116    let trimmed = strip_leading_spaces(content);
117
118    // Must start with <
119    if !trimmed.starts_with('<') {
120        return None;
121    }
122
123    // HTML comment
124    if trimmed.starts_with("<!--") {
125        return Some(HtmlBlockType::Comment);
126    }
127
128    // Processing instruction
129    if trimmed.starts_with("<?") {
130        return Some(HtmlBlockType::ProcessingInstruction);
131    }
132
133    // CDATA section
134    if trimmed.starts_with("<![CDATA[") {
135        return Some(HtmlBlockType::CData);
136    }
137
138    // Declaration (DOCTYPE, etc.)
139    if trimmed.starts_with("<!") && trimmed.len() > 2 {
140        let after_bang = &trimmed[2..];
141        if after_bang.chars().next()?.is_ascii_uppercase() {
142            return Some(HtmlBlockType::Declaration);
143        }
144    }
145
146    // Try to parse as opening tag (or closing tag, under CommonMark)
147    if let Some(tag_name) = extract_block_tag_name(trimmed, is_commonmark) {
148        let tag_lower = tag_name.to_lowercase();
149        let is_closing = trimmed.starts_with("</");
150
151        // Check if it's a block-level tag
152        if BLOCK_TAGS.contains(&tag_lower.as_str()) {
153            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
154            return Some(HtmlBlockType::BlockTag {
155                tag_name: tag_lower,
156                is_verbatim,
157                closed_by_blank_line: is_commonmark && !is_verbatim,
158            });
159        }
160
161        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
162        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
163        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
164        // do not start a type-1 block. Letting `</pre>` through here would
165        // wrongly interrupt a paragraph.
166        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
167            return Some(HtmlBlockType::BlockTag {
168                tag_name: tag_lower,
169                is_verbatim: true,
170                closed_by_blank_line: false,
171            });
172        }
173    }
174
175    // Type 7 (CommonMark only): complete open or close tag on a line by
176    // itself, tag name not in the type-1 verbatim list.
177    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
178    {
179        let rest = &trimmed[end..];
180        let only_ws = rest
181            .bytes()
182            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
183        if only_ws {
184            // Reject if the tag name belongs to the type-1 verbatim set
185            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
186            // type-1 starts above, so seeing one here means the opener
187            // had a different shape (e.g. `<pre/>` self-closing) that
188            // shouldn't trigger type 7 either. Conservatively skip.
189            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
190            let name_end = leading
191                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
192                .unwrap_or(leading.len());
193            let name = leading[..name_end].to_ascii_lowercase();
194            if !VERBATIM_TAGS.contains(&name.as_str()) {
195                return Some(HtmlBlockType::Type7);
196            }
197        }
198    }
199
200    None
201}
202
203/// Extract the tag name for HTML-block-start detection.
204///
205/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
206/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
207/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
208/// the spec — we approximate that with the space/`>`/`/` boundary check.
209fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
210    if !text.starts_with('<') {
211        return None;
212    }
213
214    let after_bracket = &text[1..];
215
216    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
217        if !accept_closing {
218            return None;
219        }
220        stripped
221    } else {
222        after_bracket
223    };
224
225    // Extract tag name (alphanumeric, ends at space, >, or /)
226    let tag_end = after_slash
227        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
228        .unwrap_or(after_slash.len());
229
230    if tag_end == 0 {
231        return None;
232    }
233
234    let tag_name = &after_slash[..tag_end];
235
236    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
237    if !tag_name.chars().next()?.is_ascii_alphabetic() {
238        return None;
239    }
240
241    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
242        return None;
243    }
244
245    Some(tag_name.to_string())
246}
247
248/// Whether this block type ends at a blank line (CommonMark types 6 & 7
249/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
250/// marker — only at end of input or the next blank line.
251fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
252    matches!(
253        block_type,
254        HtmlBlockType::Type7
255            | HtmlBlockType::BlockTag {
256                closed_by_blank_line: true,
257                ..
258            }
259    )
260}
261
262/// Check if a line contains the closing marker for the given HTML block type.
263/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
264/// blank-line-terminated types (6 in CommonMark, 7) never match here.
265fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
266    match block_type {
267        HtmlBlockType::Comment => line.contains("-->"),
268        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
269        HtmlBlockType::Declaration => line.contains('>'),
270        HtmlBlockType::CData => line.contains("]]>"),
271        HtmlBlockType::BlockTag {
272            tag_name,
273            closed_by_blank_line: false,
274            ..
275        } => {
276            // Look for closing tag </tagname>
277            let closing_tag = format!("</{}>", tag_name);
278            line.to_lowercase().contains(&closing_tag)
279        }
280        HtmlBlockType::BlockTag {
281            closed_by_blank_line: true,
282            ..
283        }
284        | HtmlBlockType::Type7 => false,
285    }
286}
287
288/// Parse an HTML block, consuming lines from the parser.
289/// Returns the new position after the HTML block.
290pub(crate) fn parse_html_block(
291    builder: &mut GreenNodeBuilder<'static>,
292    lines: &[&str],
293    start_pos: usize,
294    block_type: HtmlBlockType,
295    bq_depth: usize,
296) -> usize {
297    // Start HTML block
298    builder.start_node(SyntaxKind::HTML_BLOCK.into());
299
300    let first_line = lines[start_pos];
301    let blank_terminated = ends_at_blank_line(&block_type);
302
303    // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
304    // tokens for the first line's blockquote prefix; emit only the inner
305    // content as TEXT to keep the CST byte-equal to the source.
306    let first_inner = if bq_depth > 0 {
307        strip_n_blockquote_markers(first_line, bq_depth)
308    } else {
309        first_line
310    };
311
312    // Emit opening line
313    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
314
315    let (line_without_newline, newline_str) = strip_newline(first_inner);
316    if !line_without_newline.is_empty() {
317        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
318    }
319    if !newline_str.is_empty() {
320        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
321    }
322
323    builder.finish_node(); // HtmlBlockTag
324
325    // Check if opening line also contains closing marker. Blank-line-terminated
326    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
327    // end at a blank line or end of input.
328    if !blank_terminated && is_closing_marker(first_inner, &block_type) {
329        log::trace!(
330            "HTML block at line {} opens and closes on same line",
331            start_pos + 1
332        );
333        builder.finish_node(); // HtmlBlock
334        return start_pos + 1;
335    }
336
337    let mut current_pos = start_pos + 1;
338    let mut content_lines: Vec<&str> = Vec::new();
339    let mut found_closing = false;
340
341    // Parse content until we find the closing marker
342    while current_pos < lines.len() {
343        let line = lines[current_pos];
344        let (line_bq_depth, inner) = count_blockquote_markers(line);
345
346        // Only process lines at the same or deeper blockquote depth
347        if line_bq_depth < bq_depth {
348            break;
349        }
350
351        // Blank-line-terminated blocks (types 6/7) end before the blank line.
352        // The blank line itself is not part of the block.
353        if blank_terminated && inner.trim().is_empty() {
354            break;
355        }
356
357        // Check for closing marker. Match against the inner content so a `>`-
358        // prefixed continuation line still recognises e.g. `</div>`.
359        if is_closing_marker(inner, &block_type) {
360            log::trace!("Found HTML block closing at line {}", current_pos + 1);
361            found_closing = true;
362
363            if !content_lines.is_empty() {
364                builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
365                for content_line in &content_lines {
366                    emit_html_block_line(builder, content_line, bq_depth);
367                }
368                builder.finish_node();
369            }
370
371            builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
372            emit_html_block_line(builder, line, bq_depth);
373            builder.finish_node();
374
375            current_pos += 1;
376            break;
377        }
378
379        // Regular content line
380        content_lines.push(line);
381        current_pos += 1;
382    }
383
384    // If we didn't find a closing marker, emit what we collected
385    if !found_closing {
386        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
387        if !content_lines.is_empty() {
388            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
389            for content_line in &content_lines {
390                emit_html_block_line(builder, content_line, bq_depth);
391            }
392            builder.finish_node();
393        }
394    }
395
396    builder.finish_node(); // HtmlBlock
397    current_pos
398}
399
400/// Emit one continuation line of an HTML block, preserving any blockquote
401/// markers as structural tokens (so the CST stays byte-equal to the source
402/// and downstream consumers can strip them per-context).
403fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
404    let inner = if bq_depth > 0 {
405        let stripped = strip_n_blockquote_markers(line, bq_depth);
406        let prefix_len = line.len() - stripped.len();
407        if prefix_len > 0 {
408            for ch in line[..prefix_len].chars() {
409                if ch == '>' {
410                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
411                } else {
412                    let mut buf = [0u8; 4];
413                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
414                }
415            }
416        }
417        stripped
418    } else {
419        line
420    };
421
422    let (line_without_newline, newline_str) = strip_newline(inner);
423    if !line_without_newline.is_empty() {
424        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
425    }
426    if !newline_str.is_empty() {
427        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
428    }
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_try_parse_html_comment() {
437        assert_eq!(
438            try_parse_html_block_start("<!-- comment -->", false),
439            Some(HtmlBlockType::Comment)
440        );
441        assert_eq!(
442            try_parse_html_block_start("  <!-- comment -->", false),
443            Some(HtmlBlockType::Comment)
444        );
445    }
446
447    #[test]
448    fn test_try_parse_div_tag() {
449        assert_eq!(
450            try_parse_html_block_start("<div>", false),
451            Some(HtmlBlockType::BlockTag {
452                tag_name: "div".to_string(),
453                is_verbatim: false,
454                closed_by_blank_line: false,
455            })
456        );
457        assert_eq!(
458            try_parse_html_block_start("<div class=\"test\">", false),
459            Some(HtmlBlockType::BlockTag {
460                tag_name: "div".to_string(),
461                is_verbatim: false,
462                closed_by_blank_line: false,
463            })
464        );
465    }
466
467    #[test]
468    fn test_try_parse_script_tag() {
469        assert_eq!(
470            try_parse_html_block_start("<script>", false),
471            Some(HtmlBlockType::BlockTag {
472                tag_name: "script".to_string(),
473                is_verbatim: true,
474                closed_by_blank_line: false,
475            })
476        );
477    }
478
479    #[test]
480    fn test_try_parse_processing_instruction() {
481        assert_eq!(
482            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
483            Some(HtmlBlockType::ProcessingInstruction)
484        );
485    }
486
487    #[test]
488    fn test_try_parse_declaration() {
489        assert_eq!(
490            try_parse_html_block_start("<!DOCTYPE html>", false),
491            Some(HtmlBlockType::Declaration)
492        );
493    }
494
495    #[test]
496    fn test_try_parse_cdata() {
497        assert_eq!(
498            try_parse_html_block_start("<![CDATA[content]]>", false),
499            Some(HtmlBlockType::CData)
500        );
501    }
502
503    #[test]
504    fn test_extract_block_tag_name_open_only() {
505        assert_eq!(
506            extract_block_tag_name("<div>", false),
507            Some("div".to_string())
508        );
509        assert_eq!(
510            extract_block_tag_name("<div class=\"test\">", false),
511            Some("div".to_string())
512        );
513        assert_eq!(
514            extract_block_tag_name("<div/>", false),
515            Some("div".to_string())
516        );
517        assert_eq!(extract_block_tag_name("</div>", false), None);
518        assert_eq!(extract_block_tag_name("<>", false), None);
519        assert_eq!(extract_block_tag_name("< div>", false), None);
520    }
521
522    #[test]
523    fn test_extract_block_tag_name_with_closing() {
524        // CommonMark §4.6 type-6 starts also accept closing tags.
525        assert_eq!(
526            extract_block_tag_name("</div>", true),
527            Some("div".to_string())
528        );
529        assert_eq!(
530            extract_block_tag_name("</div >", true),
531            Some("div".to_string())
532        );
533    }
534
535    #[test]
536    fn test_commonmark_type6_closing_tag_start() {
537        assert_eq!(
538            try_parse_html_block_start("</div>", true),
539            Some(HtmlBlockType::BlockTag {
540                tag_name: "div".to_string(),
541                is_verbatim: false,
542                closed_by_blank_line: true,
543            })
544        );
545    }
546
547    #[test]
548    fn test_commonmark_type7_open_tag() {
549        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
550        // CommonMark; rejected under non-CommonMark.
551        assert_eq!(
552            try_parse_html_block_start("<a href=\"foo\">", true),
553            Some(HtmlBlockType::Type7)
554        );
555        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
556    }
557
558    #[test]
559    fn test_commonmark_type7_close_tag() {
560        assert_eq!(
561            try_parse_html_block_start("</ins>", true),
562            Some(HtmlBlockType::Type7)
563        );
564    }
565
566    #[test]
567    fn test_commonmark_type7_rejects_with_trailing_text() {
568        // A complete tag must be followed only by whitespace.
569        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
570    }
571
572    #[test]
573    fn test_is_closing_marker_comment() {
574        let block_type = HtmlBlockType::Comment;
575        assert!(is_closing_marker("-->", &block_type));
576        assert!(is_closing_marker("end -->", &block_type));
577        assert!(!is_closing_marker("<!--", &block_type));
578    }
579
580    #[test]
581    fn test_is_closing_marker_tag() {
582        let block_type = HtmlBlockType::BlockTag {
583            tag_name: "div".to_string(),
584            is_verbatim: false,
585            closed_by_blank_line: false,
586        };
587        assert!(is_closing_marker("</div>", &block_type));
588        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
589        assert!(is_closing_marker("content</div>", &block_type));
590        assert!(!is_closing_marker("<div>", &block_type));
591    }
592
593    #[test]
594    fn test_parse_html_comment_block() {
595        let input = "<!-- comment -->\n";
596        let lines: Vec<&str> = input.lines().collect();
597        let mut builder = GreenNodeBuilder::new();
598
599        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
600        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
601
602        assert_eq!(new_pos, 1);
603    }
604
605    #[test]
606    fn test_parse_div_block() {
607        let input = "<div>\ncontent\n</div>\n";
608        let lines: Vec<&str> = input.lines().collect();
609        let mut builder = GreenNodeBuilder::new();
610
611        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
612        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
613
614        assert_eq!(new_pos, 3);
615    }
616
617    #[test]
618    fn test_parse_html_block_no_closing() {
619        let input = "<div>\ncontent\n";
620        let lines: Vec<&str> = input.lines().collect();
621        let mut builder = GreenNodeBuilder::new();
622
623        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
624        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
625
626        // Should consume all lines even without closing tag
627        assert_eq!(new_pos, 2);
628    }
629
630    #[test]
631    fn test_commonmark_type6_blank_line_terminates() {
632        let input = "<div>\nfoo\n\nbar\n";
633        let lines: Vec<&str> = input.lines().collect();
634        let mut builder = GreenNodeBuilder::new();
635
636        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
637        let new_pos = parse_html_block(&mut builder, &lines, 0, block_type, 0);
638
639        // Block contains <div>\nfoo\n; stops at blank line (line 2).
640        assert_eq!(new_pos, 2);
641    }
642}