Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
4use crate::syntax::SyntaxKind;
5use rowan::GreenNodeBuilder;
6
7use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
8use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
9
10/// HTML block-level tags as defined by CommonMark spec.
11/// These tags start an HTML block when found at the start of a line.
12const BLOCK_TAGS: &[&str] = &[
13    "address",
14    "article",
15    "aside",
16    "base",
17    "basefont",
18    "blockquote",
19    "body",
20    "caption",
21    "center",
22    "col",
23    "colgroup",
24    "dd",
25    "details",
26    "dialog",
27    "dir",
28    "div",
29    "dl",
30    "dt",
31    "fieldset",
32    "figcaption",
33    "figure",
34    "footer",
35    "form",
36    "frame",
37    "frameset",
38    "h1",
39    "h2",
40    "h3",
41    "h4",
42    "h5",
43    "h6",
44    "head",
45    "header",
46    "hr",
47    "html",
48    "iframe",
49    "legend",
50    "li",
51    "link",
52    "main",
53    "menu",
54    "menuitem",
55    "nav",
56    "noframes",
57    "ol",
58    "optgroup",
59    "option",
60    "p",
61    "param",
62    "section",
63    "source",
64    "summary",
65    "table",
66    "tbody",
67    "td",
68    "tfoot",
69    "th",
70    "thead",
71    "title",
72    "tr",
73    "track",
74    "ul",
75];
76
77/// Tags that contain raw/verbatim content (no Markdown processing inside).
78const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
79
80/// Whether `name` (case-insensitive) is one of the HTML block-level tags
81/// recognized by CommonMark §4.6 type-6 (and pandoc's `markdown_in_html_blocks`
82/// splitter). Used by the pandoc-native projector to decide whether a complete
83/// HTML tag inside an `HTML_BLOCK` should split the block — block-level tags
84/// emit as separate `RawBlock` entries; inline tags (e.g. `<em>`, `<a>`,
85/// `<input>`, `<br>`) stay inline in the surrounding `Plain` content.
86pub fn is_html_block_tag_name(name: &str) -> bool {
87    let lower = name.to_ascii_lowercase();
88    BLOCK_TAGS.contains(&lower.as_str())
89}
90
91/// Information about a detected HTML block opening.
92#[derive(Debug, Clone, PartialEq, Eq)]
93pub(crate) enum HtmlBlockType {
94    /// HTML comment: <!-- ... -->
95    Comment,
96    /// Processing instruction: <? ... ?>
97    ProcessingInstruction,
98    /// Declaration: <!...>
99    Declaration,
100    /// CDATA section: <![CDATA[ ... ]]>
101    CData,
102    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
103    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
104    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
105    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
106    /// `depth_aware` extends the matching-tag close path with balanced
107    /// open/close tracking of the same tag name (mirrors pandoc's
108    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
109    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
110    /// `closed_by_blank_line` is true.
111    BlockTag {
112        tag_name: String,
113        is_verbatim: bool,
114        closed_by_blank_line: bool,
115        depth_aware: bool,
116    },
117    /// CommonMark §4.6 type 7: complete open or close tag on a line by
118    /// itself, tag name not in the type-1 verbatim list. Block ends at
119    /// blank line. Cannot interrupt a paragraph.
120    Type7,
121}
122
123/// Try to detect an HTML block opening from content.
124/// Returns block type if this is a valid HTML block start.
125///
126/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
127/// accept closing tags (`</div>`), type-6 blocks end at the next blank
128/// line (rather than a matching close tag), and type 7 is recognized.
129pub(crate) fn try_parse_html_block_start(
130    content: &str,
131    is_commonmark: bool,
132) -> Option<HtmlBlockType> {
133    let trimmed = strip_leading_spaces(content);
134
135    // Must start with <
136    if !trimmed.starts_with('<') {
137        return None;
138    }
139
140    // HTML comment
141    if trimmed.starts_with("<!--") {
142        return Some(HtmlBlockType::Comment);
143    }
144
145    // Processing instruction
146    if trimmed.starts_with("<?") {
147        return Some(HtmlBlockType::ProcessingInstruction);
148    }
149
150    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
151    // recognize bare CDATA as a raw HTML block; the literal bytes fall
152    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
153    // text is parsed as inline markdown, etc).
154    if is_commonmark && trimmed.starts_with("<![CDATA[") {
155        return Some(HtmlBlockType::CData);
156    }
157
158    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
159    // does not recognize bare declarations as raw HTML blocks (its
160    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
161    // tag-shaped blocks); the bytes fall through to paragraph parsing.
162    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
163        let after_bang = &trimmed[2..];
164        if after_bang.chars().next()?.is_ascii_alphabetic() {
165            return Some(HtmlBlockType::Declaration);
166        }
167    }
168
169    // Try to parse as opening tag (or closing tag, under CommonMark)
170    if let Some(tag_name) = extract_block_tag_name(trimmed, is_commonmark) {
171        let tag_lower = tag_name.to_lowercase();
172        let is_closing = trimmed.starts_with("</");
173
174        // Check if it's a block-level tag
175        if BLOCK_TAGS.contains(&tag_lower.as_str()) {
176            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
177            return Some(HtmlBlockType::BlockTag {
178                tag_name: tag_lower,
179                is_verbatim,
180                closed_by_blank_line: is_commonmark && !is_verbatim,
181                depth_aware: !is_commonmark,
182            });
183        }
184
185        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
186        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
187        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
188        // do not start a type-1 block. Letting `</pre>` through here would
189        // wrongly interrupt a paragraph.
190        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
191            return Some(HtmlBlockType::BlockTag {
192                tag_name: tag_lower,
193                is_verbatim: true,
194                closed_by_blank_line: false,
195                depth_aware: !is_commonmark,
196            });
197        }
198    }
199
200    // Type 7 (CommonMark only): complete open or close tag on a line by
201    // itself, tag name not in the type-1 verbatim list.
202    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
203    {
204        let rest = &trimmed[end..];
205        let only_ws = rest
206            .bytes()
207            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
208        if only_ws {
209            // Reject if the tag name belongs to the type-1 verbatim set
210            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
211            // type-1 starts above, so seeing one here means the opener
212            // had a different shape (e.g. `<pre/>` self-closing) that
213            // shouldn't trigger type 7 either. Conservatively skip.
214            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
215            let name_end = leading
216                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
217                .unwrap_or(leading.len());
218            let name = leading[..name_end].to_ascii_lowercase();
219            if !VERBATIM_TAGS.contains(&name.as_str()) {
220                return Some(HtmlBlockType::Type7);
221            }
222        }
223    }
224
225    None
226}
227
228/// Extract the tag name for HTML-block-start detection.
229///
230/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
231/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
232/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
233/// the spec — we approximate that with the space/`>`/`/` boundary check.
234fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
235    if !text.starts_with('<') {
236        return None;
237    }
238
239    let after_bracket = &text[1..];
240
241    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
242        if !accept_closing {
243            return None;
244        }
245        stripped
246    } else {
247        after_bracket
248    };
249
250    // Extract tag name (alphanumeric, ends at space, >, or /)
251    let tag_end = after_slash
252        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
253        .unwrap_or(after_slash.len());
254
255    if tag_end == 0 {
256        return None;
257    }
258
259    let tag_name = &after_slash[..tag_end];
260
261    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
262    if !tag_name.chars().next()?.is_ascii_alphabetic() {
263        return None;
264    }
265
266    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
267        return None;
268    }
269
270    Some(tag_name.to_string())
271}
272
273/// Whether this block type ends at a blank line (CommonMark types 6 & 7
274/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
275/// marker — only at end of input or the next blank line.
276fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
277    matches!(
278        block_type,
279        HtmlBlockType::Type7
280            | HtmlBlockType::BlockTag {
281                closed_by_blank_line: true,
282                ..
283            }
284    )
285}
286
287/// Check if a line contains the closing marker for the given HTML block type.
288/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
289/// blank-line-terminated types (6 in CommonMark, 7) never match here.
290fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
291    match block_type {
292        HtmlBlockType::Comment => line.contains("-->"),
293        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
294        HtmlBlockType::Declaration => line.contains('>'),
295        HtmlBlockType::CData => line.contains("]]>"),
296        HtmlBlockType::BlockTag {
297            tag_name,
298            closed_by_blank_line: false,
299            ..
300        } => {
301            // Look for closing tag </tagname>
302            let closing_tag = format!("</{}>", tag_name);
303            line.to_lowercase().contains(&closing_tag)
304        }
305        HtmlBlockType::BlockTag {
306            closed_by_blank_line: true,
307            ..
308        }
309        | HtmlBlockType::Type7 => false,
310    }
311}
312
313/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
314/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
315/// inside a quoted attribute value are NOT counted — the scanner walks
316/// `<...>` brackets and respects `"`/`'` quoting.
317///
318/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
319/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`).
320fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
321    let bytes = line.as_bytes();
322    let lower_line = line.to_ascii_lowercase();
323    let lower_bytes = lower_line.as_bytes();
324    let tag_lower = tag_name.to_ascii_lowercase();
325    let tag_bytes = tag_lower.as_bytes();
326
327    let mut opens = 0usize;
328    let mut closes = 0usize;
329    let mut i = 0usize;
330
331    while i < bytes.len() {
332        if bytes[i] != b'<' {
333            i += 1;
334            continue;
335        }
336        let after = i + 1;
337        let is_close = after < bytes.len() && bytes[after] == b'/';
338        let name_start = if is_close { after + 1 } else { after };
339        let matched = name_start + tag_bytes.len() <= bytes.len()
340            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
341        let after_name = name_start + tag_bytes.len();
342        let is_boundary = matched
343            && matches!(
344                bytes.get(after_name).copied(),
345                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
346            );
347
348        // Walk forward to the closing `>` of this tag bracket, skipping
349        // inside quoted attribute values. Self-closing form ends with `/>`.
350        let mut j = if matched { after_name } else { after };
351        let mut quote: Option<u8> = None;
352        let mut self_close = false;
353        let mut found_gt = false;
354        while j < bytes.len() {
355            let b = bytes[j];
356            match (quote, b) {
357                (Some(q), x) if x == q => quote = None,
358                (None, b'"') | (None, b'\'') => quote = Some(b),
359                (None, b'>') => {
360                    found_gt = true;
361                    if j > i + 1 && bytes[j - 1] == b'/' {
362                        self_close = true;
363                    }
364                    break;
365                }
366                _ => {}
367            }
368            j += 1;
369        }
370
371        if matched && is_boundary {
372            if is_close {
373                closes += 1;
374            } else if !self_close {
375                opens += 1;
376            }
377        }
378
379        if found_gt {
380            i = j + 1;
381        } else {
382            // Unterminated `<...` — bail out to avoid an infinite loop.
383            // The remaining bytes don't form a complete tag.
384            break;
385        }
386    }
387
388    (opens, closes)
389}
390
391/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
392/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
393/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
394/// identical to the source either way; only the wrapper retag changes.
395pub(crate) fn parse_html_block_with_wrapper(
396    builder: &mut GreenNodeBuilder<'static>,
397    lines: &[&str],
398    start_pos: usize,
399    block_type: HtmlBlockType,
400    bq_depth: usize,
401    wrapper_kind: SyntaxKind,
402) -> usize {
403    // Start HTML block
404    builder.start_node(wrapper_kind.into());
405
406    let first_line = lines[start_pos];
407    let blank_terminated = ends_at_blank_line(&block_type);
408
409    // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
410    // tokens for the first line's blockquote prefix; emit only the inner
411    // content as TEXT to keep the CST byte-equal to the source.
412    let first_inner = if bq_depth > 0 {
413        strip_n_blockquote_markers(first_line, bq_depth)
414    } else {
415        first_line
416    };
417
418    // Emit opening line
419    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
420
421    let (line_without_newline, newline_str) = strip_newline(first_inner);
422    if !line_without_newline.is_empty() {
423        // For HTML_BLOCK_DIV, expose the open tag's attributes
424        // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
425        // via the same descendants walk that handles fenced-div /
426        // heading attrs. CST bytes stay byte-equal to source — we only
427        // tokenize at finer granularity for matched div opens.
428        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
429            emit_div_open_tag_tokens(builder, line_without_newline);
430        } else {
431            builder.token(SyntaxKind::TEXT.into(), line_without_newline);
432        }
433    }
434    if !newline_str.is_empty() {
435        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
436    }
437
438    builder.finish_node(); // HtmlBlockTag
439
440    // Set up depth-aware close tracking when the block type asks for it
441    // (Pandoc dialect, balanced same-name tag matching). A `None` means
442    // we fall back to the legacy "first matching close" path via
443    // `is_closing_marker`.
444    let depth_aware_tag: Option<String> = match &block_type {
445        HtmlBlockType::BlockTag {
446            tag_name,
447            closed_by_blank_line: false,
448            depth_aware: true,
449            ..
450        } => Some(tag_name.clone()),
451        _ => None,
452    };
453    let mut depth: i64 = 1;
454    if let Some(tag_name) = &depth_aware_tag {
455        let (opens, closes) = count_tag_balance(first_inner, tag_name);
456        depth = opens as i64 - closes as i64;
457    }
458
459    // Check if opening line also contains closing marker. Blank-line-terminated
460    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
461    // end at a blank line or end of input.
462    let same_line_closed = !blank_terminated
463        && match &depth_aware_tag {
464            Some(_) => depth <= 0,
465            None => is_closing_marker(first_inner, &block_type),
466        };
467    if same_line_closed {
468        log::trace!(
469            "HTML block at line {} opens and closes on same line",
470            start_pos + 1
471        );
472        builder.finish_node(); // HtmlBlock
473        return start_pos + 1;
474    }
475
476    let mut current_pos = start_pos + 1;
477    let mut content_lines: Vec<&str> = Vec::new();
478    let mut found_closing = false;
479
480    // Parse content until we find the closing marker
481    while current_pos < lines.len() {
482        let line = lines[current_pos];
483        let (line_bq_depth, inner) = count_blockquote_markers(line);
484
485        // Only process lines at the same or deeper blockquote depth
486        if line_bq_depth < bq_depth {
487            break;
488        }
489
490        // Blank-line-terminated blocks (types 6/7) end before the blank line.
491        // The blank line itself is not part of the block.
492        if blank_terminated && inner.trim().is_empty() {
493            break;
494        }
495
496        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
497        // count opens/closes of the same tag name and only close when depth
498        // returns to 0; otherwise fall back to substring-match on the line.
499        let line_closes = match &depth_aware_tag {
500            Some(tag_name) => {
501                let (opens, closes) = count_tag_balance(inner, tag_name);
502                depth += opens as i64;
503                depth -= closes as i64;
504                depth <= 0
505            }
506            None => is_closing_marker(inner, &block_type),
507        };
508
509        if line_closes {
510            log::trace!("Found HTML block closing at line {}", current_pos + 1);
511            found_closing = true;
512
513            if !content_lines.is_empty() {
514                builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
515                for content_line in &content_lines {
516                    emit_html_block_line(builder, content_line, bq_depth);
517                }
518                builder.finish_node();
519            }
520
521            builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
522            emit_html_block_line(builder, line, bq_depth);
523            builder.finish_node();
524
525            current_pos += 1;
526            break;
527        }
528
529        // Regular content line
530        content_lines.push(line);
531        current_pos += 1;
532    }
533
534    // If we didn't find a closing marker, emit what we collected
535    if !found_closing {
536        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
537        if !content_lines.is_empty() {
538            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
539            for content_line in &content_lines {
540                emit_html_block_line(builder, content_line, bq_depth);
541            }
542            builder.finish_node();
543        }
544    }
545
546    builder.finish_node(); // HtmlBlock
547    current_pos
548}
549
550/// Emit the open-tag line of an `HTML_BLOCK_DIV`, splitting the bytes
551/// `[ws]<div[ ws ATTRS]>[trailing]` into
552/// `WHITESPACE? + TEXT("<div") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
553/// + TEXT(">") + TEXT(trailing)?`.
554///
555/// Bytes are byte-identical to the source — this only tokenizes at finer
556/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
557/// region structurally. Falls back to a single TEXT token if the line
558/// doesn't fit the expected `<div ...>` shape (defensive — the parser
559/// only retags as `HTML_BLOCK_DIV` when this shape was matched).
560fn emit_div_open_tag_tokens(builder: &mut GreenNodeBuilder<'static>, line: &str) {
561    let bytes = line.as_bytes();
562    // Leading indent (CommonMark allows up to 3 spaces).
563    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
564    if indent_end > 0 {
565        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
566    }
567    let rest = &line[indent_end..];
568    // Match the literal `<div` prefix (ASCII case-insensitive on `div`).
569    if !rest.starts_with('<') || rest.len() < 4 || !rest[1..4].eq_ignore_ascii_case("div") {
570        builder.token(SyntaxKind::TEXT.into(), rest);
571        return;
572    }
573    let after_name = &rest[4..];
574    let after_name_bytes = after_name.as_bytes();
575    // Find the closing `>` of the open tag, respecting quoted attribute values.
576    let mut i = 0usize;
577    let mut quote: Option<u8> = None;
578    let mut tag_close: Option<usize> = None;
579    while i < after_name_bytes.len() {
580        let b = after_name_bytes[i];
581        match (quote, b) {
582            (None, b'"') | (None, b'\'') => quote = Some(b),
583            (Some(q), b2) if b2 == q => quote = None,
584            (None, b'>') => {
585                tag_close = Some(i);
586                break;
587            }
588            _ => {}
589        }
590        i += 1;
591    }
592    let Some(tag_close) = tag_close else {
593        // Open tag has no closing `>` on this line — defensive fallback.
594        builder.token(SyntaxKind::TEXT.into(), rest);
595        return;
596    };
597    // Whitespace between the tag name and the attribute region.
598    let attrs_inner = &after_name[..tag_close];
599    let ws_end = attrs_inner
600        .as_bytes()
601        .iter()
602        .position(|&b| !matches!(b, b' ' | b'\t'))
603        .unwrap_or(attrs_inner.len());
604    let leading_ws = &attrs_inner[..ws_end];
605    // Strip a trailing self-closing slash and the whitespace before it
606    // from the attribute region; emit them as TEXT outside the
607    // HTML_ATTRS node so the structural region only holds attribute
608    // bytes (not formatting punctuation).
609    let attrs_after_ws = &attrs_inner[ws_end..];
610    let mut attr_end = attrs_after_ws.len();
611    let attr_bytes = attrs_after_ws.as_bytes();
612    let mut self_close_start = attr_end;
613    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
614        self_close_start = attr_end - 1;
615        attr_end = self_close_start;
616        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
617            attr_end -= 1;
618        }
619    }
620    let attrs_text = &attrs_after_ws[..attr_end];
621    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
622    let after_self_close = &attrs_after_ws[self_close_start..];
623
624    builder.token(SyntaxKind::TEXT.into(), "<div");
625    if !leading_ws.is_empty() {
626        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
627    }
628    if !attrs_text.is_empty() {
629        builder.start_node(SyntaxKind::HTML_ATTRS.into());
630        builder.token(SyntaxKind::TEXT.into(), attrs_text);
631        builder.finish_node();
632    }
633    if !trailing_text.is_empty() {
634        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
635    }
636    if !after_self_close.is_empty() {
637        builder.token(SyntaxKind::TEXT.into(), after_self_close);
638    }
639    builder.token(SyntaxKind::TEXT.into(), ">");
640    let after_gt = &after_name[tag_close + 1..];
641    if !after_gt.is_empty() {
642        builder.token(SyntaxKind::TEXT.into(), after_gt);
643    }
644}
645
646/// Emit one continuation line of an HTML block, preserving any blockquote
647/// markers as structural tokens (so the CST stays byte-equal to the source
648/// and downstream consumers can strip them per-context).
649fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
650    let inner = if bq_depth > 0 {
651        let stripped = strip_n_blockquote_markers(line, bq_depth);
652        let prefix_len = line.len() - stripped.len();
653        if prefix_len > 0 {
654            for ch in line[..prefix_len].chars() {
655                if ch == '>' {
656                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
657                } else {
658                    let mut buf = [0u8; 4];
659                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
660                }
661            }
662        }
663        stripped
664    } else {
665        line
666    };
667
668    let (line_without_newline, newline_str) = strip_newline(inner);
669    if !line_without_newline.is_empty() {
670        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
671    }
672    if !newline_str.is_empty() {
673        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
674    }
675}
676
677#[cfg(test)]
678mod tests {
679    use super::*;
680
681    #[test]
682    fn test_try_parse_html_comment() {
683        assert_eq!(
684            try_parse_html_block_start("<!-- comment -->", false),
685            Some(HtmlBlockType::Comment)
686        );
687        assert_eq!(
688            try_parse_html_block_start("  <!-- comment -->", false),
689            Some(HtmlBlockType::Comment)
690        );
691    }
692
693    #[test]
694    fn test_try_parse_div_tag() {
695        assert_eq!(
696            try_parse_html_block_start("<div>", false),
697            Some(HtmlBlockType::BlockTag {
698                tag_name: "div".to_string(),
699                is_verbatim: false,
700                closed_by_blank_line: false,
701                depth_aware: true,
702            })
703        );
704        assert_eq!(
705            try_parse_html_block_start("<div class=\"test\">", false),
706            Some(HtmlBlockType::BlockTag {
707                tag_name: "div".to_string(),
708                is_verbatim: false,
709                closed_by_blank_line: false,
710                depth_aware: true,
711            })
712        );
713    }
714
715    #[test]
716    fn test_try_parse_script_tag() {
717        assert_eq!(
718            try_parse_html_block_start("<script>", false),
719            Some(HtmlBlockType::BlockTag {
720                tag_name: "script".to_string(),
721                is_verbatim: true,
722                closed_by_blank_line: false,
723                depth_aware: true,
724            })
725        );
726    }
727
728    #[test]
729    fn test_try_parse_processing_instruction() {
730        assert_eq!(
731            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
732            Some(HtmlBlockType::ProcessingInstruction)
733        );
734    }
735
736    #[test]
737    fn test_try_parse_declaration() {
738        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
739        assert_eq!(
740            try_parse_html_block_start("<!DOCTYPE html>", true),
741            Some(HtmlBlockType::Declaration)
742        );
743        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
744        // just uppercase. Lowercase doctype must match too.
745        assert_eq!(
746            try_parse_html_block_start("<!doctype html>", true),
747            Some(HtmlBlockType::Declaration)
748        );
749        // Pandoc dialect does not — bare declarations fall through to
750        // paragraph parsing.
751        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
752        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
753    }
754
755    #[test]
756    fn test_try_parse_cdata() {
757        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
758        assert_eq!(
759            try_parse_html_block_start("<![CDATA[content]]>", true),
760            Some(HtmlBlockType::CData)
761        );
762        // Pandoc dialect does not.
763        assert_eq!(
764            try_parse_html_block_start("<![CDATA[content]]>", false),
765            None
766        );
767    }
768
769    #[test]
770    fn test_extract_block_tag_name_open_only() {
771        assert_eq!(
772            extract_block_tag_name("<div>", false),
773            Some("div".to_string())
774        );
775        assert_eq!(
776            extract_block_tag_name("<div class=\"test\">", false),
777            Some("div".to_string())
778        );
779        assert_eq!(
780            extract_block_tag_name("<div/>", false),
781            Some("div".to_string())
782        );
783        assert_eq!(extract_block_tag_name("</div>", false), None);
784        assert_eq!(extract_block_tag_name("<>", false), None);
785        assert_eq!(extract_block_tag_name("< div>", false), None);
786    }
787
788    #[test]
789    fn test_extract_block_tag_name_with_closing() {
790        // CommonMark §4.6 type-6 starts also accept closing tags.
791        assert_eq!(
792            extract_block_tag_name("</div>", true),
793            Some("div".to_string())
794        );
795        assert_eq!(
796            extract_block_tag_name("</div >", true),
797            Some("div".to_string())
798        );
799    }
800
801    #[test]
802    fn test_commonmark_type6_closing_tag_start() {
803        assert_eq!(
804            try_parse_html_block_start("</div>", true),
805            Some(HtmlBlockType::BlockTag {
806                tag_name: "div".to_string(),
807                is_verbatim: false,
808                closed_by_blank_line: true,
809                depth_aware: false,
810            })
811        );
812    }
813
814    #[test]
815    fn test_commonmark_type7_open_tag() {
816        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
817        // CommonMark; rejected under non-CommonMark.
818        assert_eq!(
819            try_parse_html_block_start("<a href=\"foo\">", true),
820            Some(HtmlBlockType::Type7)
821        );
822        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
823    }
824
825    #[test]
826    fn test_commonmark_type7_close_tag() {
827        assert_eq!(
828            try_parse_html_block_start("</ins>", true),
829            Some(HtmlBlockType::Type7)
830        );
831    }
832
833    #[test]
834    fn test_commonmark_type7_rejects_with_trailing_text() {
835        // A complete tag must be followed only by whitespace.
836        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
837    }
838
839    #[test]
840    fn test_is_closing_marker_comment() {
841        let block_type = HtmlBlockType::Comment;
842        assert!(is_closing_marker("-->", &block_type));
843        assert!(is_closing_marker("end -->", &block_type));
844        assert!(!is_closing_marker("<!--", &block_type));
845    }
846
847    #[test]
848    fn test_is_closing_marker_tag() {
849        let block_type = HtmlBlockType::BlockTag {
850            tag_name: "div".to_string(),
851            is_verbatim: false,
852            closed_by_blank_line: false,
853            depth_aware: false,
854        };
855        assert!(is_closing_marker("</div>", &block_type));
856        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
857        assert!(is_closing_marker("content</div>", &block_type));
858        assert!(!is_closing_marker("<div>", &block_type));
859    }
860
861    #[test]
862    fn test_parse_html_comment_block() {
863        let input = "<!-- comment -->\n";
864        let lines: Vec<&str> = input.lines().collect();
865        let mut builder = GreenNodeBuilder::new();
866
867        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
868        let new_pos = parse_html_block_with_wrapper(
869            &mut builder,
870            &lines,
871            0,
872            block_type,
873            0,
874            SyntaxKind::HTML_BLOCK,
875        );
876
877        assert_eq!(new_pos, 1);
878    }
879
880    #[test]
881    fn test_parse_div_block() {
882        let input = "<div>\ncontent\n</div>\n";
883        let lines: Vec<&str> = input.lines().collect();
884        let mut builder = GreenNodeBuilder::new();
885
886        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
887        let new_pos = parse_html_block_with_wrapper(
888            &mut builder,
889            &lines,
890            0,
891            block_type,
892            0,
893            SyntaxKind::HTML_BLOCK,
894        );
895
896        assert_eq!(new_pos, 3);
897    }
898
899    #[test]
900    fn test_parse_html_block_no_closing() {
901        let input = "<div>\ncontent\n";
902        let lines: Vec<&str> = input.lines().collect();
903        let mut builder = GreenNodeBuilder::new();
904
905        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
906        let new_pos = parse_html_block_with_wrapper(
907            &mut builder,
908            &lines,
909            0,
910            block_type,
911            0,
912            SyntaxKind::HTML_BLOCK,
913        );
914
915        // Should consume all lines even without closing tag
916        assert_eq!(new_pos, 2);
917    }
918
919    #[test]
920    fn test_parse_div_block_nested_pandoc() {
921        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
922        // close on the OUTER `</div>`, not the first `</div>` seen. The
923        // CommonMark-style "first close" scanner is wrong here; Pandoc's
924        // div parser is depth-aware (mirrors `htmlInBalanced`).
925        let input =
926            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
927        let lines: Vec<&str> = input.lines().collect();
928        let mut builder = GreenNodeBuilder::new();
929
930        // is_commonmark = false → Pandoc dialect.
931        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
932        let new_pos = parse_html_block_with_wrapper(
933            &mut builder,
934            &lines,
935            0,
936            block_type,
937            0,
938            SyntaxKind::HTML_BLOCK_DIV,
939        );
940
941        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
942        // inner-close, blank, outer-close. All consumed.
943        assert_eq!(new_pos, 9);
944    }
945
946    #[test]
947    fn test_parse_div_block_same_line_pandoc() {
948        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
949        // close on first line. Depth-aware tracking must not regress this.
950        let input = "<div>foo</div>\n";
951        let lines: Vec<&str> = input.lines().collect();
952        let mut builder = GreenNodeBuilder::new();
953
954        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
955        let new_pos = parse_html_block_with_wrapper(
956            &mut builder,
957            &lines,
958            0,
959            block_type,
960            0,
961            SyntaxKind::HTML_BLOCK_DIV,
962        );
963        assert_eq!(new_pos, 1);
964    }
965
966    #[test]
967    fn test_commonmark_verbatim_first_close() {
968        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
969        // ends at the first matching close — not depth-aware. Stash a
970        // bogus inner `<script>` inside a JS string; the outer block
971        // still closes at the first `</script>`.
972        let input = "<script>\nlet x = '<script>';\n</script>\n";
973        let lines: Vec<&str> = input.lines().collect();
974        let mut builder = GreenNodeBuilder::new();
975
976        // is_commonmark = true.
977        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
978        let new_pos = parse_html_block_with_wrapper(
979            &mut builder,
980            &lines,
981            0,
982            block_type,
983            0,
984            SyntaxKind::HTML_BLOCK,
985        );
986        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
987        assert_eq!(new_pos, 3);
988    }
989
990    #[test]
991    fn test_commonmark_type6_blank_line_terminates() {
992        let input = "<div>\nfoo\n\nbar\n";
993        let lines: Vec<&str> = input.lines().collect();
994        let mut builder = GreenNodeBuilder::new();
995
996        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
997        let new_pos = parse_html_block_with_wrapper(
998            &mut builder,
999            &lines,
1000            0,
1001            block_type,
1002            0,
1003            SyntaxKind::HTML_BLOCK,
1004        );
1005
1006        // Block contains <div>\nfoo\n; stops at blank line (line 2).
1007        assert_eq!(new_pos, 2);
1008    }
1009}