Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10    ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::attributes::emit_html_attrs_node;
13use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
14
15/// HTML block-level tags as defined by CommonMark spec.
16/// These tags start an HTML block when found at the start of a line.
17const BLOCK_TAGS: &[&str] = &[
18    "address",
19    "article",
20    "aside",
21    "base",
22    "basefont",
23    "blockquote",
24    "body",
25    "caption",
26    "center",
27    "col",
28    "colgroup",
29    "dd",
30    "details",
31    "dialog",
32    "dir",
33    "div",
34    "dl",
35    "dt",
36    "fieldset",
37    "figcaption",
38    "figure",
39    "footer",
40    "form",
41    "frame",
42    "frameset",
43    "h1",
44    "h2",
45    "h3",
46    "h4",
47    "h5",
48    "h6",
49    "head",
50    "header",
51    "hr",
52    "html",
53    "iframe",
54    "legend",
55    "li",
56    "link",
57    "main",
58    "menu",
59    "menuitem",
60    "nav",
61    "noframes",
62    "ol",
63    "optgroup",
64    "option",
65    "p",
66    "param",
67    "section",
68    "source",
69    "summary",
70    "table",
71    "tbody",
72    "td",
73    "tfoot",
74    "th",
75    "thead",
76    "title",
77    "tr",
78    "track",
79    "ul",
80];
81
82/// Tags that contain raw/verbatim content (no Markdown processing inside).
83const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
84
85/// Pandoc's `blockHtmlTags` (mirrors
86/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
87/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
88/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
89/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
90/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
91/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
92///
93/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
94/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
95/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
96/// and the verbatim `script`) is tracked separately as
97/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
98/// fresh-block positions but stay inline inside an existing HTML block
99/// (e.g. `<form><input><button>X</button></form>`); the projector's
100/// `split_html_block_by_tags` keys on `inline_pending` to keep them
101/// inline once an inline-only tag or text byte has been seen since the
102/// last splitter.
103const PANDOC_BLOCK_TAGS: &[&str] = &[
104    "address",
105    "article",
106    "aside",
107    "blockquote",
108    "body",
109    "canvas",
110    "caption",
111    "center",
112    "col",
113    "colgroup",
114    "dd",
115    "details",
116    "dir",
117    "div",
118    "dl",
119    "dt",
120    "fieldset",
121    "figcaption",
122    "figure",
123    "footer",
124    "form",
125    "frameset",
126    "h1",
127    "h2",
128    "h3",
129    "h4",
130    "h5",
131    "h6",
132    "head",
133    "header",
134    "hgroup",
135    "hr",
136    "html",
137    "isindex",
138    "li",
139    "main",
140    "menu",
141    "meta",
142    "nav",
143    "noframes",
144    "ol",
145    "output",
146    "p",
147    "pre",
148    "script",
149    "section",
150    "style",
151    "summary",
152    "table",
153    "tbody",
154    "td",
155    "textarea",
156    "tfoot",
157    "th",
158    "thead",
159    "tr",
160    "ul",
161];
162
163/// Whether `name` (case-insensitive) is one of the HTML block-level tags
164/// recognized by CommonMark §4.6 type-6.
165pub fn is_html_block_tag_name(name: &str) -> bool {
166    let lower = name.to_ascii_lowercase();
167    BLOCK_TAGS.contains(&lower.as_str())
168}
169
170/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
171/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
172/// Used by the pandoc-native projector's `split_html_block_by_tags` to
173/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
174/// the block — block-level tags emit as separate `RawBlock` entries;
175/// inline tags stay inline in the surrounding `Plain` content.
176pub fn is_pandoc_block_tag_name(name: &str) -> bool {
177    let lower = name.to_ascii_lowercase();
178    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
179}
180
181/// Pandoc's `eitherBlockOrInline` set (mirrors
182/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
183/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
184/// (because `name ∉ blockTags`). At top level (or after a blank line)
185/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
186/// inside an existing HTML block once a paragraph has started parsing,
187/// the same tag stays inline as `RawInline`.
188///
189/// The projector's `split_html_block_by_tags` mirrors this with an
190/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
191/// always split; inline-block tags split only when no inline content
192/// has been buffered since the last splitter.
193///
194/// Void elements (`area`, `embed`, `source`, `track`) live in
195/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
196/// rule as non-void inline-block tags but emit a single RawBlock per
197/// instance instead of a matched-pair lift.
198/// `script` is omitted because it is already verbatim (handled by the
199/// `<script>...</script>` raw-text path) and the strict-block check
200/// fires first regardless.
201const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
202    "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
203    "svg", "video",
204];
205
206/// Whether `name` (case-insensitive) is one of pandoc's
207/// `eitherBlockOrInline` tags (excluding void elements and `script`;
208/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
209pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
210    let lower = name.to_ascii_lowercase();
211    PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
212}
213
214/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
215/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
216/// minus those handled elsewhere: `br` and `wbr` are inline-only;
217/// `img` and `input` are inline-only; HTML void elements that pandoc
218/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
219/// `track`).
220///
221/// At fresh-block positions (or after a blank line) pandoc emits these
222/// as a single `RawBlock`; inside a running paragraph they stay inline
223/// as `RawInline`. The parser opens a depth-zero HTML block (closes
224/// immediately on the open-tag line — there is no closing tag to
225/// match) so subsequent lines start fresh blocks; the projector's
226/// `split_html_block_by_tags` handles the same-line splitting via
227/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
228const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
229
230/// Whether `name` (case-insensitive) is one of pandoc's void
231/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
232pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
233    let lower = name.to_ascii_lowercase();
234    PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
235}
236
237/// Whether the given tag name is eligible for the Phase 6 / Fix #4
238/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
239/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
240/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
241/// and NOT void. These are the tags where pandoc parses the body as
242/// fresh markdown between RawBlock emissions of the open/close tags —
243/// exactly the shape we can lift into structural CST children.
244///
245/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
246/// additional gate at the lift-gate site: the lift is abandoned when
247/// the body's first non-blank content is a void block tag at a
248/// fresh-block position (`<video>\n<source ...>\n</video>` projects
249/// per-tag rather than matched-pair, mirroring pandoc).
250///
251/// `<div>` is intentionally excluded — it has its own lift path
252/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
253/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
254pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
255    let lower = name.to_ascii_lowercase();
256    if VERBATIM_TAGS.contains(&lower.as_str()) {
257        return false;
258    }
259    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
260        return false;
261    }
262    if lower == "div" {
263        return false;
264    }
265    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
266        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
267}
268
269/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
270/// — anything that has an opening and a matching closing form whose
271/// `</tag>` would be recognized by the dispatcher as a separate block
272/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
273/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
274/// Void tags are excluded — they have no close form.
275///
276/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
277/// an open inside the buffer whose close would otherwise interrupt the
278/// list item mid-construct.
279pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
280    let lower = name.to_ascii_lowercase();
281    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
282        return false;
283    }
284    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
285        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
286        || VERBATIM_TAGS.contains(&lower.as_str())
287}
288
289/// Open-tag-attribute tokenization gate for non-div strict-block tags
290/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
291/// open tag is eligible for finer-grained tokenization
292/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
293/// driving the full body lift — that's the `bq_clean_lift` path. The
294/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
295/// the salsa anchor index.
296///
297/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
298/// wrapper) regardless of bq depth, so this gate skips it.
299fn bq_strict_attr_emit_tag_name(
300    wrapper_kind: SyntaxKind,
301    block_type: &HtmlBlockType,
302    bq_depth: usize,
303) -> Option<&str> {
304    if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
305        return None;
306    }
307    match block_type {
308        HtmlBlockType::BlockTag {
309            tag_name,
310            is_verbatim: false,
311            closed_by_blank_line: false,
312            depth_aware: true,
313            closes_at_open_tag: false,
314            is_closing: false,
315        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
316        _ => None,
317    }
318}
319
320/// Information about a detected HTML block opening.
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub(crate) enum HtmlBlockType {
323    /// HTML comment: <!-- ... -->
324    Comment,
325    /// Processing instruction: <? ... ?>
326    ProcessingInstruction,
327    /// Declaration: <!...>
328    Declaration,
329    /// CDATA section: <![CDATA[ ... ]]>
330    CData,
331    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
332    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
333    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
334    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
335    /// `depth_aware` extends the matching-tag close path with balanced
336    /// open/close tracking of the same tag name (mirrors pandoc's
337    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
338    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
339    /// `closed_by_blank_line` is true.
340    /// `closes_at_open_tag` short-circuits the close search: the block
341    /// always ends after the open-tag line. Used for void
342    /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
343    /// `<track>`) which have no closing tag — depth-aware matching
344    /// would walk to end-of-input.
345    /// `is_closing` records whether the tag at the start position is a
346    /// closing form (`</tag>`) rather than an opening form (`<tag>`).
347    /// The dispatcher's `cannot_interrupt` consults this to mirror
348    /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
349    /// even when `<script>` is not — pandoc treats the close-form as
350    /// always-inline regardless of attributes).
351    BlockTag {
352        tag_name: String,
353        is_verbatim: bool,
354        closed_by_blank_line: bool,
355        depth_aware: bool,
356        closes_at_open_tag: bool,
357        is_closing: bool,
358    },
359    /// CommonMark §4.6 type 7: complete open or close tag on a line by
360    /// itself, tag name not in the type-1 verbatim list. Block ends at
361    /// blank line. Cannot interrupt a paragraph.
362    Type7,
363}
364
365/// Try to detect an HTML block opening from content.
366/// Returns block type if this is a valid HTML block start.
367///
368/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
369/// accept closing tags (`</div>`), type-6 blocks end at the next blank
370/// line (rather than a matching close tag), and type 7 is recognized.
371pub(crate) fn try_parse_html_block_start(
372    content: &str,
373    is_commonmark: bool,
374) -> Option<HtmlBlockType> {
375    let trimmed = strip_leading_spaces(content);
376
377    // Must start with <
378    if !trimmed.starts_with('<') {
379        return None;
380    }
381
382    // HTML comment
383    if trimmed.starts_with("<!--") {
384        return Some(HtmlBlockType::Comment);
385    }
386
387    // Processing instruction
388    if trimmed.starts_with("<?") {
389        return Some(HtmlBlockType::ProcessingInstruction);
390    }
391
392    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
393    // recognize bare CDATA as a raw HTML block; the literal bytes fall
394    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
395    // text is parsed as inline markdown, etc).
396    if is_commonmark && trimmed.starts_with("<![CDATA[") {
397        return Some(HtmlBlockType::CData);
398    }
399
400    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
401    // does not recognize bare declarations as raw HTML blocks (its
402    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
403    // tag-shaped blocks); the bytes fall through to paragraph parsing.
404    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
405        let after_bang = &trimmed[2..];
406        if after_bang.chars().next()?.is_ascii_alphabetic() {
407            return Some(HtmlBlockType::Declaration);
408        }
409    }
410
411    // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
412    // Pandoc-native recognizes standalone closing forms of strict-block tags
413    // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
414    // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
415    // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
416    // the open-tag line via `closes_at_open_tag: true`.
417    if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
418        let tag_lower = tag_name.to_lowercase();
419        let is_closing = trimmed.starts_with("</");
420
421        // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
422        // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
423        // Unlike inline-block / void closes, these CAN interrupt a running
424        // paragraph (the dispatcher's `cannot_interrupt` only covers the
425        // inline-block / void categories). Inline-block / void closes are
426        // handled by their own branches further below.
427        if !is_commonmark
428            && is_closing
429            && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
430                || VERBATIM_TAGS.contains(&tag_lower.as_str()))
431            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
432            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
433        {
434            return Some(HtmlBlockType::BlockTag {
435                tag_name: tag_lower,
436                is_verbatim: false,
437                closed_by_blank_line: false,
438                depth_aware: false,
439                closes_at_open_tag: true,
440                is_closing: true,
441            });
442        }
443
444        // Under Pandoc, remaining closing forms (truly inline-only tags like
445        // `</em>`, `</span>`) are not block starts — fall through to the
446        // existing inline-html path. Inline-block + void closes are caught
447        // by the dedicated branches further below.
448        if !is_commonmark
449            && is_closing
450            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
451            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
452        {
453            return None;
454        }
455
456        // Check if it's a block-level tag. Pandoc and CommonMark disagree on
457        // membership: pandoc's `blockHtmlTags` (see
458        // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
459        // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
460        // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
461        let is_block_tag = if is_commonmark {
462            BLOCK_TAGS.contains(&tag_lower.as_str())
463        } else {
464            PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
465        };
466        if is_block_tag {
467            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
468            return Some(HtmlBlockType::BlockTag {
469                tag_name: tag_lower,
470                is_verbatim,
471                closed_by_blank_line: is_commonmark && !is_verbatim,
472                depth_aware: !is_commonmark,
473                closes_at_open_tag: false,
474                is_closing,
475            });
476        }
477
478        // Pandoc dialect also treats `eitherBlockOrInline` tags as block
479        // starters at fresh-block positions. The block dispatcher caller
480        // gates these as `cannot_interrupt` (mirrors pandoc — they never
481        // interrupt a running paragraph; only start a fresh block when
482        // following a blank line or at document start). Closing forms
483        // (`</video>`) emit as a single-line `RawBlock` with no balanced
484        // match — pandoc-native pins this for standalone closes.
485        if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
486            return Some(HtmlBlockType::BlockTag {
487                tag_name: tag_lower,
488                is_verbatim: false,
489                closed_by_blank_line: false,
490                depth_aware: !is_closing,
491                closes_at_open_tag: is_closing,
492                is_closing,
493            });
494        }
495
496        // Pandoc dialect also recognizes the void subset of
497        // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
498        // These have no closing tag, so the parser closes the block
499        // immediately on the open-tag line; the projector's
500        // `split_html_block_by_tags` handles the same-line splitting
501        // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
502        // non-void inline-block tags, void tags never interrupt a
503        // running paragraph (gated as `cannot_interrupt` in the
504        // dispatcher). Closing forms (`</embed>`) — semantically
505        // nonsensical for void elements — pandoc still emits as a
506        // single-line `RawBlock`; mirror that.
507        if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
508            return Some(HtmlBlockType::BlockTag {
509                tag_name: tag_lower,
510                is_verbatim: false,
511                closed_by_blank_line: false,
512                depth_aware: false,
513                closes_at_open_tag: true,
514                is_closing,
515            });
516        }
517
518        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
519        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
520        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
521        // do not start a type-1 block. Letting `</pre>` through here would
522        // wrongly interrupt a paragraph.
523        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
524            return Some(HtmlBlockType::BlockTag {
525                tag_name: tag_lower,
526                is_verbatim: true,
527                closed_by_blank_line: false,
528                depth_aware: !is_commonmark,
529                closes_at_open_tag: false,
530                is_closing: false,
531            });
532        }
533    }
534
535    // Type 7 (CommonMark only): complete open or close tag on a line by
536    // itself, tag name not in the type-1 verbatim list.
537    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
538    {
539        let rest = &trimmed[end..];
540        let only_ws = rest
541            .bytes()
542            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
543        if only_ws {
544            // Reject if the tag name belongs to the type-1 verbatim set
545            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
546            // type-1 starts above, so seeing one here means the opener
547            // had a different shape (e.g. `<pre/>` self-closing) that
548            // shouldn't trigger type 7 either. Conservatively skip.
549            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
550            let name_end = leading
551                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
552                .unwrap_or(leading.len());
553            let name = leading[..name_end].to_ascii_lowercase();
554            if !VERBATIM_TAGS.contains(&name.as_str()) {
555                return Some(HtmlBlockType::Type7);
556            }
557        }
558    }
559
560    None
561}
562
563/// Extract the tag name for HTML-block-start detection.
564///
565/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
566/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
567/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
568/// the spec — we approximate that with the space/`>`/`/` boundary check.
569fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
570    if !text.starts_with('<') {
571        return None;
572    }
573
574    let after_bracket = &text[1..];
575
576    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
577        if !accept_closing {
578            return None;
579        }
580        stripped
581    } else {
582        after_bracket
583    };
584
585    // Extract tag name (alphanumeric, ends at space, >, or /)
586    let tag_end = after_slash
587        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
588        .unwrap_or(after_slash.len());
589
590    if tag_end == 0 {
591        return None;
592    }
593
594    let tag_name = &after_slash[..tag_end];
595
596    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
597    if !tag_name.chars().next()?.is_ascii_alphabetic() {
598        return None;
599    }
600
601    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
602        return None;
603    }
604
605    Some(tag_name.to_string())
606}
607
608/// Whether this block type ends at a blank line (CommonMark types 6 & 7
609/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
610/// marker — only at end of input or the next blank line.
611fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
612    matches!(
613        block_type,
614        HtmlBlockType::Type7
615            | HtmlBlockType::BlockTag {
616                closed_by_blank_line: true,
617                ..
618            }
619    )
620}
621
622/// Check if a line contains the closing marker for the given HTML block type.
623/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
624/// blank-line-terminated types (6 in CommonMark, 7) never match here.
625fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
626    match block_type {
627        HtmlBlockType::Comment => line.contains("-->"),
628        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
629        HtmlBlockType::Declaration => line.contains('>'),
630        HtmlBlockType::CData => line.contains("]]>"),
631        HtmlBlockType::BlockTag {
632            tag_name,
633            closed_by_blank_line: false,
634            ..
635        } => {
636            // Look for closing tag </tagname>
637            let closing_tag = format!("</{}>", tag_name);
638            line.to_lowercase().contains(&closing_tag)
639        }
640        HtmlBlockType::BlockTag {
641            closed_by_blank_line: true,
642            ..
643        }
644        | HtmlBlockType::Type7 => false,
645    }
646}
647
648/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
649/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
650/// inside a quoted attribute value are NOT counted — the scanner walks
651/// `<...>` brackets and respects `"`/`'` quoting.
652///
653/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
654/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
655/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
656/// close-form dispatch that would otherwise break the list-item buffer
657/// mid-`<div>...</div>`.
658pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
659    let bytes = line.as_bytes();
660    let lower_line = line.to_ascii_lowercase();
661    let lower_bytes = lower_line.as_bytes();
662    let tag_lower = tag_name.to_ascii_lowercase();
663    let tag_bytes = tag_lower.as_bytes();
664
665    let mut opens = 0usize;
666    let mut closes = 0usize;
667    let mut i = 0usize;
668
669    while i < bytes.len() {
670        if bytes[i] != b'<' {
671            i += 1;
672            continue;
673        }
674        let after = i + 1;
675        let is_close = after < bytes.len() && bytes[after] == b'/';
676        let name_start = if is_close { after + 1 } else { after };
677        let matched = name_start + tag_bytes.len() <= bytes.len()
678            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
679        let after_name = name_start + tag_bytes.len();
680        let is_boundary = matched
681            && matches!(
682                bytes.get(after_name).copied(),
683                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
684            );
685
686        // Walk forward to the closing `>` of this tag bracket, skipping
687        // inside quoted attribute values. Self-closing form ends with `/>`.
688        let mut j = if matched { after_name } else { after };
689        let mut quote: Option<u8> = None;
690        let mut self_close = false;
691        let mut found_gt = false;
692        while j < bytes.len() {
693            let b = bytes[j];
694            match (quote, b) {
695                (Some(q), x) if x == q => quote = None,
696                (None, b'"') | (None, b'\'') => quote = Some(b),
697                (None, b'>') => {
698                    found_gt = true;
699                    if j > i + 1 && bytes[j - 1] == b'/' {
700                        self_close = true;
701                    }
702                    break;
703                }
704                _ => {}
705            }
706            j += 1;
707        }
708
709        if matched && is_boundary {
710            if is_close {
711                closes += 1;
712            } else if !self_close {
713                opens += 1;
714            }
715        }
716
717        if found_gt {
718            i = j + 1;
719        } else {
720            // Unterminated `<...` — bail out to avoid an infinite loop.
721            // The remaining bytes don't form a complete tag.
722            break;
723        }
724    }
725
726    (opens, closes)
727}
728
729/// Pandoc-dialect lift for HTML comments / processing instructions
730/// whose close marker is followed by additional bytes (same-line
731/// trailing or following lines). Pandoc-native emits a `RawBlock` for
732/// the marker bytes only, then parses the remainder as fresh blocks.
733///
734/// Returns `Some(consumed_lines)` when the split fires (caller must
735/// NOT enter the legacy emission); `None` to fall back to the legacy
736/// path (no close marker found, or no trailing content to split).
737///
738/// CST shape on success:
739/// ```text
740/// HTML_BLOCK
741///   HTML_BLOCK_TAG (open)        // line[0] up to and incl close marker
742///     TEXT  "<!-- hi -->"        // or with HTML_BLOCK_CONTENT in between
743///     ...                        // for multi-line `<!--\n…\n-->` shape
744/// <sibling blocks>               // recursive parse of trailing + lines[M+1..]
745/// ```
746fn try_parse_comment_pi_with_trailing_split(
747    builder: &mut GreenNodeBuilder<'static>,
748    lines: &[&str],
749    start_pos: usize,
750    block_type: &HtmlBlockType,
751    wrapper_kind: SyntaxKind,
752    bq_depth: usize,
753    config: &ParserOptions,
754) -> Option<usize> {
755    let marker: &str = match block_type {
756        HtmlBlockType::Comment => "-->",
757        HtmlBlockType::ProcessingInstruction => "?>",
758        _ => return None,
759    };
760
761    // Find the close marker in the bq-stripped line content. For
762    // bq_depth == 0 the inner content equals the raw line; for
763    // bq_depth > 0 we look past the `>` markers stripped by the
764    // outer dispatcher (line 0) and emitted as bq prefix below
765    // (lines > 0). `marker_end_in_inner` is the byte offset of the
766    // first byte AFTER the close marker, measured from the start
767    // of the inner (post-strip) content.
768    let mut close_line_idx: Option<usize> = None;
769    let mut marker_end_in_inner: usize = 0;
770    for (offset, line) in lines[start_pos..].iter().enumerate() {
771        let inner = if bq_depth > 0 {
772            strip_n_blockquote_markers(line, bq_depth)
773        } else {
774            line
775        };
776        if let Some(pos) = inner.find(marker) {
777            close_line_idx = Some(start_pos + offset);
778            marker_end_in_inner = pos + marker.len();
779            break;
780        }
781    }
782    let close_line_idx = close_line_idx?;
783    let close_line = lines[close_line_idx];
784    let close_inner = if bq_depth > 0 {
785        strip_n_blockquote_markers(close_line, bq_depth)
786    } else {
787        close_line
788    };
789    let close_prefix_len = close_line.len() - close_inner.len();
790    let trailing = &close_inner[marker_end_in_inner..];
791
792    // Only fire when there is non-whitespace content AFTER the close
793    // marker on the close line. The legacy path correctly handles
794    // the close-line-ends-at-close-marker shapes (`-->\n` followed
795    // by separate blocks); only the same-line-trailing case needs
796    // structural splitting. Trailing-whitespace-only handling
797    // (`-->   \n`) is a projector-side trim — separate concern.
798    let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
799    if !has_non_ws_trailing {
800        return None;
801    }
802
803    builder.start_node(wrapper_kind.into());
804
805    // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
806    // middle `HTML_BLOCK_CONTENT` lines between open and close. The
807    // close `HTML_BLOCK_TAG` carries only the bytes up to and
808    // including the close marker — trailing bytes go to the sibling.
809    if close_line_idx == start_pos {
810        // Same-line shape: one HTML_BLOCK_TAG containing the close
811        // marker's bytes. The newline lives on the trailing sibling.
812        // Line 0's bq prefix (if any) was already emitted by the
813        // outer dispatcher; emit only the inner marker bytes.
814        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
815        let close_part = &close_inner[..marker_end_in_inner];
816        if !close_part.is_empty() {
817            builder.token(SyntaxKind::TEXT.into(), close_part);
818        }
819        builder.finish_node();
820    } else {
821        // Multi-line shape: open tag covers lines[start_pos..close],
822        // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
823        // only the marker bytes. Line 0's bq prefix was emitted by
824        // the outer dispatcher; subsequent lines (middle + close)
825        // need bq prefix re-emission inside the wrapper.
826        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
827        let first_line = lines[start_pos];
828        let first_inner = if bq_depth > 0 {
829            strip_n_blockquote_markers(first_line, bq_depth)
830        } else {
831            first_line
832        };
833        let (line_no_nl, nl) = strip_newline(first_inner);
834        if !line_no_nl.is_empty() {
835            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
836        }
837        if !nl.is_empty() {
838            builder.token(SyntaxKind::NEWLINE.into(), nl);
839        }
840        builder.finish_node();
841
842        if close_line_idx > start_pos + 1 {
843            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
844            for content_line in &lines[start_pos + 1..close_line_idx] {
845                emit_html_block_line(builder, content_line, bq_depth);
846            }
847            builder.finish_node();
848        }
849
850        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
851        if bq_depth > 0 && close_prefix_len > 0 {
852            emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
853        }
854        let close_part = &close_inner[..marker_end_in_inner];
855        if !close_part.is_empty() {
856            builder.token(SyntaxKind::TEXT.into(), close_part);
857        }
858        builder.finish_node();
859    }
860
861    builder.finish_node(); // HTML_BLOCK
862
863    // Recursively parse JUST the trailing bytes on the close line
864    // and graft top-level children as siblings of the HTML_BLOCK we
865    // just closed. We do NOT consume subsequent lines here — the
866    // outer dispatcher continues from `close_line_idx + 1` and
867    // handles container-boundary lines (`:::` div closes, blockquote
868    // markers, list-marker continuations) correctly. Multi-line
869    // softbreak continuation (`<!-- --> trailing\nmore\n` →
870    // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
871    // outer dispatcher sees `more` after the close line and starts
872    // a fresh paragraph. Refdefs flow through from the outer config
873    // (same pattern as `emit_html_block_body_lifted_inner`).
874    if !trailing.is_empty() {
875        let mut inner_options = config.clone();
876        let refdefs = config.refdef_labels.clone().unwrap_or_default();
877        inner_options.refdef_labels = Some(refdefs.clone());
878        let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
879        let mut bq = None;
880        graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
881    }
882
883    Some(close_line_idx + 1)
884}
885
886/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
887/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
888/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
889/// identical to the source either way; only the wrapper retag changes.
890pub(crate) fn parse_html_block_with_wrapper(
891    builder: &mut GreenNodeBuilder<'static>,
892    lines: &[&str],
893    start_pos: usize,
894    block_type: HtmlBlockType,
895    prefix: &ContainerPrefix,
896    wrapper_kind: SyntaxKind,
897    config: &ParserOptions,
898) -> usize {
899    let bq_depth = prefix.bq_depth();
900    // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
901    // closes the RawBlock at the close marker (`-->` / `?>`) and parses
902    // any subsequent bytes (same-line trailing or following lines) as
903    // fresh blocks. The legacy path absorbs them into the HTML block
904    // wrapper, producing one oversized RawBlock. Handle the split here
905    // before entering the legacy emission so the CST encodes the
906    // sibling structure.
907    if config.dialect == crate::options::Dialect::Pandoc
908        && matches!(
909            block_type,
910            HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
911        )
912        && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
913            builder,
914            lines,
915            start_pos,
916            &block_type,
917            wrapper_kind,
918            bq_depth,
919            config,
920        )
921    {
922        return consumed;
923    }
924
925    // Start HTML block
926    builder.start_node(wrapper_kind.into());
927
928    let first_line = lines[start_pos];
929    let blank_terminated = ends_at_blank_line(&block_type);
930
931    // The block dispatcher has already emitted the bq prefix tokens for
932    // the first line; emit only the inner content as TEXT to keep the
933    // CST byte-equal to the source. List-marker bytes are stripped only
934    // when this dispatch fires on a list-marker line — for
935    // continuation-line dispatches (the much more common case) the
936    // leading indent is inner content, not upstream-emitted prefix.
937    let first_inner = prefix.strip_line_0_for_emission(first_line);
938
939    // Detect a multi-line open tag.
940    // - `<div>` (Pandoc lift): we tokenize each line structurally so the
941    //   salsa anchor walk picks up `id` from the HTML_ATTRS region.
942    // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
943    //   `<section>`, `<header>`, …): same structural emission, exposing
944    //   `id` to the salsa anchor walk and enabling the body lift below.
945    // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
946    //   without this, the parser closes the block after line 0 and the
947    //   remainder of the open tag falls into following paragraphs;
948    //   pandoc-native treats the whole multi-line open tag as a single
949    //   `RawBlock`. Emission for void tags uses simple per-line
950    //   TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
951    //   from void tags).
952    let multiline_open_end = match (wrapper_kind, &block_type) {
953        (SyntaxKind::HTML_BLOCK_DIV, _) => {
954            find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
955        }
956        (
957            _,
958            HtmlBlockType::BlockTag {
959                tag_name,
960                closes_at_open_tag: true,
961                ..
962            },
963        ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
964        (
965            _,
966            HtmlBlockType::BlockTag {
967                tag_name,
968                is_verbatim: false,
969                closed_by_blank_line: false,
970                depth_aware: true,
971                closes_at_open_tag: false,
972                is_closing: false,
973            },
974        ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
975            find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
976        }
977        _ => None,
978    };
979
980    // Set up depth-aware close tracking when the block type asks for it
981    // (Pandoc dialect, balanced same-name tag matching). A `None` means
982    // we fall back to the legacy "first matching close" path via
983    // `is_closing_marker`. Computed up front so the lift-mode gate
984    // below can decide whether the open line already balances the
985    // block (same-line `<div>...</div>`).
986    let depth_aware_tag: Option<String> = match &block_type {
987        HtmlBlockType::BlockTag {
988            tag_name,
989            closed_by_blank_line: false,
990            depth_aware: true,
991            ..
992        } => Some(tag_name.clone()),
993        _ => None,
994    };
995    let mut depth: i64 = 1;
996    if let Some(tag_name) = &depth_aware_tag {
997        // Sum opens/closes across all open-tag lines (single-line: just
998        // line 0; multi-line: lines 0..=end_line_idx).
999        let last_open_line = multiline_open_end.unwrap_or(start_pos);
1000        let mut opens = 0usize;
1001        let mut closes = 0usize;
1002        for line in &lines[start_pos..=last_open_line] {
1003            let inner = prefix.strip(line);
1004            let (o, c) = count_tag_balance(inner, tag_name);
1005            opens += o;
1006            closes += c;
1007        }
1008        depth = opens as i64 - closes as i64;
1009    }
1010
1011    // Same-line `<div>foo</div>` shape: the open line balances the
1012    // block under depth-aware tracking. We can lift this structurally
1013    // only when the open-tag trailing has exactly one `</div>` close,
1014    // zero `<div>` opens, and no non-whitespace content after the
1015    // close. Other same-line shapes (nested, trailing text, malformed)
1016    // fall through to the byte-reparse path.
1017    let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1018        && multiline_open_end.is_none()
1019        && depth_aware_tag.is_some()
1020        && depth <= 0;
1021    let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1022        let (line_without_newline, _) = strip_newline(first_inner);
1023        probe_same_line_lift(line_without_newline, "div")
1024    };
1025
1026    // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1027    // `<nav>`, …): the body parses as fresh markdown between RawBlock
1028    // emissions of the open/close tags. Covers the clean multi-line
1029    // shape (open tag stands alone on its line), open-trailing
1030    // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1031    // and same-line (`<form>foo</form>`). Multi-line open and
1032    // blockquote-wrapped non-div shapes still fall through to the
1033    // byte-walker path.
1034    let strict_block_tag_name: Option<&str> =
1035        if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1036            match &block_type {
1037                HtmlBlockType::BlockTag {
1038                    tag_name,
1039                    is_verbatim: false,
1040                    closed_by_blank_line: false,
1041                    depth_aware: true,
1042                    closes_at_open_tag: false,
1043                    is_closing: false,
1044                } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1045                _ => None,
1046            }
1047        } else {
1048            None
1049        };
1050    // Same-line `<form>foo</form>` shape: the open line already
1051    // balances the block (`depth <= 0`). Lift only when the trailing
1052    // bytes after the open `>` end with `</tag>` and contain exactly
1053    // one close + zero nested opens.
1054    let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1055        multiline_open_end.is_none() && depth <= 0 && {
1056            let (line_no_nl, _) = strip_newline(first_inner);
1057            probe_same_line_lift(line_no_nl, name)
1058        }
1059    });
1060    // Strict-block lift gate: accept (a) a multi-line open tag spanning
1061    // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1062    // trailing single-line open (depth > 0, open `>` is present with
1063    // quote-aware matching), or (c) a safe same-line shape. For
1064    // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1065    // …) the lift additionally abandons when the body starts at a
1066    // fresh-block position with a void block tag — pandoc-native pins
1067    // per-tag emission rather than a matched-pair lift in that case.
1068    let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1069        let (line_no_nl, _) = strip_newline(first_inner);
1070        let shape_ok = if multiline_open_end.is_some() {
1071            // `find_multiline_open_end` already verified the open tag
1072            // closes with a quote-aware `>` somewhere in lines
1073            // `start_pos+1..=end`. No same-line trailing content to
1074            // probe; defer trailing-on-close-`>`-line handling to a
1075            // future session (rare in practice).
1076            true
1077        } else if depth > 0 {
1078            probe_open_tag_line_has_close_gt(line_no_nl, name)
1079        } else {
1080            same_line_strict_lift_safe
1081        };
1082        if !shape_ok {
1083            return false;
1084        }
1085        if !is_pandoc_inline_block_tag_name(name) {
1086            return true;
1087        }
1088        !inline_block_void_interior_abandons(
1089            first_inner,
1090            lines,
1091            start_pos,
1092            multiline_open_end,
1093            bq_depth,
1094            name,
1095        )
1096    });
1097
1098    // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1099    // are byte-equal to the non-bq same-line shape minus the leading
1100    // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1101    // The body has no inner newlines, so no bq prefix re-injection is
1102    // needed when grafting — `emit_html_block_body_lifted` (passing
1103    // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1104    // open-trailing) still fall through to the projector's byte
1105    // walker — they need per-line prefix injection.
1106    let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1107        && multiline_open_end.is_none()
1108        && depth_aware_tag.is_some()
1109        && depth <= 0
1110    {
1111        let (line_no_nl, _) = strip_newline(first_inner);
1112        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1113            if probe_same_line_lift(line_no_nl, "div") {
1114                Some("div")
1115            } else {
1116                None
1117            }
1118        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1119            match &block_type {
1120                HtmlBlockType::BlockTag {
1121                    tag_name,
1122                    is_verbatim: false,
1123                    closed_by_blank_line: false,
1124                    depth_aware: true,
1125                    closes_at_open_tag: false,
1126                    is_closing: false,
1127                } if is_pandoc_lift_eligible_block_tag(tag_name)
1128                    && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1129                {
1130                    // Inline-block tags (`<video>`, `<iframe>`, …) skip
1131                    // the void-interior check at same-line — the shape
1132                    // has no inner block content to interfere with.
1133                    Some(tag_name.as_str())
1134                }
1135                _ => None,
1136            }
1137        } else {
1138            None
1139        }
1140    } else {
1141        None
1142    };
1143
1144    // Messy-shape lift inside a blockquote — covers open-trailing
1145    // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1146    // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1147    // including the multi-line-open variants (`> <div\n>   id="x">foo\n>
1148    // body\n> </div>`) where the trailing is captured into `pre_content`
1149    // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1150    // The open line does NOT balance the block (depth > 0 after the
1151    // open line, distinguishing this from `same_line_bq_lift_tag` which
1152    // requires depth <= 0). The close line — possibly with leading body
1153    // text — closes the block when depth returns to 0. Body lines (incl.
1154    // open trailing and close leading) graft via prefix re-injection.
1155    let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1156    {
1157        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1158            Some("div")
1159        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1160            match &block_type {
1161                HtmlBlockType::BlockTag {
1162                    tag_name,
1163                    is_verbatim: false,
1164                    closed_by_blank_line: false,
1165                    depth_aware: true,
1166                    closes_at_open_tag: false,
1167                    is_closing: false,
1168                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1169                    // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1170                    // …) abandon the lift when the body starts at a
1171                    // fresh-block position with a void block tag. Same gate
1172                    // as the non-bq matched-pair lift (`strict_block_lift`).
1173                    if is_pandoc_inline_block_tag_name(tag_name)
1174                        && inline_block_void_interior_abandons(
1175                            first_inner,
1176                            lines,
1177                            start_pos,
1178                            multiline_open_end,
1179                            bq_depth,
1180                            tag_name,
1181                        )
1182                    {
1183                        None
1184                    } else {
1185                        Some(tag_name.as_str())
1186                    }
1187                }
1188                _ => None,
1189            }
1190        } else {
1191            None
1192        }
1193    } else {
1194        None
1195    };
1196
1197    // Multi-line open + matched close-on-the-open's-last-line shape inside
1198    // a blockquote (`> <div\n>   id="x">foo</div>` and depth-aware variants:
1199    // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1200    // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1201    // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1202    // close is on the open's last line (`depth <= 0` after the open lines)
1203    // AND `multiline_open_end.is_some()`. The trailing bytes after the
1204    // last `>` get lifted into `pre_content` via
1205    // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1206    // new branch below splits `pre_content` at the matched close marker
1207    // and grafts body + close + any trailing siblings.
1208    let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1209        && multiline_open_end.is_some()
1210        && depth_aware_tag.is_some()
1211        && depth <= 0
1212    {
1213        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1214            Some("div")
1215        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1216            match &block_type {
1217                HtmlBlockType::BlockTag {
1218                    tag_name,
1219                    is_verbatim: false,
1220                    closed_by_blank_line: false,
1221                    depth_aware: true,
1222                    closes_at_open_tag: false,
1223                    is_closing: false,
1224                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1225                    if is_pandoc_inline_block_tag_name(tag_name)
1226                        && inline_block_void_interior_abandons(
1227                            first_inner,
1228                            lines,
1229                            start_pos,
1230                            multiline_open_end,
1231                            bq_depth,
1232                            tag_name,
1233                        )
1234                    {
1235                        None
1236                    } else {
1237                        Some(tag_name.as_str())
1238                    }
1239                }
1240                _ => None,
1241            }
1242        } else {
1243            None
1244        }
1245    } else {
1246        None
1247    };
1248
1249    // Whether this block participates in the Phase 6 structural lift
1250    // (recursively parse body as Pandoc markdown and graft children).
1251    // Covers `<div>` outside blockquote context. For same-line shapes
1252    // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1253    // keep the legacy single-HTML_BLOCK_TAG shape and let the
1254    // byte-reparse path handle projection.
1255    let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1256        && bq_depth == 0
1257        && (!is_same_line_div || same_line_div_lift_safe))
1258        || strict_block_lift
1259        || same_line_bq_lift_tag.is_some()
1260        || bq_messy_lift_tag.is_some()
1261        || bq_multiline_close_lift_tag.is_some();
1262
1263    // Trailing content from the open tag (after `>`). When the lift is
1264    // active and the open line is `<div ATTRS>foo\n`, this captures
1265    // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1266    // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1267    // shapes (same-line / blockquote-wrapped).
1268    let mut pre_content = String::new();
1269
1270    // Emit opening line(s)
1271    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1272
1273    if let Some(end_line_idx) = multiline_open_end {
1274        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1275            emit_multiline_open_tag_with_attrs(
1276                builder,
1277                lines,
1278                start_pos,
1279                end_line_idx,
1280                "div",
1281                bq_depth,
1282                lift_mode,
1283                &mut pre_content,
1284            );
1285        } else if let Some(name) = strict_block_tag_name
1286            && strict_block_lift
1287        {
1288            emit_multiline_open_tag_with_attrs(
1289                builder,
1290                lines,
1291                start_pos,
1292                end_line_idx,
1293                name,
1294                bq_depth,
1295                lift_mode,
1296                &mut pre_content,
1297            );
1298        } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1299        {
1300            // Multi-line open of a lift-eligible strict-block tag inside a
1301            // blockquote (`> <section\n>   id=...>`). The non-bq
1302            // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1303            // covers the bq side so the open tag emits HTML_ATTRS regions
1304            // for `AttributeNode::cast` and the projector's canonicalizer.
1305            //
1306            // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1307            // call below: only push trailing bytes into `pre_content` when
1308            // the structural lift will consume them (bq messy lift). The
1309            // bq clean-lift requires `pre_content.is_empty()`, so for clean
1310            // multi-line opens the trailing is empty anyway and this is
1311            // a no-op.
1312            let lift_trailing =
1313                bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1314            emit_multiline_open_tag_with_attrs(
1315                builder,
1316                lines,
1317                start_pos,
1318                end_line_idx,
1319                name,
1320                bq_depth,
1321                lift_trailing,
1322                &mut pre_content,
1323            );
1324        } else {
1325            emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1326        }
1327    } else {
1328        let (line_without_newline, newline_str) = strip_newline(first_inner);
1329        if !line_without_newline.is_empty() {
1330            // For HTML_BLOCK_DIV, expose the open tag's attributes
1331            // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1332            // via the same descendants walk that handles fenced-div /
1333            // heading attrs. CST bytes stay byte-equal to source — we only
1334            // tokenize at finer granularity for matched div opens.
1335            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1336                let trailing =
1337                    emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1338                if !trailing.is_empty() {
1339                    pre_content.push_str(trailing);
1340                    pre_content.push_str(newline_str);
1341                }
1342            } else if let Some(name) = strict_block_tag_name
1343                && strict_block_lift
1344            {
1345                let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1346                if !trailing.is_empty() {
1347                    pre_content.push_str(trailing);
1348                    pre_content.push_str(newline_str);
1349                }
1350            } else if let Some(name) =
1351                bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1352            {
1353                // Inside a blockquote, lift trailing bytes into
1354                // `pre_content` when either the same-line bq gate fires
1355                // (`> <tag>body</tag>` — handled by `same_line_closed`)
1356                // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1357                // </tag>` and butted-close — handled at the close-marker
1358                // site below). For the clean-shape bq lift the open has
1359                // no trailing bytes regardless, so `lift_trailing=true`
1360                // is a no-op there.
1361                let lift_trailing =
1362                    same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1363                let trailing =
1364                    emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1365                if lift_trailing && !trailing.is_empty() {
1366                    pre_content.push_str(trailing);
1367                    pre_content.push_str(newline_str);
1368                }
1369            } else {
1370                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1371            }
1372        }
1373        // When the open tag has trailing content under lift mode, the
1374        // newline belongs to that trailing line (it terminates the
1375        // synthetic body line, not the open tag). Don't double-emit.
1376        if pre_content.is_empty() && !newline_str.is_empty() {
1377            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1378        }
1379    }
1380
1381    builder.finish_node(); // HtmlBlockTag
1382
1383    // Check if opening line also contains closing marker. Blank-line-terminated
1384    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1385    // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1386    // (`closes_at_open_tag: true`) close immediately — the block always
1387    // ends on the open-tag line since there is no closing tag to find.
1388    let void_block = matches!(
1389        &block_type,
1390        HtmlBlockType::BlockTag {
1391            closes_at_open_tag: true,
1392            ..
1393        }
1394    );
1395    // Void tags with a multi-line open close immediately after the open
1396    // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1397    // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1398    // a single RawBlock for the whole multi-line tag, with no following
1399    // content.
1400    if void_block && let Some(end_line_idx) = multiline_open_end {
1401        log::trace!(
1402            "HTML void block at line {} closes after multi-line open ending at line {}",
1403            start_pos + 1,
1404            end_line_idx + 1
1405        );
1406        builder.finish_node(); // HtmlBlock
1407        return end_line_idx + 1;
1408    }
1409    // Multi-line open with all matched closes on the open's last line:
1410    // `pre_content` holds the bytes after the last open `>` (lifted there
1411    // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1412    // When `depth <= 0` after the multi-line open and the trailing bytes
1413    // contain the depth-zero matched close, do the same-line lift on
1414    // `pre_content` directly. Mirrors the single-line `same_line_closed`
1415    // lift below — same body / close-marker / trailing-graft shape, just
1416    // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1417    //
1418    // The body bytes of `pre_content` come from the open's last line,
1419    // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1420    // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1421    // tag thus inherit the bq context without per-line prefix injection,
1422    // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1423    // both the non-bq and bq variants of this shape.
1424    if let Some(end_line_idx) = multiline_open_end
1425        && !blank_terminated
1426        && depth_aware_tag.is_some()
1427        && depth <= 0
1428        && lift_mode
1429        && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1430        && !pre_content.is_empty()
1431    {
1432        let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1433            Some("div")
1434        } else if strict_block_lift {
1435            strict_block_tag_name
1436        } else if let Some(name) = bq_multiline_close_lift_tag {
1437            Some(name)
1438        } else {
1439            None
1440        };
1441        if let Some(tag_name) = tag_name_opt {
1442            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1443            if let Some((leading, close_part)) =
1444                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1445            {
1446                let close_marker_end =
1447                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1448                let close_marker = &close_part[..close_marker_end];
1449                let same_line_trailing = &close_part[close_marker_end..];
1450                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1451                    LastParaDemote::SkipTrailingBlanks
1452                } else {
1453                    LastParaDemote::OnlyIfLast
1454                };
1455                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1456                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1457                if same_line_trailing.is_empty() {
1458                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1459                    close_line.push_str(close_marker);
1460                    close_line.push_str(post_nl);
1461                    emit_html_block_line(builder, &close_line, 0);
1462                    builder.finish_node();
1463                    builder.finish_node(); // HtmlBlock
1464                } else {
1465                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1466                    builder.finish_node(); // HTML_BLOCK_TAG
1467                    builder.finish_node(); // HtmlBlock
1468
1469                    let mut trailing_text =
1470                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1471                    trailing_text.push_str(same_line_trailing);
1472                    trailing_text.push_str(post_nl);
1473                    let mut inner_options = config.clone();
1474                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1475                    inner_options.refdef_labels = Some(refdefs.clone());
1476                    let inner_root = crate::parser::parse_with_refdefs(
1477                        &trailing_text,
1478                        Some(inner_options),
1479                        refdefs,
1480                    );
1481                    let mut bq = None;
1482                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1483                }
1484                return end_line_idx + 1;
1485            }
1486        }
1487    }
1488
1489    let same_line_closed = !blank_terminated
1490        && multiline_open_end.is_none()
1491        && (void_block
1492            || match &depth_aware_tag {
1493                Some(_) => depth <= 0,
1494                None => is_closing_marker(first_inner, &block_type),
1495            });
1496    if same_line_closed {
1497        log::trace!(
1498            "HTML block at line {} opens and closes on same line",
1499            start_pos + 1
1500        );
1501        // Same-line structural lift (div or non-div strict-block):
1502        // pre_content holds the bytes after the open `>` (including
1503        // the close `</tag>` and the trailing newline). Split into
1504        // body + close tag, emit body via recursive parse, emit close
1505        // tag as a sibling `HTML_BLOCK_TAG`.
1506        let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1507            None
1508        } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1509            Some("div")
1510        } else if same_line_strict_lift_safe {
1511            strict_block_tag_name
1512        } else if let Some(name) = same_line_bq_lift_tag {
1513            // Bq same-line: body has no inner newlines so the standard
1514            // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1515            // sufficient. The bq prefix `> ` lives on the outer
1516            // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1517            Some(name)
1518        } else {
1519            None
1520        };
1521        if let Some(tag_name) = same_line_lift_tag {
1522            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1523            // Depth-aware split: handles `<tag>foo</tag>bar` (single
1524            // close, trailing text), `<tag>foo</tag></tag>` (matched
1525            // close + unmatched trailing close → sibling RawBlock),
1526            // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1527            // recursive body parse).
1528            if let Some((leading, close_part)) =
1529                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1530            {
1531                // `close_part` starts with `</tag` and contains the close
1532                // marker followed by any same-line trailing text. Split
1533                // off the close marker bytes (`</tag>`) so the close
1534                // `HTML_BLOCK_TAG` carries only those bytes; trailing
1535                // text is parsed and grafted as a sibling block at the
1536                // parent level (matches pandoc-native shape:
1537                // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1538                let close_marker_end =
1539                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1540                let close_marker = &close_part[..close_marker_end];
1541                let same_line_trailing = &close_part[close_marker_end..];
1542
1543                // Same-line is always close-butted; div demotes the
1544                // trailing Para→Plain via `SkipTrailingBlanks`.
1545                // Non-div strict-block uses `OnlyIfLast` (consistent
1546                // with butted-close — no trailing BLANK_LINE before
1547                // the close means the trailing Para demotes).
1548                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1549                    LastParaDemote::SkipTrailingBlanks
1550                } else {
1551                    LastParaDemote::OnlyIfLast
1552                };
1553                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1554                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1555                if same_line_trailing.is_empty() {
1556                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1557                    close_line.push_str(close_marker);
1558                    close_line.push_str(post_nl);
1559                    emit_html_block_line(builder, &close_line, 0);
1560                    builder.finish_node();
1561                    builder.finish_node(); // HtmlBlock
1562                } else {
1563                    // Close tag holds only the close-marker bytes;
1564                    // trailing + newline graft as siblings of the
1565                    // wrapper (matches pandoc's per-tag block split).
1566                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1567                    builder.finish_node(); // HTML_BLOCK_TAG
1568                    builder.finish_node(); // HtmlBlock
1569
1570                    let mut trailing_text =
1571                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1572                    trailing_text.push_str(same_line_trailing);
1573                    trailing_text.push_str(post_nl);
1574                    let mut inner_options = config.clone();
1575                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1576                    inner_options.refdef_labels = Some(refdefs.clone());
1577                    let inner_root = crate::parser::parse_with_refdefs(
1578                        &trailing_text,
1579                        Some(inner_options),
1580                        refdefs,
1581                    );
1582                    let mut bq = None;
1583                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1584                }
1585                return start_pos + 1;
1586            }
1587        }
1588        builder.finish_node(); // HtmlBlock
1589        return start_pos + 1;
1590    }
1591
1592    let mut current_pos = multiline_open_end
1593        .map(|end| end + 1)
1594        .unwrap_or(start_pos + 1);
1595    let mut content_lines: Vec<&str> = Vec::new();
1596    let mut found_closing = false;
1597
1598    // Parse content until we find the closing marker
1599    while current_pos < lines.len() {
1600        let line = lines[current_pos];
1601        let (line_bq_depth, inner) = count_blockquote_markers(line);
1602
1603        // Only process lines at the same or deeper blockquote depth
1604        if line_bq_depth < bq_depth {
1605            break;
1606        }
1607
1608        // Blank-line-terminated blocks (types 6/7) end before the blank line.
1609        // The blank line itself is not part of the block.
1610        if blank_terminated && inner.trim().is_empty() {
1611            break;
1612        }
1613
1614        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1615        // count opens/closes of the same tag name and only close when depth
1616        // returns to 0; otherwise fall back to substring-match on the line.
1617        let line_closes = match &depth_aware_tag {
1618            Some(tag_name) => {
1619                let (opens, closes) = count_tag_balance(inner, tag_name);
1620                depth += opens as i64;
1621                depth -= closes as i64;
1622                depth <= 0
1623            }
1624            None => is_closing_marker(inner, &block_type),
1625        };
1626
1627        if line_closes {
1628            log::trace!("Found HTML block closing at line {}", current_pos + 1);
1629            found_closing = true;
1630
1631            // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1632            // the open and close tags stand alone on their source lines
1633            // (no trailing on open, no body content on close after
1634            // stripping bq markers), lift the body lines structurally
1635            // so the projector walks CST children instead of
1636            // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1637            //
1638            // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1639            // grafted, Para preserved), non-div strict-block tags
1640            // (`<form>`, `<section>`, …) and inline-block matched-pair
1641            // tags (`<video>`, `<iframe>`, …) — the latter two under
1642            // HTML_BLOCK with the structural lift hitting pandoc's
1643            // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1644            // demotion. Inline-block additionally bails if the body
1645            // starts at a fresh-block position with a void block tag
1646            // (mirrors the non-bq matched-pair gate).
1647            //
1648            // Other bq-wrapped shapes (butted-close / open-trailing /
1649            // same-line) still fall through to the opaque path.
1650            // Multi-line opens are allowed here as of 2026-05-12: the
1651            // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1652            // per attr line and per-line bq prefix tokens) by the bq-aware
1653            // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1654            // empty for multi-line opens (the emitter writes any trailing
1655            // bytes on the last open line directly as TEXT inside
1656            // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1657            // trailing falls through to the opaque path, matching the non-
1658            // bq deferral.
1659            let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1660                if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1661                    Some("div")
1662                } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1663                    match &block_type {
1664                        HtmlBlockType::BlockTag {
1665                            tag_name,
1666                            is_verbatim: false,
1667                            closed_by_blank_line: false,
1668                            depth_aware: true,
1669                            closes_at_open_tag: false,
1670                            is_closing: false,
1671                        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1672                        _ => None,
1673                    }
1674                } else {
1675                    None
1676                }
1677            } else {
1678                None
1679            };
1680
1681            let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1682                // Open-shape: last open line must end with `>` (clean
1683                // close-of-open). For single-line, that's `first_inner`
1684                // (already bq-stripped); for multi-line, strip bq markers
1685                // from `lines[end_line_idx]` and check the same.
1686                let last_open_line: &str = match multiline_open_end {
1687                    None => first_inner,
1688                    Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1689                        prefix.strip(lines[end])
1690                    }
1691                    Some(end) => lines[end],
1692                };
1693                let (open_no_nl, _) = strip_newline(last_open_line);
1694                if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1695                    return false;
1696                }
1697                let close_stripped = prefix.strip(line);
1698                let (close_no_nl, _) = strip_newline(close_stripped);
1699                if !close_no_nl
1700                    .trim_start_matches([' ', '\t'])
1701                    .starts_with("</")
1702                {
1703                    return false;
1704                }
1705                if is_pandoc_inline_block_tag_name(tag_name)
1706                    && inline_block_void_interior_abandons(
1707                        first_inner,
1708                        lines,
1709                        start_pos,
1710                        multiline_open_end,
1711                        bq_depth,
1712                        tag_name,
1713                    )
1714                {
1715                    return false;
1716                }
1717                true
1718            });
1719
1720            if bq_clean_lift {
1721                let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1722                    LastParaDemote::Never
1723                } else {
1724                    LastParaDemote::OnlyIfLast
1725                };
1726                emit_html_block_body_lifted_bq(
1727                    builder,
1728                    &content_lines,
1729                    prefix,
1730                    demote_policy,
1731                    config,
1732                );
1733                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1734                emit_html_block_line(builder, line, bq_depth);
1735                builder.finish_node();
1736                current_pos += 1;
1737                break;
1738            }
1739
1740            // Bq messy-shape lift — single-line open with trailing or
1741            // butted-close (or both). `pre_content` already captures any
1742            // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1743            // strip the close line's bq markers before splitting so
1744            // `leading` and `close_part` are bq-prefix-free. Body parses
1745            // recursively from `pre_content + stripped(content_lines) +
1746            // leading`, with per-line bq prefixes re-injected so the CST
1747            // stays byte-equal to the source. Demote: div is keyed on
1748            // close-butted-ness (Plain when leading non-empty, Para
1749            // otherwise); non-div uses OnlyIfLast either way.
1750            if let Some(tag_name) = bq_messy_lift_tag {
1751                let close_stripped = prefix.strip(line);
1752                let close_prefix_len = line.len() - close_stripped.len();
1753                let close_prefix = &line[..close_prefix_len];
1754                if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1755                {
1756                    let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1757                        if leading.is_empty() {
1758                            LastParaDemote::Never
1759                        } else {
1760                            LastParaDemote::SkipTrailingBlanks
1761                        }
1762                    } else {
1763                        LastParaDemote::OnlyIfLast
1764                    };
1765                    emit_html_block_body_lifted_bq_messy(
1766                        builder,
1767                        &pre_content,
1768                        &content_lines,
1769                        leading,
1770                        close_prefix,
1771                        prefix,
1772                        policy,
1773                        config,
1774                    );
1775                    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1776                    // When `leading` is empty, no recursive-parse output carries
1777                    // the close line's bq prefix, so emit it here before the
1778                    // close tag. When `leading` is non-empty,
1779                    // `emit_html_block_body_lifted_bq_messy` already injected
1780                    // the prefix at the start of the leading bytes (via the
1781                    // BqPrefixState entry); emitting again would double the
1782                    // prefix bytes and break losslessness.
1783                    if leading.is_empty() {
1784                        emit_bq_prefix_tokens(builder, close_prefix);
1785                    }
1786                    emit_html_block_line(builder, close_part, 0);
1787                    builder.finish_node();
1788                    current_pos += 1;
1789                    break;
1790                }
1791            }
1792
1793            // Under lift mode, try to split the close line into a
1794            // leading "body content" prefix and the close-marker
1795            // remainder using depth-aware matching. Walks at depth 1
1796            // (we're inside the open tag) so nested same-tag opens
1797            // (e.g. `<inner></inner></tag>` style with a nested div)
1798            // are absorbed into the body and parsed recursively, and
1799            // multi-close shapes (`foo</div></div>` on the close line)
1800            // peel off the matched-pair close — the unmatched
1801            // trailing close projects as a sibling `RawBlock` per
1802            // pandoc-native. For `<div>`, non-empty `leading`
1803            // propagates pandoc's `markdown_in_html_blocks` Plain
1804            // demotion rule. For non-div strict-block tags, demotion
1805            // follows pandoc's `OnlyIfLast` rule (demote the trailing
1806            // Para only when no blank line precedes the close).
1807            let close_split_tag = if lift_mode {
1808                if strict_block_lift {
1809                    strict_block_tag_name
1810                } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1811                    Some("div")
1812                } else {
1813                    None
1814                }
1815            } else {
1816                None
1817            };
1818            let (close_no_nl, close_post_nl) = strip_newline(line);
1819            let close_split = close_split_tag
1820                .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1821
1822            if let Some((leading, close_part)) = close_split {
1823                // Close-line leading that is whitespace-only is close-tag
1824                // indentation, not body content (pandoc-native strips it
1825                // from the close RawBlock and treats the close as butted —
1826                // see `   </tag>` shapes). Route those bytes into the
1827                // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1828                // projector strips them; keep the demote policy keyed on
1829                // the original leading so butted-close detection (Plain
1830                // demotion for div, OnlyIfLast for non-div) still fires.
1831                let leading_is_ws_only =
1832                    !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1833                let body_leading = if leading_is_ws_only { "" } else { leading };
1834                let policy = if strict_block_lift {
1835                    LastParaDemote::OnlyIfLast
1836                } else if !leading.is_empty() {
1837                    LastParaDemote::SkipTrailingBlanks
1838                } else {
1839                    LastParaDemote::Never
1840                };
1841                // Split close_part into close-marker bytes (`</tag>`)
1842                // and trailing bytes (e.g. an extra `</div>` for the
1843                // double-close case, or `bar` for trailing text after
1844                // a normal close). Trailing bytes are recursively
1845                // parsed and grafted as siblings of the HTML_BLOCK_DIV
1846                // wrapper.
1847                let close_tag_name = close_split_tag.expect("close_split_tag present");
1848                let close_marker_end =
1849                    split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1850                let close_marker = &close_part[..close_marker_end];
1851                let close_trailing = &close_part[close_marker_end..];
1852
1853                emit_html_block_body_lifted(
1854                    builder,
1855                    &pre_content,
1856                    &content_lines,
1857                    body_leading,
1858                    policy,
1859                    config,
1860                );
1861                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1862                if leading_is_ws_only {
1863                    builder.token(SyntaxKind::WHITESPACE.into(), leading);
1864                }
1865                if close_trailing.is_empty() {
1866                    let mut close_line =
1867                        String::with_capacity(close_marker.len() + close_post_nl.len());
1868                    close_line.push_str(close_marker);
1869                    close_line.push_str(close_post_nl);
1870                    emit_html_block_line(builder, &close_line, 0);
1871                    builder.finish_node();
1872                } else {
1873                    // Close tag holds only the close-marker bytes;
1874                    // trailing + newline graft as siblings.
1875                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1876                    builder.finish_node(); // HTML_BLOCK_TAG
1877                    builder.finish_node(); // HtmlBlock
1878
1879                    let mut trailing_text =
1880                        String::with_capacity(close_trailing.len() + close_post_nl.len());
1881                    trailing_text.push_str(close_trailing);
1882                    trailing_text.push_str(close_post_nl);
1883                    let mut inner_options = config.clone();
1884                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1885                    inner_options.refdef_labels = Some(refdefs.clone());
1886                    let inner_root = crate::parser::parse_with_refdefs(
1887                        &trailing_text,
1888                        Some(inner_options),
1889                        refdefs,
1890                    );
1891                    let mut bq = None;
1892                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1893                    current_pos += 1;
1894                    return current_pos;
1895                }
1896            } else {
1897                emit_html_block_body(
1898                    builder,
1899                    &pre_content,
1900                    &content_lines,
1901                    bq_depth,
1902                    wrapper_kind,
1903                    lift_mode,
1904                    config,
1905                );
1906                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1907                emit_html_block_line(builder, line, bq_depth);
1908                builder.finish_node();
1909            }
1910
1911            current_pos += 1;
1912            break;
1913        }
1914
1915        // Regular content line
1916        content_lines.push(line);
1917        current_pos += 1;
1918    }
1919
1920    // If we didn't find a closing marker, emit what we collected
1921    if !found_closing {
1922        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1923        emit_html_block_body(
1924            builder,
1925            &pre_content,
1926            &content_lines,
1927            bq_depth,
1928            wrapper_kind,
1929            lift_mode,
1930            config,
1931        );
1932    }
1933
1934    builder.finish_node(); // HtmlBlock
1935    current_pos
1936}
1937
1938/// Emit the collected inner content lines for an HTML block.
1939///
1940/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1941/// line `<div>` open outside blockquote), recursively parse the inner
1942/// content (including any open-tag trailing) as Pandoc-flavored
1943/// markdown and graft the resulting top-level blocks as direct children
1944/// of the wrapper. This is the Phase 6 structural lift — the projector
1945/// and downstream consumers (linter, salsa, LSP) can walk the
1946/// structural children instead of re-tokenizing the body bytes.
1947///
1948/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1949/// blockquote, multi-line open, or no content at all — fall through to
1950/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1951///
1952/// CST bytes remain byte-identical to source: the recursive parser is
1953/// lossless on the same byte slice the legacy path would have captured
1954/// as TEXT.
1955fn emit_html_block_body(
1956    builder: &mut GreenNodeBuilder<'static>,
1957    pre_content: &str,
1958    content_lines: &[&str],
1959    bq_depth: usize,
1960    wrapper_kind: SyntaxKind,
1961    lift_mode: bool,
1962    config: &ParserOptions,
1963) {
1964    if pre_content.is_empty() && content_lines.is_empty() {
1965        return;
1966    }
1967    if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1968        // Reached when the parser walked to end-of-input without finding
1969        // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1970        emit_html_block_body_lifted(
1971            builder,
1972            pre_content,
1973            content_lines,
1974            "",
1975            LastParaDemote::Never,
1976            config,
1977        );
1978        return;
1979    }
1980    // Legacy path: opaque TEXT capture. `pre_content` is always empty
1981    // here (lift_mode is the only path that populates it), but be
1982    // defensive — if a trailing prefix snuck in, emit it as TEXT so
1983    // bytes are preserved.
1984    builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1985    if !pre_content.is_empty() {
1986        builder.token(SyntaxKind::TEXT.into(), pre_content);
1987    }
1988    for content_line in content_lines {
1989        emit_html_block_line(builder, content_line, bq_depth);
1990    }
1991    builder.finish_node();
1992}
1993
1994/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1995/// to `PLAIN` when grafting children into the structural CST.
1996#[derive(Copy, Clone, Debug)]
1997enum LastParaDemote {
1998    /// Never demote — pandoc preserves the trailing `Para`.
1999    Never,
2000    /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2001    /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2002    /// tag is butted against the paragraph text on its source line —
2003    /// pandoc's `markdown_in_html_blocks` Plain demotion.
2004    SkipTrailingBlanks,
2005    /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2006    /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2007    /// for non-div strict-block tags whose body emits at top-level
2008    /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2009    /// demotes the trailing `Para` to `Plain` unless a blank line
2010    /// separates them.
2011    OnlyIfLast,
2012}
2013
2014/// Lift the HTML-block body into structural CST children: build the
2015/// inner text from `pre_content` + `content_lines` + `post_content`
2016/// (in order), recursively parse it as Pandoc-flavored markdown, and
2017/// graft the resulting top-level blocks into `builder`. `demote_policy`
2018/// controls whether the trailing paragraph is retagged as `PLAIN` to
2019/// encode pandoc's Plain/Para adjacency rules structurally.
2020fn emit_html_block_body_lifted(
2021    builder: &mut GreenNodeBuilder<'static>,
2022    pre_content: &str,
2023    content_lines: &[&str],
2024    post_content: &str,
2025    demote_policy: LastParaDemote,
2026    config: &ParserOptions,
2027) {
2028    emit_html_block_body_lifted_inner(
2029        builder,
2030        pre_content,
2031        content_lines,
2032        post_content,
2033        demote_policy,
2034        config,
2035        &mut None,
2036    )
2037}
2038
2039/// Body-lift variant for `<div>` inside a blockquote. Strips
2040/// `bq_depth` levels of blockquote markers from each `content_line`,
2041/// captures the per-line prefix bytes, and grafts the recursive parse
2042/// with prefix injection so the output CST stays byte-equal to the
2043/// source. `pre_content` and `post_content` must be empty (the bq
2044/// clean lift only handles the shape where the open and close tags
2045/// stand alone on their source lines).
2046fn emit_html_block_body_lifted_bq(
2047    builder: &mut GreenNodeBuilder<'static>,
2048    content_lines: &[&str],
2049    prefix: &ContainerPrefix,
2050    demote_policy: LastParaDemote,
2051    config: &ParserOptions,
2052) {
2053    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2054    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2055    for cl in content_lines {
2056        let (li, bq, inner) = prefix.split(cl);
2057        prefix_lines.push(ContainerPrefixLine {
2058            list_indent: li.to_string(),
2059            bq_prefix: bq.to_string(),
2060        });
2061        stripped_lines.push(inner);
2062    }
2063    let mut state = ContainerPrefixState::new(prefix_lines);
2064    emit_html_block_body_lifted_inner(
2065        builder,
2066        "",
2067        &stripped_lines,
2068        "",
2069        demote_policy,
2070        config,
2071        &mut state,
2072    )
2073}
2074
2075/// Body-lift variant for the bq messy-shape lift — open-trailing,
2076/// butted-close, or both. The open-trailing bytes (if any) sit in
2077/// `pre_content` (line 0 of the body — no bq prefix in source because
2078/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2079/// each carry their own bq prefix. The close line's `leading` (body
2080/// bytes before `</tag>`) sits on the close line, prefixed in source
2081/// by `close_line_prefix` (the bq prefix captured from `line`).
2082///
2083/// Builds `prefixes` so each emitted line in the recursive parse
2084/// output gets the right per-line bq prefix re-injected at line start:
2085/// `pre_content` → empty prefix (no source `> ` precedes it); each
2086/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2087/// Result CST stays byte-equal to source.
2088#[allow(clippy::too_many_arguments)]
2089fn emit_html_block_body_lifted_bq_messy(
2090    builder: &mut GreenNodeBuilder<'static>,
2091    pre_content: &str,
2092    content_lines: &[&str],
2093    leading: &str,
2094    close_line_prefix: &str,
2095    prefix: &ContainerPrefix,
2096    demote_policy: LastParaDemote,
2097    config: &ParserOptions,
2098) {
2099    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2100    if !pre_content.is_empty() {
2101        prefix_lines.push(ContainerPrefixLine::default());
2102    }
2103    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2104    for cl in content_lines {
2105        let (li, bq, inner) = prefix.split(cl);
2106        prefix_lines.push(ContainerPrefixLine {
2107            list_indent: li.to_string(),
2108            bq_prefix: bq.to_string(),
2109        });
2110        stripped_lines.push(inner);
2111    }
2112    if !leading.is_empty() {
2113        // The close line carries its own captured prefix bytes; treat
2114        // them as bq-prefix only (no list-indent split applied) to keep
2115        // the legacy bq-only re-injection behavior for messy-shape
2116        // close-line lifts.
2117        prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2118    }
2119    let mut state = ContainerPrefixState::new(prefix_lines);
2120    emit_html_block_body_lifted_inner(
2121        builder,
2122        pre_content,
2123        &stripped_lines,
2124        leading,
2125        demote_policy,
2126        config,
2127        &mut state,
2128    )
2129}
2130
2131fn emit_html_block_body_lifted_inner(
2132    builder: &mut GreenNodeBuilder<'static>,
2133    pre_content: &str,
2134    content_lines: &[&str],
2135    post_content: &str,
2136    demote_policy: LastParaDemote,
2137    config: &ParserOptions,
2138    bq: &mut Option<ContainerPrefixState>,
2139) {
2140    if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2141        return;
2142    }
2143    let mut inner_text = String::with_capacity(
2144        pre_content.len()
2145            + content_lines.iter().map(|s| s.len()).sum::<usize>()
2146            + post_content.len(),
2147    );
2148    inner_text.push_str(pre_content);
2149    for line in content_lines {
2150        inner_text.push_str(line);
2151    }
2152    inner_text.push_str(post_content);
2153
2154    let mut inner_options = config.clone();
2155    let refdefs = config.refdef_labels.clone().unwrap_or_default();
2156    inner_options.refdef_labels = Some(refdefs.clone());
2157    let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2158    graft_document_children(builder, &inner_root, demote_policy, bq);
2159}
2160
2161/// Walk a parsed inner document's top-level children and re-emit them
2162/// into `builder`. The document's wrapper node is skipped — only its
2163/// children are grafted.
2164///
2165/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2166/// as `PLAIN` — see [`LastParaDemote`].
2167///
2168/// `bq` is `Some` when grafting a body that lived inside an outer
2169/// container (blockquote, list-item, or both) — token emission then
2170/// injects the captured per-line prefix tokens at line starts so the
2171/// CST stays byte-equal to source. See
2172/// [`super::container_prefix::ContainerPrefixState`].
2173fn graft_document_children(
2174    builder: &mut GreenNodeBuilder<'static>,
2175    doc: &SyntaxNode,
2176    demote_policy: LastParaDemote,
2177    bq: &mut Option<ContainerPrefixState>,
2178) {
2179    let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2180
2181    let mut demote_idx: Option<usize> = None;
2182    match demote_policy {
2183        LastParaDemote::Never => {}
2184        LastParaDemote::SkipTrailingBlanks => {
2185            for (i, c) in children.iter().enumerate().rev() {
2186                if let rowan::NodeOrToken::Node(n) = c {
2187                    if n.kind() == SyntaxKind::BLANK_LINE {
2188                        continue;
2189                    }
2190                    if n.kind() == SyntaxKind::PARAGRAPH {
2191                        demote_idx = Some(i);
2192                    }
2193                    break;
2194                }
2195            }
2196        }
2197        LastParaDemote::OnlyIfLast => {
2198            for (i, c) in children.iter().enumerate().rev() {
2199                if let rowan::NodeOrToken::Node(n) = c {
2200                    if n.kind() == SyntaxKind::PARAGRAPH {
2201                        demote_idx = Some(i);
2202                    }
2203                    break;
2204                }
2205            }
2206        }
2207    }
2208
2209    for (i, child) in children.into_iter().enumerate() {
2210        match child {
2211            rowan::NodeOrToken::Node(n) => {
2212                if Some(i) == demote_idx {
2213                    graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2214                } else {
2215                    graft_subtree(builder, &n, bq);
2216                }
2217            }
2218            rowan::NodeOrToken::Token(t) => {
2219                emit_grafted_token(builder, t.kind(), t.text(), bq);
2220            }
2221        }
2222    }
2223}
2224
2225/// Recursively re-emit `node` and its descendants into `builder`.
2226/// Token text is copied verbatim so the result is byte-identical to
2227/// the input span (modulo bq prefix tokens injected at line starts
2228/// when `bq` is `Some`).
2229fn graft_subtree(
2230    builder: &mut GreenNodeBuilder<'static>,
2231    node: &SyntaxNode,
2232    bq: &mut Option<ContainerPrefixState>,
2233) {
2234    graft_subtree_as(builder, node, node.kind(), bq);
2235}
2236
2237/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2238/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2239/// the close-butted demotion rule.
2240fn graft_subtree_as(
2241    builder: &mut GreenNodeBuilder<'static>,
2242    node: &SyntaxNode,
2243    kind: SyntaxKind,
2244    bq: &mut Option<ContainerPrefixState>,
2245) {
2246    builder.start_node(kind.into());
2247    for child in node.children_with_tokens() {
2248        match child {
2249            rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2250            rowan::NodeOrToken::Token(t) => {
2251                emit_grafted_token(builder, t.kind(), t.text(), bq);
2252            }
2253        }
2254    }
2255    builder.finish_node();
2256}
2257
2258/// Emit a single token while optionally injecting blockquote prefix
2259/// tokens at line starts. When `bq` is `None`, this is a plain
2260/// `builder.token()` passthrough.
2261fn emit_grafted_token(
2262    builder: &mut GreenNodeBuilder<'static>,
2263    kind: SyntaxKind,
2264    text: &str,
2265    bq: &mut Option<ContainerPrefixState>,
2266) {
2267    if let Some(state) = bq.as_mut() {
2268        if state.at_line_start {
2269            if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2270                emit_container_prefix_tokens(builder, line_prefix);
2271            }
2272            state.at_line_start = false;
2273        }
2274        builder.token(kind.into(), text);
2275        // `BLANK_LINE` token represents an entirely blank source line —
2276        // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2277        // token as line-ending so the per-line prefix index advances
2278        // correctly.
2279        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2280            state.line_idx += 1;
2281            state.at_line_start = true;
2282        }
2283    } else {
2284        builder.token(kind.into(), text);
2285    }
2286}
2287
2288/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2289/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2290fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2291    for ch in prefix.chars() {
2292        if ch == '>' {
2293            builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2294        } else {
2295            let mut buf = [0u8; 4];
2296            builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2297        }
2298    }
2299}
2300
2301/// Locate the byte index (within `line`) of the open-tag's closing `>`
2302/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2303/// the line doesn't fit the expected shape. Mirrors the inner scan of
2304/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2305/// caller can slice off the trailing bytes.
2306fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2307    let bytes = line.as_bytes();
2308    let indent_end = bytes
2309        .iter()
2310        .position(|&b| b != b' ' && b != b'\t')
2311        .unwrap_or(bytes.len());
2312    let rest = &line[indent_end..];
2313    let rest_bytes = rest.as_bytes();
2314    let prefix_len = 1 + tag_name.len();
2315    if rest_bytes.len() < prefix_len + 1
2316        || rest_bytes[0] != b'<'
2317        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2318    {
2319        return None;
2320    }
2321    let after_name = &rest[prefix_len..];
2322    let after_name_bytes = after_name.as_bytes();
2323    let mut i = 0usize;
2324    let mut quote: Option<u8> = None;
2325    while i < after_name_bytes.len() {
2326        match (quote, after_name_bytes[i]) {
2327            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2328            (Some(q), b2) if b2 == q => quote = None,
2329            (None, b'>') => return Some(indent_end + prefix_len + i),
2330            _ => {}
2331        }
2332        i += 1;
2333    }
2334    None
2335}
2336
2337/// Whether `slice` begins (after leading ASCII whitespace) with an
2338/// open tag whose name is a Pandoc void block tag (`<source>`,
2339/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2340/// open tags return false.
2341///
2342/// Used by the inline-block matched-pair lift gate: pandoc-native
2343/// abandons the lift when the body's first non-blank content is a
2344/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2345/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2346/// matched-pair lift).
2347fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2348    let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2349    if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2350        return false;
2351    }
2352    let Some(tag_end) = parse_open_tag(trimmed) else {
2353        return false;
2354    };
2355    let bytes = trimmed.as_bytes();
2356    let mut name_end = 1usize;
2357    while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2358    {
2359        name_end += 1;
2360    }
2361    if name_end == 1 {
2362        return false;
2363    }
2364    is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2365}
2366
2367/// Whether the body of an inline-block matched-pair (`<video>...`,
2368/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2369/// a void block tag — the condition under which pandoc-native abandons
2370/// the matched-pair lift. Probes three shapes:
2371///
2372/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2373///   after the open `>` on `first_inner` start with `<source`.
2374/// - **Single-line open + multi-line body**: open-trailing on the open
2375///   line is empty/whitespace AND the first non-blank body line
2376///   (`lines[start_pos+1..]`) starts with a void tag.
2377/// - **Multi-line open**: same body-line scan starting at
2378///   `lines[multiline_open_end+1..]`.
2379///
2380/// Returns `false` when the body begins with text, with a close tag,
2381/// or with a non-void block tag — those cases all proceed with the
2382/// matched-pair lift.
2383fn inline_block_void_interior_abandons(
2384    first_inner: &str,
2385    lines: &[&str],
2386    start_pos: usize,
2387    multiline_open_end: Option<usize>,
2388    bq_depth: usize,
2389    tag_name: &str,
2390) -> bool {
2391    let (line_no_nl, _) = strip_newline(first_inner);
2392    let (body_start_line_idx, open_trailing) = match multiline_open_end {
2393        Some(end) => (end + 1, ""),
2394        None => {
2395            let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2396            let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2397            (start_pos + 1, trailing)
2398        }
2399    };
2400    let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2401    if !trimmed.is_empty() {
2402        return slice_starts_with_void_block_tag(trimmed);
2403    }
2404    for line in &lines[body_start_line_idx..] {
2405        let inner = if bq_depth > 0 {
2406            strip_n_blockquote_markers(line, bq_depth)
2407        } else {
2408            line
2409        };
2410        let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2411        if trimmed.is_empty() {
2412            continue;
2413        }
2414        return slice_starts_with_void_block_tag(trimmed);
2415    }
2416    false
2417}
2418
2419/// Probe whether the open-tag line has a valid (quote-aware) closing
2420/// `>` after the tag name. Admits trailing content after `>` (the
2421/// open-trailing shape `<form>foo`) — the caller is expected to capture
2422/// that trailing into the structural lift's `pre_content`.
2423pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2424    let bytes = line.as_bytes();
2425    let indent_end = bytes
2426        .iter()
2427        .position(|&b| b != b' ' && b != b'\t')
2428        .unwrap_or(bytes.len());
2429    let rest = &line[indent_end..];
2430    let rest_bytes = rest.as_bytes();
2431    let prefix_len = 1 + tag_name.len();
2432    if rest_bytes.len() < prefix_len + 1
2433        || rest_bytes[0] != b'<'
2434        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2435    {
2436        return false;
2437    }
2438    let after_name = &rest[prefix_len..];
2439    let after_name_bytes = after_name.as_bytes();
2440    let mut i = 0usize;
2441    let mut quote: Option<u8> = None;
2442    while i < after_name_bytes.len() {
2443        match (quote, after_name_bytes[i]) {
2444            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2445            (Some(q), b2) if b2 == q => quote = None,
2446            (None, b'>') => return true,
2447            _ => {}
2448        }
2449        i += 1;
2450    }
2451    false
2452}
2453
2454/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2455/// be lifted structurally. Returns `true` only when:
2456/// - The line starts with `<tag_name` (modulo leading whitespace).
2457/// - The open tag's `>` exists with proper quote handling.
2458/// - The bytes after the open `>` contain a depth-zero matched
2459///   `</tag_name>` close (depth-aware: nested `<tag>` opens
2460///   increment depth; matching is case-insensitive, quote-aware).
2461///
2462/// Trailing bytes after the matched close are accepted and grafted
2463/// as a sibling block by the caller. Examples:
2464/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2465/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2466///   recursively parses to a `RawBlock`).
2467/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2468///   parsed recursively), trailing=`bar`.
2469fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2470    let bytes = line.as_bytes();
2471    let indent_end = bytes
2472        .iter()
2473        .position(|&b| b != b' ' && b != b'\t')
2474        .unwrap_or(bytes.len());
2475    let rest = &line[indent_end..];
2476    let rest_bytes = rest.as_bytes();
2477    let prefix_len = 1 + tag_name.len();
2478    if rest_bytes.len() < prefix_len
2479        || rest_bytes[0] != b'<'
2480        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2481    {
2482        return false;
2483    }
2484    let after_name = &rest[prefix_len..];
2485    let after_name_bytes = after_name.as_bytes();
2486    let mut i = 0usize;
2487    let mut quote: Option<u8> = None;
2488    let mut gt_idx: Option<usize> = None;
2489    while i < after_name_bytes.len() {
2490        match (quote, after_name_bytes[i]) {
2491            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2492            (Some(q), b2) if b2 == q => quote = None,
2493            (None, b'>') => {
2494                gt_idx = Some(i);
2495                break;
2496            }
2497            _ => {}
2498        }
2499        i += 1;
2500    }
2501    let Some(gt_idx) = gt_idx else {
2502        return false;
2503    };
2504    let trailing = &after_name[gt_idx + 1..];
2505    // Depth-aware: walk `trailing` (we begin inside the open tag at
2506    // depth 1). Return true iff a matched `</tag>` exists where depth
2507    // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2508    matched_close_offset(trailing, tag_name).is_some()
2509}
2510
2511/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2512/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2513/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2514/// starts at 1 (we begin inside the open tag). Self-closing opens
2515/// (`<tag/>`) do not increment depth.
2516///
2517/// Returns `Some((close_start, close_end))` where:
2518/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2519/// - `close_end` is one past the matched `>`.
2520///
2521/// Returns `None` when no matched close is present (unclosed tag,
2522/// depth never returns to 0).
2523fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2524    let bytes = trailing.as_bytes();
2525    let lower_line = trailing.to_ascii_lowercase();
2526    let lower_bytes = lower_line.as_bytes();
2527    let tag_lower = tag_name.to_ascii_lowercase();
2528    let tag_bytes = tag_lower.as_bytes();
2529
2530    let mut depth: i32 = 1;
2531    let mut i = 0usize;
2532
2533    while i < bytes.len() {
2534        if bytes[i] != b'<' {
2535            i += 1;
2536            continue;
2537        }
2538        let after = i + 1;
2539        let is_close = after < bytes.len() && bytes[after] == b'/';
2540        let name_start = if is_close { after + 1 } else { after };
2541        let matched = name_start + tag_bytes.len() <= bytes.len()
2542            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2543        let after_name = name_start + tag_bytes.len();
2544        let is_boundary = matched
2545            && matches!(
2546                bytes.get(after_name).copied(),
2547                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2548            );
2549
2550        // Scan forward to this tag bracket's `>`, respecting quoted
2551        // attribute values; track self-closing form (`/>`).
2552        let mut j = if matched { after_name } else { after };
2553        let mut quote: Option<u8> = None;
2554        let mut self_close = false;
2555        let mut found_gt = false;
2556        while j < bytes.len() {
2557            let b = bytes[j];
2558            match (quote, b) {
2559                (Some(q), x) if x == q => quote = None,
2560                (None, b'"') | (None, b'\'') => quote = Some(b),
2561                (None, b'>') => {
2562                    found_gt = true;
2563                    if j > i + 1 && bytes[j - 1] == b'/' {
2564                        self_close = true;
2565                    }
2566                    break;
2567                }
2568                _ => {}
2569            }
2570            j += 1;
2571        }
2572
2573        if matched && is_boundary {
2574            if is_close {
2575                depth -= 1;
2576                if depth == 0 && found_gt {
2577                    return Some((i, j + 1));
2578                }
2579            } else if !self_close {
2580                depth += 1;
2581            }
2582        }
2583
2584        if found_gt {
2585            i = j + 1;
2586        } else {
2587            // Unterminated `<...` — give up.
2588            break;
2589        }
2590    }
2591    None
2592}
2593
2594/// Locate the byte offset of the first `>` after a `</tag` prefix at
2595/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2596/// the caller can split `close_part` into the close-marker bytes
2597/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2598/// expected prefix shape is missing — caller treats the whole slice
2599/// as the close marker (no trailing).
2600fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2601    let prefix_len = 2 + tag_name.len();
2602    let bytes = close_part.as_bytes();
2603    if bytes.len() < prefix_len
2604        || bytes[0] != b'<'
2605        || bytes[1] != b'/'
2606        || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2607    {
2608        return None;
2609    }
2610    // Scan from after `</tag` to the first unquoted `>`.
2611    let mut i = prefix_len;
2612    let mut quote: Option<u8> = None;
2613    while i < bytes.len() {
2614        match (quote, bytes[i]) {
2615            (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2616            (Some(q), b2) if b2 == q => quote = None,
2617            (None, b'>') => return Some(i + 1),
2618            _ => {}
2619        }
2620        i += 1;
2621    }
2622    None
2623}
2624
2625/// Try to split the close line of an HTML_BLOCK_DIV body into a
2626/// leading content prefix and a clean `</tag>...` remainder. Returns
2627/// `Some((leading, close_part))` only when the line contains exactly
2628/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2629/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2630/// for missing close tags, or for compound shapes the parser
2631/// shouldn't attempt to lift in this pass.
2632///
2633/// `leading` may be empty (close starts at column 0) or pure
2634/// whitespace (close on an indented line). Both count as "butted" per
2635/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2636/// the trailing paragraph inside the div demotes Para→Plain.
2637fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2638    let (opens, closes) = count_tag_balance(line, tag_name);
2639    if opens != 0 || closes != 1 {
2640        return None;
2641    }
2642    // Locate the close tag's opening `<` by lowercased substring search.
2643    // Safe because we've already established (above) that the line has
2644    // exactly one `</tag>` and no `<tag>` opens, so the first match is
2645    // THE close.
2646    let needle = format!("</{}", tag_name);
2647    let lower = line.to_ascii_lowercase();
2648    let close_lt = lower.find(&needle)?;
2649    Some((&line[..close_lt], &line[close_lt..]))
2650}
2651
2652/// Depth-aware variant of `try_split_close_line` used by the same-line
2653/// lift path. Walks `line` starting at depth 1 (we begin inside the
2654/// open `<tag>`) and splits at the byte position where the matched
2655/// `</tag>` close brings depth to 0. Returns `Some((body,
2656/// close_part))` where `body` is the bytes before the matched-close
2657/// start and `close_part` is the bytes from the matched close onward.
2658///
2659/// Unlike `try_split_close_line` this accepts nested same-tag opens
2660/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2661/// body=`<div>x</div>` (a nested div the body lift parses
2662/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2663/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2664/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2665fn try_split_close_line_depth_aware<'a>(
2666    line: &'a str,
2667    tag_name: &str,
2668) -> Option<(&'a str, &'a str)> {
2669    let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2670    Some((&line[..close_start], &line[close_start..]))
2671}
2672
2673/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2674/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2675/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2676/// + TEXT(">") + TEXT(trailing)?`.
2677///
2678/// Bytes are byte-identical to the source — this only tokenizes at finer
2679/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2680/// region structurally. Falls back to a single TEXT token if the line
2681/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2682/// only retags as the lift kind when this shape was matched).
2683///
2684/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2685/// returned as `&str` instead so the caller can splice them into the
2686/// recursive-parse input for the structural body lift. When false
2687/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2688/// empty slice is returned.
2689fn emit_open_tag_tokens<'a>(
2690    builder: &mut GreenNodeBuilder<'static>,
2691    line: &'a str,
2692    tag_name: &str,
2693    lift_trailing: bool,
2694) -> &'a str {
2695    let bytes = line.as_bytes();
2696    // Leading indent (CommonMark allows up to 3 spaces).
2697    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2698    if indent_end > 0 {
2699        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2700    }
2701    let rest = &line[indent_end..];
2702    // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2703    let prefix_len = 1 + tag_name.len();
2704    if !rest.starts_with('<')
2705        || rest.len() < prefix_len
2706        || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2707    {
2708        builder.token(SyntaxKind::TEXT.into(), rest);
2709        return "";
2710    }
2711    let after_name = &rest[prefix_len..];
2712    let after_name_bytes = after_name.as_bytes();
2713    // Find the closing `>` of the open tag, respecting quoted attribute values.
2714    let mut i = 0usize;
2715    let mut quote: Option<u8> = None;
2716    let mut tag_close: Option<usize> = None;
2717    while i < after_name_bytes.len() {
2718        let b = after_name_bytes[i];
2719        match (quote, b) {
2720            (None, b'"') | (None, b'\'') => quote = Some(b),
2721            (Some(q), b2) if b2 == q => quote = None,
2722            (None, b'>') => {
2723                tag_close = Some(i);
2724                break;
2725            }
2726            _ => {}
2727        }
2728        i += 1;
2729    }
2730    let Some(tag_close) = tag_close else {
2731        // Open tag has no closing `>` on this line — defensive fallback.
2732        builder.token(SyntaxKind::TEXT.into(), rest);
2733        return "";
2734    };
2735    // Whitespace between the tag name and the attribute region.
2736    let attrs_inner = &after_name[..tag_close];
2737    let ws_end = attrs_inner
2738        .as_bytes()
2739        .iter()
2740        .position(|&b| !matches!(b, b' ' | b'\t'))
2741        .unwrap_or(attrs_inner.len());
2742    let leading_ws = &attrs_inner[..ws_end];
2743    // Strip a trailing self-closing slash and the whitespace before it
2744    // from the attribute region; emit them as TEXT outside the
2745    // HTML_ATTRS node so the structural region only holds attribute
2746    // bytes (not formatting punctuation).
2747    let attrs_after_ws = &attrs_inner[ws_end..];
2748    let mut attr_end = attrs_after_ws.len();
2749    let attr_bytes = attrs_after_ws.as_bytes();
2750    let mut self_close_start = attr_end;
2751    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2752        self_close_start = attr_end - 1;
2753        attr_end = self_close_start;
2754        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2755            attr_end -= 1;
2756        }
2757    }
2758    let attrs_text = &attrs_after_ws[..attr_end];
2759    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2760    let after_self_close = &attrs_after_ws[self_close_start..];
2761
2762    // Use the original source bytes for the `<tag` prefix (preserves
2763    // source casing — losslessness).
2764    builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2765    if !leading_ws.is_empty() {
2766        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2767    }
2768    if !attrs_text.is_empty() {
2769        emit_html_attrs_node(builder, attrs_text);
2770    }
2771    if !trailing_text.is_empty() {
2772        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2773    }
2774    if !after_self_close.is_empty() {
2775        builder.token(SyntaxKind::TEXT.into(), after_self_close);
2776    }
2777    builder.token(SyntaxKind::TEXT.into(), ">");
2778    let after_gt = &after_name[tag_close + 1..];
2779    if lift_trailing {
2780        // Return trailing bytes to the caller (will be spliced into the
2781        // recursive-parse input for the body lift).
2782        return after_gt;
2783    }
2784    if !after_gt.is_empty() {
2785        builder.token(SyntaxKind::TEXT.into(), after_gt);
2786    }
2787    ""
2788}
2789
2790/// Detect a multi-line HTML open tag for `tag_name`. Returns
2791/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2792/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2793/// the existing path) or when the `>` is missing entirely.
2794///
2795/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2796/// attribute value doesn't terminate the open tag. Quote state carries
2797/// across line boundaries.
2798fn find_multiline_open_end(
2799    lines: &[&str],
2800    start_pos: usize,
2801    first_inner: &str,
2802    tag_name: &str,
2803    prefix: &ContainerPrefix,
2804) -> Option<usize> {
2805    // Locate the `<tag_name` literal in `first_inner` to start scanning past
2806    // it. Match is ASCII case-insensitive; the parser preserves source casing.
2807    // `first_inner` is already bq-stripped by the caller; subsequent lines are
2808    // stripped inline below via `strip_n_blockquote_markers`.
2809    let trimmed = strip_leading_spaces(first_inner);
2810    let prefix_len = 1 + tag_name.len();
2811    if !trimmed.starts_with('<')
2812        || trimmed.len() < prefix_len
2813        || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2814    {
2815        return None;
2816    }
2817    let leading_indent = first_inner.len() - trimmed.len();
2818    let mut i = leading_indent + prefix_len; // past `<tag_name`
2819    let mut quote: Option<u8> = None;
2820
2821    // Scan first line for an unquoted `>`.
2822    let line0_bytes = first_inner.as_bytes();
2823    while i < line0_bytes.len() {
2824        match (quote, line0_bytes[i]) {
2825            (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2826            (Some(q), x) if x == q => quote = None,
2827            (None, b'>') => return None, // single-line case
2828            _ => {}
2829        }
2830        i += 1;
2831    }
2832
2833    // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2834    // blockquote markers per line so `> ` prefixes don't count toward the
2835    // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2836    let mut line_idx = start_pos + 1;
2837    while line_idx < lines.len() {
2838        let raw = lines[line_idx];
2839        let inner = prefix.strip(raw);
2840        for &b in inner.as_bytes() {
2841            match (quote, b) {
2842                (None, b'"') | (None, b'\'') => quote = Some(b),
2843                (Some(q), x) if x == q => quote = None,
2844                (None, b'>') => return Some(line_idx),
2845                _ => {}
2846            }
2847        }
2848        line_idx += 1;
2849    }
2850
2851    None
2852}
2853
2854/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2855/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2856/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2857/// open tag (no `>` in the remaining input) as paragraph text rather than
2858/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2859/// reparse the same content recursively, causing a stack overflow.
2860///
2861/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2862/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2863/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2864/// the tag eventually closes far away.
2865pub(crate) fn pandoc_html_open_tag_closes(
2866    lines: &[&str],
2867    start_pos: usize,
2868    prefix: &ContainerPrefix,
2869) -> bool {
2870    if start_pos >= lines.len() {
2871        return false;
2872    }
2873    let mut quote: Option<u8> = None;
2874    for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2875        let inner = prefix.strip(line);
2876        let bytes = inner.as_bytes();
2877        let mut i = 0usize;
2878        if offset == start_pos {
2879            while i < bytes.len() && bytes[i] == b' ' {
2880                i += 1;
2881            }
2882            if bytes.get(i) != Some(&b'<') {
2883                return false;
2884            }
2885            i += 1;
2886        }
2887        while i < bytes.len() {
2888            match (quote, bytes[i]) {
2889                (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2890                (Some(q), x) if x == q => quote = None,
2891                (None, b'>') => return true,
2892                _ => {}
2893            }
2894            i += 1;
2895        }
2896    }
2897    false
2898}
2899
2900/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2901/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2902/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2903/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2904/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2905/// Phase 6 structural lift.
2906///
2907/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2908/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2909/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2910/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2911///   + TEXT(">") + (TEXT(trailing))? + NEWLINE
2912///
2913/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2914/// newline; `parse_html_attribute_list` tolerates whitespace.
2915#[allow(clippy::too_many_arguments)]
2916fn emit_multiline_open_tag_with_attrs(
2917    builder: &mut GreenNodeBuilder<'static>,
2918    lines: &[&str],
2919    start_pos: usize,
2920    end_line_idx: usize,
2921    tag_name: &str,
2922    bq_depth: usize,
2923    lift_trailing: bool,
2924    pre_content: &mut String,
2925) {
2926    let prefix_len = 1 + tag_name.len();
2927    for (line_idx, raw) in lines
2928        .iter()
2929        .enumerate()
2930        .take(end_line_idx + 1)
2931        .skip(start_pos)
2932    {
2933        // Strip `bq_depth` blockquote markers from the source line so
2934        // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2935        // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2936        // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2937        // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2938        // before this parser runs; re-emitting it here would double
2939        // the bytes and break losslessness.
2940        let stripped = if bq_depth > 0 {
2941            strip_n_blockquote_markers(raw, bq_depth)
2942        } else {
2943            raw
2944        };
2945        let bq_prefix_len = raw.len() - stripped.len();
2946        if bq_prefix_len > 0 && line_idx != start_pos {
2947            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2948        }
2949        let line = stripped;
2950        let (line_no_nl, newline_str) = strip_newline(line);
2951
2952        if line_idx == start_pos {
2953            // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2954            // + attrs)?. The closing `>` is on a later line, so any
2955            // remaining bytes after "<{tag_name}" on this line are the
2956            // start of the attribute region.
2957            let bytes = line_no_nl.as_bytes();
2958            let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2959            if indent_end > 0 {
2960                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2961            }
2962            // Defensive: caller verified the line starts with `<{tag_name}`.
2963            let after_indent = &line_no_nl[indent_end..];
2964            if after_indent.len() >= prefix_len {
2965                builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2966                let rest = &after_indent[prefix_len..];
2967                emit_attr_region(builder, rest);
2968            } else {
2969                builder.token(SyntaxKind::TEXT.into(), after_indent);
2970            }
2971        } else if line_idx < end_line_idx {
2972            // Pure attribute line.
2973            let bytes = line_no_nl.as_bytes();
2974            let indent_end = bytes
2975                .iter()
2976                .position(|&b| !matches!(b, b' ' | b'\t'))
2977                .unwrap_or(bytes.len());
2978            if indent_end > 0 {
2979                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2980            }
2981            let attrs_text = &line_no_nl[indent_end..];
2982            if !attrs_text.is_empty() {
2983                emit_html_attrs_node(builder, attrs_text);
2984            }
2985        } else {
2986            // Last line: indent + attrs + ">" + trailing.
2987            let bytes = line_no_nl.as_bytes();
2988            let indent_end = bytes
2989                .iter()
2990                .position(|&b| !matches!(b, b' ' | b'\t'))
2991                .unwrap_or(bytes.len());
2992            if indent_end > 0 {
2993                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2994            }
2995            // Find the unquoted `>` byte position in this line.
2996            let mut quote: Option<u8> = None;
2997            let mut gt_pos: Option<usize> = None;
2998            for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
2999                let actual_j = indent_end + j;
3000                match (quote, b) {
3001                    (None, b'"') | (None, b'\'') => quote = Some(b),
3002                    (Some(q), x) if x == q => quote = None,
3003                    (None, b'>') => {
3004                        gt_pos = Some(actual_j);
3005                        break;
3006                    }
3007                    _ => {}
3008                }
3009            }
3010            let Some(gt) = gt_pos else {
3011                // Defensive — caller said `>` is on this line.
3012                builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3013                if !newline_str.is_empty() {
3014                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3015                }
3016                continue;
3017            };
3018            // Attribute region: between indent_end and gt, with possibly
3019            // trailing whitespace before `>`.
3020            let attrs_region = &line_no_nl[indent_end..gt];
3021            let region_bytes = attrs_region.as_bytes();
3022            // Strip trailing whitespace from attrs region; emit as
3023            // separate WHITESPACE so HTML_ATTRS only contains attribute
3024            // bytes.
3025            let mut attr_end = region_bytes.len();
3026            while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3027                attr_end -= 1;
3028            }
3029            let attrs_text = &attrs_region[..attr_end];
3030            let trailing_ws = &attrs_region[attr_end..];
3031            if !attrs_text.is_empty() {
3032                emit_html_attrs_node(builder, attrs_text);
3033            }
3034            if !trailing_ws.is_empty() {
3035                builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3036            }
3037            builder.token(SyntaxKind::TEXT.into(), ">");
3038            let after_gt = &line_no_nl[gt + 1..];
3039            if lift_trailing && !after_gt.is_empty() {
3040                // Lift trailing bytes (and the trailing newline) into
3041                // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3042                // with `TEXT(">")`. The recursive parse at the close-marker
3043                // site treats `pre_content` as the leading bytes of the
3044                // structural body — same shape produced by `emit_open_tag_tokens`
3045                // for single-line opens.
3046                pre_content.push_str(after_gt);
3047                pre_content.push_str(newline_str);
3048                continue;
3049            }
3050            if !after_gt.is_empty() {
3051                builder.token(SyntaxKind::TEXT.into(), after_gt);
3052            }
3053        }
3054
3055        if !newline_str.is_empty() {
3056            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3057        }
3058    }
3059}
3060
3061/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3062/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3063/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3064/// node is added. Pandoc's projector reads attributes only for `<div>` /
3065/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3066fn emit_multiline_open_tag_simple(
3067    builder: &mut GreenNodeBuilder<'static>,
3068    lines: &[&str],
3069    start_pos: usize,
3070    end_line_idx: usize,
3071    bq_depth: usize,
3072) {
3073    for (line_idx, raw) in lines
3074        .iter()
3075        .enumerate()
3076        .take(end_line_idx + 1)
3077        .skip(start_pos)
3078    {
3079        let stripped = if bq_depth > 0 {
3080            strip_n_blockquote_markers(raw, bq_depth)
3081        } else {
3082            raw
3083        };
3084        let bq_prefix_len = raw.len() - stripped.len();
3085        // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3086        // re-emit prefixes only for subsequent lines.
3087        if bq_prefix_len > 0 && line_idx != start_pos {
3088            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3089        }
3090        let (line_no_nl, newline_str) = strip_newline(stripped);
3091        if !line_no_nl.is_empty() {
3092            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3093        }
3094        if !newline_str.is_empty() {
3095            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3096        }
3097    }
3098}
3099
3100/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3101/// `<div` literal up to end-of-line. Called only from
3102/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3103/// pure attribute (and possibly inter-attribute whitespace).
3104fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3105    if region.is_empty() {
3106        return;
3107    }
3108    let bytes = region.as_bytes();
3109    // Split a leading run of whitespace into a WHITESPACE token so the
3110    // HTML_ATTRS node holds only attribute bytes.
3111    let ws_end = bytes
3112        .iter()
3113        .position(|&b| !matches!(b, b' ' | b'\t'))
3114        .unwrap_or(bytes.len());
3115    if ws_end > 0 {
3116        builder.token(SyntaxKind::WHITESPACE.into(), &region[..ws_end]);
3117    }
3118    let attrs_text = &region[ws_end..];
3119    if !attrs_text.is_empty() {
3120        emit_html_attrs_node(builder, attrs_text);
3121    }
3122}
3123
3124/// Emit one continuation line of an HTML block, preserving any blockquote
3125/// markers as structural tokens (so the CST stays byte-equal to the source
3126/// and downstream consumers can strip them per-context).
3127fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3128    let inner = if bq_depth > 0 {
3129        let stripped = strip_n_blockquote_markers(line, bq_depth);
3130        let prefix_len = line.len() - stripped.len();
3131        if prefix_len > 0 {
3132            for ch in line[..prefix_len].chars() {
3133                if ch == '>' {
3134                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3135                } else {
3136                    let mut buf = [0u8; 4];
3137                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3138                }
3139            }
3140        }
3141        stripped
3142    } else {
3143        line
3144    };
3145
3146    let (line_without_newline, newline_str) = strip_newline(inner);
3147    if !line_without_newline.is_empty() {
3148        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3149    }
3150    if !newline_str.is_empty() {
3151        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3152    }
3153}
3154
3155#[cfg(test)]
3156mod tests {
3157    use super::*;
3158
3159    #[test]
3160    fn test_try_parse_html_comment() {
3161        assert_eq!(
3162            try_parse_html_block_start("<!-- comment -->", false),
3163            Some(HtmlBlockType::Comment)
3164        );
3165        assert_eq!(
3166            try_parse_html_block_start("  <!-- comment -->", false),
3167            Some(HtmlBlockType::Comment)
3168        );
3169    }
3170
3171    #[test]
3172    fn test_try_parse_div_tag() {
3173        assert_eq!(
3174            try_parse_html_block_start("<div>", false),
3175            Some(HtmlBlockType::BlockTag {
3176                tag_name: "div".to_string(),
3177                is_verbatim: false,
3178                closed_by_blank_line: false,
3179                depth_aware: true,
3180                closes_at_open_tag: false,
3181                is_closing: false,
3182            })
3183        );
3184        assert_eq!(
3185            try_parse_html_block_start("<div class=\"test\">", false),
3186            Some(HtmlBlockType::BlockTag {
3187                tag_name: "div".to_string(),
3188                is_verbatim: false,
3189                closed_by_blank_line: false,
3190                depth_aware: true,
3191                closes_at_open_tag: false,
3192                is_closing: false,
3193            })
3194        );
3195    }
3196
3197    #[test]
3198    fn test_try_parse_script_tag() {
3199        assert_eq!(
3200            try_parse_html_block_start("<script>", false),
3201            Some(HtmlBlockType::BlockTag {
3202                tag_name: "script".to_string(),
3203                is_verbatim: true,
3204                closed_by_blank_line: false,
3205                depth_aware: true,
3206                closes_at_open_tag: false,
3207                is_closing: false,
3208            })
3209        );
3210    }
3211
3212    #[test]
3213    fn test_try_parse_processing_instruction() {
3214        assert_eq!(
3215            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3216            Some(HtmlBlockType::ProcessingInstruction)
3217        );
3218    }
3219
3220    #[test]
3221    fn test_try_parse_declaration() {
3222        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3223        assert_eq!(
3224            try_parse_html_block_start("<!DOCTYPE html>", true),
3225            Some(HtmlBlockType::Declaration)
3226        );
3227        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3228        // just uppercase. Lowercase doctype must match too.
3229        assert_eq!(
3230            try_parse_html_block_start("<!doctype html>", true),
3231            Some(HtmlBlockType::Declaration)
3232        );
3233        // Pandoc dialect does not — bare declarations fall through to
3234        // paragraph parsing.
3235        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3236        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3237    }
3238
3239    #[test]
3240    fn test_dialect_specific_block_tag_membership() {
3241        // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3242        // CommonMark §4.6 type-6 plus a few additions. These tags
3243        // diverge between dialects:
3244        //   CM-only block tags (Pandoc treats as inline raw HTML):
3245        //     dialog, legend, menuitem, optgroup, option, frame,
3246        //     base, basefont, link, param
3247        //   Pandoc-only block tags (CM doesn't recognize):
3248        //     canvas, hgroup, isindex, meta, output
3249        for cm_only in [
3250            "<dialog>",
3251            "<legend>",
3252            "<menuitem>",
3253            "<optgroup>",
3254            "<option>",
3255            "<frame>",
3256            "<base>",
3257            "<basefont>",
3258            "<link>",
3259            "<param>",
3260        ] {
3261            assert!(
3262                matches!(
3263                    try_parse_html_block_start(cm_only, true),
3264                    Some(HtmlBlockType::BlockTag { .. })
3265                ),
3266                "{cm_only} should be a block-tag start under CommonMark",
3267            );
3268            assert_eq!(
3269                try_parse_html_block_start(cm_only, false),
3270                None,
3271                "{cm_only} should NOT be a block-tag start under Pandoc",
3272            );
3273        }
3274        for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3275            // Under CM these are not type-6 BlockTags; they may still match
3276            // type-7 (complete tag on a line) which has different semantics.
3277            assert!(
3278                !matches!(
3279                    try_parse_html_block_start(pandoc_only, true),
3280                    Some(HtmlBlockType::BlockTag { .. })
3281                ),
3282                "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3283            );
3284            assert!(
3285                matches!(
3286                    try_parse_html_block_start(pandoc_only, false),
3287                    Some(HtmlBlockType::BlockTag { .. })
3288                ),
3289                "{pandoc_only} should be a block-tag start under Pandoc",
3290            );
3291        }
3292    }
3293
3294    #[test]
3295    fn test_pandoc_inline_block_tag_membership() {
3296        // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3297        // fresh-block positions under Pandoc dialect. We list the
3298        // non-void, non-script subset (verbatim `script` is handled
3299        // via the verbatim path; void elements are deferred — see
3300        // PANDOC_INLINE_BLOCK_TAGS docs).
3301        for tag in [
3302            "<button>",
3303            "<iframe>",
3304            "<video>",
3305            "<audio>",
3306            "<noscript>",
3307            "<object>",
3308            "<map>",
3309            "<progress>",
3310            "<del>",
3311            "<ins>",
3312            "<svg>",
3313            "<applet>",
3314        ] {
3315            assert!(
3316                matches!(
3317                    try_parse_html_block_start(tag, false),
3318                    Some(HtmlBlockType::BlockTag {
3319                        depth_aware: true,
3320                        ..
3321                    })
3322                ),
3323                "{tag} should be a depth-aware block-tag start under Pandoc",
3324            );
3325        }
3326        // Closing forms of inline-block tags also start a block under
3327        // Pandoc — pandoc-native pins `</button>` standalone as a
3328        // single-line `RawBlock`. These use `closes_at_open_tag: true`
3329        // (no balanced match — the close emits as a one-line block on
3330        // its own).
3331        for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3332            assert!(
3333                matches!(
3334                    try_parse_html_block_start(closing, false),
3335                    Some(HtmlBlockType::BlockTag {
3336                        depth_aware: false,
3337                        closes_at_open_tag: true,
3338                        ..
3339                    })
3340                ),
3341                "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3342            );
3343        }
3344    }
3345
3346    #[test]
3347    fn test_pandoc_void_block_tag_membership() {
3348        // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3349        // at fresh-block positions under Pandoc dialect, with
3350        // `closes_at_open_tag: true` — the block always ends on the
3351        // open-tag line (no closing tag to match).
3352        for tag in [
3353            "<area>",
3354            "<embed>",
3355            "<source>",
3356            "<track>",
3357            "<embed src=\"foo.swf\">",
3358            "<source src=\"foo.mp4\" type=\"video/mp4\">",
3359        ] {
3360            assert!(
3361                matches!(
3362                    try_parse_html_block_start(tag, false),
3363                    Some(HtmlBlockType::BlockTag {
3364                        depth_aware: false,
3365                        closes_at_open_tag: true,
3366                        ..
3367                    })
3368                ),
3369                "{tag} should be a void block-tag start under Pandoc",
3370            );
3371        }
3372        // Closing forms of void tags also start a single-line block
3373        // under Pandoc. Void elements have no closing tag in HTML, but
3374        // `</embed>` etc. can appear in the wild — pandoc-native still
3375        // emits them as `RawBlock`s at fresh-block positions; mirror
3376        // that with the same `closes_at_open_tag: true` shape.
3377        for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3378            assert!(
3379                matches!(
3380                    try_parse_html_block_start(closing, false),
3381                    Some(HtmlBlockType::BlockTag {
3382                        depth_aware: false,
3383                        closes_at_open_tag: true,
3384                        ..
3385                    })
3386                ),
3387                "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3388            );
3389        }
3390        // Under CommonMark dialect, the void-tag block-start path is
3391        // skipped. `<source>` and `<track>` are in the CM type-6
3392        // BLOCK_TAGS set so they DO start a block, but with CM type-6
3393        // semantics (`closed_by_blank_line: true`,
3394        // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3395        // `<embed>` and `<area>` aren't in the CM type-6 list — they
3396        // fall through to type 7 (complete tag on a line by itself).
3397        assert_eq!(
3398            try_parse_html_block_start("<embed>", true),
3399            Some(HtmlBlockType::Type7)
3400        );
3401        assert_eq!(
3402            try_parse_html_block_start("<area>", true),
3403            Some(HtmlBlockType::Type7)
3404        );
3405        assert!(matches!(
3406            try_parse_html_block_start("<source src=\"x\">", true),
3407            Some(HtmlBlockType::BlockTag {
3408                closed_by_blank_line: true,
3409                closes_at_open_tag: false,
3410                ..
3411            })
3412        ));
3413        assert!(matches!(
3414            try_parse_html_block_start("<track src=\"x\">", true),
3415            Some(HtmlBlockType::BlockTag {
3416                closed_by_blank_line: true,
3417                closes_at_open_tag: false,
3418                ..
3419            })
3420        ));
3421    }
3422
3423    #[test]
3424    fn test_find_multiline_open_end() {
3425        // Single-line opens return None (caller takes the regular path).
3426        assert_eq!(
3427            find_multiline_open_end(
3428                &["<div id=\"x\">"],
3429                0,
3430                "<div id=\"x\">",
3431                "div",
3432                &ContainerPrefix::default()
3433            ),
3434            None
3435        );
3436        assert_eq!(
3437            find_multiline_open_end(
3438                &["<embed src=\"x\">"],
3439                0,
3440                "<embed src=\"x\">",
3441                "embed",
3442                &ContainerPrefix::default()
3443            ),
3444            None
3445        );
3446        // Multi-line opens return the line index of the closing `>`.
3447        assert_eq!(
3448            find_multiline_open_end(
3449                &["<embed", "  src=\"x\">"],
3450                0,
3451                "<embed",
3452                "embed",
3453                &ContainerPrefix::default()
3454            ),
3455            Some(1)
3456        );
3457        assert_eq!(
3458            find_multiline_open_end(
3459                &["<embed", "  src=\"x\"", "  type=\"video\">"],
3460                0,
3461                "<embed",
3462                "embed",
3463                &ContainerPrefix::default()
3464            ),
3465            Some(2)
3466        );
3467        // Tag-name mismatch returns None (case-insensitive on the tag name).
3468        assert_eq!(
3469            find_multiline_open_end(
3470                &["<embed", "  src=\"x\">"],
3471                0,
3472                "<embed",
3473                "div",
3474                &ContainerPrefix::default()
3475            ),
3476            None
3477        );
3478        assert_eq!(
3479            find_multiline_open_end(
3480                &["<EMBED", "  src=\"x\">"],
3481                0,
3482                "<EMBED",
3483                "embed",
3484                &ContainerPrefix::default()
3485            ),
3486            Some(1)
3487        );
3488        // Quoted `>` does not terminate the open tag; quote state threads
3489        // across line boundaries.
3490        assert_eq!(
3491            find_multiline_open_end(
3492                &["<embed title=\"a>b", "  c\">"],
3493                0,
3494                "<embed title=\"a>b",
3495                "embed",
3496                &ContainerPrefix::default()
3497            ),
3498            Some(1)
3499        );
3500        // No `>` anywhere returns None.
3501        assert_eq!(
3502            find_multiline_open_end(
3503                &["<embed", "  src=\"x\""],
3504                0,
3505                "<embed",
3506                "embed",
3507                &ContainerPrefix::default()
3508            ),
3509            None
3510        );
3511        // Subsequent lines inside a blockquote: bq markers stripped before
3512        // scanning so `> ` prefixes don't count.
3513        assert_eq!(
3514            find_multiline_open_end(
3515                &["<div", ">   id=\"x\">"],
3516                0,
3517                "<div",
3518                "div",
3519                &ContainerPrefix::bq_only(1)
3520            ),
3521            Some(1)
3522        );
3523        // Nested bq: strips two `> ` per line.
3524        assert_eq!(
3525            find_multiline_open_end(
3526                &["<section", "> >   id=\"x\">"],
3527                0,
3528                "<section",
3529                "section",
3530                &ContainerPrefix::bq_only(2)
3531            ),
3532            Some(1)
3533        );
3534    }
3535
3536    #[test]
3537    fn test_pandoc_html_open_tag_closes() {
3538        // Single-line complete: scanner finds `>` on the first line.
3539        assert!(pandoc_html_open_tag_closes(
3540            &["<div>"],
3541            0,
3542            &ContainerPrefix::default()
3543        ));
3544        assert!(pandoc_html_open_tag_closes(
3545            &["<embed src=\"x\">"],
3546            0,
3547            &ContainerPrefix::default()
3548        ));
3549        // Multi-line complete: scanner finds `>` on a later line.
3550        assert!(pandoc_html_open_tag_closes(
3551            &["<div", "  id=\"x\">", "body", "</div>"],
3552            0,
3553            &ContainerPrefix::default()
3554        ));
3555        assert!(pandoc_html_open_tag_closes(
3556            &["<embed", "  src=\"x.png\" alt=\"y\">"],
3557            0,
3558            &ContainerPrefix::default()
3559        ));
3560        // Quoted `>` does not close: scanner threads quote state.
3561        assert!(!pandoc_html_open_tag_closes(
3562            &["<div title=\"a>b", "  c\""],
3563            0,
3564            &ContainerPrefix::default()
3565        ));
3566        assert!(pandoc_html_open_tag_closes(
3567            &["<div title=\"a>b", "  c\">"],
3568            0,
3569            &ContainerPrefix::default()
3570        ));
3571        // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3572        assert!(!pandoc_html_open_tag_closes(
3573            &["<embed"],
3574            0,
3575            &ContainerPrefix::default()
3576        ));
3577        assert!(!pandoc_html_open_tag_closes(
3578            &["<div", "foo", "bar"],
3579            0,
3580            &ContainerPrefix::default()
3581        ));
3582        // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3583        // across them); the scan continues until EOF or `>`.
3584        assert!(pandoc_html_open_tag_closes(
3585            &["<div", "", "id=\"x\">"],
3586            0,
3587            &ContainerPrefix::default()
3588        ));
3589    }
3590
3591    #[test]
3592    fn test_try_parse_cdata() {
3593        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3594        assert_eq!(
3595            try_parse_html_block_start("<![CDATA[content]]>", true),
3596            Some(HtmlBlockType::CData)
3597        );
3598        // Pandoc dialect does not.
3599        assert_eq!(
3600            try_parse_html_block_start("<![CDATA[content]]>", false),
3601            None
3602        );
3603    }
3604
3605    #[test]
3606    fn test_extract_block_tag_name_open_only() {
3607        assert_eq!(
3608            extract_block_tag_name("<div>", false),
3609            Some("div".to_string())
3610        );
3611        assert_eq!(
3612            extract_block_tag_name("<div class=\"test\">", false),
3613            Some("div".to_string())
3614        );
3615        assert_eq!(
3616            extract_block_tag_name("<div/>", false),
3617            Some("div".to_string())
3618        );
3619        assert_eq!(extract_block_tag_name("</div>", false), None);
3620        assert_eq!(extract_block_tag_name("<>", false), None);
3621        assert_eq!(extract_block_tag_name("< div>", false), None);
3622    }
3623
3624    #[test]
3625    fn test_extract_block_tag_name_with_closing() {
3626        // CommonMark §4.6 type-6 starts also accept closing tags.
3627        assert_eq!(
3628            extract_block_tag_name("</div>", true),
3629            Some("div".to_string())
3630        );
3631        assert_eq!(
3632            extract_block_tag_name("</div >", true),
3633            Some("div".to_string())
3634        );
3635    }
3636
3637    #[test]
3638    fn test_commonmark_type6_closing_tag_start() {
3639        assert_eq!(
3640            try_parse_html_block_start("</div>", true),
3641            Some(HtmlBlockType::BlockTag {
3642                tag_name: "div".to_string(),
3643                is_verbatim: false,
3644                closed_by_blank_line: true,
3645                depth_aware: false,
3646                closes_at_open_tag: false,
3647                is_closing: true,
3648            })
3649        );
3650    }
3651
3652    #[test]
3653    fn test_commonmark_type7_open_tag() {
3654        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3655        // CommonMark; rejected under non-CommonMark.
3656        assert_eq!(
3657            try_parse_html_block_start("<a href=\"foo\">", true),
3658            Some(HtmlBlockType::Type7)
3659        );
3660        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3661    }
3662
3663    #[test]
3664    fn test_commonmark_type7_close_tag() {
3665        assert_eq!(
3666            try_parse_html_block_start("</ins>", true),
3667            Some(HtmlBlockType::Type7)
3668        );
3669    }
3670
3671    #[test]
3672    fn test_commonmark_type7_rejects_with_trailing_text() {
3673        // A complete tag must be followed only by whitespace.
3674        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3675    }
3676
3677    #[test]
3678    fn test_is_closing_marker_comment() {
3679        let block_type = HtmlBlockType::Comment;
3680        assert!(is_closing_marker("-->", &block_type));
3681        assert!(is_closing_marker("end -->", &block_type));
3682        assert!(!is_closing_marker("<!--", &block_type));
3683    }
3684
3685    #[test]
3686    fn test_is_closing_marker_tag() {
3687        let block_type = HtmlBlockType::BlockTag {
3688            tag_name: "div".to_string(),
3689            is_verbatim: false,
3690            closed_by_blank_line: false,
3691            depth_aware: false,
3692            closes_at_open_tag: false,
3693            is_closing: false,
3694        };
3695        assert!(is_closing_marker("</div>", &block_type));
3696        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3697        assert!(is_closing_marker("content</div>", &block_type));
3698        assert!(!is_closing_marker("<div>", &block_type));
3699    }
3700
3701    #[test]
3702    fn test_parse_html_comment_block() {
3703        let input = "<!-- comment -->\n";
3704        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3705        let mut builder = GreenNodeBuilder::new();
3706
3707        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3708        let opts = ParserOptions::default();
3709        let new_pos = parse_html_block_with_wrapper(
3710            &mut builder,
3711            &lines,
3712            0,
3713            block_type,
3714            &ContainerPrefix::default(),
3715            SyntaxKind::HTML_BLOCK,
3716            &opts,
3717        );
3718
3719        assert_eq!(new_pos, 1);
3720    }
3721
3722    #[test]
3723    fn test_parse_div_block() {
3724        let input = "<div>\ncontent\n</div>\n";
3725        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3726        let mut builder = GreenNodeBuilder::new();
3727
3728        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3729        let opts = ParserOptions::default();
3730        let new_pos = parse_html_block_with_wrapper(
3731            &mut builder,
3732            &lines,
3733            0,
3734            block_type,
3735            &ContainerPrefix::default(),
3736            SyntaxKind::HTML_BLOCK,
3737            &opts,
3738        );
3739
3740        assert_eq!(new_pos, 3);
3741    }
3742
3743    #[test]
3744    fn test_parse_html_block_no_closing() {
3745        let input = "<div>\ncontent\n";
3746        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3747        let mut builder = GreenNodeBuilder::new();
3748
3749        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3750        let opts = ParserOptions::default();
3751        let new_pos = parse_html_block_with_wrapper(
3752            &mut builder,
3753            &lines,
3754            0,
3755            block_type,
3756            &ContainerPrefix::default(),
3757            SyntaxKind::HTML_BLOCK,
3758            &opts,
3759        );
3760
3761        // Should consume all lines even without closing tag
3762        assert_eq!(new_pos, 2);
3763    }
3764
3765    #[test]
3766    fn test_parse_div_block_nested_pandoc() {
3767        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3768        // close on the OUTER `</div>`, not the first `</div>` seen. The
3769        // CommonMark-style "first close" scanner is wrong here; Pandoc's
3770        // div parser is depth-aware (mirrors `htmlInBalanced`).
3771        let input =
3772            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3773        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3774        let mut builder = GreenNodeBuilder::new();
3775
3776        // is_commonmark = false → Pandoc dialect.
3777        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3778        let opts = ParserOptions::default();
3779        let new_pos = parse_html_block_with_wrapper(
3780            &mut builder,
3781            &lines,
3782            0,
3783            block_type,
3784            &ContainerPrefix::default(),
3785            SyntaxKind::HTML_BLOCK_DIV,
3786            &opts,
3787        );
3788
3789        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3790        // inner-close, blank, outer-close. All consumed.
3791        assert_eq!(new_pos, 9);
3792    }
3793
3794    #[test]
3795    fn test_parse_div_block_same_line_pandoc() {
3796        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3797        // close on first line. Depth-aware tracking must not regress this.
3798        let input = "<div>foo</div>\n";
3799        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3800        let mut builder = GreenNodeBuilder::new();
3801
3802        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3803        let opts = ParserOptions::default();
3804        let new_pos = parse_html_block_with_wrapper(
3805            &mut builder,
3806            &lines,
3807            0,
3808            block_type,
3809            &ContainerPrefix::default(),
3810            SyntaxKind::HTML_BLOCK_DIV,
3811            &opts,
3812        );
3813        assert_eq!(new_pos, 1);
3814    }
3815
3816    #[test]
3817    fn test_commonmark_verbatim_first_close() {
3818        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3819        // ends at the first matching close — not depth-aware. Stash a
3820        // bogus inner `<script>` inside a JS string; the outer block
3821        // still closes at the first `</script>`.
3822        let input = "<script>\nlet x = '<script>';\n</script>\n";
3823        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3824        let mut builder = GreenNodeBuilder::new();
3825
3826        // is_commonmark = true.
3827        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3828        let opts = ParserOptions::default();
3829        let new_pos = parse_html_block_with_wrapper(
3830            &mut builder,
3831            &lines,
3832            0,
3833            block_type,
3834            &ContainerPrefix::default(),
3835            SyntaxKind::HTML_BLOCK,
3836            &opts,
3837        );
3838        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3839        assert_eq!(new_pos, 3);
3840    }
3841
3842    #[test]
3843    fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3844        // Multi-line open tag with the closing `>` on its own line:
3845        //
3846        //   <div
3847        //     id="x"
3848        //     class="y"
3849        //   >
3850        //
3851        //   foo
3852        //
3853        //   </div>
3854        //
3855        // Open tag spans lines 0..=3. Content starts at line 4.
3856        let input = "<div\n  id=\"x\"\n  class=\"y\"\n>\n\nfoo\n\n</div>\n";
3857        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3858        let mut builder = GreenNodeBuilder::new();
3859
3860        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3861        let opts = ParserOptions::default();
3862        let new_pos = parse_html_block_with_wrapper(
3863            &mut builder,
3864            &lines,
3865            0,
3866            block_type,
3867            &ContainerPrefix::default(),
3868            SyntaxKind::HTML_BLOCK_DIV,
3869            &opts,
3870        );
3871
3872        // 8 lines: open-line 0, open-line 1 (`  id="x"`), open-line 2
3873        // (`  class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3874        assert_eq!(new_pos, 8);
3875
3876        // CST must contain a structural HTML_ATTRS region holding the
3877        // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3878        let green = builder.finish();
3879        let root = crate::syntax::SyntaxNode::new_root(green);
3880        let attrs_count = root
3881            .descendants()
3882            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3883            .count();
3884        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3885
3886        // Byte-identical losslessness check.
3887        let collected: String = root
3888            .descendants_with_tokens()
3889            .filter_map(|n| n.into_token())
3890            .map(|t| t.text().to_string())
3891            .collect();
3892        assert_eq!(collected, input);
3893    }
3894
3895    #[test]
3896    fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3897        // Multi-line open tag with the closing `>` on the last attribute
3898        // line (case 0262 already covers this pattern; pin behavior to
3899        // also ensure HTML_ATTRS structural exposure).
3900        let input = "<div\n  id=\"x\"\n  class=\"y\">\nfoo\n</div>\n";
3901        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3902        let mut builder = GreenNodeBuilder::new();
3903
3904        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3905        let opts = ParserOptions::default();
3906        let new_pos = parse_html_block_with_wrapper(
3907            &mut builder,
3908            &lines,
3909            0,
3910            block_type,
3911            &ContainerPrefix::default(),
3912            SyntaxKind::HTML_BLOCK_DIV,
3913            &opts,
3914        );
3915
3916        // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3917        // </div>.
3918        assert_eq!(new_pos, 5);
3919
3920        let green = builder.finish();
3921        let root = crate::syntax::SyntaxNode::new_root(green);
3922        let attrs_count = root
3923            .descendants()
3924            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3925            .count();
3926        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3927
3928        let collected: String = root
3929            .descendants_with_tokens()
3930            .filter_map(|n| n.into_token())
3931            .map(|t| t.text().to_string())
3932            .collect();
3933        assert_eq!(collected, input);
3934    }
3935
3936    #[test]
3937    fn test_commonmark_type6_blank_line_terminates() {
3938        let input = "<div>\nfoo\n\nbar\n";
3939        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3940        let mut builder = GreenNodeBuilder::new();
3941
3942        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3943        let opts = ParserOptions::default();
3944        let new_pos = parse_html_block_with_wrapper(
3945            &mut builder,
3946            &lines,
3947            0,
3948            block_type,
3949            &ContainerPrefix::default(),
3950            SyntaxKind::HTML_BLOCK,
3951            &opts,
3952        );
3953
3954        // Block contains <div>\nfoo\n; stops at blank line (line 2).
3955        assert_eq!(new_pos, 2);
3956    }
3957}