Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10    ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::attributes::emit_html_attrs_node;
13use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
14
15/// HTML block-level tags as defined by CommonMark spec.
16/// These tags start an HTML block when found at the start of a line.
17const BLOCK_TAGS: &[&str] = &[
18    "address",
19    "article",
20    "aside",
21    "base",
22    "basefont",
23    "blockquote",
24    "body",
25    "caption",
26    "center",
27    "col",
28    "colgroup",
29    "dd",
30    "details",
31    "dialog",
32    "dir",
33    "div",
34    "dl",
35    "dt",
36    "fieldset",
37    "figcaption",
38    "figure",
39    "footer",
40    "form",
41    "frame",
42    "frameset",
43    "h1",
44    "h2",
45    "h3",
46    "h4",
47    "h5",
48    "h6",
49    "head",
50    "header",
51    "hr",
52    "html",
53    "iframe",
54    "legend",
55    "li",
56    "link",
57    "main",
58    "menu",
59    "menuitem",
60    "nav",
61    "noframes",
62    "ol",
63    "optgroup",
64    "option",
65    "p",
66    "param",
67    "section",
68    "source",
69    "summary",
70    "table",
71    "tbody",
72    "td",
73    "tfoot",
74    "th",
75    "thead",
76    "title",
77    "tr",
78    "track",
79    "ul",
80];
81
82/// Tags that contain raw/verbatim content (no Markdown processing inside).
83const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
84
85/// Pandoc's `blockHtmlTags` (mirrors
86/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
87/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
88/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
89/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
90/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
91/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
92///
93/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
94/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
95/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
96/// and the verbatim `script`) is tracked separately as
97/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
98/// fresh-block positions but stay inline inside an existing HTML block
99/// (e.g. `<form><input><button>X</button></form>`); the projector's
100/// `split_html_block_by_tags` keys on `inline_pending` to keep them
101/// inline once an inline-only tag or text byte has been seen since the
102/// last splitter.
103const PANDOC_BLOCK_TAGS: &[&str] = &[
104    "address",
105    "article",
106    "aside",
107    "blockquote",
108    "body",
109    "canvas",
110    "caption",
111    "center",
112    "col",
113    "colgroup",
114    "dd",
115    "details",
116    "dir",
117    "div",
118    "dl",
119    "dt",
120    "fieldset",
121    "figcaption",
122    "figure",
123    "footer",
124    "form",
125    "frameset",
126    "h1",
127    "h2",
128    "h3",
129    "h4",
130    "h5",
131    "h6",
132    "head",
133    "header",
134    "hgroup",
135    "hr",
136    "html",
137    "isindex",
138    "li",
139    "main",
140    "menu",
141    "meta",
142    "nav",
143    "noframes",
144    "ol",
145    "output",
146    "p",
147    "pre",
148    "script",
149    "section",
150    "style",
151    "summary",
152    "table",
153    "tbody",
154    "td",
155    "textarea",
156    "tfoot",
157    "th",
158    "thead",
159    "tr",
160    "ul",
161];
162
163/// Whether `name` (case-insensitive) is one of the HTML block-level tags
164/// recognized by CommonMark §4.6 type-6.
165pub fn is_html_block_tag_name(name: &str) -> bool {
166    let lower = name.to_ascii_lowercase();
167    BLOCK_TAGS.contains(&lower.as_str())
168}
169
170/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
171/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
172/// Used by the pandoc-native projector's `split_html_block_by_tags` to
173/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
174/// the block — block-level tags emit as separate `RawBlock` entries;
175/// inline tags stay inline in the surrounding `Plain` content.
176pub fn is_pandoc_block_tag_name(name: &str) -> bool {
177    let lower = name.to_ascii_lowercase();
178    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
179}
180
181/// Pandoc's `eitherBlockOrInline` set (mirrors
182/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
183/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
184/// (because `name ∉ blockTags`). At top level (or after a blank line)
185/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
186/// inside an existing HTML block once a paragraph has started parsing,
187/// the same tag stays inline as `RawInline`.
188///
189/// The projector's `split_html_block_by_tags` mirrors this with an
190/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
191/// always split; inline-block tags split only when no inline content
192/// has been buffered since the last splitter.
193///
194/// Void elements (`area`, `embed`, `source`, `track`) live in
195/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
196/// rule as non-void inline-block tags but emit a single RawBlock per
197/// instance instead of a matched-pair lift.
198/// `script` is omitted because it is already verbatim (handled by the
199/// `<script>...</script>` raw-text path) and the strict-block check
200/// fires first regardless.
201const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
202    "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
203    "svg", "video",
204];
205
206/// Whether `name` (case-insensitive) is one of pandoc's
207/// `eitherBlockOrInline` tags (excluding void elements and `script`;
208/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
209pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
210    let lower = name.to_ascii_lowercase();
211    PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
212}
213
214/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
215/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
216/// minus those handled elsewhere: `br` and `wbr` are inline-only;
217/// `img` and `input` are inline-only; HTML void elements that pandoc
218/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
219/// `track`).
220///
221/// At fresh-block positions (or after a blank line) pandoc emits these
222/// as a single `RawBlock`; inside a running paragraph they stay inline
223/// as `RawInline`. The parser opens a depth-zero HTML block (closes
224/// immediately on the open-tag line — there is no closing tag to
225/// match) so subsequent lines start fresh blocks; the projector's
226/// `split_html_block_by_tags` handles the same-line splitting via
227/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
228const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
229
230/// Whether `name` (case-insensitive) is one of pandoc's void
231/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
232pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
233    let lower = name.to_ascii_lowercase();
234    PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
235}
236
237/// Whether the given tag name is eligible for the Phase 6 / Fix #4
238/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
239/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
240/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
241/// and NOT void. These are the tags where pandoc parses the body as
242/// fresh markdown between RawBlock emissions of the open/close tags —
243/// exactly the shape we can lift into structural CST children.
244///
245/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
246/// additional gate at the lift-gate site: the lift is abandoned when
247/// the body's first non-blank content is a void block tag at a
248/// fresh-block position (`<video>\n<source ...>\n</video>` projects
249/// per-tag rather than matched-pair, mirroring pandoc).
250///
251/// `<div>` is intentionally excluded — it has its own lift path
252/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
253/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
254pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
255    let lower = name.to_ascii_lowercase();
256    if VERBATIM_TAGS.contains(&lower.as_str()) {
257        return false;
258    }
259    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
260        return false;
261    }
262    if lower == "div" {
263        return false;
264    }
265    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
266        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
267}
268
269/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
270/// — anything that has an opening and a matching closing form whose
271/// `</tag>` would be recognized by the dispatcher as a separate block
272/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
273/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
274/// Void tags are excluded — they have no close form.
275///
276/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
277/// an open inside the buffer whose close would otherwise interrupt the
278/// list item mid-construct.
279pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
280    let lower = name.to_ascii_lowercase();
281    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
282        return false;
283    }
284    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
285        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
286        || VERBATIM_TAGS.contains(&lower.as_str())
287}
288
289/// Open-tag-attribute tokenization gate for non-div strict-block tags
290/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
291/// open tag is eligible for finer-grained tokenization
292/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
293/// driving the full body lift — that's the `bq_clean_lift` path. The
294/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
295/// the salsa anchor index.
296///
297/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
298/// wrapper) regardless of bq depth, so this gate skips it.
299fn bq_strict_attr_emit_tag_name(
300    wrapper_kind: SyntaxKind,
301    block_type: &HtmlBlockType,
302    bq_depth: usize,
303) -> Option<&str> {
304    if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
305        return None;
306    }
307    match block_type {
308        HtmlBlockType::BlockTag {
309            tag_name,
310            is_verbatim: false,
311            closed_by_blank_line: false,
312            depth_aware: true,
313            closes_at_open_tag: false,
314            is_closing: false,
315        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
316        _ => None,
317    }
318}
319
320/// Information about a detected HTML block opening.
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub(crate) enum HtmlBlockType {
323    /// HTML comment: <!-- ... -->
324    Comment,
325    /// Processing instruction: <? ... ?>
326    ProcessingInstruction,
327    /// Declaration: <!...>
328    Declaration,
329    /// CDATA section: <![CDATA[ ... ]]>
330    CData,
331    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
332    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
333    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
334    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
335    /// `depth_aware` extends the matching-tag close path with balanced
336    /// open/close tracking of the same tag name (mirrors pandoc's
337    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
338    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
339    /// `closed_by_blank_line` is true.
340    /// `closes_at_open_tag` short-circuits the close search: the block
341    /// always ends after the open-tag line. Used for void
342    /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
343    /// `<track>`) which have no closing tag — depth-aware matching
344    /// would walk to end-of-input.
345    /// `is_closing` records whether the tag at the start position is a
346    /// closing form (`</tag>`) rather than an opening form (`<tag>`).
347    /// The dispatcher's `cannot_interrupt` consults this to mirror
348    /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
349    /// even when `<script>` is not — pandoc treats the close-form as
350    /// always-inline regardless of attributes).
351    BlockTag {
352        tag_name: String,
353        is_verbatim: bool,
354        closed_by_blank_line: bool,
355        depth_aware: bool,
356        closes_at_open_tag: bool,
357        is_closing: bool,
358    },
359    /// CommonMark §4.6 type 7: complete open or close tag on a line by
360    /// itself, tag name not in the type-1 verbatim list. Block ends at
361    /// blank line. Cannot interrupt a paragraph.
362    Type7,
363}
364
365/// Try to detect an HTML block opening from content.
366/// Returns block type if this is a valid HTML block start.
367///
368/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
369/// accept closing tags (`</div>`), type-6 blocks end at the next blank
370/// line (rather than a matching close tag), and type 7 is recognized.
371pub(crate) fn try_parse_html_block_start(
372    content: &str,
373    is_commonmark: bool,
374) -> Option<HtmlBlockType> {
375    let trimmed = strip_leading_spaces(content);
376
377    // Must start with <
378    if !trimmed.starts_with('<') {
379        return None;
380    }
381
382    // HTML comment
383    if trimmed.starts_with("<!--") {
384        return Some(HtmlBlockType::Comment);
385    }
386
387    // Processing instruction
388    if trimmed.starts_with("<?") {
389        return Some(HtmlBlockType::ProcessingInstruction);
390    }
391
392    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
393    // recognize bare CDATA as a raw HTML block; the literal bytes fall
394    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
395    // text is parsed as inline markdown, etc).
396    if is_commonmark && trimmed.starts_with("<![CDATA[") {
397        return Some(HtmlBlockType::CData);
398    }
399
400    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
401    // does not recognize bare declarations as raw HTML blocks (its
402    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
403    // tag-shaped blocks); the bytes fall through to paragraph parsing.
404    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
405        let after_bang = &trimmed[2..];
406        if after_bang.chars().next()?.is_ascii_alphabetic() {
407            return Some(HtmlBlockType::Declaration);
408        }
409    }
410
411    // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
412    // Pandoc-native recognizes standalone closing forms of strict-block tags
413    // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
414    // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
415    // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
416    // the open-tag line via `closes_at_open_tag: true`.
417    if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
418        let tag_lower = tag_name.to_lowercase();
419        let is_closing = trimmed.starts_with("</");
420
421        // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
422        // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
423        // Unlike inline-block / void closes, these CAN interrupt a running
424        // paragraph (the dispatcher's `cannot_interrupt` only covers the
425        // inline-block / void categories). Inline-block / void closes are
426        // handled by their own branches further below.
427        if !is_commonmark
428            && is_closing
429            && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
430                || VERBATIM_TAGS.contains(&tag_lower.as_str()))
431            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
432            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
433        {
434            return Some(HtmlBlockType::BlockTag {
435                tag_name: tag_lower,
436                is_verbatim: false,
437                closed_by_blank_line: false,
438                depth_aware: false,
439                closes_at_open_tag: true,
440                is_closing: true,
441            });
442        }
443
444        // Under Pandoc, remaining closing forms (truly inline-only tags like
445        // `</em>`, `</span>`) are not block starts — fall through to the
446        // existing inline-html path. Inline-block + void closes are caught
447        // by the dedicated branches further below.
448        if !is_commonmark
449            && is_closing
450            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
451            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
452        {
453            return None;
454        }
455
456        // Check if it's a block-level tag. Pandoc and CommonMark disagree on
457        // membership: pandoc's `blockHtmlTags` (see
458        // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
459        // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
460        // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
461        let is_block_tag = if is_commonmark {
462            BLOCK_TAGS.contains(&tag_lower.as_str())
463        } else {
464            PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
465        };
466        if is_block_tag {
467            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
468            return Some(HtmlBlockType::BlockTag {
469                tag_name: tag_lower,
470                is_verbatim,
471                closed_by_blank_line: is_commonmark && !is_verbatim,
472                depth_aware: !is_commonmark,
473                closes_at_open_tag: false,
474                is_closing,
475            });
476        }
477
478        // Pandoc dialect also treats `eitherBlockOrInline` tags as block
479        // starters at fresh-block positions. The block dispatcher caller
480        // gates these as `cannot_interrupt` (mirrors pandoc — they never
481        // interrupt a running paragraph; only start a fresh block when
482        // following a blank line or at document start). Closing forms
483        // (`</video>`) emit as a single-line `RawBlock` with no balanced
484        // match — pandoc-native pins this for standalone closes.
485        if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
486            return Some(HtmlBlockType::BlockTag {
487                tag_name: tag_lower,
488                is_verbatim: false,
489                closed_by_blank_line: false,
490                depth_aware: !is_closing,
491                closes_at_open_tag: is_closing,
492                is_closing,
493            });
494        }
495
496        // Pandoc dialect also recognizes the void subset of
497        // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
498        // These have no closing tag, so the parser closes the block
499        // immediately on the open-tag line; the projector's
500        // `split_html_block_by_tags` handles the same-line splitting
501        // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
502        // non-void inline-block tags, void tags never interrupt a
503        // running paragraph (gated as `cannot_interrupt` in the
504        // dispatcher). Closing forms (`</embed>`) — semantically
505        // nonsensical for void elements — pandoc still emits as a
506        // single-line `RawBlock`; mirror that.
507        if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
508            return Some(HtmlBlockType::BlockTag {
509                tag_name: tag_lower,
510                is_verbatim: false,
511                closed_by_blank_line: false,
512                depth_aware: false,
513                closes_at_open_tag: true,
514                is_closing,
515            });
516        }
517
518        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
519        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
520        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
521        // do not start a type-1 block. Letting `</pre>` through here would
522        // wrongly interrupt a paragraph.
523        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
524            return Some(HtmlBlockType::BlockTag {
525                tag_name: tag_lower,
526                is_verbatim: true,
527                closed_by_blank_line: false,
528                depth_aware: !is_commonmark,
529                closes_at_open_tag: false,
530                is_closing: false,
531            });
532        }
533    }
534
535    // Type 7 (CommonMark only): complete open or close tag on a line by
536    // itself, tag name not in the type-1 verbatim list.
537    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
538    {
539        let rest = &trimmed[end..];
540        let only_ws = rest
541            .bytes()
542            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
543        if only_ws {
544            // Reject if the tag name belongs to the type-1 verbatim set
545            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
546            // type-1 starts above, so seeing one here means the opener
547            // had a different shape (e.g. `<pre/>` self-closing) that
548            // shouldn't trigger type 7 either. Conservatively skip.
549            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
550            let name_end = leading
551                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
552                .unwrap_or(leading.len());
553            let name = leading[..name_end].to_ascii_lowercase();
554            if !VERBATIM_TAGS.contains(&name.as_str()) {
555                return Some(HtmlBlockType::Type7);
556            }
557        }
558    }
559
560    None
561}
562
563/// Extract the tag name for HTML-block-start detection.
564///
565/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
566/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
567/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
568/// the spec — we approximate that with the space/`>`/`/` boundary check.
569fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
570    if !text.starts_with('<') {
571        return None;
572    }
573
574    let after_bracket = &text[1..];
575
576    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
577        if !accept_closing {
578            return None;
579        }
580        stripped
581    } else {
582        after_bracket
583    };
584
585    // Extract tag name (alphanumeric, ends at space, >, or /)
586    let tag_end = after_slash
587        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
588        .unwrap_or(after_slash.len());
589
590    if tag_end == 0 {
591        return None;
592    }
593
594    let tag_name = &after_slash[..tag_end];
595
596    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
597    if !tag_name.chars().next()?.is_ascii_alphabetic() {
598        return None;
599    }
600
601    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
602        return None;
603    }
604
605    Some(tag_name.to_string())
606}
607
608/// Whether this block type ends at a blank line (CommonMark types 6 & 7
609/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
610/// marker — only at end of input or the next blank line.
611fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
612    matches!(
613        block_type,
614        HtmlBlockType::Type7
615            | HtmlBlockType::BlockTag {
616                closed_by_blank_line: true,
617                ..
618            }
619    )
620}
621
622/// Check if a line contains the closing marker for the given HTML block type.
623/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
624/// blank-line-terminated types (6 in CommonMark, 7) never match here.
625fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
626    match block_type {
627        HtmlBlockType::Comment => line.contains("-->"),
628        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
629        HtmlBlockType::Declaration => line.contains('>'),
630        HtmlBlockType::CData => line.contains("]]>"),
631        HtmlBlockType::BlockTag {
632            tag_name,
633            closed_by_blank_line: false,
634            ..
635        } => {
636            // Look for closing tag </tagname>
637            let closing_tag = format!("</{}>", tag_name);
638            line.to_lowercase().contains(&closing_tag)
639        }
640        HtmlBlockType::BlockTag {
641            closed_by_blank_line: true,
642            ..
643        }
644        | HtmlBlockType::Type7 => false,
645    }
646}
647
648/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
649/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
650/// inside a quoted attribute value are NOT counted — the scanner walks
651/// `<...>` brackets and respects `"`/`'` quoting.
652///
653/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
654/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
655/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
656/// close-form dispatch that would otherwise break the list-item buffer
657/// mid-`<div>...</div>`.
658pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
659    let bytes = line.as_bytes();
660    let lower_line = line.to_ascii_lowercase();
661    let lower_bytes = lower_line.as_bytes();
662    let tag_lower = tag_name.to_ascii_lowercase();
663    let tag_bytes = tag_lower.as_bytes();
664
665    let mut opens = 0usize;
666    let mut closes = 0usize;
667    let mut i = 0usize;
668
669    while i < bytes.len() {
670        if bytes[i] != b'<' {
671            i += 1;
672            continue;
673        }
674        let after = i + 1;
675        let is_close = after < bytes.len() && bytes[after] == b'/';
676        let name_start = if is_close { after + 1 } else { after };
677        let matched = name_start + tag_bytes.len() <= bytes.len()
678            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
679        let after_name = name_start + tag_bytes.len();
680        let is_boundary = matched
681            && matches!(
682                bytes.get(after_name).copied(),
683                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
684            );
685
686        // Walk forward to the closing `>` of this tag bracket, skipping
687        // inside quoted attribute values. Self-closing form ends with `/>`.
688        let mut j = if matched { after_name } else { after };
689        let mut quote: Option<u8> = None;
690        let mut self_close = false;
691        let mut found_gt = false;
692        while j < bytes.len() {
693            let b = bytes[j];
694            match (quote, b) {
695                (Some(q), x) if x == q => quote = None,
696                (None, b'"') | (None, b'\'') => quote = Some(b),
697                (None, b'>') => {
698                    found_gt = true;
699                    if j > i + 1 && bytes[j - 1] == b'/' {
700                        self_close = true;
701                    }
702                    break;
703                }
704                _ => {}
705            }
706            j += 1;
707        }
708
709        if matched && is_boundary {
710            if is_close {
711                closes += 1;
712            } else if !self_close {
713                opens += 1;
714            }
715        }
716
717        if found_gt {
718            i = j + 1;
719        } else {
720            // Unterminated `<...` — bail out to avoid an infinite loop.
721            // The remaining bytes don't form a complete tag.
722            break;
723        }
724    }
725
726    (opens, closes)
727}
728
729/// Pandoc-dialect lift for HTML comments / processing instructions
730/// whose close marker is followed by additional bytes (same-line
731/// trailing or following lines). Pandoc-native emits a `RawBlock` for
732/// the marker bytes only, then parses the remainder as fresh blocks.
733///
734/// Returns `Some(consumed_lines)` when the split fires (caller must
735/// NOT enter the legacy emission); `None` to fall back to the legacy
736/// path (no close marker found, or no trailing content to split).
737///
738/// CST shape on success:
739/// ```text
740/// HTML_BLOCK
741///   HTML_BLOCK_TAG (open)        // line[0] up to and incl close marker
742///     TEXT  "<!-- hi -->"        // or with HTML_BLOCK_CONTENT in between
743///     ...                        // for multi-line `<!--\n…\n-->` shape
744/// <sibling blocks>               // recursive parse of trailing + lines[M+1..]
745/// ```
746/// The CST node kind to emit for an opaque single-construct HTML block.
747/// Under `Dialect::Pandoc`, comments, processing instructions, and
748/// verbatim raw-text elements (`<pre>`/`<script>`/`<style>`/`<textarea>`)
749/// each project to exactly one `RawBlock "html"`; tagging the wrapper
750/// `HTML_BLOCK_RAW` lets the pandoc-native projector route by kind instead
751/// of re-sniffing the leading bytes. This changes only the wrapper `u16` —
752/// the child tokens are emitted byte-for-byte identically, so the CST stays
753/// lossless (the `HTML_BLOCK_DIV` precedent). The behavioral `wrapper_kind`
754/// stays `HTML_BLOCK` everywhere else in `parse_html_block_with_wrapper`, so
755/// no lift gate changes. CommonMark dialect keeps the opaque `HTML_BLOCK`
756/// shape.
757fn html_block_node_kind(
758    wrapper_kind: SyntaxKind,
759    block_type: &HtmlBlockType,
760    dialect: crate::options::Dialect,
761) -> SyntaxKind {
762    if wrapper_kind == SyntaxKind::HTML_BLOCK
763        && dialect == crate::options::Dialect::Pandoc
764        && matches!(
765            block_type,
766            HtmlBlockType::Comment
767                | HtmlBlockType::ProcessingInstruction
768                | HtmlBlockType::BlockTag {
769                    is_verbatim: true,
770                    ..
771                }
772        )
773    {
774        SyntaxKind::HTML_BLOCK_RAW
775    } else {
776        wrapper_kind
777    }
778}
779
780fn try_parse_comment_pi_with_trailing_split(
781    builder: &mut GreenNodeBuilder<'static>,
782    lines: &[&str],
783    start_pos: usize,
784    block_type: &HtmlBlockType,
785    wrapper_kind: SyntaxKind,
786    bq_depth: usize,
787    config: &ParserOptions,
788) -> Option<usize> {
789    let marker: &str = match block_type {
790        HtmlBlockType::Comment => "-->",
791        HtmlBlockType::ProcessingInstruction => "?>",
792        _ => return None,
793    };
794
795    // Find the close marker in the bq-stripped line content. For
796    // bq_depth == 0 the inner content equals the raw line; for
797    // bq_depth > 0 we look past the `>` markers stripped by the
798    // outer dispatcher (line 0) and emitted as bq prefix below
799    // (lines > 0). `marker_end_in_inner` is the byte offset of the
800    // first byte AFTER the close marker, measured from the start
801    // of the inner (post-strip) content.
802    let mut close_line_idx: Option<usize> = None;
803    let mut marker_end_in_inner: usize = 0;
804    for (offset, line) in lines[start_pos..].iter().enumerate() {
805        let inner = if bq_depth > 0 {
806            strip_n_blockquote_markers(line, bq_depth)
807        } else {
808            line
809        };
810        if let Some(pos) = inner.find(marker) {
811            close_line_idx = Some(start_pos + offset);
812            marker_end_in_inner = pos + marker.len();
813            break;
814        }
815    }
816    let close_line_idx = close_line_idx?;
817    let close_line = lines[close_line_idx];
818    let close_inner = if bq_depth > 0 {
819        strip_n_blockquote_markers(close_line, bq_depth)
820    } else {
821        close_line
822    };
823    let close_prefix_len = close_line.len() - close_inner.len();
824    let trailing = &close_inner[marker_end_in_inner..];
825
826    // Only fire when there is non-whitespace content AFTER the close
827    // marker on the close line. The legacy path correctly handles
828    // the close-line-ends-at-close-marker shapes (`-->\n` followed
829    // by separate blocks); only the same-line-trailing case needs
830    // structural splitting. Trailing-whitespace-only handling
831    // (`-->   \n`) is a projector-side trim — separate concern.
832    let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
833    if !has_non_ws_trailing {
834        return None;
835    }
836
837    builder.start_node(html_block_node_kind(wrapper_kind, block_type, config.dialect).into());
838
839    // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
840    // middle `HTML_BLOCK_CONTENT` lines between open and close. The
841    // close `HTML_BLOCK_TAG` carries only the bytes up to and
842    // including the close marker — trailing bytes go to the sibling.
843    if close_line_idx == start_pos {
844        // Same-line shape: one HTML_BLOCK_TAG containing the close
845        // marker's bytes. The newline lives on the trailing sibling.
846        // Line 0's bq prefix (if any) was already emitted by the
847        // outer dispatcher; emit only the inner marker bytes.
848        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
849        let close_part = &close_inner[..marker_end_in_inner];
850        if !close_part.is_empty() {
851            builder.token(SyntaxKind::TEXT.into(), close_part);
852        }
853        builder.finish_node();
854    } else {
855        // Multi-line shape: open tag covers lines[start_pos..close],
856        // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
857        // only the marker bytes. Line 0's bq prefix was emitted by
858        // the outer dispatcher; subsequent lines (middle + close)
859        // need bq prefix re-emission inside the wrapper.
860        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
861        let first_line = lines[start_pos];
862        let first_inner = if bq_depth > 0 {
863            strip_n_blockquote_markers(first_line, bq_depth)
864        } else {
865            first_line
866        };
867        let (line_no_nl, nl) = strip_newline(first_inner);
868        if !line_no_nl.is_empty() {
869            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
870        }
871        if !nl.is_empty() {
872            builder.token(SyntaxKind::NEWLINE.into(), nl);
873        }
874        builder.finish_node();
875
876        if close_line_idx > start_pos + 1 {
877            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
878            for content_line in &lines[start_pos + 1..close_line_idx] {
879                emit_html_block_line(builder, content_line, bq_depth);
880            }
881            builder.finish_node();
882        }
883
884        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
885        if bq_depth > 0 && close_prefix_len > 0 {
886            emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
887        }
888        let close_part = &close_inner[..marker_end_in_inner];
889        if !close_part.is_empty() {
890            builder.token(SyntaxKind::TEXT.into(), close_part);
891        }
892        builder.finish_node();
893    }
894
895    builder.finish_node(); // HTML_BLOCK
896
897    // Recursively parse JUST the trailing bytes on the close line
898    // and graft top-level children as siblings of the HTML_BLOCK we
899    // just closed. We do NOT consume subsequent lines here — the
900    // outer dispatcher continues from `close_line_idx + 1` and
901    // handles container-boundary lines (`:::` div closes, blockquote
902    // markers, list-marker continuations) correctly. Multi-line
903    // softbreak continuation (`<!-- --> trailing\nmore\n` →
904    // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
905    // outer dispatcher sees `more` after the close line and starts
906    // a fresh paragraph. Refdefs flow through from the outer config
907    // (same pattern as `emit_html_block_body_lifted_inner`).
908    if !trailing.is_empty() {
909        let mut inner_options = config.clone();
910        let refdefs = config.refdef_labels.clone().unwrap_or_default();
911        inner_options.refdef_labels = Some(refdefs.clone());
912        let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
913        let mut bq = None;
914        graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
915    }
916
917    Some(close_line_idx + 1)
918}
919
920/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
921/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
922/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
923/// identical to the source either way; only the wrapper retag changes.
924pub(crate) fn parse_html_block_with_wrapper(
925    builder: &mut GreenNodeBuilder<'static>,
926    lines: &[&str],
927    start_pos: usize,
928    block_type: HtmlBlockType,
929    prefix: &ContainerPrefix,
930    wrapper_kind: SyntaxKind,
931    config: &ParserOptions,
932) -> usize {
933    let bq_depth = prefix.bq_depth();
934    // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
935    // closes the RawBlock at the close marker (`-->` / `?>`) and parses
936    // any subsequent bytes (same-line trailing or following lines) as
937    // fresh blocks. The legacy path absorbs them into the HTML block
938    // wrapper, producing one oversized RawBlock. Handle the split here
939    // before entering the legacy emission so the CST encodes the
940    // sibling structure.
941    if config.dialect == crate::options::Dialect::Pandoc
942        && matches!(
943            block_type,
944            HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
945        )
946        && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
947            builder,
948            lines,
949            start_pos,
950            &block_type,
951            wrapper_kind,
952            bq_depth,
953            config,
954        )
955    {
956        return consumed;
957    }
958
959    // Start HTML block. The node kind may retag to `HTML_BLOCK_RAW` for
960    // single-construct opaque shapes (comment / PI / verbatim) under
961    // Pandoc; `wrapper_kind` itself stays the behavioral gate below so no
962    // lift logic changes and the child tokens stay byte-identical.
963    builder.start_node(html_block_node_kind(wrapper_kind, &block_type, config.dialect).into());
964
965    let first_line = lines[start_pos];
966    let blank_terminated = ends_at_blank_line(&block_type);
967
968    // The block dispatcher has already emitted the bq prefix tokens for
969    // the first line; emit only the inner content as TEXT to keep the
970    // CST byte-equal to the source. List-marker bytes are stripped only
971    // when this dispatch fires on a list-marker line — for
972    // continuation-line dispatches (the much more common case) the
973    // leading indent is inner content, not upstream-emitted prefix.
974    let first_inner = prefix.strip_line_0_for_emission(first_line);
975
976    // Detect a multi-line open tag.
977    // - `<div>` (Pandoc lift): we tokenize each line structurally so the
978    //   salsa anchor walk picks up `id` from the HTML_ATTRS region.
979    // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
980    //   `<section>`, `<header>`, …): same structural emission, exposing
981    //   `id` to the salsa anchor walk and enabling the body lift below.
982    // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
983    //   without this, the parser closes the block after line 0 and the
984    //   remainder of the open tag falls into following paragraphs;
985    //   pandoc-native treats the whole multi-line open tag as a single
986    //   `RawBlock`. Emission for void tags uses simple per-line
987    //   TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
988    //   from void tags).
989    let multiline_open_end = match (wrapper_kind, &block_type) {
990        (SyntaxKind::HTML_BLOCK_DIV, _) => {
991            find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
992        }
993        (
994            _,
995            HtmlBlockType::BlockTag {
996                tag_name,
997                closes_at_open_tag: true,
998                ..
999            },
1000        ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
1001        (
1002            _,
1003            HtmlBlockType::BlockTag {
1004                tag_name,
1005                is_verbatim: false,
1006                closed_by_blank_line: false,
1007                depth_aware: true,
1008                closes_at_open_tag: false,
1009                is_closing: false,
1010            },
1011        ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
1012            find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
1013        }
1014        _ => None,
1015    };
1016
1017    // Set up depth-aware close tracking when the block type asks for it
1018    // (Pandoc dialect, balanced same-name tag matching). A `None` means
1019    // we fall back to the legacy "first matching close" path via
1020    // `is_closing_marker`. Computed up front so the lift-mode gate
1021    // below can decide whether the open line already balances the
1022    // block (same-line `<div>...</div>`).
1023    let depth_aware_tag: Option<String> = match &block_type {
1024        HtmlBlockType::BlockTag {
1025            tag_name,
1026            closed_by_blank_line: false,
1027            depth_aware: true,
1028            ..
1029        } => Some(tag_name.clone()),
1030        _ => None,
1031    };
1032    let mut depth: i64 = 1;
1033    if let Some(tag_name) = &depth_aware_tag {
1034        // Sum opens/closes across all open-tag lines (single-line: just
1035        // line 0; multi-line: lines 0..=end_line_idx).
1036        let last_open_line = multiline_open_end.unwrap_or(start_pos);
1037        let mut opens = 0usize;
1038        let mut closes = 0usize;
1039        for line in &lines[start_pos..=last_open_line] {
1040            let inner = prefix.strip(line);
1041            let (o, c) = count_tag_balance(inner, tag_name);
1042            opens += o;
1043            closes += c;
1044        }
1045        depth = opens as i64 - closes as i64;
1046    }
1047
1048    // Same-line `<div>foo</div>` shape: the open line balances the
1049    // block under depth-aware tracking. We can lift this structurally
1050    // only when the open-tag trailing has exactly one `</div>` close,
1051    // zero `<div>` opens, and no non-whitespace content after the
1052    // close. Other same-line shapes (nested, trailing text, malformed)
1053    // fall through to the byte-reparse path.
1054    let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1055        && multiline_open_end.is_none()
1056        && depth_aware_tag.is_some()
1057        && depth <= 0;
1058    let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1059        let (line_without_newline, _) = strip_newline(first_inner);
1060        probe_same_line_lift(line_without_newline, "div")
1061    };
1062
1063    // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1064    // `<nav>`, …): the body parses as fresh markdown between RawBlock
1065    // emissions of the open/close tags. Covers the clean multi-line
1066    // shape (open tag stands alone on its line), open-trailing
1067    // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1068    // and same-line (`<form>foo</form>`). Multi-line open and
1069    // blockquote-wrapped non-div shapes still fall through to the
1070    // byte-walker path.
1071    let strict_block_tag_name: Option<&str> =
1072        if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1073            match &block_type {
1074                HtmlBlockType::BlockTag {
1075                    tag_name,
1076                    is_verbatim: false,
1077                    closed_by_blank_line: false,
1078                    depth_aware: true,
1079                    closes_at_open_tag: false,
1080                    is_closing: false,
1081                } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1082                _ => None,
1083            }
1084        } else {
1085            None
1086        };
1087    // Same-line `<form>foo</form>` shape: the open line already
1088    // balances the block (`depth <= 0`). Lift only when the trailing
1089    // bytes after the open `>` end with `</tag>` and contain exactly
1090    // one close + zero nested opens.
1091    let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1092        multiline_open_end.is_none() && depth <= 0 && {
1093            let (line_no_nl, _) = strip_newline(first_inner);
1094            probe_same_line_lift(line_no_nl, name)
1095        }
1096    });
1097    // Strict-block lift gate: accept (a) a multi-line open tag spanning
1098    // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1099    // trailing single-line open (depth > 0, open `>` is present with
1100    // quote-aware matching), or (c) a safe same-line shape. For
1101    // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1102    // …) the lift additionally abandons when the body starts at a
1103    // fresh-block position with a void block tag — pandoc-native pins
1104    // per-tag emission rather than a matched-pair lift in that case.
1105    let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1106        let (line_no_nl, _) = strip_newline(first_inner);
1107        let shape_ok = if multiline_open_end.is_some() {
1108            // `find_multiline_open_end` already verified the open tag
1109            // closes with a quote-aware `>` somewhere in lines
1110            // `start_pos+1..=end`. No same-line trailing content to
1111            // probe; defer trailing-on-close-`>`-line handling to a
1112            // future session (rare in practice).
1113            true
1114        } else if depth > 0 {
1115            probe_open_tag_line_has_close_gt(line_no_nl, name)
1116        } else {
1117            same_line_strict_lift_safe
1118        };
1119        if !shape_ok {
1120            return false;
1121        }
1122        if !is_pandoc_inline_block_tag_name(name) {
1123            return true;
1124        }
1125        !inline_block_void_interior_abandons(
1126            first_inner,
1127            lines,
1128            start_pos,
1129            multiline_open_end,
1130            bq_depth,
1131            name,
1132        )
1133    });
1134
1135    // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1136    // are byte-equal to the non-bq same-line shape minus the leading
1137    // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1138    // The body has no inner newlines, so no bq prefix re-injection is
1139    // needed when grafting — `emit_html_block_body_lifted` (passing
1140    // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1141    // open-trailing) still fall through to the projector's byte
1142    // walker — they need per-line prefix injection.
1143    let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1144        && multiline_open_end.is_none()
1145        && depth_aware_tag.is_some()
1146        && depth <= 0
1147    {
1148        let (line_no_nl, _) = strip_newline(first_inner);
1149        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1150            if probe_same_line_lift(line_no_nl, "div") {
1151                Some("div")
1152            } else {
1153                None
1154            }
1155        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1156            match &block_type {
1157                HtmlBlockType::BlockTag {
1158                    tag_name,
1159                    is_verbatim: false,
1160                    closed_by_blank_line: false,
1161                    depth_aware: true,
1162                    closes_at_open_tag: false,
1163                    is_closing: false,
1164                } if is_pandoc_lift_eligible_block_tag(tag_name)
1165                    && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1166                {
1167                    // Inline-block tags (`<video>`, `<iframe>`, …) skip
1168                    // the void-interior check at same-line — the shape
1169                    // has no inner block content to interfere with.
1170                    Some(tag_name.as_str())
1171                }
1172                _ => None,
1173            }
1174        } else {
1175            None
1176        }
1177    } else {
1178        None
1179    };
1180
1181    // Messy-shape lift inside a blockquote — covers open-trailing
1182    // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1183    // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1184    // including the multi-line-open variants (`> <div\n>   id="x">foo\n>
1185    // body\n> </div>`) where the trailing is captured into `pre_content`
1186    // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1187    // The open line does NOT balance the block (depth > 0 after the
1188    // open line, distinguishing this from `same_line_bq_lift_tag` which
1189    // requires depth <= 0). The close line — possibly with leading body
1190    // text — closes the block when depth returns to 0. Body lines (incl.
1191    // open trailing and close leading) graft via prefix re-injection.
1192    let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1193    {
1194        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1195            Some("div")
1196        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1197            match &block_type {
1198                HtmlBlockType::BlockTag {
1199                    tag_name,
1200                    is_verbatim: false,
1201                    closed_by_blank_line: false,
1202                    depth_aware: true,
1203                    closes_at_open_tag: false,
1204                    is_closing: false,
1205                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1206                    // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1207                    // …) abandon the lift when the body starts at a
1208                    // fresh-block position with a void block tag. Same gate
1209                    // as the non-bq matched-pair lift (`strict_block_lift`).
1210                    if is_pandoc_inline_block_tag_name(tag_name)
1211                        && inline_block_void_interior_abandons(
1212                            first_inner,
1213                            lines,
1214                            start_pos,
1215                            multiline_open_end,
1216                            bq_depth,
1217                            tag_name,
1218                        )
1219                    {
1220                        None
1221                    } else {
1222                        Some(tag_name.as_str())
1223                    }
1224                }
1225                _ => None,
1226            }
1227        } else {
1228            None
1229        }
1230    } else {
1231        None
1232    };
1233
1234    // Multi-line open + matched close-on-the-open's-last-line shape inside
1235    // a blockquote (`> <div\n>   id="x">foo</div>` and depth-aware variants:
1236    // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1237    // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1238    // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1239    // close is on the open's last line (`depth <= 0` after the open lines)
1240    // AND `multiline_open_end.is_some()`. The trailing bytes after the
1241    // last `>` get lifted into `pre_content` via
1242    // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1243    // new branch below splits `pre_content` at the matched close marker
1244    // and grafts body + close + any trailing siblings.
1245    let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1246        && multiline_open_end.is_some()
1247        && depth_aware_tag.is_some()
1248        && depth <= 0
1249    {
1250        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1251            Some("div")
1252        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1253            match &block_type {
1254                HtmlBlockType::BlockTag {
1255                    tag_name,
1256                    is_verbatim: false,
1257                    closed_by_blank_line: false,
1258                    depth_aware: true,
1259                    closes_at_open_tag: false,
1260                    is_closing: false,
1261                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1262                    if is_pandoc_inline_block_tag_name(tag_name)
1263                        && inline_block_void_interior_abandons(
1264                            first_inner,
1265                            lines,
1266                            start_pos,
1267                            multiline_open_end,
1268                            bq_depth,
1269                            tag_name,
1270                        )
1271                    {
1272                        None
1273                    } else {
1274                        Some(tag_name.as_str())
1275                    }
1276                }
1277                _ => None,
1278            }
1279        } else {
1280            None
1281        }
1282    } else {
1283        None
1284    };
1285
1286    // Whether this block participates in the Phase 6 structural lift
1287    // (recursively parse body as Pandoc markdown and graft children).
1288    // Covers `<div>` outside blockquote context. For same-line shapes
1289    // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1290    // keep the legacy single-HTML_BLOCK_TAG shape and let the
1291    // byte-reparse path handle projection.
1292    let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1293        && bq_depth == 0
1294        && (!is_same_line_div || same_line_div_lift_safe))
1295        || strict_block_lift
1296        || same_line_bq_lift_tag.is_some()
1297        || bq_messy_lift_tag.is_some()
1298        || bq_multiline_close_lift_tag.is_some();
1299
1300    // Trailing content from the open tag (after `>`). When the lift is
1301    // active and the open line is `<div ATTRS>foo\n`, this captures
1302    // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1303    // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1304    // shapes (same-line / blockquote-wrapped).
1305    let mut pre_content = String::new();
1306
1307    // Emit opening line(s)
1308    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1309
1310    if let Some(end_line_idx) = multiline_open_end {
1311        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1312            emit_multiline_open_tag_with_attrs(
1313                builder,
1314                lines,
1315                start_pos,
1316                end_line_idx,
1317                "div",
1318                bq_depth,
1319                lift_mode,
1320                &mut pre_content,
1321            );
1322        } else if let Some(name) = strict_block_tag_name
1323            && strict_block_lift
1324        {
1325            emit_multiline_open_tag_with_attrs(
1326                builder,
1327                lines,
1328                start_pos,
1329                end_line_idx,
1330                name,
1331                bq_depth,
1332                lift_mode,
1333                &mut pre_content,
1334            );
1335        } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1336        {
1337            // Multi-line open of a lift-eligible strict-block tag inside a
1338            // blockquote (`> <section\n>   id=...>`). The non-bq
1339            // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1340            // covers the bq side so the open tag emits HTML_ATTRS regions
1341            // for `AttributeNode::cast` and the projector's canonicalizer.
1342            //
1343            // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1344            // call below: only push trailing bytes into `pre_content` when
1345            // the structural lift will consume them (bq messy lift). The
1346            // bq clean-lift requires `pre_content.is_empty()`, so for clean
1347            // multi-line opens the trailing is empty anyway and this is
1348            // a no-op.
1349            let lift_trailing =
1350                bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1351            emit_multiline_open_tag_with_attrs(
1352                builder,
1353                lines,
1354                start_pos,
1355                end_line_idx,
1356                name,
1357                bq_depth,
1358                lift_trailing,
1359                &mut pre_content,
1360            );
1361        } else {
1362            emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1363        }
1364    } else {
1365        let (line_without_newline, newline_str) = strip_newline(first_inner);
1366        if !line_without_newline.is_empty() {
1367            // For HTML_BLOCK_DIV, expose the open tag's attributes
1368            // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1369            // via the same descendants walk that handles fenced-div /
1370            // heading attrs. CST bytes stay byte-equal to source — we only
1371            // tokenize at finer granularity for matched div opens.
1372            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1373                let trailing =
1374                    emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1375                if !trailing.is_empty() {
1376                    pre_content.push_str(trailing);
1377                    pre_content.push_str(newline_str);
1378                }
1379            } else if let Some(name) = strict_block_tag_name
1380                && strict_block_lift
1381            {
1382                let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1383                if !trailing.is_empty() {
1384                    pre_content.push_str(trailing);
1385                    pre_content.push_str(newline_str);
1386                }
1387            } else if let Some(name) =
1388                bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1389            {
1390                // Inside a blockquote, lift trailing bytes into
1391                // `pre_content` when either the same-line bq gate fires
1392                // (`> <tag>body</tag>` — handled by `same_line_closed`)
1393                // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1394                // </tag>` and butted-close — handled at the close-marker
1395                // site below). For the clean-shape bq lift the open has
1396                // no trailing bytes regardless, so `lift_trailing=true`
1397                // is a no-op there.
1398                let lift_trailing =
1399                    same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1400                let trailing =
1401                    emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1402                if lift_trailing && !trailing.is_empty() {
1403                    pre_content.push_str(trailing);
1404                    pre_content.push_str(newline_str);
1405                }
1406            } else {
1407                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1408            }
1409        }
1410        // When the open tag has trailing content under lift mode, the
1411        // newline belongs to that trailing line (it terminates the
1412        // synthetic body line, not the open tag). Don't double-emit.
1413        if pre_content.is_empty() && !newline_str.is_empty() {
1414            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1415        }
1416    }
1417
1418    builder.finish_node(); // HtmlBlockTag
1419
1420    // Check if opening line also contains closing marker. Blank-line-terminated
1421    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1422    // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1423    // (`closes_at_open_tag: true`) close immediately — the block always
1424    // ends on the open-tag line since there is no closing tag to find.
1425    let void_block = matches!(
1426        &block_type,
1427        HtmlBlockType::BlockTag {
1428            closes_at_open_tag: true,
1429            ..
1430        }
1431    );
1432    // Void tags with a multi-line open close immediately after the open
1433    // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1434    // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1435    // a single RawBlock for the whole multi-line tag, with no following
1436    // content.
1437    if void_block && let Some(end_line_idx) = multiline_open_end {
1438        log::trace!(
1439            "HTML void block at line {} closes after multi-line open ending at line {}",
1440            start_pos + 1,
1441            end_line_idx + 1
1442        );
1443        builder.finish_node(); // HtmlBlock
1444        return end_line_idx + 1;
1445    }
1446    // Multi-line open with all matched closes on the open's last line:
1447    // `pre_content` holds the bytes after the last open `>` (lifted there
1448    // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1449    // When `depth <= 0` after the multi-line open and the trailing bytes
1450    // contain the depth-zero matched close, do the same-line lift on
1451    // `pre_content` directly. Mirrors the single-line `same_line_closed`
1452    // lift below — same body / close-marker / trailing-graft shape, just
1453    // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1454    //
1455    // The body bytes of `pre_content` come from the open's last line,
1456    // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1457    // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1458    // tag thus inherit the bq context without per-line prefix injection,
1459    // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1460    // both the non-bq and bq variants of this shape.
1461    if let Some(end_line_idx) = multiline_open_end
1462        && !blank_terminated
1463        && depth_aware_tag.is_some()
1464        && depth <= 0
1465        && lift_mode
1466        && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1467        && !pre_content.is_empty()
1468    {
1469        let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1470            Some("div")
1471        } else if strict_block_lift {
1472            strict_block_tag_name
1473        } else if let Some(name) = bq_multiline_close_lift_tag {
1474            Some(name)
1475        } else {
1476            None
1477        };
1478        if let Some(tag_name) = tag_name_opt {
1479            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1480            if let Some((leading, close_part)) =
1481                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1482            {
1483                let close_marker_end =
1484                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1485                let close_marker = &close_part[..close_marker_end];
1486                let same_line_trailing = &close_part[close_marker_end..];
1487                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1488                    LastParaDemote::SkipTrailingBlanks
1489                } else {
1490                    LastParaDemote::OnlyIfLast
1491                };
1492                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1493                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1494                if same_line_trailing.is_empty() {
1495                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1496                    close_line.push_str(close_marker);
1497                    close_line.push_str(post_nl);
1498                    emit_html_block_line(builder, &close_line, 0);
1499                    builder.finish_node();
1500                    builder.finish_node(); // HtmlBlock
1501                } else {
1502                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1503                    builder.finish_node(); // HTML_BLOCK_TAG
1504                    builder.finish_node(); // HtmlBlock
1505
1506                    let mut trailing_text =
1507                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1508                    trailing_text.push_str(same_line_trailing);
1509                    trailing_text.push_str(post_nl);
1510                    let mut inner_options = config.clone();
1511                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1512                    inner_options.refdef_labels = Some(refdefs.clone());
1513                    let inner_root = crate::parser::parse_with_refdefs(
1514                        &trailing_text,
1515                        Some(inner_options),
1516                        refdefs,
1517                    );
1518                    let mut bq = None;
1519                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1520                }
1521                return end_line_idx + 1;
1522            }
1523        }
1524    }
1525
1526    let same_line_closed = !blank_terminated
1527        && multiline_open_end.is_none()
1528        && (void_block
1529            || match &depth_aware_tag {
1530                Some(_) => depth <= 0,
1531                None => is_closing_marker(first_inner, &block_type),
1532            });
1533    if same_line_closed {
1534        log::trace!(
1535            "HTML block at line {} opens and closes on same line",
1536            start_pos + 1
1537        );
1538        // Same-line structural lift (div or non-div strict-block):
1539        // pre_content holds the bytes after the open `>` (including
1540        // the close `</tag>` and the trailing newline). Split into
1541        // body + close tag, emit body via recursive parse, emit close
1542        // tag as a sibling `HTML_BLOCK_TAG`.
1543        let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1544            None
1545        } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1546            Some("div")
1547        } else if same_line_strict_lift_safe {
1548            strict_block_tag_name
1549        } else if let Some(name) = same_line_bq_lift_tag {
1550            // Bq same-line: body has no inner newlines so the standard
1551            // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1552            // sufficient. The bq prefix `> ` lives on the outer
1553            // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1554            Some(name)
1555        } else {
1556            None
1557        };
1558        if let Some(tag_name) = same_line_lift_tag {
1559            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1560            // Depth-aware split: handles `<tag>foo</tag>bar` (single
1561            // close, trailing text), `<tag>foo</tag></tag>` (matched
1562            // close + unmatched trailing close → sibling RawBlock),
1563            // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1564            // recursive body parse).
1565            if let Some((leading, close_part)) =
1566                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1567            {
1568                // `close_part` starts with `</tag` and contains the close
1569                // marker followed by any same-line trailing text. Split
1570                // off the close marker bytes (`</tag>`) so the close
1571                // `HTML_BLOCK_TAG` carries only those bytes; trailing
1572                // text is parsed and grafted as a sibling block at the
1573                // parent level (matches pandoc-native shape:
1574                // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1575                let close_marker_end =
1576                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1577                let close_marker = &close_part[..close_marker_end];
1578                let same_line_trailing = &close_part[close_marker_end..];
1579
1580                // Same-line is always close-butted; div demotes the
1581                // trailing Para→Plain via `SkipTrailingBlanks`.
1582                // Non-div strict-block uses `OnlyIfLast` (consistent
1583                // with butted-close — no trailing BLANK_LINE before
1584                // the close means the trailing Para demotes).
1585                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1586                    LastParaDemote::SkipTrailingBlanks
1587                } else {
1588                    LastParaDemote::OnlyIfLast
1589                };
1590                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1591                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1592                if same_line_trailing.is_empty() {
1593                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1594                    close_line.push_str(close_marker);
1595                    close_line.push_str(post_nl);
1596                    emit_html_block_line(builder, &close_line, 0);
1597                    builder.finish_node();
1598                    builder.finish_node(); // HtmlBlock
1599                } else {
1600                    // Close tag holds only the close-marker bytes;
1601                    // trailing + newline graft as siblings of the
1602                    // wrapper (matches pandoc's per-tag block split).
1603                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1604                    builder.finish_node(); // HTML_BLOCK_TAG
1605                    builder.finish_node(); // HtmlBlock
1606
1607                    let mut trailing_text =
1608                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1609                    trailing_text.push_str(same_line_trailing);
1610                    trailing_text.push_str(post_nl);
1611                    let mut inner_options = config.clone();
1612                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1613                    inner_options.refdef_labels = Some(refdefs.clone());
1614                    let inner_root = crate::parser::parse_with_refdefs(
1615                        &trailing_text,
1616                        Some(inner_options),
1617                        refdefs,
1618                    );
1619                    let mut bq = None;
1620                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1621                }
1622                return start_pos + 1;
1623            }
1624        }
1625        builder.finish_node(); // HtmlBlock
1626        return start_pos + 1;
1627    }
1628
1629    let mut current_pos = multiline_open_end
1630        .map(|end| end + 1)
1631        .unwrap_or(start_pos + 1);
1632    let mut content_lines: Vec<&str> = Vec::new();
1633    let mut found_closing = false;
1634
1635    // Parse content until we find the closing marker
1636    while current_pos < lines.len() {
1637        let line = lines[current_pos];
1638        let (line_bq_depth, inner) = count_blockquote_markers(line);
1639
1640        // Only process lines at the same or deeper blockquote depth
1641        if line_bq_depth < bq_depth {
1642            break;
1643        }
1644
1645        // Blank-line-terminated blocks (types 6/7) end before the blank line.
1646        // The blank line itself is not part of the block.
1647        if blank_terminated && inner.trim().is_empty() {
1648            break;
1649        }
1650
1651        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1652        // count opens/closes of the same tag name and only close when depth
1653        // returns to 0; otherwise fall back to substring-match on the line.
1654        let line_closes = match &depth_aware_tag {
1655            Some(tag_name) => {
1656                let (opens, closes) = count_tag_balance(inner, tag_name);
1657                depth += opens as i64;
1658                depth -= closes as i64;
1659                depth <= 0
1660            }
1661            None => is_closing_marker(inner, &block_type),
1662        };
1663
1664        if line_closes {
1665            log::trace!("Found HTML block closing at line {}", current_pos + 1);
1666            found_closing = true;
1667
1668            // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1669            // the open and close tags stand alone on their source lines
1670            // (no trailing on open, no body content on close after
1671            // stripping bq markers), lift the body lines structurally
1672            // so the projector walks CST children instead of
1673            // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1674            //
1675            // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1676            // grafted, Para preserved), non-div strict-block tags
1677            // (`<form>`, `<section>`, …) and inline-block matched-pair
1678            // tags (`<video>`, `<iframe>`, …) — the latter two under
1679            // HTML_BLOCK with the structural lift hitting pandoc's
1680            // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1681            // demotion. Inline-block additionally bails if the body
1682            // starts at a fresh-block position with a void block tag
1683            // (mirrors the non-bq matched-pair gate).
1684            //
1685            // Other bq-wrapped shapes (butted-close / open-trailing /
1686            // same-line) still fall through to the opaque path.
1687            // Multi-line opens are allowed here as of 2026-05-12: the
1688            // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1689            // per attr line and per-line bq prefix tokens) by the bq-aware
1690            // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1691            // empty for multi-line opens (the emitter writes any trailing
1692            // bytes on the last open line directly as TEXT inside
1693            // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1694            // trailing falls through to the opaque path, matching the non-
1695            // bq deferral.
1696            let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1697                if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1698                    Some("div")
1699                } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1700                    match &block_type {
1701                        HtmlBlockType::BlockTag {
1702                            tag_name,
1703                            is_verbatim: false,
1704                            closed_by_blank_line: false,
1705                            depth_aware: true,
1706                            closes_at_open_tag: false,
1707                            is_closing: false,
1708                        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1709                        _ => None,
1710                    }
1711                } else {
1712                    None
1713                }
1714            } else {
1715                None
1716            };
1717
1718            let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1719                // Open-shape: last open line must end with `>` (clean
1720                // close-of-open). For single-line, that's `first_inner`
1721                // (already bq-stripped); for multi-line, strip bq markers
1722                // from `lines[end_line_idx]` and check the same.
1723                let last_open_line: &str = match multiline_open_end {
1724                    None => first_inner,
1725                    Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1726                        prefix.strip(lines[end])
1727                    }
1728                    Some(end) => lines[end],
1729                };
1730                let (open_no_nl, _) = strip_newline(last_open_line);
1731                if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1732                    return false;
1733                }
1734                let close_stripped = prefix.strip(line);
1735                let (close_no_nl, _) = strip_newline(close_stripped);
1736                if !close_no_nl
1737                    .trim_start_matches([' ', '\t'])
1738                    .starts_with("</")
1739                {
1740                    return false;
1741                }
1742                if is_pandoc_inline_block_tag_name(tag_name)
1743                    && inline_block_void_interior_abandons(
1744                        first_inner,
1745                        lines,
1746                        start_pos,
1747                        multiline_open_end,
1748                        bq_depth,
1749                        tag_name,
1750                    )
1751                {
1752                    return false;
1753                }
1754                true
1755            });
1756
1757            if bq_clean_lift {
1758                let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1759                    LastParaDemote::Never
1760                } else {
1761                    LastParaDemote::OnlyIfLast
1762                };
1763                emit_html_block_body_lifted_bq(
1764                    builder,
1765                    &content_lines,
1766                    prefix,
1767                    demote_policy,
1768                    config,
1769                );
1770                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1771                emit_html_block_line(builder, line, bq_depth);
1772                builder.finish_node();
1773                current_pos += 1;
1774                break;
1775            }
1776
1777            // Bq messy-shape lift — single-line open with trailing or
1778            // butted-close (or both). `pre_content` already captures any
1779            // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1780            // strip the close line's bq markers before splitting so
1781            // `leading` and `close_part` are bq-prefix-free. Body parses
1782            // recursively from `pre_content + stripped(content_lines) +
1783            // leading`, with per-line bq prefixes re-injected so the CST
1784            // stays byte-equal to the source. Demote: div is keyed on
1785            // close-butted-ness (Plain when leading non-empty, Para
1786            // otherwise); non-div uses OnlyIfLast either way.
1787            if let Some(tag_name) = bq_messy_lift_tag {
1788                let close_stripped = prefix.strip(line);
1789                let close_prefix_len = line.len() - close_stripped.len();
1790                let close_prefix = &line[..close_prefix_len];
1791                if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1792                {
1793                    let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1794                        if leading.is_empty() {
1795                            LastParaDemote::Never
1796                        } else {
1797                            LastParaDemote::SkipTrailingBlanks
1798                        }
1799                    } else {
1800                        LastParaDemote::OnlyIfLast
1801                    };
1802                    emit_html_block_body_lifted_bq_messy(
1803                        builder,
1804                        &pre_content,
1805                        &content_lines,
1806                        leading,
1807                        close_prefix,
1808                        prefix,
1809                        policy,
1810                        config,
1811                    );
1812                    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1813                    // When `leading` is empty, no recursive-parse output carries
1814                    // the close line's bq prefix, so emit it here before the
1815                    // close tag. When `leading` is non-empty,
1816                    // `emit_html_block_body_lifted_bq_messy` already injected
1817                    // the prefix at the start of the leading bytes (via the
1818                    // BqPrefixState entry); emitting again would double the
1819                    // prefix bytes and break losslessness.
1820                    if leading.is_empty() {
1821                        emit_bq_prefix_tokens(builder, close_prefix);
1822                    }
1823                    emit_html_block_line(builder, close_part, 0);
1824                    builder.finish_node();
1825                    current_pos += 1;
1826                    break;
1827                }
1828            }
1829
1830            // Under lift mode, try to split the close line into a
1831            // leading "body content" prefix and the close-marker
1832            // remainder using depth-aware matching. Walks at depth 1
1833            // (we're inside the open tag) so nested same-tag opens
1834            // (e.g. `<inner></inner></tag>` style with a nested div)
1835            // are absorbed into the body and parsed recursively, and
1836            // multi-close shapes (`foo</div></div>` on the close line)
1837            // peel off the matched-pair close — the unmatched
1838            // trailing close projects as a sibling `RawBlock` per
1839            // pandoc-native. For `<div>`, non-empty `leading`
1840            // propagates pandoc's `markdown_in_html_blocks` Plain
1841            // demotion rule. For non-div strict-block tags, demotion
1842            // follows pandoc's `OnlyIfLast` rule (demote the trailing
1843            // Para only when no blank line precedes the close).
1844            let close_split_tag = if lift_mode {
1845                if strict_block_lift {
1846                    strict_block_tag_name
1847                } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1848                    Some("div")
1849                } else {
1850                    None
1851                }
1852            } else {
1853                None
1854            };
1855            let (close_no_nl, close_post_nl) = strip_newline(line);
1856            let close_split = close_split_tag
1857                .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1858
1859            if let Some((leading, close_part)) = close_split {
1860                // Close-line leading that is whitespace-only is close-tag
1861                // indentation, not body content (pandoc-native strips it
1862                // from the close RawBlock and treats the close as butted —
1863                // see `   </tag>` shapes). Route those bytes into the
1864                // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1865                // projector strips them; keep the demote policy keyed on
1866                // the original leading so butted-close detection (Plain
1867                // demotion for div, OnlyIfLast for non-div) still fires.
1868                let leading_is_ws_only =
1869                    !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1870                let body_leading = if leading_is_ws_only { "" } else { leading };
1871                let policy = if strict_block_lift {
1872                    LastParaDemote::OnlyIfLast
1873                } else if !leading.is_empty() {
1874                    LastParaDemote::SkipTrailingBlanks
1875                } else {
1876                    LastParaDemote::Never
1877                };
1878                // Split close_part into close-marker bytes (`</tag>`)
1879                // and trailing bytes (e.g. an extra `</div>` for the
1880                // double-close case, or `bar` for trailing text after
1881                // a normal close). Trailing bytes are recursively
1882                // parsed and grafted as siblings of the HTML_BLOCK_DIV
1883                // wrapper.
1884                let close_tag_name = close_split_tag.expect("close_split_tag present");
1885                let close_marker_end =
1886                    split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1887                let close_marker = &close_part[..close_marker_end];
1888                let close_trailing = &close_part[close_marker_end..];
1889
1890                emit_html_block_body_lifted(
1891                    builder,
1892                    &pre_content,
1893                    &content_lines,
1894                    body_leading,
1895                    policy,
1896                    config,
1897                );
1898                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1899                if leading_is_ws_only {
1900                    builder.token(SyntaxKind::WHITESPACE.into(), leading);
1901                }
1902                if close_trailing.is_empty() {
1903                    let mut close_line =
1904                        String::with_capacity(close_marker.len() + close_post_nl.len());
1905                    close_line.push_str(close_marker);
1906                    close_line.push_str(close_post_nl);
1907                    emit_html_block_line(builder, &close_line, 0);
1908                    builder.finish_node();
1909                } else {
1910                    // Close tag holds only the close-marker bytes;
1911                    // trailing + newline graft as siblings.
1912                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1913                    builder.finish_node(); // HTML_BLOCK_TAG
1914                    builder.finish_node(); // HtmlBlock
1915
1916                    let mut trailing_text =
1917                        String::with_capacity(close_trailing.len() + close_post_nl.len());
1918                    trailing_text.push_str(close_trailing);
1919                    trailing_text.push_str(close_post_nl);
1920                    let mut inner_options = config.clone();
1921                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1922                    inner_options.refdef_labels = Some(refdefs.clone());
1923                    let inner_root = crate::parser::parse_with_refdefs(
1924                        &trailing_text,
1925                        Some(inner_options),
1926                        refdefs,
1927                    );
1928                    let mut bq = None;
1929                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1930                    current_pos += 1;
1931                    return current_pos;
1932                }
1933            } else {
1934                emit_html_block_body(
1935                    builder,
1936                    &pre_content,
1937                    &content_lines,
1938                    bq_depth,
1939                    wrapper_kind,
1940                    lift_mode,
1941                    config,
1942                );
1943                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1944                emit_html_block_line(builder, line, bq_depth);
1945                builder.finish_node();
1946            }
1947
1948            current_pos += 1;
1949            break;
1950        }
1951
1952        // Regular content line
1953        content_lines.push(line);
1954        current_pos += 1;
1955    }
1956
1957    // If we didn't find a closing marker, emit what we collected
1958    if !found_closing {
1959        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1960        emit_html_block_body(
1961            builder,
1962            &pre_content,
1963            &content_lines,
1964            bq_depth,
1965            wrapper_kind,
1966            lift_mode,
1967            config,
1968        );
1969    }
1970
1971    builder.finish_node(); // HtmlBlock
1972    current_pos
1973}
1974
1975/// Emit the collected inner content lines for an HTML block.
1976///
1977/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1978/// line `<div>` open outside blockquote), recursively parse the inner
1979/// content (including any open-tag trailing) as Pandoc-flavored
1980/// markdown and graft the resulting top-level blocks as direct children
1981/// of the wrapper. This is the Phase 6 structural lift — the projector
1982/// and downstream consumers (linter, salsa, LSP) can walk the
1983/// structural children instead of re-tokenizing the body bytes.
1984///
1985/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1986/// blockquote, multi-line open, or no content at all — fall through to
1987/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1988///
1989/// CST bytes remain byte-identical to source: the recursive parser is
1990/// lossless on the same byte slice the legacy path would have captured
1991/// as TEXT.
1992fn emit_html_block_body(
1993    builder: &mut GreenNodeBuilder<'static>,
1994    pre_content: &str,
1995    content_lines: &[&str],
1996    bq_depth: usize,
1997    wrapper_kind: SyntaxKind,
1998    lift_mode: bool,
1999    config: &ParserOptions,
2000) {
2001    if pre_content.is_empty() && content_lines.is_empty() {
2002        return;
2003    }
2004    if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
2005        // Reached when the parser walked to end-of-input without finding
2006        // `</div>` (unbalanced div) — no close tag, no Plain demotion.
2007        emit_html_block_body_lifted(
2008            builder,
2009            pre_content,
2010            content_lines,
2011            "",
2012            LastParaDemote::Never,
2013            config,
2014        );
2015        return;
2016    }
2017    // Legacy path: opaque TEXT capture. `pre_content` is always empty
2018    // here (lift_mode is the only path that populates it), but be
2019    // defensive — if a trailing prefix snuck in, emit it as TEXT so
2020    // bytes are preserved.
2021    builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
2022    if !pre_content.is_empty() {
2023        builder.token(SyntaxKind::TEXT.into(), pre_content);
2024    }
2025    for content_line in content_lines {
2026        emit_html_block_line(builder, content_line, bq_depth);
2027    }
2028    builder.finish_node();
2029}
2030
2031/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
2032/// to `PLAIN` when grafting children into the structural CST.
2033#[derive(Copy, Clone, Debug)]
2034enum LastParaDemote {
2035    /// Never demote — pandoc preserves the trailing `Para`.
2036    Never,
2037    /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2038    /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2039    /// tag is butted against the paragraph text on its source line —
2040    /// pandoc's `markdown_in_html_blocks` Plain demotion.
2041    SkipTrailingBlanks,
2042    /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2043    /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2044    /// for non-div strict-block tags whose body emits at top-level
2045    /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2046    /// demotes the trailing `Para` to `Plain` unless a blank line
2047    /// separates them.
2048    OnlyIfLast,
2049}
2050
2051/// Lift the HTML-block body into structural CST children: build the
2052/// inner text from `pre_content` + `content_lines` + `post_content`
2053/// (in order), recursively parse it as Pandoc-flavored markdown, and
2054/// graft the resulting top-level blocks into `builder`. `demote_policy`
2055/// controls whether the trailing paragraph is retagged as `PLAIN` to
2056/// encode pandoc's Plain/Para adjacency rules structurally.
2057fn emit_html_block_body_lifted(
2058    builder: &mut GreenNodeBuilder<'static>,
2059    pre_content: &str,
2060    content_lines: &[&str],
2061    post_content: &str,
2062    demote_policy: LastParaDemote,
2063    config: &ParserOptions,
2064) {
2065    emit_html_block_body_lifted_inner(
2066        builder,
2067        pre_content,
2068        content_lines,
2069        post_content,
2070        demote_policy,
2071        config,
2072        &mut None,
2073    )
2074}
2075
2076/// Body-lift variant for `<div>` inside a blockquote. Strips
2077/// `bq_depth` levels of blockquote markers from each `content_line`,
2078/// captures the per-line prefix bytes, and grafts the recursive parse
2079/// with prefix injection so the output CST stays byte-equal to the
2080/// source. `pre_content` and `post_content` must be empty (the bq
2081/// clean lift only handles the shape where the open and close tags
2082/// stand alone on their source lines).
2083fn emit_html_block_body_lifted_bq(
2084    builder: &mut GreenNodeBuilder<'static>,
2085    content_lines: &[&str],
2086    prefix: &ContainerPrefix,
2087    demote_policy: LastParaDemote,
2088    config: &ParserOptions,
2089) {
2090    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2091    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2092    for cl in content_lines {
2093        let (li, bq, inner) = prefix.split(cl);
2094        prefix_lines.push(ContainerPrefixLine {
2095            list_indent: li.to_string(),
2096            bq_prefix: bq.to_string(),
2097        });
2098        stripped_lines.push(inner);
2099    }
2100    let mut state = ContainerPrefixState::new(prefix_lines);
2101    emit_html_block_body_lifted_inner(
2102        builder,
2103        "",
2104        &stripped_lines,
2105        "",
2106        demote_policy,
2107        config,
2108        &mut state,
2109    )
2110}
2111
2112/// Body-lift variant for the bq messy-shape lift — open-trailing,
2113/// butted-close, or both. The open-trailing bytes (if any) sit in
2114/// `pre_content` (line 0 of the body — no bq prefix in source because
2115/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2116/// each carry their own bq prefix. The close line's `leading` (body
2117/// bytes before `</tag>`) sits on the close line, prefixed in source
2118/// by `close_line_prefix` (the bq prefix captured from `line`).
2119///
2120/// Builds `prefixes` so each emitted line in the recursive parse
2121/// output gets the right per-line bq prefix re-injected at line start:
2122/// `pre_content` → empty prefix (no source `> ` precedes it); each
2123/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2124/// Result CST stays byte-equal to source.
2125#[allow(clippy::too_many_arguments)]
2126fn emit_html_block_body_lifted_bq_messy(
2127    builder: &mut GreenNodeBuilder<'static>,
2128    pre_content: &str,
2129    content_lines: &[&str],
2130    leading: &str,
2131    close_line_prefix: &str,
2132    prefix: &ContainerPrefix,
2133    demote_policy: LastParaDemote,
2134    config: &ParserOptions,
2135) {
2136    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2137    if !pre_content.is_empty() {
2138        prefix_lines.push(ContainerPrefixLine::default());
2139    }
2140    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2141    for cl in content_lines {
2142        let (li, bq, inner) = prefix.split(cl);
2143        prefix_lines.push(ContainerPrefixLine {
2144            list_indent: li.to_string(),
2145            bq_prefix: bq.to_string(),
2146        });
2147        stripped_lines.push(inner);
2148    }
2149    if !leading.is_empty() {
2150        // The close line carries its own captured prefix bytes; treat
2151        // them as bq-prefix only (no list-indent split applied) to keep
2152        // the legacy bq-only re-injection behavior for messy-shape
2153        // close-line lifts.
2154        prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2155    }
2156    let mut state = ContainerPrefixState::new(prefix_lines);
2157    emit_html_block_body_lifted_inner(
2158        builder,
2159        pre_content,
2160        &stripped_lines,
2161        leading,
2162        demote_policy,
2163        config,
2164        &mut state,
2165    )
2166}
2167
2168fn emit_html_block_body_lifted_inner(
2169    builder: &mut GreenNodeBuilder<'static>,
2170    pre_content: &str,
2171    content_lines: &[&str],
2172    post_content: &str,
2173    demote_policy: LastParaDemote,
2174    config: &ParserOptions,
2175    bq: &mut Option<ContainerPrefixState>,
2176) {
2177    if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2178        return;
2179    }
2180    let mut inner_text = String::with_capacity(
2181        pre_content.len()
2182            + content_lines.iter().map(|s| s.len()).sum::<usize>()
2183            + post_content.len(),
2184    );
2185    inner_text.push_str(pre_content);
2186    for line in content_lines {
2187        inner_text.push_str(line);
2188    }
2189    inner_text.push_str(post_content);
2190
2191    let mut inner_options = config.clone();
2192    let refdefs = config.refdef_labels.clone().unwrap_or_default();
2193    inner_options.refdef_labels = Some(refdefs.clone());
2194    let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2195    graft_document_children(builder, &inner_root, demote_policy, bq);
2196}
2197
2198/// Walk a parsed inner document's top-level children and re-emit them
2199/// into `builder`. The document's wrapper node is skipped — only its
2200/// children are grafted.
2201///
2202/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2203/// as `PLAIN` — see [`LastParaDemote`].
2204///
2205/// `bq` is `Some` when grafting a body that lived inside an outer
2206/// container (blockquote, list-item, or both) — token emission then
2207/// injects the captured per-line prefix tokens at line starts so the
2208/// CST stays byte-equal to source. See
2209/// [`super::container_prefix::ContainerPrefixState`].
2210fn graft_document_children(
2211    builder: &mut GreenNodeBuilder<'static>,
2212    doc: &SyntaxNode,
2213    demote_policy: LastParaDemote,
2214    bq: &mut Option<ContainerPrefixState>,
2215) {
2216    let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2217
2218    let mut demote_idx: Option<usize> = None;
2219    match demote_policy {
2220        LastParaDemote::Never => {}
2221        LastParaDemote::SkipTrailingBlanks => {
2222            for (i, c) in children.iter().enumerate().rev() {
2223                if let rowan::NodeOrToken::Node(n) = c {
2224                    if n.kind() == SyntaxKind::BLANK_LINE {
2225                        continue;
2226                    }
2227                    if n.kind() == SyntaxKind::PARAGRAPH {
2228                        demote_idx = Some(i);
2229                    }
2230                    break;
2231                }
2232            }
2233        }
2234        LastParaDemote::OnlyIfLast => {
2235            for (i, c) in children.iter().enumerate().rev() {
2236                if let rowan::NodeOrToken::Node(n) = c {
2237                    if n.kind() == SyntaxKind::PARAGRAPH {
2238                        demote_idx = Some(i);
2239                    }
2240                    break;
2241                }
2242            }
2243        }
2244    }
2245
2246    for (i, child) in children.into_iter().enumerate() {
2247        match child {
2248            rowan::NodeOrToken::Node(n) => {
2249                if Some(i) == demote_idx {
2250                    graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2251                } else {
2252                    graft_subtree(builder, &n, bq);
2253                }
2254            }
2255            rowan::NodeOrToken::Token(t) => {
2256                emit_grafted_token(builder, t.kind(), t.text(), bq);
2257            }
2258        }
2259    }
2260}
2261
2262/// Recursively re-emit `node` and its descendants into `builder`.
2263/// Token text is copied verbatim so the result is byte-identical to
2264/// the input span (modulo bq prefix tokens injected at line starts
2265/// when `bq` is `Some`).
2266fn graft_subtree(
2267    builder: &mut GreenNodeBuilder<'static>,
2268    node: &SyntaxNode,
2269    bq: &mut Option<ContainerPrefixState>,
2270) {
2271    graft_subtree_as(builder, node, node.kind(), bq);
2272}
2273
2274/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2275/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2276/// the close-butted demotion rule.
2277fn graft_subtree_as(
2278    builder: &mut GreenNodeBuilder<'static>,
2279    node: &SyntaxNode,
2280    kind: SyntaxKind,
2281    bq: &mut Option<ContainerPrefixState>,
2282) {
2283    builder.start_node(kind.into());
2284    for child in node.children_with_tokens() {
2285        match child {
2286            rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2287            rowan::NodeOrToken::Token(t) => {
2288                emit_grafted_token(builder, t.kind(), t.text(), bq);
2289            }
2290        }
2291    }
2292    builder.finish_node();
2293}
2294
2295/// Emit a single token while optionally injecting blockquote prefix
2296/// tokens at line starts. When `bq` is `None`, this is a plain
2297/// `builder.token()` passthrough.
2298fn emit_grafted_token(
2299    builder: &mut GreenNodeBuilder<'static>,
2300    kind: SyntaxKind,
2301    text: &str,
2302    bq: &mut Option<ContainerPrefixState>,
2303) {
2304    if let Some(state) = bq.as_mut() {
2305        if state.at_line_start {
2306            if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2307                emit_container_prefix_tokens(builder, line_prefix);
2308            }
2309            state.at_line_start = false;
2310        }
2311        builder.token(kind.into(), text);
2312        // `BLANK_LINE` token represents an entirely blank source line —
2313        // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2314        // token as line-ending so the per-line prefix index advances
2315        // correctly.
2316        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2317            state.line_idx += 1;
2318            state.at_line_start = true;
2319        }
2320    } else {
2321        builder.token(kind.into(), text);
2322    }
2323}
2324
2325/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2326/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2327fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2328    for ch in prefix.chars() {
2329        if ch == '>' {
2330            builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2331        } else {
2332            let mut buf = [0u8; 4];
2333            builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2334        }
2335    }
2336}
2337
2338/// Locate the byte index (within `line`) of the open-tag's closing `>`
2339/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2340/// the line doesn't fit the expected shape. Mirrors the inner scan of
2341/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2342/// caller can slice off the trailing bytes.
2343fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2344    let bytes = line.as_bytes();
2345    let indent_end = bytes
2346        .iter()
2347        .position(|&b| b != b' ' && b != b'\t')
2348        .unwrap_or(bytes.len());
2349    let rest = &line[indent_end..];
2350    let rest_bytes = rest.as_bytes();
2351    let prefix_len = 1 + tag_name.len();
2352    if rest_bytes.len() < prefix_len + 1
2353        || rest_bytes[0] != b'<'
2354        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2355    {
2356        return None;
2357    }
2358    let after_name = &rest[prefix_len..];
2359    let after_name_bytes = after_name.as_bytes();
2360    let mut i = 0usize;
2361    let mut quote: Option<u8> = None;
2362    while i < after_name_bytes.len() {
2363        match (quote, after_name_bytes[i]) {
2364            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2365            (Some(q), b2) if b2 == q => quote = None,
2366            (None, b'>') => return Some(indent_end + prefix_len + i),
2367            _ => {}
2368        }
2369        i += 1;
2370    }
2371    None
2372}
2373
2374/// Whether `slice` begins (after leading ASCII whitespace) with an
2375/// open tag whose name is a Pandoc void block tag (`<source>`,
2376/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2377/// open tags return false.
2378///
2379/// Used by the inline-block matched-pair lift gate: pandoc-native
2380/// abandons the lift when the body's first non-blank content is a
2381/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2382/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2383/// matched-pair lift).
2384fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2385    let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2386    if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2387        return false;
2388    }
2389    let Some(tag_end) = parse_open_tag(trimmed) else {
2390        return false;
2391    };
2392    let bytes = trimmed.as_bytes();
2393    let mut name_end = 1usize;
2394    while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2395    {
2396        name_end += 1;
2397    }
2398    if name_end == 1 {
2399        return false;
2400    }
2401    is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2402}
2403
2404/// Whether the body of an inline-block matched-pair (`<video>...`,
2405/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2406/// a void block tag — the condition under which pandoc-native abandons
2407/// the matched-pair lift. Probes three shapes:
2408///
2409/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2410///   after the open `>` on `first_inner` start with `<source`.
2411/// - **Single-line open + multi-line body**: open-trailing on the open
2412///   line is empty/whitespace AND the first non-blank body line
2413///   (`lines[start_pos+1..]`) starts with a void tag.
2414/// - **Multi-line open**: same body-line scan starting at
2415///   `lines[multiline_open_end+1..]`.
2416///
2417/// Returns `false` when the body begins with text, with a close tag,
2418/// or with a non-void block tag — those cases all proceed with the
2419/// matched-pair lift.
2420fn inline_block_void_interior_abandons(
2421    first_inner: &str,
2422    lines: &[&str],
2423    start_pos: usize,
2424    multiline_open_end: Option<usize>,
2425    bq_depth: usize,
2426    tag_name: &str,
2427) -> bool {
2428    let (line_no_nl, _) = strip_newline(first_inner);
2429    let (body_start_line_idx, open_trailing) = match multiline_open_end {
2430        Some(end) => (end + 1, ""),
2431        None => {
2432            let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2433            let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2434            (start_pos + 1, trailing)
2435        }
2436    };
2437    let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2438    if !trimmed.is_empty() {
2439        return slice_starts_with_void_block_tag(trimmed);
2440    }
2441    for line in &lines[body_start_line_idx..] {
2442        let inner = if bq_depth > 0 {
2443            strip_n_blockquote_markers(line, bq_depth)
2444        } else {
2445            line
2446        };
2447        let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2448        if trimmed.is_empty() {
2449            continue;
2450        }
2451        return slice_starts_with_void_block_tag(trimmed);
2452    }
2453    false
2454}
2455
2456/// Probe whether the open-tag line has a valid (quote-aware) closing
2457/// `>` after the tag name. Admits trailing content after `>` (the
2458/// open-trailing shape `<form>foo`) — the caller is expected to capture
2459/// that trailing into the structural lift's `pre_content`.
2460pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2461    let bytes = line.as_bytes();
2462    let indent_end = bytes
2463        .iter()
2464        .position(|&b| b != b' ' && b != b'\t')
2465        .unwrap_or(bytes.len());
2466    let rest = &line[indent_end..];
2467    let rest_bytes = rest.as_bytes();
2468    let prefix_len = 1 + tag_name.len();
2469    if rest_bytes.len() < prefix_len + 1
2470        || rest_bytes[0] != b'<'
2471        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2472    {
2473        return false;
2474    }
2475    let after_name = &rest[prefix_len..];
2476    let after_name_bytes = after_name.as_bytes();
2477    let mut i = 0usize;
2478    let mut quote: Option<u8> = None;
2479    while i < after_name_bytes.len() {
2480        match (quote, after_name_bytes[i]) {
2481            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2482            (Some(q), b2) if b2 == q => quote = None,
2483            (None, b'>') => return true,
2484            _ => {}
2485        }
2486        i += 1;
2487    }
2488    false
2489}
2490
2491/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2492/// be lifted structurally. Returns `true` only when:
2493/// - The line starts with `<tag_name` (modulo leading whitespace).
2494/// - The open tag's `>` exists with proper quote handling.
2495/// - The bytes after the open `>` contain a depth-zero matched
2496///   `</tag_name>` close (depth-aware: nested `<tag>` opens
2497///   increment depth; matching is case-insensitive, quote-aware).
2498///
2499/// Trailing bytes after the matched close are accepted and grafted
2500/// as a sibling block by the caller. Examples:
2501/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2502/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2503///   recursively parses to a `RawBlock`).
2504/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2505///   parsed recursively), trailing=`bar`.
2506fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2507    let bytes = line.as_bytes();
2508    let indent_end = bytes
2509        .iter()
2510        .position(|&b| b != b' ' && b != b'\t')
2511        .unwrap_or(bytes.len());
2512    let rest = &line[indent_end..];
2513    let rest_bytes = rest.as_bytes();
2514    let prefix_len = 1 + tag_name.len();
2515    if rest_bytes.len() < prefix_len
2516        || rest_bytes[0] != b'<'
2517        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2518    {
2519        return false;
2520    }
2521    let after_name = &rest[prefix_len..];
2522    let after_name_bytes = after_name.as_bytes();
2523    let mut i = 0usize;
2524    let mut quote: Option<u8> = None;
2525    let mut gt_idx: Option<usize> = None;
2526    while i < after_name_bytes.len() {
2527        match (quote, after_name_bytes[i]) {
2528            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2529            (Some(q), b2) if b2 == q => quote = None,
2530            (None, b'>') => {
2531                gt_idx = Some(i);
2532                break;
2533            }
2534            _ => {}
2535        }
2536        i += 1;
2537    }
2538    let Some(gt_idx) = gt_idx else {
2539        return false;
2540    };
2541    let trailing = &after_name[gt_idx + 1..];
2542    // Depth-aware: walk `trailing` (we begin inside the open tag at
2543    // depth 1). Return true iff a matched `</tag>` exists where depth
2544    // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2545    matched_close_offset(trailing, tag_name).is_some()
2546}
2547
2548/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2549/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2550/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2551/// starts at 1 (we begin inside the open tag). Self-closing opens
2552/// (`<tag/>`) do not increment depth.
2553///
2554/// Returns `Some((close_start, close_end))` where:
2555/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2556/// - `close_end` is one past the matched `>`.
2557///
2558/// Returns `None` when no matched close is present (unclosed tag,
2559/// depth never returns to 0).
2560fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2561    let bytes = trailing.as_bytes();
2562    let lower_line = trailing.to_ascii_lowercase();
2563    let lower_bytes = lower_line.as_bytes();
2564    let tag_lower = tag_name.to_ascii_lowercase();
2565    let tag_bytes = tag_lower.as_bytes();
2566
2567    let mut depth: i32 = 1;
2568    let mut i = 0usize;
2569
2570    while i < bytes.len() {
2571        if bytes[i] != b'<' {
2572            i += 1;
2573            continue;
2574        }
2575        let after = i + 1;
2576        let is_close = after < bytes.len() && bytes[after] == b'/';
2577        let name_start = if is_close { after + 1 } else { after };
2578        let matched = name_start + tag_bytes.len() <= bytes.len()
2579            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2580        let after_name = name_start + tag_bytes.len();
2581        let is_boundary = matched
2582            && matches!(
2583                bytes.get(after_name).copied(),
2584                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2585            );
2586
2587        // Scan forward to this tag bracket's `>`, respecting quoted
2588        // attribute values; track self-closing form (`/>`).
2589        let mut j = if matched { after_name } else { after };
2590        let mut quote: Option<u8> = None;
2591        let mut self_close = false;
2592        let mut found_gt = false;
2593        while j < bytes.len() {
2594            let b = bytes[j];
2595            match (quote, b) {
2596                (Some(q), x) if x == q => quote = None,
2597                (None, b'"') | (None, b'\'') => quote = Some(b),
2598                (None, b'>') => {
2599                    found_gt = true;
2600                    if j > i + 1 && bytes[j - 1] == b'/' {
2601                        self_close = true;
2602                    }
2603                    break;
2604                }
2605                _ => {}
2606            }
2607            j += 1;
2608        }
2609
2610        if matched && is_boundary {
2611            if is_close {
2612                depth -= 1;
2613                if depth == 0 && found_gt {
2614                    return Some((i, j + 1));
2615                }
2616            } else if !self_close {
2617                depth += 1;
2618            }
2619        }
2620
2621        if found_gt {
2622            i = j + 1;
2623        } else {
2624            // Unterminated `<...` — give up.
2625            break;
2626        }
2627    }
2628    None
2629}
2630
2631/// Locate the byte offset of the first `>` after a `</tag` prefix at
2632/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2633/// the caller can split `close_part` into the close-marker bytes
2634/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2635/// expected prefix shape is missing — caller treats the whole slice
2636/// as the close marker (no trailing).
2637fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2638    let prefix_len = 2 + tag_name.len();
2639    let bytes = close_part.as_bytes();
2640    if bytes.len() < prefix_len
2641        || bytes[0] != b'<'
2642        || bytes[1] != b'/'
2643        || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2644    {
2645        return None;
2646    }
2647    // Scan from after `</tag` to the first unquoted `>`.
2648    let mut i = prefix_len;
2649    let mut quote: Option<u8> = None;
2650    while i < bytes.len() {
2651        match (quote, bytes[i]) {
2652            (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2653            (Some(q), b2) if b2 == q => quote = None,
2654            (None, b'>') => return Some(i + 1),
2655            _ => {}
2656        }
2657        i += 1;
2658    }
2659    None
2660}
2661
2662/// Try to split the close line of an HTML_BLOCK_DIV body into a
2663/// leading content prefix and a clean `</tag>...` remainder. Returns
2664/// `Some((leading, close_part))` only when the line contains exactly
2665/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2666/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2667/// for missing close tags, or for compound shapes the parser
2668/// shouldn't attempt to lift in this pass.
2669///
2670/// `leading` may be empty (close starts at column 0) or pure
2671/// whitespace (close on an indented line). Both count as "butted" per
2672/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2673/// the trailing paragraph inside the div demotes Para→Plain.
2674fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2675    let (opens, closes) = count_tag_balance(line, tag_name);
2676    if opens != 0 || closes != 1 {
2677        return None;
2678    }
2679    // Locate the close tag's opening `<` by lowercased substring search.
2680    // Safe because we've already established (above) that the line has
2681    // exactly one `</tag>` and no `<tag>` opens, so the first match is
2682    // THE close.
2683    let needle = format!("</{}", tag_name);
2684    let lower = line.to_ascii_lowercase();
2685    let close_lt = lower.find(&needle)?;
2686    Some((&line[..close_lt], &line[close_lt..]))
2687}
2688
2689/// Depth-aware variant of `try_split_close_line` used by the same-line
2690/// lift path. Walks `line` starting at depth 1 (we begin inside the
2691/// open `<tag>`) and splits at the byte position where the matched
2692/// `</tag>` close brings depth to 0. Returns `Some((body,
2693/// close_part))` where `body` is the bytes before the matched-close
2694/// start and `close_part` is the bytes from the matched close onward.
2695///
2696/// Unlike `try_split_close_line` this accepts nested same-tag opens
2697/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2698/// body=`<div>x</div>` (a nested div the body lift parses
2699/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2700/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2701/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2702fn try_split_close_line_depth_aware<'a>(
2703    line: &'a str,
2704    tag_name: &str,
2705) -> Option<(&'a str, &'a str)> {
2706    let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2707    Some((&line[..close_start], &line[close_start..]))
2708}
2709
2710/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2711/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2712/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2713/// + TEXT(">") + TEXT(trailing)?`.
2714///
2715/// Bytes are byte-identical to the source — this only tokenizes at finer
2716/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2717/// region structurally. Falls back to a single TEXT token if the line
2718/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2719/// only retags as the lift kind when this shape was matched).
2720///
2721/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2722/// returned as `&str` instead so the caller can splice them into the
2723/// recursive-parse input for the structural body lift. When false
2724/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2725/// empty slice is returned.
2726fn emit_open_tag_tokens<'a>(
2727    builder: &mut GreenNodeBuilder<'static>,
2728    line: &'a str,
2729    tag_name: &str,
2730    lift_trailing: bool,
2731) -> &'a str {
2732    let bytes = line.as_bytes();
2733    // Leading indent (CommonMark allows up to 3 spaces).
2734    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2735    if indent_end > 0 {
2736        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2737    }
2738    let rest = &line[indent_end..];
2739    // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2740    let prefix_len = 1 + tag_name.len();
2741    if !rest.starts_with('<')
2742        || rest.len() < prefix_len
2743        || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2744    {
2745        builder.token(SyntaxKind::TEXT.into(), rest);
2746        return "";
2747    }
2748    let after_name = &rest[prefix_len..];
2749    let after_name_bytes = after_name.as_bytes();
2750    // Find the closing `>` of the open tag, respecting quoted attribute values.
2751    let mut i = 0usize;
2752    let mut quote: Option<u8> = None;
2753    let mut tag_close: Option<usize> = None;
2754    while i < after_name_bytes.len() {
2755        let b = after_name_bytes[i];
2756        match (quote, b) {
2757            (None, b'"') | (None, b'\'') => quote = Some(b),
2758            (Some(q), b2) if b2 == q => quote = None,
2759            (None, b'>') => {
2760                tag_close = Some(i);
2761                break;
2762            }
2763            _ => {}
2764        }
2765        i += 1;
2766    }
2767    let Some(tag_close) = tag_close else {
2768        // Open tag has no closing `>` on this line — defensive fallback.
2769        builder.token(SyntaxKind::TEXT.into(), rest);
2770        return "";
2771    };
2772    // Whitespace between the tag name and the attribute region.
2773    let attrs_inner = &after_name[..tag_close];
2774    let ws_end = attrs_inner
2775        .as_bytes()
2776        .iter()
2777        .position(|&b| !matches!(b, b' ' | b'\t'))
2778        .unwrap_or(attrs_inner.len());
2779    let leading_ws = &attrs_inner[..ws_end];
2780    // Strip a trailing self-closing slash and the whitespace before it
2781    // from the attribute region; emit them as TEXT outside the
2782    // HTML_ATTRS node so the structural region only holds attribute
2783    // bytes (not formatting punctuation).
2784    let attrs_after_ws = &attrs_inner[ws_end..];
2785    let mut attr_end = attrs_after_ws.len();
2786    let attr_bytes = attrs_after_ws.as_bytes();
2787    let mut self_close_start = attr_end;
2788    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2789        self_close_start = attr_end - 1;
2790        attr_end = self_close_start;
2791        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2792            attr_end -= 1;
2793        }
2794    }
2795    let attrs_text = &attrs_after_ws[..attr_end];
2796    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2797    let after_self_close = &attrs_after_ws[self_close_start..];
2798
2799    // Use the original source bytes for the `<tag` prefix (preserves
2800    // source casing — losslessness).
2801    builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2802    if !leading_ws.is_empty() {
2803        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2804    }
2805    if !attrs_text.is_empty() {
2806        emit_html_attrs_node(builder, attrs_text);
2807    }
2808    if !trailing_text.is_empty() {
2809        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2810    }
2811    if !after_self_close.is_empty() {
2812        builder.token(SyntaxKind::TEXT.into(), after_self_close);
2813    }
2814    builder.token(SyntaxKind::TEXT.into(), ">");
2815    let after_gt = &after_name[tag_close + 1..];
2816    if lift_trailing {
2817        // Return trailing bytes to the caller (will be spliced into the
2818        // recursive-parse input for the body lift).
2819        return after_gt;
2820    }
2821    if !after_gt.is_empty() {
2822        builder.token(SyntaxKind::TEXT.into(), after_gt);
2823    }
2824    ""
2825}
2826
2827/// Detect a multi-line HTML open tag for `tag_name`. Returns
2828/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2829/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2830/// the existing path) or when the `>` is missing entirely.
2831///
2832/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2833/// attribute value doesn't terminate the open tag. Quote state carries
2834/// across line boundaries.
2835fn find_multiline_open_end(
2836    lines: &[&str],
2837    start_pos: usize,
2838    first_inner: &str,
2839    tag_name: &str,
2840    prefix: &ContainerPrefix,
2841) -> Option<usize> {
2842    // Locate the `<tag_name` literal in `first_inner` to start scanning past
2843    // it. Match is ASCII case-insensitive; the parser preserves source casing.
2844    // `first_inner` is already bq-stripped by the caller; subsequent lines are
2845    // stripped inline below via `strip_n_blockquote_markers`.
2846    let trimmed = strip_leading_spaces(first_inner);
2847    let prefix_len = 1 + tag_name.len();
2848    if !trimmed.starts_with('<')
2849        || trimmed.len() < prefix_len
2850        || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2851    {
2852        return None;
2853    }
2854    let leading_indent = first_inner.len() - trimmed.len();
2855    let mut i = leading_indent + prefix_len; // past `<tag_name`
2856    let mut quote: Option<u8> = None;
2857
2858    // Scan first line for an unquoted `>`.
2859    let line0_bytes = first_inner.as_bytes();
2860    while i < line0_bytes.len() {
2861        match (quote, line0_bytes[i]) {
2862            (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2863            (Some(q), x) if x == q => quote = None,
2864            (None, b'>') => return None, // single-line case
2865            _ => {}
2866        }
2867        i += 1;
2868    }
2869
2870    // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2871    // blockquote markers per line so `> ` prefixes don't count toward the
2872    // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2873    let mut line_idx = start_pos + 1;
2874    while line_idx < lines.len() {
2875        let raw = lines[line_idx];
2876        let inner = prefix.strip(raw);
2877        for &b in inner.as_bytes() {
2878            match (quote, b) {
2879                (None, b'"') | (None, b'\'') => quote = Some(b),
2880                (Some(q), x) if x == q => quote = None,
2881                (None, b'>') => return Some(line_idx),
2882                _ => {}
2883            }
2884        }
2885        line_idx += 1;
2886    }
2887
2888    None
2889}
2890
2891/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2892/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2893/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2894/// open tag (no `>` in the remaining input) as paragraph text rather than
2895/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2896/// reparse the same content recursively, causing a stack overflow.
2897///
2898/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2899/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2900/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2901/// the tag eventually closes far away.
2902pub(crate) fn pandoc_html_open_tag_closes(
2903    lines: &[&str],
2904    start_pos: usize,
2905    prefix: &ContainerPrefix,
2906) -> bool {
2907    if start_pos >= lines.len() {
2908        return false;
2909    }
2910    let mut quote: Option<u8> = None;
2911    for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2912        let inner = prefix.strip(line);
2913        let bytes = inner.as_bytes();
2914        let mut i = 0usize;
2915        if offset == start_pos {
2916            while i < bytes.len() && bytes[i] == b' ' {
2917                i += 1;
2918            }
2919            if bytes.get(i) != Some(&b'<') {
2920                return false;
2921            }
2922            i += 1;
2923        }
2924        while i < bytes.len() {
2925            match (quote, bytes[i]) {
2926                (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2927                (Some(q), x) if x == q => quote = None,
2928                (None, b'>') => return true,
2929                _ => {}
2930            }
2931            i += 1;
2932        }
2933    }
2934    false
2935}
2936
2937/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2938/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2939/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2940/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2941/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2942/// Phase 6 structural lift.
2943///
2944/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2945/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2946/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2947/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2948///   + TEXT(">") + (TEXT(trailing))? + NEWLINE
2949///
2950/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2951/// newline; `parse_html_attribute_list` tolerates whitespace.
2952#[allow(clippy::too_many_arguments)]
2953fn emit_multiline_open_tag_with_attrs(
2954    builder: &mut GreenNodeBuilder<'static>,
2955    lines: &[&str],
2956    start_pos: usize,
2957    end_line_idx: usize,
2958    tag_name: &str,
2959    bq_depth: usize,
2960    lift_trailing: bool,
2961    pre_content: &mut String,
2962) {
2963    let prefix_len = 1 + tag_name.len();
2964    for (line_idx, raw) in lines
2965        .iter()
2966        .enumerate()
2967        .take(end_line_idx + 1)
2968        .skip(start_pos)
2969    {
2970        // Strip `bq_depth` blockquote markers from the source line so
2971        // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2972        // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2973        // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2974        // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2975        // before this parser runs; re-emitting it here would double
2976        // the bytes and break losslessness.
2977        let stripped = if bq_depth > 0 {
2978            strip_n_blockquote_markers(raw, bq_depth)
2979        } else {
2980            raw
2981        };
2982        let bq_prefix_len = raw.len() - stripped.len();
2983        if bq_prefix_len > 0 && line_idx != start_pos {
2984            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2985        }
2986        let line = stripped;
2987        let (line_no_nl, newline_str) = strip_newline(line);
2988
2989        if line_idx == start_pos {
2990            // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2991            // + attrs)?. The closing `>` is on a later line, so any
2992            // remaining bytes after "<{tag_name}" on this line are the
2993            // start of the attribute region.
2994            let bytes = line_no_nl.as_bytes();
2995            let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2996            if indent_end > 0 {
2997                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2998            }
2999            // Defensive: caller verified the line starts with `<{tag_name}`.
3000            let after_indent = &line_no_nl[indent_end..];
3001            if after_indent.len() >= prefix_len {
3002                builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
3003                let rest = &after_indent[prefix_len..];
3004                emit_attr_region(builder, rest);
3005            } else {
3006                builder.token(SyntaxKind::TEXT.into(), after_indent);
3007            }
3008        } else if line_idx < end_line_idx {
3009            // Pure attribute line.
3010            let bytes = line_no_nl.as_bytes();
3011            let indent_end = bytes
3012                .iter()
3013                .position(|&b| !matches!(b, b' ' | b'\t'))
3014                .unwrap_or(bytes.len());
3015            if indent_end > 0 {
3016                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3017            }
3018            let attrs_text = &line_no_nl[indent_end..];
3019            if !attrs_text.is_empty() {
3020                emit_html_attrs_node(builder, attrs_text);
3021            }
3022        } else {
3023            // Last line: indent + attrs + ">" + trailing.
3024            let bytes = line_no_nl.as_bytes();
3025            let indent_end = bytes
3026                .iter()
3027                .position(|&b| !matches!(b, b' ' | b'\t'))
3028                .unwrap_or(bytes.len());
3029            if indent_end > 0 {
3030                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3031            }
3032            // Find the unquoted `>` byte position in this line.
3033            let mut quote: Option<u8> = None;
3034            let mut gt_pos: Option<usize> = None;
3035            for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3036                let actual_j = indent_end + j;
3037                match (quote, b) {
3038                    (None, b'"') | (None, b'\'') => quote = Some(b),
3039                    (Some(q), x) if x == q => quote = None,
3040                    (None, b'>') => {
3041                        gt_pos = Some(actual_j);
3042                        break;
3043                    }
3044                    _ => {}
3045                }
3046            }
3047            let Some(gt) = gt_pos else {
3048                // Defensive — caller said `>` is on this line.
3049                builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3050                if !newline_str.is_empty() {
3051                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3052                }
3053                continue;
3054            };
3055            // Attribute region: between indent_end and gt, with possibly
3056            // trailing whitespace before `>`.
3057            let attrs_region = &line_no_nl[indent_end..gt];
3058            let region_bytes = attrs_region.as_bytes();
3059            // Strip trailing whitespace from attrs region; emit as
3060            // separate WHITESPACE so HTML_ATTRS only contains attribute
3061            // bytes.
3062            let mut attr_end = region_bytes.len();
3063            while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3064                attr_end -= 1;
3065            }
3066            let attrs_text = &attrs_region[..attr_end];
3067            let trailing_ws = &attrs_region[attr_end..];
3068            if !attrs_text.is_empty() {
3069                emit_html_attrs_node(builder, attrs_text);
3070            }
3071            if !trailing_ws.is_empty() {
3072                builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3073            }
3074            builder.token(SyntaxKind::TEXT.into(), ">");
3075            let after_gt = &line_no_nl[gt + 1..];
3076            if lift_trailing && !after_gt.is_empty() {
3077                // Lift trailing bytes (and the trailing newline) into
3078                // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3079                // with `TEXT(">")`. The recursive parse at the close-marker
3080                // site treats `pre_content` as the leading bytes of the
3081                // structural body — same shape produced by `emit_open_tag_tokens`
3082                // for single-line opens.
3083                pre_content.push_str(after_gt);
3084                pre_content.push_str(newline_str);
3085                continue;
3086            }
3087            if !after_gt.is_empty() {
3088                builder.token(SyntaxKind::TEXT.into(), after_gt);
3089            }
3090        }
3091
3092        if !newline_str.is_empty() {
3093            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3094        }
3095    }
3096}
3097
3098/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3099/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3100/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3101/// node is added. Pandoc's projector reads attributes only for `<div>` /
3102/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3103fn emit_multiline_open_tag_simple(
3104    builder: &mut GreenNodeBuilder<'static>,
3105    lines: &[&str],
3106    start_pos: usize,
3107    end_line_idx: usize,
3108    bq_depth: usize,
3109) {
3110    for (line_idx, raw) in lines
3111        .iter()
3112        .enumerate()
3113        .take(end_line_idx + 1)
3114        .skip(start_pos)
3115    {
3116        let stripped = if bq_depth > 0 {
3117            strip_n_blockquote_markers(raw, bq_depth)
3118        } else {
3119            raw
3120        };
3121        let bq_prefix_len = raw.len() - stripped.len();
3122        // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3123        // re-emit prefixes only for subsequent lines.
3124        if bq_prefix_len > 0 && line_idx != start_pos {
3125            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3126        }
3127        let (line_no_nl, newline_str) = strip_newline(stripped);
3128        if !line_no_nl.is_empty() {
3129            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3130        }
3131        if !newline_str.is_empty() {
3132            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3133        }
3134    }
3135}
3136
3137/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3138/// `<div` literal up to end-of-line. Called only from
3139/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3140/// pure attribute (and possibly inter-attribute whitespace).
3141fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3142    if region.is_empty() {
3143        return;
3144    }
3145    let bytes = region.as_bytes();
3146    // Split a leading run of whitespace into a WHITESPACE token so the
3147    // HTML_ATTRS node holds only attribute bytes.
3148    let ws_end = bytes
3149        .iter()
3150        .position(|&b| !matches!(b, b' ' | b'\t'))
3151        .unwrap_or(bytes.len());
3152    if ws_end > 0 {
3153        builder.token(SyntaxKind::WHITESPACE.into(), &region[..ws_end]);
3154    }
3155    let attrs_text = &region[ws_end..];
3156    if !attrs_text.is_empty() {
3157        emit_html_attrs_node(builder, attrs_text);
3158    }
3159}
3160
3161/// Emit one continuation line of an HTML block, preserving any blockquote
3162/// markers as structural tokens (so the CST stays byte-equal to the source
3163/// and downstream consumers can strip them per-context).
3164fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3165    let inner = if bq_depth > 0 {
3166        let stripped = strip_n_blockquote_markers(line, bq_depth);
3167        let prefix_len = line.len() - stripped.len();
3168        if prefix_len > 0 {
3169            for ch in line[..prefix_len].chars() {
3170                if ch == '>' {
3171                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3172                } else {
3173                    let mut buf = [0u8; 4];
3174                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3175                }
3176            }
3177        }
3178        stripped
3179    } else {
3180        line
3181    };
3182
3183    let (line_without_newline, newline_str) = strip_newline(inner);
3184    if !line_without_newline.is_empty() {
3185        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3186    }
3187    if !newline_str.is_empty() {
3188        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3189    }
3190}
3191
3192#[cfg(test)]
3193mod tests {
3194    use super::*;
3195
3196    #[test]
3197    fn test_try_parse_html_comment() {
3198        assert_eq!(
3199            try_parse_html_block_start("<!-- comment -->", false),
3200            Some(HtmlBlockType::Comment)
3201        );
3202        assert_eq!(
3203            try_parse_html_block_start("  <!-- comment -->", false),
3204            Some(HtmlBlockType::Comment)
3205        );
3206    }
3207
3208    #[test]
3209    fn test_try_parse_div_tag() {
3210        assert_eq!(
3211            try_parse_html_block_start("<div>", false),
3212            Some(HtmlBlockType::BlockTag {
3213                tag_name: "div".to_string(),
3214                is_verbatim: false,
3215                closed_by_blank_line: false,
3216                depth_aware: true,
3217                closes_at_open_tag: false,
3218                is_closing: false,
3219            })
3220        );
3221        assert_eq!(
3222            try_parse_html_block_start("<div class=\"test\">", false),
3223            Some(HtmlBlockType::BlockTag {
3224                tag_name: "div".to_string(),
3225                is_verbatim: false,
3226                closed_by_blank_line: false,
3227                depth_aware: true,
3228                closes_at_open_tag: false,
3229                is_closing: false,
3230            })
3231        );
3232    }
3233
3234    #[test]
3235    fn test_try_parse_script_tag() {
3236        assert_eq!(
3237            try_parse_html_block_start("<script>", false),
3238            Some(HtmlBlockType::BlockTag {
3239                tag_name: "script".to_string(),
3240                is_verbatim: true,
3241                closed_by_blank_line: false,
3242                depth_aware: true,
3243                closes_at_open_tag: false,
3244                is_closing: false,
3245            })
3246        );
3247    }
3248
3249    #[test]
3250    fn test_try_parse_processing_instruction() {
3251        assert_eq!(
3252            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3253            Some(HtmlBlockType::ProcessingInstruction)
3254        );
3255    }
3256
3257    #[test]
3258    fn test_try_parse_declaration() {
3259        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3260        assert_eq!(
3261            try_parse_html_block_start("<!DOCTYPE html>", true),
3262            Some(HtmlBlockType::Declaration)
3263        );
3264        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3265        // just uppercase. Lowercase doctype must match too.
3266        assert_eq!(
3267            try_parse_html_block_start("<!doctype html>", true),
3268            Some(HtmlBlockType::Declaration)
3269        );
3270        // Pandoc dialect does not — bare declarations fall through to
3271        // paragraph parsing.
3272        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3273        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3274    }
3275
3276    #[test]
3277    fn test_dialect_specific_block_tag_membership() {
3278        // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3279        // CommonMark §4.6 type-6 plus a few additions. These tags
3280        // diverge between dialects:
3281        //   CM-only block tags (Pandoc treats as inline raw HTML):
3282        //     dialog, legend, menuitem, optgroup, option, frame,
3283        //     base, basefont, link, param
3284        //   Pandoc-only block tags (CM doesn't recognize):
3285        //     canvas, hgroup, isindex, meta, output
3286        for cm_only in [
3287            "<dialog>",
3288            "<legend>",
3289            "<menuitem>",
3290            "<optgroup>",
3291            "<option>",
3292            "<frame>",
3293            "<base>",
3294            "<basefont>",
3295            "<link>",
3296            "<param>",
3297        ] {
3298            assert!(
3299                matches!(
3300                    try_parse_html_block_start(cm_only, true),
3301                    Some(HtmlBlockType::BlockTag { .. })
3302                ),
3303                "{cm_only} should be a block-tag start under CommonMark",
3304            );
3305            assert_eq!(
3306                try_parse_html_block_start(cm_only, false),
3307                None,
3308                "{cm_only} should NOT be a block-tag start under Pandoc",
3309            );
3310        }
3311        for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3312            // Under CM these are not type-6 BlockTags; they may still match
3313            // type-7 (complete tag on a line) which has different semantics.
3314            assert!(
3315                !matches!(
3316                    try_parse_html_block_start(pandoc_only, true),
3317                    Some(HtmlBlockType::BlockTag { .. })
3318                ),
3319                "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3320            );
3321            assert!(
3322                matches!(
3323                    try_parse_html_block_start(pandoc_only, false),
3324                    Some(HtmlBlockType::BlockTag { .. })
3325                ),
3326                "{pandoc_only} should be a block-tag start under Pandoc",
3327            );
3328        }
3329    }
3330
3331    #[test]
3332    fn test_pandoc_inline_block_tag_membership() {
3333        // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3334        // fresh-block positions under Pandoc dialect. We list the
3335        // non-void, non-script subset (verbatim `script` is handled
3336        // via the verbatim path; void elements are deferred — see
3337        // PANDOC_INLINE_BLOCK_TAGS docs).
3338        for tag in [
3339            "<button>",
3340            "<iframe>",
3341            "<video>",
3342            "<audio>",
3343            "<noscript>",
3344            "<object>",
3345            "<map>",
3346            "<progress>",
3347            "<del>",
3348            "<ins>",
3349            "<svg>",
3350            "<applet>",
3351        ] {
3352            assert!(
3353                matches!(
3354                    try_parse_html_block_start(tag, false),
3355                    Some(HtmlBlockType::BlockTag {
3356                        depth_aware: true,
3357                        ..
3358                    })
3359                ),
3360                "{tag} should be a depth-aware block-tag start under Pandoc",
3361            );
3362        }
3363        // Closing forms of inline-block tags also start a block under
3364        // Pandoc — pandoc-native pins `</button>` standalone as a
3365        // single-line `RawBlock`. These use `closes_at_open_tag: true`
3366        // (no balanced match — the close emits as a one-line block on
3367        // its own).
3368        for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3369            assert!(
3370                matches!(
3371                    try_parse_html_block_start(closing, false),
3372                    Some(HtmlBlockType::BlockTag {
3373                        depth_aware: false,
3374                        closes_at_open_tag: true,
3375                        ..
3376                    })
3377                ),
3378                "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3379            );
3380        }
3381    }
3382
3383    #[test]
3384    fn test_pandoc_void_block_tag_membership() {
3385        // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3386        // at fresh-block positions under Pandoc dialect, with
3387        // `closes_at_open_tag: true` — the block always ends on the
3388        // open-tag line (no closing tag to match).
3389        for tag in [
3390            "<area>",
3391            "<embed>",
3392            "<source>",
3393            "<track>",
3394            "<embed src=\"foo.swf\">",
3395            "<source src=\"foo.mp4\" type=\"video/mp4\">",
3396        ] {
3397            assert!(
3398                matches!(
3399                    try_parse_html_block_start(tag, false),
3400                    Some(HtmlBlockType::BlockTag {
3401                        depth_aware: false,
3402                        closes_at_open_tag: true,
3403                        ..
3404                    })
3405                ),
3406                "{tag} should be a void block-tag start under Pandoc",
3407            );
3408        }
3409        // Closing forms of void tags also start a single-line block
3410        // under Pandoc. Void elements have no closing tag in HTML, but
3411        // `</embed>` etc. can appear in the wild — pandoc-native still
3412        // emits them as `RawBlock`s at fresh-block positions; mirror
3413        // that with the same `closes_at_open_tag: true` shape.
3414        for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3415            assert!(
3416                matches!(
3417                    try_parse_html_block_start(closing, false),
3418                    Some(HtmlBlockType::BlockTag {
3419                        depth_aware: false,
3420                        closes_at_open_tag: true,
3421                        ..
3422                    })
3423                ),
3424                "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3425            );
3426        }
3427        // Under CommonMark dialect, the void-tag block-start path is
3428        // skipped. `<source>` and `<track>` are in the CM type-6
3429        // BLOCK_TAGS set so they DO start a block, but with CM type-6
3430        // semantics (`closed_by_blank_line: true`,
3431        // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3432        // `<embed>` and `<area>` aren't in the CM type-6 list — they
3433        // fall through to type 7 (complete tag on a line by itself).
3434        assert_eq!(
3435            try_parse_html_block_start("<embed>", true),
3436            Some(HtmlBlockType::Type7)
3437        );
3438        assert_eq!(
3439            try_parse_html_block_start("<area>", true),
3440            Some(HtmlBlockType::Type7)
3441        );
3442        assert!(matches!(
3443            try_parse_html_block_start("<source src=\"x\">", true),
3444            Some(HtmlBlockType::BlockTag {
3445                closed_by_blank_line: true,
3446                closes_at_open_tag: false,
3447                ..
3448            })
3449        ));
3450        assert!(matches!(
3451            try_parse_html_block_start("<track src=\"x\">", true),
3452            Some(HtmlBlockType::BlockTag {
3453                closed_by_blank_line: true,
3454                closes_at_open_tag: false,
3455                ..
3456            })
3457        ));
3458    }
3459
3460    #[test]
3461    fn test_find_multiline_open_end() {
3462        // Single-line opens return None (caller takes the regular path).
3463        assert_eq!(
3464            find_multiline_open_end(
3465                &["<div id=\"x\">"],
3466                0,
3467                "<div id=\"x\">",
3468                "div",
3469                &ContainerPrefix::default()
3470            ),
3471            None
3472        );
3473        assert_eq!(
3474            find_multiline_open_end(
3475                &["<embed src=\"x\">"],
3476                0,
3477                "<embed src=\"x\">",
3478                "embed",
3479                &ContainerPrefix::default()
3480            ),
3481            None
3482        );
3483        // Multi-line opens return the line index of the closing `>`.
3484        assert_eq!(
3485            find_multiline_open_end(
3486                &["<embed", "  src=\"x\">"],
3487                0,
3488                "<embed",
3489                "embed",
3490                &ContainerPrefix::default()
3491            ),
3492            Some(1)
3493        );
3494        assert_eq!(
3495            find_multiline_open_end(
3496                &["<embed", "  src=\"x\"", "  type=\"video\">"],
3497                0,
3498                "<embed",
3499                "embed",
3500                &ContainerPrefix::default()
3501            ),
3502            Some(2)
3503        );
3504        // Tag-name mismatch returns None (case-insensitive on the tag name).
3505        assert_eq!(
3506            find_multiline_open_end(
3507                &["<embed", "  src=\"x\">"],
3508                0,
3509                "<embed",
3510                "div",
3511                &ContainerPrefix::default()
3512            ),
3513            None
3514        );
3515        assert_eq!(
3516            find_multiline_open_end(
3517                &["<EMBED", "  src=\"x\">"],
3518                0,
3519                "<EMBED",
3520                "embed",
3521                &ContainerPrefix::default()
3522            ),
3523            Some(1)
3524        );
3525        // Quoted `>` does not terminate the open tag; quote state threads
3526        // across line boundaries.
3527        assert_eq!(
3528            find_multiline_open_end(
3529                &["<embed title=\"a>b", "  c\">"],
3530                0,
3531                "<embed title=\"a>b",
3532                "embed",
3533                &ContainerPrefix::default()
3534            ),
3535            Some(1)
3536        );
3537        // No `>` anywhere returns None.
3538        assert_eq!(
3539            find_multiline_open_end(
3540                &["<embed", "  src=\"x\""],
3541                0,
3542                "<embed",
3543                "embed",
3544                &ContainerPrefix::default()
3545            ),
3546            None
3547        );
3548        // Subsequent lines inside a blockquote: bq markers stripped before
3549        // scanning so `> ` prefixes don't count.
3550        assert_eq!(
3551            find_multiline_open_end(
3552                &["<div", ">   id=\"x\">"],
3553                0,
3554                "<div",
3555                "div",
3556                &ContainerPrefix::bq_only(1)
3557            ),
3558            Some(1)
3559        );
3560        // Nested bq: strips two `> ` per line.
3561        assert_eq!(
3562            find_multiline_open_end(
3563                &["<section", "> >   id=\"x\">"],
3564                0,
3565                "<section",
3566                "section",
3567                &ContainerPrefix::bq_only(2)
3568            ),
3569            Some(1)
3570        );
3571    }
3572
3573    #[test]
3574    fn test_pandoc_html_open_tag_closes() {
3575        // Single-line complete: scanner finds `>` on the first line.
3576        assert!(pandoc_html_open_tag_closes(
3577            &["<div>"],
3578            0,
3579            &ContainerPrefix::default()
3580        ));
3581        assert!(pandoc_html_open_tag_closes(
3582            &["<embed src=\"x\">"],
3583            0,
3584            &ContainerPrefix::default()
3585        ));
3586        // Multi-line complete: scanner finds `>` on a later line.
3587        assert!(pandoc_html_open_tag_closes(
3588            &["<div", "  id=\"x\">", "body", "</div>"],
3589            0,
3590            &ContainerPrefix::default()
3591        ));
3592        assert!(pandoc_html_open_tag_closes(
3593            &["<embed", "  src=\"x.png\" alt=\"y\">"],
3594            0,
3595            &ContainerPrefix::default()
3596        ));
3597        // Quoted `>` does not close: scanner threads quote state.
3598        assert!(!pandoc_html_open_tag_closes(
3599            &["<div title=\"a>b", "  c\""],
3600            0,
3601            &ContainerPrefix::default()
3602        ));
3603        assert!(pandoc_html_open_tag_closes(
3604            &["<div title=\"a>b", "  c\">"],
3605            0,
3606            &ContainerPrefix::default()
3607        ));
3608        // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3609        assert!(!pandoc_html_open_tag_closes(
3610            &["<embed"],
3611            0,
3612            &ContainerPrefix::default()
3613        ));
3614        assert!(!pandoc_html_open_tag_closes(
3615            &["<div", "foo", "bar"],
3616            0,
3617            &ContainerPrefix::default()
3618        ));
3619        // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3620        // across them); the scan continues until EOF or `>`.
3621        assert!(pandoc_html_open_tag_closes(
3622            &["<div", "", "id=\"x\">"],
3623            0,
3624            &ContainerPrefix::default()
3625        ));
3626    }
3627
3628    #[test]
3629    fn test_try_parse_cdata() {
3630        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3631        assert_eq!(
3632            try_parse_html_block_start("<![CDATA[content]]>", true),
3633            Some(HtmlBlockType::CData)
3634        );
3635        // Pandoc dialect does not.
3636        assert_eq!(
3637            try_parse_html_block_start("<![CDATA[content]]>", false),
3638            None
3639        );
3640    }
3641
3642    #[test]
3643    fn test_extract_block_tag_name_open_only() {
3644        assert_eq!(
3645            extract_block_tag_name("<div>", false),
3646            Some("div".to_string())
3647        );
3648        assert_eq!(
3649            extract_block_tag_name("<div class=\"test\">", false),
3650            Some("div".to_string())
3651        );
3652        assert_eq!(
3653            extract_block_tag_name("<div/>", false),
3654            Some("div".to_string())
3655        );
3656        assert_eq!(extract_block_tag_name("</div>", false), None);
3657        assert_eq!(extract_block_tag_name("<>", false), None);
3658        assert_eq!(extract_block_tag_name("< div>", false), None);
3659    }
3660
3661    #[test]
3662    fn test_extract_block_tag_name_with_closing() {
3663        // CommonMark §4.6 type-6 starts also accept closing tags.
3664        assert_eq!(
3665            extract_block_tag_name("</div>", true),
3666            Some("div".to_string())
3667        );
3668        assert_eq!(
3669            extract_block_tag_name("</div >", true),
3670            Some("div".to_string())
3671        );
3672    }
3673
3674    #[test]
3675    fn test_commonmark_type6_closing_tag_start() {
3676        assert_eq!(
3677            try_parse_html_block_start("</div>", true),
3678            Some(HtmlBlockType::BlockTag {
3679                tag_name: "div".to_string(),
3680                is_verbatim: false,
3681                closed_by_blank_line: true,
3682                depth_aware: false,
3683                closes_at_open_tag: false,
3684                is_closing: true,
3685            })
3686        );
3687    }
3688
3689    #[test]
3690    fn test_commonmark_type7_open_tag() {
3691        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3692        // CommonMark; rejected under non-CommonMark.
3693        assert_eq!(
3694            try_parse_html_block_start("<a href=\"foo\">", true),
3695            Some(HtmlBlockType::Type7)
3696        );
3697        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3698    }
3699
3700    #[test]
3701    fn test_commonmark_type7_close_tag() {
3702        assert_eq!(
3703            try_parse_html_block_start("</ins>", true),
3704            Some(HtmlBlockType::Type7)
3705        );
3706    }
3707
3708    #[test]
3709    fn test_commonmark_type7_rejects_with_trailing_text() {
3710        // A complete tag must be followed only by whitespace.
3711        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3712    }
3713
3714    #[test]
3715    fn test_is_closing_marker_comment() {
3716        let block_type = HtmlBlockType::Comment;
3717        assert!(is_closing_marker("-->", &block_type));
3718        assert!(is_closing_marker("end -->", &block_type));
3719        assert!(!is_closing_marker("<!--", &block_type));
3720    }
3721
3722    #[test]
3723    fn test_is_closing_marker_tag() {
3724        let block_type = HtmlBlockType::BlockTag {
3725            tag_name: "div".to_string(),
3726            is_verbatim: false,
3727            closed_by_blank_line: false,
3728            depth_aware: false,
3729            closes_at_open_tag: false,
3730            is_closing: false,
3731        };
3732        assert!(is_closing_marker("</div>", &block_type));
3733        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3734        assert!(is_closing_marker("content</div>", &block_type));
3735        assert!(!is_closing_marker("<div>", &block_type));
3736    }
3737
3738    #[test]
3739    fn test_parse_html_comment_block() {
3740        let input = "<!-- comment -->\n";
3741        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3742        let mut builder = GreenNodeBuilder::new();
3743
3744        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3745        let opts = ParserOptions::default();
3746        let new_pos = parse_html_block_with_wrapper(
3747            &mut builder,
3748            &lines,
3749            0,
3750            block_type,
3751            &ContainerPrefix::default(),
3752            SyntaxKind::HTML_BLOCK,
3753            &opts,
3754        );
3755
3756        assert_eq!(new_pos, 1);
3757    }
3758
3759    #[test]
3760    fn test_parse_div_block() {
3761        let input = "<div>\ncontent\n</div>\n";
3762        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3763        let mut builder = GreenNodeBuilder::new();
3764
3765        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3766        let opts = ParserOptions::default();
3767        let new_pos = parse_html_block_with_wrapper(
3768            &mut builder,
3769            &lines,
3770            0,
3771            block_type,
3772            &ContainerPrefix::default(),
3773            SyntaxKind::HTML_BLOCK,
3774            &opts,
3775        );
3776
3777        assert_eq!(new_pos, 3);
3778    }
3779
3780    #[test]
3781    fn test_parse_html_block_no_closing() {
3782        let input = "<div>\ncontent\n";
3783        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3784        let mut builder = GreenNodeBuilder::new();
3785
3786        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3787        let opts = ParserOptions::default();
3788        let new_pos = parse_html_block_with_wrapper(
3789            &mut builder,
3790            &lines,
3791            0,
3792            block_type,
3793            &ContainerPrefix::default(),
3794            SyntaxKind::HTML_BLOCK,
3795            &opts,
3796        );
3797
3798        // Should consume all lines even without closing tag
3799        assert_eq!(new_pos, 2);
3800    }
3801
3802    #[test]
3803    fn test_parse_div_block_nested_pandoc() {
3804        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3805        // close on the OUTER `</div>`, not the first `</div>` seen. The
3806        // CommonMark-style "first close" scanner is wrong here; Pandoc's
3807        // div parser is depth-aware (mirrors `htmlInBalanced`).
3808        let input =
3809            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3810        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3811        let mut builder = GreenNodeBuilder::new();
3812
3813        // is_commonmark = false → Pandoc dialect.
3814        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3815        let opts = ParserOptions::default();
3816        let new_pos = parse_html_block_with_wrapper(
3817            &mut builder,
3818            &lines,
3819            0,
3820            block_type,
3821            &ContainerPrefix::default(),
3822            SyntaxKind::HTML_BLOCK_DIV,
3823            &opts,
3824        );
3825
3826        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3827        // inner-close, blank, outer-close. All consumed.
3828        assert_eq!(new_pos, 9);
3829    }
3830
3831    #[test]
3832    fn test_parse_div_block_same_line_pandoc() {
3833        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3834        // close on first line. Depth-aware tracking must not regress this.
3835        let input = "<div>foo</div>\n";
3836        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3837        let mut builder = GreenNodeBuilder::new();
3838
3839        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3840        let opts = ParserOptions::default();
3841        let new_pos = parse_html_block_with_wrapper(
3842            &mut builder,
3843            &lines,
3844            0,
3845            block_type,
3846            &ContainerPrefix::default(),
3847            SyntaxKind::HTML_BLOCK_DIV,
3848            &opts,
3849        );
3850        assert_eq!(new_pos, 1);
3851    }
3852
3853    #[test]
3854    fn test_commonmark_verbatim_first_close() {
3855        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3856        // ends at the first matching close — not depth-aware. Stash a
3857        // bogus inner `<script>` inside a JS string; the outer block
3858        // still closes at the first `</script>`.
3859        let input = "<script>\nlet x = '<script>';\n</script>\n";
3860        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3861        let mut builder = GreenNodeBuilder::new();
3862
3863        // is_commonmark = true.
3864        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3865        let opts = ParserOptions::default();
3866        let new_pos = parse_html_block_with_wrapper(
3867            &mut builder,
3868            &lines,
3869            0,
3870            block_type,
3871            &ContainerPrefix::default(),
3872            SyntaxKind::HTML_BLOCK,
3873            &opts,
3874        );
3875        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3876        assert_eq!(new_pos, 3);
3877    }
3878
3879    #[test]
3880    fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3881        // Multi-line open tag with the closing `>` on its own line:
3882        //
3883        //   <div
3884        //     id="x"
3885        //     class="y"
3886        //   >
3887        //
3888        //   foo
3889        //
3890        //   </div>
3891        //
3892        // Open tag spans lines 0..=3. Content starts at line 4.
3893        let input = "<div\n  id=\"x\"\n  class=\"y\"\n>\n\nfoo\n\n</div>\n";
3894        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3895        let mut builder = GreenNodeBuilder::new();
3896
3897        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3898        let opts = ParserOptions::default();
3899        let new_pos = parse_html_block_with_wrapper(
3900            &mut builder,
3901            &lines,
3902            0,
3903            block_type,
3904            &ContainerPrefix::default(),
3905            SyntaxKind::HTML_BLOCK_DIV,
3906            &opts,
3907        );
3908
3909        // 8 lines: open-line 0, open-line 1 (`  id="x"`), open-line 2
3910        // (`  class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3911        assert_eq!(new_pos, 8);
3912
3913        // CST must contain a structural HTML_ATTRS region holding the
3914        // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3915        let green = builder.finish();
3916        let root = crate::syntax::SyntaxNode::new_root(green);
3917        let attrs_count = root
3918            .descendants()
3919            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3920            .count();
3921        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3922
3923        // Byte-identical losslessness check.
3924        let collected: String = root
3925            .descendants_with_tokens()
3926            .filter_map(|n| n.into_token())
3927            .map(|t| t.text().to_string())
3928            .collect();
3929        assert_eq!(collected, input);
3930    }
3931
3932    #[test]
3933    fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3934        // Multi-line open tag with the closing `>` on the last attribute
3935        // line (case 0262 already covers this pattern; pin behavior to
3936        // also ensure HTML_ATTRS structural exposure).
3937        let input = "<div\n  id=\"x\"\n  class=\"y\">\nfoo\n</div>\n";
3938        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3939        let mut builder = GreenNodeBuilder::new();
3940
3941        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3942        let opts = ParserOptions::default();
3943        let new_pos = parse_html_block_with_wrapper(
3944            &mut builder,
3945            &lines,
3946            0,
3947            block_type,
3948            &ContainerPrefix::default(),
3949            SyntaxKind::HTML_BLOCK_DIV,
3950            &opts,
3951        );
3952
3953        // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3954        // </div>.
3955        assert_eq!(new_pos, 5);
3956
3957        let green = builder.finish();
3958        let root = crate::syntax::SyntaxNode::new_root(green);
3959        let attrs_count = root
3960            .descendants()
3961            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3962            .count();
3963        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3964
3965        let collected: String = root
3966            .descendants_with_tokens()
3967            .filter_map(|n| n.into_token())
3968            .map(|t| t.text().to_string())
3969            .collect();
3970        assert_eq!(collected, input);
3971    }
3972
3973    #[test]
3974    fn test_commonmark_type6_blank_line_terminates() {
3975        let input = "<div>\nfoo\n\nbar\n";
3976        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3977        let mut builder = GreenNodeBuilder::new();
3978
3979        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3980        let opts = ParserOptions::default();
3981        let new_pos = parse_html_block_with_wrapper(
3982            &mut builder,
3983            &lines,
3984            0,
3985            block_type,
3986            &ContainerPrefix::default(),
3987            SyntaxKind::HTML_BLOCK,
3988            &opts,
3989        );
3990
3991        // Block contains <div>\nfoo\n; stops at blank line (line 2).
3992        assert_eq!(new_pos, 2);
3993    }
3994}