Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10    ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
13
14/// HTML block-level tags as defined by CommonMark spec.
15/// These tags start an HTML block when found at the start of a line.
16const BLOCK_TAGS: &[&str] = &[
17    "address",
18    "article",
19    "aside",
20    "base",
21    "basefont",
22    "blockquote",
23    "body",
24    "caption",
25    "center",
26    "col",
27    "colgroup",
28    "dd",
29    "details",
30    "dialog",
31    "dir",
32    "div",
33    "dl",
34    "dt",
35    "fieldset",
36    "figcaption",
37    "figure",
38    "footer",
39    "form",
40    "frame",
41    "frameset",
42    "h1",
43    "h2",
44    "h3",
45    "h4",
46    "h5",
47    "h6",
48    "head",
49    "header",
50    "hr",
51    "html",
52    "iframe",
53    "legend",
54    "li",
55    "link",
56    "main",
57    "menu",
58    "menuitem",
59    "nav",
60    "noframes",
61    "ol",
62    "optgroup",
63    "option",
64    "p",
65    "param",
66    "section",
67    "source",
68    "summary",
69    "table",
70    "tbody",
71    "td",
72    "tfoot",
73    "th",
74    "thead",
75    "title",
76    "tr",
77    "track",
78    "ul",
79];
80
81/// Tags that contain raw/verbatim content (no Markdown processing inside).
82const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
83
84/// Pandoc's `blockHtmlTags` (mirrors
85/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
86/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
87/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
88/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
89/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
90/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
91///
92/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
93/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
94/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
95/// and the verbatim `script`) is tracked separately as
96/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
97/// fresh-block positions but stay inline inside an existing HTML block
98/// (e.g. `<form><input><button>X</button></form>`); the projector's
99/// `split_html_block_by_tags` keys on `inline_pending` to keep them
100/// inline once an inline-only tag or text byte has been seen since the
101/// last splitter.
102const PANDOC_BLOCK_TAGS: &[&str] = &[
103    "address",
104    "article",
105    "aside",
106    "blockquote",
107    "body",
108    "canvas",
109    "caption",
110    "center",
111    "col",
112    "colgroup",
113    "dd",
114    "details",
115    "dir",
116    "div",
117    "dl",
118    "dt",
119    "fieldset",
120    "figcaption",
121    "figure",
122    "footer",
123    "form",
124    "frameset",
125    "h1",
126    "h2",
127    "h3",
128    "h4",
129    "h5",
130    "h6",
131    "head",
132    "header",
133    "hgroup",
134    "hr",
135    "html",
136    "isindex",
137    "li",
138    "main",
139    "menu",
140    "meta",
141    "nav",
142    "noframes",
143    "ol",
144    "output",
145    "p",
146    "pre",
147    "script",
148    "section",
149    "style",
150    "summary",
151    "table",
152    "tbody",
153    "td",
154    "textarea",
155    "tfoot",
156    "th",
157    "thead",
158    "tr",
159    "ul",
160];
161
162/// Whether `name` (case-insensitive) is one of the HTML block-level tags
163/// recognized by CommonMark §4.6 type-6.
164pub fn is_html_block_tag_name(name: &str) -> bool {
165    let lower = name.to_ascii_lowercase();
166    BLOCK_TAGS.contains(&lower.as_str())
167}
168
169/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
170/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
171/// Used by the pandoc-native projector's `split_html_block_by_tags` to
172/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
173/// the block — block-level tags emit as separate `RawBlock` entries;
174/// inline tags stay inline in the surrounding `Plain` content.
175pub fn is_pandoc_block_tag_name(name: &str) -> bool {
176    let lower = name.to_ascii_lowercase();
177    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
178}
179
180/// Pandoc's `eitherBlockOrInline` set (mirrors
181/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
182/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
183/// (because `name ∉ blockTags`). At top level (or after a blank line)
184/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
185/// inside an existing HTML block once a paragraph has started parsing,
186/// the same tag stays inline as `RawInline`.
187///
188/// The projector's `split_html_block_by_tags` mirrors this with an
189/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
190/// always split; inline-block tags split only when no inline content
191/// has been buffered since the last splitter.
192///
193/// Void elements (`area`, `embed`, `source`, `track`) live in
194/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
195/// rule as non-void inline-block tags but emit a single RawBlock per
196/// instance instead of a matched-pair lift.
197/// `script` is omitted because it is already verbatim (handled by the
198/// `<script>...</script>` raw-text path) and the strict-block check
199/// fires first regardless.
200const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
201    "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
202    "svg", "video",
203];
204
205/// Whether `name` (case-insensitive) is one of pandoc's
206/// `eitherBlockOrInline` tags (excluding void elements and `script`;
207/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
208pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
209    let lower = name.to_ascii_lowercase();
210    PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
211}
212
213/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
214/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
215/// minus those handled elsewhere: `br` and `wbr` are inline-only;
216/// `img` and `input` are inline-only; HTML void elements that pandoc
217/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
218/// `track`).
219///
220/// At fresh-block positions (or after a blank line) pandoc emits these
221/// as a single `RawBlock`; inside a running paragraph they stay inline
222/// as `RawInline`. The parser opens a depth-zero HTML block (closes
223/// immediately on the open-tag line — there is no closing tag to
224/// match) so subsequent lines start fresh blocks; the projector's
225/// `split_html_block_by_tags` handles the same-line splitting via
226/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
227const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
228
229/// Whether `name` (case-insensitive) is one of pandoc's void
230/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
231pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
232    let lower = name.to_ascii_lowercase();
233    PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
234}
235
236/// Whether the given tag name is eligible for the Phase 6 / Fix #4
237/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
238/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
239/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
240/// and NOT void. These are the tags where pandoc parses the body as
241/// fresh markdown between RawBlock emissions of the open/close tags —
242/// exactly the shape we can lift into structural CST children.
243///
244/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
245/// additional gate at the lift-gate site: the lift is abandoned when
246/// the body's first non-blank content is a void block tag at a
247/// fresh-block position (`<video>\n<source ...>\n</video>` projects
248/// per-tag rather than matched-pair, mirroring pandoc).
249///
250/// `<div>` is intentionally excluded — it has its own lift path
251/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
252/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
253pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
254    let lower = name.to_ascii_lowercase();
255    if VERBATIM_TAGS.contains(&lower.as_str()) {
256        return false;
257    }
258    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
259        return false;
260    }
261    if lower == "div" {
262        return false;
263    }
264    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
265        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
266}
267
268/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
269/// — anything that has an opening and a matching closing form whose
270/// `</tag>` would be recognized by the dispatcher as a separate block
271/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
272/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
273/// Void tags are excluded — they have no close form.
274///
275/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
276/// an open inside the buffer whose close would otherwise interrupt the
277/// list item mid-construct.
278pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
279    let lower = name.to_ascii_lowercase();
280    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
281        return false;
282    }
283    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
284        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
285        || VERBATIM_TAGS.contains(&lower.as_str())
286}
287
288/// Open-tag-attribute tokenization gate for non-div strict-block tags
289/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
290/// open tag is eligible for finer-grained tokenization
291/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
292/// driving the full body lift — that's the `bq_clean_lift` path. The
293/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
294/// the salsa anchor index.
295///
296/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
297/// wrapper) regardless of bq depth, so this gate skips it.
298fn bq_strict_attr_emit_tag_name(
299    wrapper_kind: SyntaxKind,
300    block_type: &HtmlBlockType,
301    bq_depth: usize,
302) -> Option<&str> {
303    if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
304        return None;
305    }
306    match block_type {
307        HtmlBlockType::BlockTag {
308            tag_name,
309            is_verbatim: false,
310            closed_by_blank_line: false,
311            depth_aware: true,
312            closes_at_open_tag: false,
313            is_closing: false,
314        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
315        _ => None,
316    }
317}
318
319/// Information about a detected HTML block opening.
320#[derive(Debug, Clone, PartialEq, Eq)]
321pub(crate) enum HtmlBlockType {
322    /// HTML comment: <!-- ... -->
323    Comment,
324    /// Processing instruction: <? ... ?>
325    ProcessingInstruction,
326    /// Declaration: <!...>
327    Declaration,
328    /// CDATA section: <![CDATA[ ... ]]>
329    CData,
330    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
331    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
332    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
333    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
334    /// `depth_aware` extends the matching-tag close path with balanced
335    /// open/close tracking of the same tag name (mirrors pandoc's
336    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
337    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
338    /// `closed_by_blank_line` is true.
339    /// `closes_at_open_tag` short-circuits the close search: the block
340    /// always ends after the open-tag line. Used for void
341    /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
342    /// `<track>`) which have no closing tag — depth-aware matching
343    /// would walk to end-of-input.
344    /// `is_closing` records whether the tag at the start position is a
345    /// closing form (`</tag>`) rather than an opening form (`<tag>`).
346    /// The dispatcher's `cannot_interrupt` consults this to mirror
347    /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
348    /// even when `<script>` is not — pandoc treats the close-form as
349    /// always-inline regardless of attributes).
350    BlockTag {
351        tag_name: String,
352        is_verbatim: bool,
353        closed_by_blank_line: bool,
354        depth_aware: bool,
355        closes_at_open_tag: bool,
356        is_closing: bool,
357    },
358    /// CommonMark §4.6 type 7: complete open or close tag on a line by
359    /// itself, tag name not in the type-1 verbatim list. Block ends at
360    /// blank line. Cannot interrupt a paragraph.
361    Type7,
362}
363
364/// Try to detect an HTML block opening from content.
365/// Returns block type if this is a valid HTML block start.
366///
367/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
368/// accept closing tags (`</div>`), type-6 blocks end at the next blank
369/// line (rather than a matching close tag), and type 7 is recognized.
370pub(crate) fn try_parse_html_block_start(
371    content: &str,
372    is_commonmark: bool,
373) -> Option<HtmlBlockType> {
374    let trimmed = strip_leading_spaces(content);
375
376    // Must start with <
377    if !trimmed.starts_with('<') {
378        return None;
379    }
380
381    // HTML comment
382    if trimmed.starts_with("<!--") {
383        return Some(HtmlBlockType::Comment);
384    }
385
386    // Processing instruction
387    if trimmed.starts_with("<?") {
388        return Some(HtmlBlockType::ProcessingInstruction);
389    }
390
391    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
392    // recognize bare CDATA as a raw HTML block; the literal bytes fall
393    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
394    // text is parsed as inline markdown, etc).
395    if is_commonmark && trimmed.starts_with("<![CDATA[") {
396        return Some(HtmlBlockType::CData);
397    }
398
399    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
400    // does not recognize bare declarations as raw HTML blocks (its
401    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
402    // tag-shaped blocks); the bytes fall through to paragraph parsing.
403    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
404        let after_bang = &trimmed[2..];
405        if after_bang.chars().next()?.is_ascii_alphabetic() {
406            return Some(HtmlBlockType::Declaration);
407        }
408    }
409
410    // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
411    // Pandoc-native recognizes standalone closing forms of strict-block tags
412    // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
413    // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
414    // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
415    // the open-tag line via `closes_at_open_tag: true`.
416    if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
417        let tag_lower = tag_name.to_lowercase();
418        let is_closing = trimmed.starts_with("</");
419
420        // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
421        // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
422        // Unlike inline-block / void closes, these CAN interrupt a running
423        // paragraph (the dispatcher's `cannot_interrupt` only covers the
424        // inline-block / void categories). Inline-block / void closes are
425        // handled by their own branches further below.
426        if !is_commonmark
427            && is_closing
428            && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
429                || VERBATIM_TAGS.contains(&tag_lower.as_str()))
430            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
431            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
432        {
433            return Some(HtmlBlockType::BlockTag {
434                tag_name: tag_lower,
435                is_verbatim: false,
436                closed_by_blank_line: false,
437                depth_aware: false,
438                closes_at_open_tag: true,
439                is_closing: true,
440            });
441        }
442
443        // Under Pandoc, remaining closing forms (truly inline-only tags like
444        // `</em>`, `</span>`) are not block starts — fall through to the
445        // existing inline-html path. Inline-block + void closes are caught
446        // by the dedicated branches further below.
447        if !is_commonmark
448            && is_closing
449            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
450            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
451        {
452            return None;
453        }
454
455        // Check if it's a block-level tag. Pandoc and CommonMark disagree on
456        // membership: pandoc's `blockHtmlTags` (see
457        // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
458        // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
459        // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
460        let is_block_tag = if is_commonmark {
461            BLOCK_TAGS.contains(&tag_lower.as_str())
462        } else {
463            PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
464        };
465        if is_block_tag {
466            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
467            return Some(HtmlBlockType::BlockTag {
468                tag_name: tag_lower,
469                is_verbatim,
470                closed_by_blank_line: is_commonmark && !is_verbatim,
471                depth_aware: !is_commonmark,
472                closes_at_open_tag: false,
473                is_closing,
474            });
475        }
476
477        // Pandoc dialect also treats `eitherBlockOrInline` tags as block
478        // starters at fresh-block positions. The block dispatcher caller
479        // gates these as `cannot_interrupt` (mirrors pandoc — they never
480        // interrupt a running paragraph; only start a fresh block when
481        // following a blank line or at document start). Closing forms
482        // (`</video>`) emit as a single-line `RawBlock` with no balanced
483        // match — pandoc-native pins this for standalone closes.
484        if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
485            return Some(HtmlBlockType::BlockTag {
486                tag_name: tag_lower,
487                is_verbatim: false,
488                closed_by_blank_line: false,
489                depth_aware: !is_closing,
490                closes_at_open_tag: is_closing,
491                is_closing,
492            });
493        }
494
495        // Pandoc dialect also recognizes the void subset of
496        // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
497        // These have no closing tag, so the parser closes the block
498        // immediately on the open-tag line; the projector's
499        // `split_html_block_by_tags` handles the same-line splitting
500        // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
501        // non-void inline-block tags, void tags never interrupt a
502        // running paragraph (gated as `cannot_interrupt` in the
503        // dispatcher). Closing forms (`</embed>`) — semantically
504        // nonsensical for void elements — pandoc still emits as a
505        // single-line `RawBlock`; mirror that.
506        if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
507            return Some(HtmlBlockType::BlockTag {
508                tag_name: tag_lower,
509                is_verbatim: false,
510                closed_by_blank_line: false,
511                depth_aware: false,
512                closes_at_open_tag: true,
513                is_closing,
514            });
515        }
516
517        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
518        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
519        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
520        // do not start a type-1 block. Letting `</pre>` through here would
521        // wrongly interrupt a paragraph.
522        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
523            return Some(HtmlBlockType::BlockTag {
524                tag_name: tag_lower,
525                is_verbatim: true,
526                closed_by_blank_line: false,
527                depth_aware: !is_commonmark,
528                closes_at_open_tag: false,
529                is_closing: false,
530            });
531        }
532    }
533
534    // Type 7 (CommonMark only): complete open or close tag on a line by
535    // itself, tag name not in the type-1 verbatim list.
536    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
537    {
538        let rest = &trimmed[end..];
539        let only_ws = rest
540            .bytes()
541            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
542        if only_ws {
543            // Reject if the tag name belongs to the type-1 verbatim set
544            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
545            // type-1 starts above, so seeing one here means the opener
546            // had a different shape (e.g. `<pre/>` self-closing) that
547            // shouldn't trigger type 7 either. Conservatively skip.
548            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
549            let name_end = leading
550                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
551                .unwrap_or(leading.len());
552            let name = leading[..name_end].to_ascii_lowercase();
553            if !VERBATIM_TAGS.contains(&name.as_str()) {
554                return Some(HtmlBlockType::Type7);
555            }
556        }
557    }
558
559    None
560}
561
562/// Extract the tag name for HTML-block-start detection.
563///
564/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
565/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
566/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
567/// the spec — we approximate that with the space/`>`/`/` boundary check.
568fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
569    if !text.starts_with('<') {
570        return None;
571    }
572
573    let after_bracket = &text[1..];
574
575    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
576        if !accept_closing {
577            return None;
578        }
579        stripped
580    } else {
581        after_bracket
582    };
583
584    // Extract tag name (alphanumeric, ends at space, >, or /)
585    let tag_end = after_slash
586        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
587        .unwrap_or(after_slash.len());
588
589    if tag_end == 0 {
590        return None;
591    }
592
593    let tag_name = &after_slash[..tag_end];
594
595    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
596    if !tag_name.chars().next()?.is_ascii_alphabetic() {
597        return None;
598    }
599
600    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
601        return None;
602    }
603
604    Some(tag_name.to_string())
605}
606
607/// Whether this block type ends at a blank line (CommonMark types 6 & 7
608/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
609/// marker — only at end of input or the next blank line.
610fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
611    matches!(
612        block_type,
613        HtmlBlockType::Type7
614            | HtmlBlockType::BlockTag {
615                closed_by_blank_line: true,
616                ..
617            }
618    )
619}
620
621/// Check if a line contains the closing marker for the given HTML block type.
622/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
623/// blank-line-terminated types (6 in CommonMark, 7) never match here.
624fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
625    match block_type {
626        HtmlBlockType::Comment => line.contains("-->"),
627        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
628        HtmlBlockType::Declaration => line.contains('>'),
629        HtmlBlockType::CData => line.contains("]]>"),
630        HtmlBlockType::BlockTag {
631            tag_name,
632            closed_by_blank_line: false,
633            ..
634        } => {
635            // Look for closing tag </tagname>
636            let closing_tag = format!("</{}>", tag_name);
637            line.to_lowercase().contains(&closing_tag)
638        }
639        HtmlBlockType::BlockTag {
640            closed_by_blank_line: true,
641            ..
642        }
643        | HtmlBlockType::Type7 => false,
644    }
645}
646
647/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
648/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
649/// inside a quoted attribute value are NOT counted — the scanner walks
650/// `<...>` brackets and respects `"`/`'` quoting.
651///
652/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
653/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
654/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
655/// close-form dispatch that would otherwise break the list-item buffer
656/// mid-`<div>...</div>`.
657pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
658    let bytes = line.as_bytes();
659    let lower_line = line.to_ascii_lowercase();
660    let lower_bytes = lower_line.as_bytes();
661    let tag_lower = tag_name.to_ascii_lowercase();
662    let tag_bytes = tag_lower.as_bytes();
663
664    let mut opens = 0usize;
665    let mut closes = 0usize;
666    let mut i = 0usize;
667
668    while i < bytes.len() {
669        if bytes[i] != b'<' {
670            i += 1;
671            continue;
672        }
673        let after = i + 1;
674        let is_close = after < bytes.len() && bytes[after] == b'/';
675        let name_start = if is_close { after + 1 } else { after };
676        let matched = name_start + tag_bytes.len() <= bytes.len()
677            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
678        let after_name = name_start + tag_bytes.len();
679        let is_boundary = matched
680            && matches!(
681                bytes.get(after_name).copied(),
682                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
683            );
684
685        // Walk forward to the closing `>` of this tag bracket, skipping
686        // inside quoted attribute values. Self-closing form ends with `/>`.
687        let mut j = if matched { after_name } else { after };
688        let mut quote: Option<u8> = None;
689        let mut self_close = false;
690        let mut found_gt = false;
691        while j < bytes.len() {
692            let b = bytes[j];
693            match (quote, b) {
694                (Some(q), x) if x == q => quote = None,
695                (None, b'"') | (None, b'\'') => quote = Some(b),
696                (None, b'>') => {
697                    found_gt = true;
698                    if j > i + 1 && bytes[j - 1] == b'/' {
699                        self_close = true;
700                    }
701                    break;
702                }
703                _ => {}
704            }
705            j += 1;
706        }
707
708        if matched && is_boundary {
709            if is_close {
710                closes += 1;
711            } else if !self_close {
712                opens += 1;
713            }
714        }
715
716        if found_gt {
717            i = j + 1;
718        } else {
719            // Unterminated `<...` — bail out to avoid an infinite loop.
720            // The remaining bytes don't form a complete tag.
721            break;
722        }
723    }
724
725    (opens, closes)
726}
727
728/// Pandoc-dialect lift for HTML comments / processing instructions
729/// whose close marker is followed by additional bytes (same-line
730/// trailing or following lines). Pandoc-native emits a `RawBlock` for
731/// the marker bytes only, then parses the remainder as fresh blocks.
732///
733/// Returns `Some(consumed_lines)` when the split fires (caller must
734/// NOT enter the legacy emission); `None` to fall back to the legacy
735/// path (no close marker found, or no trailing content to split).
736///
737/// CST shape on success:
738/// ```text
739/// HTML_BLOCK
740///   HTML_BLOCK_TAG (open)        // line[0] up to and incl close marker
741///     TEXT  "<!-- hi -->"        // or with HTML_BLOCK_CONTENT in between
742///     ...                        // for multi-line `<!--\n…\n-->` shape
743/// <sibling blocks>               // recursive parse of trailing + lines[M+1..]
744/// ```
745fn try_parse_comment_pi_with_trailing_split(
746    builder: &mut GreenNodeBuilder<'static>,
747    lines: &[&str],
748    start_pos: usize,
749    block_type: &HtmlBlockType,
750    wrapper_kind: SyntaxKind,
751    bq_depth: usize,
752    config: &ParserOptions,
753) -> Option<usize> {
754    let marker: &str = match block_type {
755        HtmlBlockType::Comment => "-->",
756        HtmlBlockType::ProcessingInstruction => "?>",
757        _ => return None,
758    };
759
760    // Find the close marker in the bq-stripped line content. For
761    // bq_depth == 0 the inner content equals the raw line; for
762    // bq_depth > 0 we look past the `>` markers stripped by the
763    // outer dispatcher (line 0) and emitted as bq prefix below
764    // (lines > 0). `marker_end_in_inner` is the byte offset of the
765    // first byte AFTER the close marker, measured from the start
766    // of the inner (post-strip) content.
767    let mut close_line_idx: Option<usize> = None;
768    let mut marker_end_in_inner: usize = 0;
769    for (offset, line) in lines[start_pos..].iter().enumerate() {
770        let inner = if bq_depth > 0 {
771            strip_n_blockquote_markers(line, bq_depth)
772        } else {
773            line
774        };
775        if let Some(pos) = inner.find(marker) {
776            close_line_idx = Some(start_pos + offset);
777            marker_end_in_inner = pos + marker.len();
778            break;
779        }
780    }
781    let close_line_idx = close_line_idx?;
782    let close_line = lines[close_line_idx];
783    let close_inner = if bq_depth > 0 {
784        strip_n_blockquote_markers(close_line, bq_depth)
785    } else {
786        close_line
787    };
788    let close_prefix_len = close_line.len() - close_inner.len();
789    let trailing = &close_inner[marker_end_in_inner..];
790
791    // Only fire when there is non-whitespace content AFTER the close
792    // marker on the close line. The legacy path correctly handles
793    // the close-line-ends-at-close-marker shapes (`-->\n` followed
794    // by separate blocks); only the same-line-trailing case needs
795    // structural splitting. Trailing-whitespace-only handling
796    // (`-->   \n`) is a projector-side trim — separate concern.
797    let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
798    if !has_non_ws_trailing {
799        return None;
800    }
801
802    builder.start_node(wrapper_kind.into());
803
804    // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
805    // middle `HTML_BLOCK_CONTENT` lines between open and close. The
806    // close `HTML_BLOCK_TAG` carries only the bytes up to and
807    // including the close marker — trailing bytes go to the sibling.
808    if close_line_idx == start_pos {
809        // Same-line shape: one HTML_BLOCK_TAG containing the close
810        // marker's bytes. The newline lives on the trailing sibling.
811        // Line 0's bq prefix (if any) was already emitted by the
812        // outer dispatcher; emit only the inner marker bytes.
813        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
814        let close_part = &close_inner[..marker_end_in_inner];
815        if !close_part.is_empty() {
816            builder.token(SyntaxKind::TEXT.into(), close_part);
817        }
818        builder.finish_node();
819    } else {
820        // Multi-line shape: open tag covers lines[start_pos..close],
821        // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
822        // only the marker bytes. Line 0's bq prefix was emitted by
823        // the outer dispatcher; subsequent lines (middle + close)
824        // need bq prefix re-emission inside the wrapper.
825        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
826        let first_line = lines[start_pos];
827        let first_inner = if bq_depth > 0 {
828            strip_n_blockquote_markers(first_line, bq_depth)
829        } else {
830            first_line
831        };
832        let (line_no_nl, nl) = strip_newline(first_inner);
833        if !line_no_nl.is_empty() {
834            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
835        }
836        if !nl.is_empty() {
837            builder.token(SyntaxKind::NEWLINE.into(), nl);
838        }
839        builder.finish_node();
840
841        if close_line_idx > start_pos + 1 {
842            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
843            for content_line in &lines[start_pos + 1..close_line_idx] {
844                emit_html_block_line(builder, content_line, bq_depth);
845            }
846            builder.finish_node();
847        }
848
849        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
850        if bq_depth > 0 && close_prefix_len > 0 {
851            emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
852        }
853        let close_part = &close_inner[..marker_end_in_inner];
854        if !close_part.is_empty() {
855            builder.token(SyntaxKind::TEXT.into(), close_part);
856        }
857        builder.finish_node();
858    }
859
860    builder.finish_node(); // HTML_BLOCK
861
862    // Recursively parse JUST the trailing bytes on the close line
863    // and graft top-level children as siblings of the HTML_BLOCK we
864    // just closed. We do NOT consume subsequent lines here — the
865    // outer dispatcher continues from `close_line_idx + 1` and
866    // handles container-boundary lines (`:::` div closes, blockquote
867    // markers, list-marker continuations) correctly. Multi-line
868    // softbreak continuation (`<!-- --> trailing\nmore\n` →
869    // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
870    // outer dispatcher sees `more` after the close line and starts
871    // a fresh paragraph. Refdefs flow through from the outer config
872    // (same pattern as `emit_html_block_body_lifted_inner`).
873    if !trailing.is_empty() {
874        let mut inner_options = config.clone();
875        let refdefs = config.refdef_labels.clone().unwrap_or_default();
876        inner_options.refdef_labels = Some(refdefs.clone());
877        let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
878        let mut bq = None;
879        graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
880    }
881
882    Some(close_line_idx + 1)
883}
884
885/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
886/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
887/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
888/// identical to the source either way; only the wrapper retag changes.
889pub(crate) fn parse_html_block_with_wrapper(
890    builder: &mut GreenNodeBuilder<'static>,
891    lines: &[&str],
892    start_pos: usize,
893    block_type: HtmlBlockType,
894    prefix: &ContainerPrefix,
895    wrapper_kind: SyntaxKind,
896    config: &ParserOptions,
897) -> usize {
898    let bq_depth = prefix.bq_depth();
899    // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
900    // closes the RawBlock at the close marker (`-->` / `?>`) and parses
901    // any subsequent bytes (same-line trailing or following lines) as
902    // fresh blocks. The legacy path absorbs them into the HTML block
903    // wrapper, producing one oversized RawBlock. Handle the split here
904    // before entering the legacy emission so the CST encodes the
905    // sibling structure.
906    if config.dialect == crate::options::Dialect::Pandoc
907        && matches!(
908            block_type,
909            HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
910        )
911        && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
912            builder,
913            lines,
914            start_pos,
915            &block_type,
916            wrapper_kind,
917            bq_depth,
918            config,
919        )
920    {
921        return consumed;
922    }
923
924    // Start HTML block
925    builder.start_node(wrapper_kind.into());
926
927    let first_line = lines[start_pos];
928    let blank_terminated = ends_at_blank_line(&block_type);
929
930    // The block dispatcher has already emitted the bq prefix tokens for
931    // the first line; emit only the inner content as TEXT to keep the
932    // CST byte-equal to the source. List-marker bytes are stripped only
933    // when this dispatch fires on a list-marker line — for
934    // continuation-line dispatches (the much more common case) the
935    // leading indent is inner content, not upstream-emitted prefix.
936    let first_inner = prefix.strip_line_0_for_emission(first_line);
937
938    // Detect a multi-line open tag.
939    // - `<div>` (Pandoc lift): we tokenize each line structurally so the
940    //   salsa anchor walk picks up `id` from the HTML_ATTRS region.
941    // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
942    //   `<section>`, `<header>`, …): same structural emission, exposing
943    //   `id` to the salsa anchor walk and enabling the body lift below.
944    // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
945    //   without this, the parser closes the block after line 0 and the
946    //   remainder of the open tag falls into following paragraphs;
947    //   pandoc-native treats the whole multi-line open tag as a single
948    //   `RawBlock`. Emission for void tags uses simple per-line
949    //   TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
950    //   from void tags).
951    let multiline_open_end = match (wrapper_kind, &block_type) {
952        (SyntaxKind::HTML_BLOCK_DIV, _) => {
953            find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
954        }
955        (
956            _,
957            HtmlBlockType::BlockTag {
958                tag_name,
959                closes_at_open_tag: true,
960                ..
961            },
962        ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
963        (
964            _,
965            HtmlBlockType::BlockTag {
966                tag_name,
967                is_verbatim: false,
968                closed_by_blank_line: false,
969                depth_aware: true,
970                closes_at_open_tag: false,
971                is_closing: false,
972            },
973        ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
974            find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
975        }
976        _ => None,
977    };
978
979    // Set up depth-aware close tracking when the block type asks for it
980    // (Pandoc dialect, balanced same-name tag matching). A `None` means
981    // we fall back to the legacy "first matching close" path via
982    // `is_closing_marker`. Computed up front so the lift-mode gate
983    // below can decide whether the open line already balances the
984    // block (same-line `<div>...</div>`).
985    let depth_aware_tag: Option<String> = match &block_type {
986        HtmlBlockType::BlockTag {
987            tag_name,
988            closed_by_blank_line: false,
989            depth_aware: true,
990            ..
991        } => Some(tag_name.clone()),
992        _ => None,
993    };
994    let mut depth: i64 = 1;
995    if let Some(tag_name) = &depth_aware_tag {
996        // Sum opens/closes across all open-tag lines (single-line: just
997        // line 0; multi-line: lines 0..=end_line_idx).
998        let last_open_line = multiline_open_end.unwrap_or(start_pos);
999        let mut opens = 0usize;
1000        let mut closes = 0usize;
1001        for line in &lines[start_pos..=last_open_line] {
1002            let inner = prefix.strip(line);
1003            let (o, c) = count_tag_balance(inner, tag_name);
1004            opens += o;
1005            closes += c;
1006        }
1007        depth = opens as i64 - closes as i64;
1008    }
1009
1010    // Same-line `<div>foo</div>` shape: the open line balances the
1011    // block under depth-aware tracking. We can lift this structurally
1012    // only when the open-tag trailing has exactly one `</div>` close,
1013    // zero `<div>` opens, and no non-whitespace content after the
1014    // close. Other same-line shapes (nested, trailing text, malformed)
1015    // fall through to the byte-reparse path.
1016    let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1017        && multiline_open_end.is_none()
1018        && depth_aware_tag.is_some()
1019        && depth <= 0;
1020    let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1021        let (line_without_newline, _) = strip_newline(first_inner);
1022        probe_same_line_lift(line_without_newline, "div")
1023    };
1024
1025    // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1026    // `<nav>`, …): the body parses as fresh markdown between RawBlock
1027    // emissions of the open/close tags. Covers the clean multi-line
1028    // shape (open tag stands alone on its line), open-trailing
1029    // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1030    // and same-line (`<form>foo</form>`). Multi-line open and
1031    // blockquote-wrapped non-div shapes still fall through to the
1032    // byte-walker path.
1033    let strict_block_tag_name: Option<&str> =
1034        if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1035            match &block_type {
1036                HtmlBlockType::BlockTag {
1037                    tag_name,
1038                    is_verbatim: false,
1039                    closed_by_blank_line: false,
1040                    depth_aware: true,
1041                    closes_at_open_tag: false,
1042                    is_closing: false,
1043                } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1044                _ => None,
1045            }
1046        } else {
1047            None
1048        };
1049    // Same-line `<form>foo</form>` shape: the open line already
1050    // balances the block (`depth <= 0`). Lift only when the trailing
1051    // bytes after the open `>` end with `</tag>` and contain exactly
1052    // one close + zero nested opens.
1053    let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1054        multiline_open_end.is_none() && depth <= 0 && {
1055            let (line_no_nl, _) = strip_newline(first_inner);
1056            probe_same_line_lift(line_no_nl, name)
1057        }
1058    });
1059    // Strict-block lift gate: accept (a) a multi-line open tag spanning
1060    // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1061    // trailing single-line open (depth > 0, open `>` is present with
1062    // quote-aware matching), or (c) a safe same-line shape. For
1063    // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1064    // …) the lift additionally abandons when the body starts at a
1065    // fresh-block position with a void block tag — pandoc-native pins
1066    // per-tag emission rather than a matched-pair lift in that case.
1067    let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1068        let (line_no_nl, _) = strip_newline(first_inner);
1069        let shape_ok = if multiline_open_end.is_some() {
1070            // `find_multiline_open_end` already verified the open tag
1071            // closes with a quote-aware `>` somewhere in lines
1072            // `start_pos+1..=end`. No same-line trailing content to
1073            // probe; defer trailing-on-close-`>`-line handling to a
1074            // future session (rare in practice).
1075            true
1076        } else if depth > 0 {
1077            probe_open_tag_line_has_close_gt(line_no_nl, name)
1078        } else {
1079            same_line_strict_lift_safe
1080        };
1081        if !shape_ok {
1082            return false;
1083        }
1084        if !is_pandoc_inline_block_tag_name(name) {
1085            return true;
1086        }
1087        !inline_block_void_interior_abandons(
1088            first_inner,
1089            lines,
1090            start_pos,
1091            multiline_open_end,
1092            bq_depth,
1093            name,
1094        )
1095    });
1096
1097    // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1098    // are byte-equal to the non-bq same-line shape minus the leading
1099    // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1100    // The body has no inner newlines, so no bq prefix re-injection is
1101    // needed when grafting — `emit_html_block_body_lifted` (passing
1102    // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1103    // open-trailing) still fall through to the projector's byte
1104    // walker — they need per-line prefix injection.
1105    let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1106        && multiline_open_end.is_none()
1107        && depth_aware_tag.is_some()
1108        && depth <= 0
1109    {
1110        let (line_no_nl, _) = strip_newline(first_inner);
1111        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1112            if probe_same_line_lift(line_no_nl, "div") {
1113                Some("div")
1114            } else {
1115                None
1116            }
1117        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1118            match &block_type {
1119                HtmlBlockType::BlockTag {
1120                    tag_name,
1121                    is_verbatim: false,
1122                    closed_by_blank_line: false,
1123                    depth_aware: true,
1124                    closes_at_open_tag: false,
1125                    is_closing: false,
1126                } if is_pandoc_lift_eligible_block_tag(tag_name)
1127                    && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1128                {
1129                    // Inline-block tags (`<video>`, `<iframe>`, …) skip
1130                    // the void-interior check at same-line — the shape
1131                    // has no inner block content to interfere with.
1132                    Some(tag_name.as_str())
1133                }
1134                _ => None,
1135            }
1136        } else {
1137            None
1138        }
1139    } else {
1140        None
1141    };
1142
1143    // Messy-shape lift inside a blockquote — covers open-trailing
1144    // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1145    // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1146    // including the multi-line-open variants (`> <div\n>   id="x">foo\n>
1147    // body\n> </div>`) where the trailing is captured into `pre_content`
1148    // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1149    // The open line does NOT balance the block (depth > 0 after the
1150    // open line, distinguishing this from `same_line_bq_lift_tag` which
1151    // requires depth <= 0). The close line — possibly with leading body
1152    // text — closes the block when depth returns to 0. Body lines (incl.
1153    // open trailing and close leading) graft via prefix re-injection.
1154    let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1155    {
1156        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1157            Some("div")
1158        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1159            match &block_type {
1160                HtmlBlockType::BlockTag {
1161                    tag_name,
1162                    is_verbatim: false,
1163                    closed_by_blank_line: false,
1164                    depth_aware: true,
1165                    closes_at_open_tag: false,
1166                    is_closing: false,
1167                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1168                    // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1169                    // …) abandon the lift when the body starts at a
1170                    // fresh-block position with a void block tag. Same gate
1171                    // as the non-bq matched-pair lift (`strict_block_lift`).
1172                    if is_pandoc_inline_block_tag_name(tag_name)
1173                        && inline_block_void_interior_abandons(
1174                            first_inner,
1175                            lines,
1176                            start_pos,
1177                            multiline_open_end,
1178                            bq_depth,
1179                            tag_name,
1180                        )
1181                    {
1182                        None
1183                    } else {
1184                        Some(tag_name.as_str())
1185                    }
1186                }
1187                _ => None,
1188            }
1189        } else {
1190            None
1191        }
1192    } else {
1193        None
1194    };
1195
1196    // Multi-line open + matched close-on-the-open's-last-line shape inside
1197    // a blockquote (`> <div\n>   id="x">foo</div>` and depth-aware variants:
1198    // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1199    // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1200    // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1201    // close is on the open's last line (`depth <= 0` after the open lines)
1202    // AND `multiline_open_end.is_some()`. The trailing bytes after the
1203    // last `>` get lifted into `pre_content` via
1204    // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1205    // new branch below splits `pre_content` at the matched close marker
1206    // and grafts body + close + any trailing siblings.
1207    let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1208        && multiline_open_end.is_some()
1209        && depth_aware_tag.is_some()
1210        && depth <= 0
1211    {
1212        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1213            Some("div")
1214        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1215            match &block_type {
1216                HtmlBlockType::BlockTag {
1217                    tag_name,
1218                    is_verbatim: false,
1219                    closed_by_blank_line: false,
1220                    depth_aware: true,
1221                    closes_at_open_tag: false,
1222                    is_closing: false,
1223                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1224                    if is_pandoc_inline_block_tag_name(tag_name)
1225                        && inline_block_void_interior_abandons(
1226                            first_inner,
1227                            lines,
1228                            start_pos,
1229                            multiline_open_end,
1230                            bq_depth,
1231                            tag_name,
1232                        )
1233                    {
1234                        None
1235                    } else {
1236                        Some(tag_name.as_str())
1237                    }
1238                }
1239                _ => None,
1240            }
1241        } else {
1242            None
1243        }
1244    } else {
1245        None
1246    };
1247
1248    // Whether this block participates in the Phase 6 structural lift
1249    // (recursively parse body as Pandoc markdown and graft children).
1250    // Covers `<div>` outside blockquote context. For same-line shapes
1251    // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1252    // keep the legacy single-HTML_BLOCK_TAG shape and let the
1253    // byte-reparse path handle projection.
1254    let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1255        && bq_depth == 0
1256        && (!is_same_line_div || same_line_div_lift_safe))
1257        || strict_block_lift
1258        || same_line_bq_lift_tag.is_some()
1259        || bq_messy_lift_tag.is_some()
1260        || bq_multiline_close_lift_tag.is_some();
1261
1262    // Trailing content from the open tag (after `>`). When the lift is
1263    // active and the open line is `<div ATTRS>foo\n`, this captures
1264    // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1265    // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1266    // shapes (same-line / blockquote-wrapped).
1267    let mut pre_content = String::new();
1268
1269    // Emit opening line(s)
1270    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1271
1272    if let Some(end_line_idx) = multiline_open_end {
1273        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1274            emit_multiline_open_tag_with_attrs(
1275                builder,
1276                lines,
1277                start_pos,
1278                end_line_idx,
1279                "div",
1280                bq_depth,
1281                lift_mode,
1282                &mut pre_content,
1283            );
1284        } else if let Some(name) = strict_block_tag_name
1285            && strict_block_lift
1286        {
1287            emit_multiline_open_tag_with_attrs(
1288                builder,
1289                lines,
1290                start_pos,
1291                end_line_idx,
1292                name,
1293                bq_depth,
1294                lift_mode,
1295                &mut pre_content,
1296            );
1297        } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1298        {
1299            // Multi-line open of a lift-eligible strict-block tag inside a
1300            // blockquote (`> <section\n>   id=...>`). The non-bq
1301            // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1302            // covers the bq side so the open tag emits HTML_ATTRS regions
1303            // for `AttributeNode::cast` and the projector's canonicalizer.
1304            //
1305            // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1306            // call below: only push trailing bytes into `pre_content` when
1307            // the structural lift will consume them (bq messy lift). The
1308            // bq clean-lift requires `pre_content.is_empty()`, so for clean
1309            // multi-line opens the trailing is empty anyway and this is
1310            // a no-op.
1311            let lift_trailing =
1312                bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1313            emit_multiline_open_tag_with_attrs(
1314                builder,
1315                lines,
1316                start_pos,
1317                end_line_idx,
1318                name,
1319                bq_depth,
1320                lift_trailing,
1321                &mut pre_content,
1322            );
1323        } else {
1324            emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1325        }
1326    } else {
1327        let (line_without_newline, newline_str) = strip_newline(first_inner);
1328        if !line_without_newline.is_empty() {
1329            // For HTML_BLOCK_DIV, expose the open tag's attributes
1330            // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1331            // via the same descendants walk that handles fenced-div /
1332            // heading attrs. CST bytes stay byte-equal to source — we only
1333            // tokenize at finer granularity for matched div opens.
1334            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1335                let trailing =
1336                    emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1337                if !trailing.is_empty() {
1338                    pre_content.push_str(trailing);
1339                    pre_content.push_str(newline_str);
1340                }
1341            } else if let Some(name) = strict_block_tag_name
1342                && strict_block_lift
1343            {
1344                let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1345                if !trailing.is_empty() {
1346                    pre_content.push_str(trailing);
1347                    pre_content.push_str(newline_str);
1348                }
1349            } else if let Some(name) =
1350                bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1351            {
1352                // Inside a blockquote, lift trailing bytes into
1353                // `pre_content` when either the same-line bq gate fires
1354                // (`> <tag>body</tag>` — handled by `same_line_closed`)
1355                // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1356                // </tag>` and butted-close — handled at the close-marker
1357                // site below). For the clean-shape bq lift the open has
1358                // no trailing bytes regardless, so `lift_trailing=true`
1359                // is a no-op there.
1360                let lift_trailing =
1361                    same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1362                let trailing =
1363                    emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1364                if lift_trailing && !trailing.is_empty() {
1365                    pre_content.push_str(trailing);
1366                    pre_content.push_str(newline_str);
1367                }
1368            } else {
1369                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1370            }
1371        }
1372        // When the open tag has trailing content under lift mode, the
1373        // newline belongs to that trailing line (it terminates the
1374        // synthetic body line, not the open tag). Don't double-emit.
1375        if pre_content.is_empty() && !newline_str.is_empty() {
1376            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1377        }
1378    }
1379
1380    builder.finish_node(); // HtmlBlockTag
1381
1382    // Check if opening line also contains closing marker. Blank-line-terminated
1383    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1384    // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1385    // (`closes_at_open_tag: true`) close immediately — the block always
1386    // ends on the open-tag line since there is no closing tag to find.
1387    let void_block = matches!(
1388        &block_type,
1389        HtmlBlockType::BlockTag {
1390            closes_at_open_tag: true,
1391            ..
1392        }
1393    );
1394    // Void tags with a multi-line open close immediately after the open
1395    // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1396    // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1397    // a single RawBlock for the whole multi-line tag, with no following
1398    // content.
1399    if void_block && let Some(end_line_idx) = multiline_open_end {
1400        log::trace!(
1401            "HTML void block at line {} closes after multi-line open ending at line {}",
1402            start_pos + 1,
1403            end_line_idx + 1
1404        );
1405        builder.finish_node(); // HtmlBlock
1406        return end_line_idx + 1;
1407    }
1408    // Multi-line open with all matched closes on the open's last line:
1409    // `pre_content` holds the bytes after the last open `>` (lifted there
1410    // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1411    // When `depth <= 0` after the multi-line open and the trailing bytes
1412    // contain the depth-zero matched close, do the same-line lift on
1413    // `pre_content` directly. Mirrors the single-line `same_line_closed`
1414    // lift below — same body / close-marker / trailing-graft shape, just
1415    // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1416    //
1417    // The body bytes of `pre_content` come from the open's last line,
1418    // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1419    // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1420    // tag thus inherit the bq context without per-line prefix injection,
1421    // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1422    // both the non-bq and bq variants of this shape.
1423    if let Some(end_line_idx) = multiline_open_end
1424        && !blank_terminated
1425        && depth_aware_tag.is_some()
1426        && depth <= 0
1427        && lift_mode
1428        && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1429        && !pre_content.is_empty()
1430    {
1431        let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1432            Some("div")
1433        } else if strict_block_lift {
1434            strict_block_tag_name
1435        } else if let Some(name) = bq_multiline_close_lift_tag {
1436            Some(name)
1437        } else {
1438            None
1439        };
1440        if let Some(tag_name) = tag_name_opt {
1441            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1442            if let Some((leading, close_part)) =
1443                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1444            {
1445                let close_marker_end =
1446                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1447                let close_marker = &close_part[..close_marker_end];
1448                let same_line_trailing = &close_part[close_marker_end..];
1449                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1450                    LastParaDemote::SkipTrailingBlanks
1451                } else {
1452                    LastParaDemote::OnlyIfLast
1453                };
1454                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1455                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1456                if same_line_trailing.is_empty() {
1457                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1458                    close_line.push_str(close_marker);
1459                    close_line.push_str(post_nl);
1460                    emit_html_block_line(builder, &close_line, 0);
1461                    builder.finish_node();
1462                    builder.finish_node(); // HtmlBlock
1463                } else {
1464                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1465                    builder.finish_node(); // HTML_BLOCK_TAG
1466                    builder.finish_node(); // HtmlBlock
1467
1468                    let mut trailing_text =
1469                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1470                    trailing_text.push_str(same_line_trailing);
1471                    trailing_text.push_str(post_nl);
1472                    let mut inner_options = config.clone();
1473                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1474                    inner_options.refdef_labels = Some(refdefs.clone());
1475                    let inner_root = crate::parser::parse_with_refdefs(
1476                        &trailing_text,
1477                        Some(inner_options),
1478                        refdefs,
1479                    );
1480                    let mut bq = None;
1481                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1482                }
1483                return end_line_idx + 1;
1484            }
1485        }
1486    }
1487
1488    let same_line_closed = !blank_terminated
1489        && multiline_open_end.is_none()
1490        && (void_block
1491            || match &depth_aware_tag {
1492                Some(_) => depth <= 0,
1493                None => is_closing_marker(first_inner, &block_type),
1494            });
1495    if same_line_closed {
1496        log::trace!(
1497            "HTML block at line {} opens and closes on same line",
1498            start_pos + 1
1499        );
1500        // Same-line structural lift (div or non-div strict-block):
1501        // pre_content holds the bytes after the open `>` (including
1502        // the close `</tag>` and the trailing newline). Split into
1503        // body + close tag, emit body via recursive parse, emit close
1504        // tag as a sibling `HTML_BLOCK_TAG`.
1505        let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1506            None
1507        } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1508            Some("div")
1509        } else if same_line_strict_lift_safe {
1510            strict_block_tag_name
1511        } else if let Some(name) = same_line_bq_lift_tag {
1512            // Bq same-line: body has no inner newlines so the standard
1513            // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1514            // sufficient. The bq prefix `> ` lives on the outer
1515            // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1516            Some(name)
1517        } else {
1518            None
1519        };
1520        if let Some(tag_name) = same_line_lift_tag {
1521            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1522            // Depth-aware split: handles `<tag>foo</tag>bar` (single
1523            // close, trailing text), `<tag>foo</tag></tag>` (matched
1524            // close + unmatched trailing close → sibling RawBlock),
1525            // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1526            // recursive body parse).
1527            if let Some((leading, close_part)) =
1528                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1529            {
1530                // `close_part` starts with `</tag` and contains the close
1531                // marker followed by any same-line trailing text. Split
1532                // off the close marker bytes (`</tag>`) so the close
1533                // `HTML_BLOCK_TAG` carries only those bytes; trailing
1534                // text is parsed and grafted as a sibling block at the
1535                // parent level (matches pandoc-native shape:
1536                // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1537                let close_marker_end =
1538                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1539                let close_marker = &close_part[..close_marker_end];
1540                let same_line_trailing = &close_part[close_marker_end..];
1541
1542                // Same-line is always close-butted; div demotes the
1543                // trailing Para→Plain via `SkipTrailingBlanks`.
1544                // Non-div strict-block uses `OnlyIfLast` (consistent
1545                // with butted-close — no trailing BLANK_LINE before
1546                // the close means the trailing Para demotes).
1547                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1548                    LastParaDemote::SkipTrailingBlanks
1549                } else {
1550                    LastParaDemote::OnlyIfLast
1551                };
1552                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1553                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1554                if same_line_trailing.is_empty() {
1555                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1556                    close_line.push_str(close_marker);
1557                    close_line.push_str(post_nl);
1558                    emit_html_block_line(builder, &close_line, 0);
1559                    builder.finish_node();
1560                    builder.finish_node(); // HtmlBlock
1561                } else {
1562                    // Close tag holds only the close-marker bytes;
1563                    // trailing + newline graft as siblings of the
1564                    // wrapper (matches pandoc's per-tag block split).
1565                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1566                    builder.finish_node(); // HTML_BLOCK_TAG
1567                    builder.finish_node(); // HtmlBlock
1568
1569                    let mut trailing_text =
1570                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1571                    trailing_text.push_str(same_line_trailing);
1572                    trailing_text.push_str(post_nl);
1573                    let mut inner_options = config.clone();
1574                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1575                    inner_options.refdef_labels = Some(refdefs.clone());
1576                    let inner_root = crate::parser::parse_with_refdefs(
1577                        &trailing_text,
1578                        Some(inner_options),
1579                        refdefs,
1580                    );
1581                    let mut bq = None;
1582                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1583                }
1584                return start_pos + 1;
1585            }
1586        }
1587        builder.finish_node(); // HtmlBlock
1588        return start_pos + 1;
1589    }
1590
1591    let mut current_pos = multiline_open_end
1592        .map(|end| end + 1)
1593        .unwrap_or(start_pos + 1);
1594    let mut content_lines: Vec<&str> = Vec::new();
1595    let mut found_closing = false;
1596
1597    // Parse content until we find the closing marker
1598    while current_pos < lines.len() {
1599        let line = lines[current_pos];
1600        let (line_bq_depth, inner) = count_blockquote_markers(line);
1601
1602        // Only process lines at the same or deeper blockquote depth
1603        if line_bq_depth < bq_depth {
1604            break;
1605        }
1606
1607        // Blank-line-terminated blocks (types 6/7) end before the blank line.
1608        // The blank line itself is not part of the block.
1609        if blank_terminated && inner.trim().is_empty() {
1610            break;
1611        }
1612
1613        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1614        // count opens/closes of the same tag name and only close when depth
1615        // returns to 0; otherwise fall back to substring-match on the line.
1616        let line_closes = match &depth_aware_tag {
1617            Some(tag_name) => {
1618                let (opens, closes) = count_tag_balance(inner, tag_name);
1619                depth += opens as i64;
1620                depth -= closes as i64;
1621                depth <= 0
1622            }
1623            None => is_closing_marker(inner, &block_type),
1624        };
1625
1626        if line_closes {
1627            log::trace!("Found HTML block closing at line {}", current_pos + 1);
1628            found_closing = true;
1629
1630            // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1631            // the open and close tags stand alone on their source lines
1632            // (no trailing on open, no body content on close after
1633            // stripping bq markers), lift the body lines structurally
1634            // so the projector walks CST children instead of
1635            // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1636            //
1637            // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1638            // grafted, Para preserved), non-div strict-block tags
1639            // (`<form>`, `<section>`, …) and inline-block matched-pair
1640            // tags (`<video>`, `<iframe>`, …) — the latter two under
1641            // HTML_BLOCK with the structural lift hitting pandoc's
1642            // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1643            // demotion. Inline-block additionally bails if the body
1644            // starts at a fresh-block position with a void block tag
1645            // (mirrors the non-bq matched-pair gate).
1646            //
1647            // Other bq-wrapped shapes (butted-close / open-trailing /
1648            // same-line) still fall through to the opaque path.
1649            // Multi-line opens are allowed here as of 2026-05-12: the
1650            // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1651            // per attr line and per-line bq prefix tokens) by the bq-aware
1652            // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1653            // empty for multi-line opens (the emitter writes any trailing
1654            // bytes on the last open line directly as TEXT inside
1655            // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1656            // trailing falls through to the opaque path, matching the non-
1657            // bq deferral.
1658            let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1659                if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1660                    Some("div")
1661                } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1662                    match &block_type {
1663                        HtmlBlockType::BlockTag {
1664                            tag_name,
1665                            is_verbatim: false,
1666                            closed_by_blank_line: false,
1667                            depth_aware: true,
1668                            closes_at_open_tag: false,
1669                            is_closing: false,
1670                        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1671                        _ => None,
1672                    }
1673                } else {
1674                    None
1675                }
1676            } else {
1677                None
1678            };
1679
1680            let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1681                // Open-shape: last open line must end with `>` (clean
1682                // close-of-open). For single-line, that's `first_inner`
1683                // (already bq-stripped); for multi-line, strip bq markers
1684                // from `lines[end_line_idx]` and check the same.
1685                let last_open_line: &str = match multiline_open_end {
1686                    None => first_inner,
1687                    Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1688                        prefix.strip(lines[end])
1689                    }
1690                    Some(end) => lines[end],
1691                };
1692                let (open_no_nl, _) = strip_newline(last_open_line);
1693                if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1694                    return false;
1695                }
1696                let close_stripped = prefix.strip(line);
1697                let (close_no_nl, _) = strip_newline(close_stripped);
1698                if !close_no_nl
1699                    .trim_start_matches([' ', '\t'])
1700                    .starts_with("</")
1701                {
1702                    return false;
1703                }
1704                if is_pandoc_inline_block_tag_name(tag_name)
1705                    && inline_block_void_interior_abandons(
1706                        first_inner,
1707                        lines,
1708                        start_pos,
1709                        multiline_open_end,
1710                        bq_depth,
1711                        tag_name,
1712                    )
1713                {
1714                    return false;
1715                }
1716                true
1717            });
1718
1719            if bq_clean_lift {
1720                let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1721                    LastParaDemote::Never
1722                } else {
1723                    LastParaDemote::OnlyIfLast
1724                };
1725                emit_html_block_body_lifted_bq(
1726                    builder,
1727                    &content_lines,
1728                    prefix,
1729                    demote_policy,
1730                    config,
1731                );
1732                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1733                emit_html_block_line(builder, line, bq_depth);
1734                builder.finish_node();
1735                current_pos += 1;
1736                break;
1737            }
1738
1739            // Bq messy-shape lift — single-line open with trailing or
1740            // butted-close (or both). `pre_content` already captures any
1741            // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1742            // strip the close line's bq markers before splitting so
1743            // `leading` and `close_part` are bq-prefix-free. Body parses
1744            // recursively from `pre_content + stripped(content_lines) +
1745            // leading`, with per-line bq prefixes re-injected so the CST
1746            // stays byte-equal to the source. Demote: div is keyed on
1747            // close-butted-ness (Plain when leading non-empty, Para
1748            // otherwise); non-div uses OnlyIfLast either way.
1749            if let Some(tag_name) = bq_messy_lift_tag {
1750                let close_stripped = prefix.strip(line);
1751                let close_prefix_len = line.len() - close_stripped.len();
1752                let close_prefix = &line[..close_prefix_len];
1753                if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1754                {
1755                    let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1756                        if leading.is_empty() {
1757                            LastParaDemote::Never
1758                        } else {
1759                            LastParaDemote::SkipTrailingBlanks
1760                        }
1761                    } else {
1762                        LastParaDemote::OnlyIfLast
1763                    };
1764                    emit_html_block_body_lifted_bq_messy(
1765                        builder,
1766                        &pre_content,
1767                        &content_lines,
1768                        leading,
1769                        close_prefix,
1770                        prefix,
1771                        policy,
1772                        config,
1773                    );
1774                    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1775                    // When `leading` is empty, no recursive-parse output carries
1776                    // the close line's bq prefix, so emit it here before the
1777                    // close tag. When `leading` is non-empty,
1778                    // `emit_html_block_body_lifted_bq_messy` already injected
1779                    // the prefix at the start of the leading bytes (via the
1780                    // BqPrefixState entry); emitting again would double the
1781                    // prefix bytes and break losslessness.
1782                    if leading.is_empty() {
1783                        emit_bq_prefix_tokens(builder, close_prefix);
1784                    }
1785                    emit_html_block_line(builder, close_part, 0);
1786                    builder.finish_node();
1787                    current_pos += 1;
1788                    break;
1789                }
1790            }
1791
1792            // Under lift mode, try to split the close line into a
1793            // leading "body content" prefix and the close-marker
1794            // remainder using depth-aware matching. Walks at depth 1
1795            // (we're inside the open tag) so nested same-tag opens
1796            // (e.g. `<inner></inner></tag>` style with a nested div)
1797            // are absorbed into the body and parsed recursively, and
1798            // multi-close shapes (`foo</div></div>` on the close line)
1799            // peel off the matched-pair close — the unmatched
1800            // trailing close projects as a sibling `RawBlock` per
1801            // pandoc-native. For `<div>`, non-empty `leading`
1802            // propagates pandoc's `markdown_in_html_blocks` Plain
1803            // demotion rule. For non-div strict-block tags, demotion
1804            // follows pandoc's `OnlyIfLast` rule (demote the trailing
1805            // Para only when no blank line precedes the close).
1806            let close_split_tag = if lift_mode {
1807                if strict_block_lift {
1808                    strict_block_tag_name
1809                } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1810                    Some("div")
1811                } else {
1812                    None
1813                }
1814            } else {
1815                None
1816            };
1817            let (close_no_nl, close_post_nl) = strip_newline(line);
1818            let close_split = close_split_tag
1819                .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1820
1821            if let Some((leading, close_part)) = close_split {
1822                // Close-line leading that is whitespace-only is close-tag
1823                // indentation, not body content (pandoc-native strips it
1824                // from the close RawBlock and treats the close as butted —
1825                // see `   </tag>` shapes). Route those bytes into the
1826                // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1827                // projector strips them; keep the demote policy keyed on
1828                // the original leading so butted-close detection (Plain
1829                // demotion for div, OnlyIfLast for non-div) still fires.
1830                let leading_is_ws_only =
1831                    !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1832                let body_leading = if leading_is_ws_only { "" } else { leading };
1833                let policy = if strict_block_lift {
1834                    LastParaDemote::OnlyIfLast
1835                } else if !leading.is_empty() {
1836                    LastParaDemote::SkipTrailingBlanks
1837                } else {
1838                    LastParaDemote::Never
1839                };
1840                // Split close_part into close-marker bytes (`</tag>`)
1841                // and trailing bytes (e.g. an extra `</div>` for the
1842                // double-close case, or `bar` for trailing text after
1843                // a normal close). Trailing bytes are recursively
1844                // parsed and grafted as siblings of the HTML_BLOCK_DIV
1845                // wrapper.
1846                let close_tag_name = close_split_tag.expect("close_split_tag present");
1847                let close_marker_end =
1848                    split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1849                let close_marker = &close_part[..close_marker_end];
1850                let close_trailing = &close_part[close_marker_end..];
1851
1852                emit_html_block_body_lifted(
1853                    builder,
1854                    &pre_content,
1855                    &content_lines,
1856                    body_leading,
1857                    policy,
1858                    config,
1859                );
1860                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1861                if leading_is_ws_only {
1862                    builder.token(SyntaxKind::WHITESPACE.into(), leading);
1863                }
1864                if close_trailing.is_empty() {
1865                    let mut close_line =
1866                        String::with_capacity(close_marker.len() + close_post_nl.len());
1867                    close_line.push_str(close_marker);
1868                    close_line.push_str(close_post_nl);
1869                    emit_html_block_line(builder, &close_line, 0);
1870                    builder.finish_node();
1871                } else {
1872                    // Close tag holds only the close-marker bytes;
1873                    // trailing + newline graft as siblings.
1874                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1875                    builder.finish_node(); // HTML_BLOCK_TAG
1876                    builder.finish_node(); // HtmlBlock
1877
1878                    let mut trailing_text =
1879                        String::with_capacity(close_trailing.len() + close_post_nl.len());
1880                    trailing_text.push_str(close_trailing);
1881                    trailing_text.push_str(close_post_nl);
1882                    let mut inner_options = config.clone();
1883                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1884                    inner_options.refdef_labels = Some(refdefs.clone());
1885                    let inner_root = crate::parser::parse_with_refdefs(
1886                        &trailing_text,
1887                        Some(inner_options),
1888                        refdefs,
1889                    );
1890                    let mut bq = None;
1891                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1892                    current_pos += 1;
1893                    return current_pos;
1894                }
1895            } else {
1896                emit_html_block_body(
1897                    builder,
1898                    &pre_content,
1899                    &content_lines,
1900                    bq_depth,
1901                    wrapper_kind,
1902                    lift_mode,
1903                    config,
1904                );
1905                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1906                emit_html_block_line(builder, line, bq_depth);
1907                builder.finish_node();
1908            }
1909
1910            current_pos += 1;
1911            break;
1912        }
1913
1914        // Regular content line
1915        content_lines.push(line);
1916        current_pos += 1;
1917    }
1918
1919    // If we didn't find a closing marker, emit what we collected
1920    if !found_closing {
1921        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1922        emit_html_block_body(
1923            builder,
1924            &pre_content,
1925            &content_lines,
1926            bq_depth,
1927            wrapper_kind,
1928            lift_mode,
1929            config,
1930        );
1931    }
1932
1933    builder.finish_node(); // HtmlBlock
1934    current_pos
1935}
1936
1937/// Emit the collected inner content lines for an HTML block.
1938///
1939/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1940/// line `<div>` open outside blockquote), recursively parse the inner
1941/// content (including any open-tag trailing) as Pandoc-flavored
1942/// markdown and graft the resulting top-level blocks as direct children
1943/// of the wrapper. This is the Phase 6 structural lift — the projector
1944/// and downstream consumers (linter, salsa, LSP) can walk the
1945/// structural children instead of re-tokenizing the body bytes.
1946///
1947/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1948/// blockquote, multi-line open, or no content at all — fall through to
1949/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1950///
1951/// CST bytes remain byte-identical to source: the recursive parser is
1952/// lossless on the same byte slice the legacy path would have captured
1953/// as TEXT.
1954fn emit_html_block_body(
1955    builder: &mut GreenNodeBuilder<'static>,
1956    pre_content: &str,
1957    content_lines: &[&str],
1958    bq_depth: usize,
1959    wrapper_kind: SyntaxKind,
1960    lift_mode: bool,
1961    config: &ParserOptions,
1962) {
1963    if pre_content.is_empty() && content_lines.is_empty() {
1964        return;
1965    }
1966    if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1967        // Reached when the parser walked to end-of-input without finding
1968        // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1969        emit_html_block_body_lifted(
1970            builder,
1971            pre_content,
1972            content_lines,
1973            "",
1974            LastParaDemote::Never,
1975            config,
1976        );
1977        return;
1978    }
1979    // Legacy path: opaque TEXT capture. `pre_content` is always empty
1980    // here (lift_mode is the only path that populates it), but be
1981    // defensive — if a trailing prefix snuck in, emit it as TEXT so
1982    // bytes are preserved.
1983    builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1984    if !pre_content.is_empty() {
1985        builder.token(SyntaxKind::TEXT.into(), pre_content);
1986    }
1987    for content_line in content_lines {
1988        emit_html_block_line(builder, content_line, bq_depth);
1989    }
1990    builder.finish_node();
1991}
1992
1993/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1994/// to `PLAIN` when grafting children into the structural CST.
1995#[derive(Copy, Clone, Debug)]
1996enum LastParaDemote {
1997    /// Never demote — pandoc preserves the trailing `Para`.
1998    Never,
1999    /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2000    /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2001    /// tag is butted against the paragraph text on its source line —
2002    /// pandoc's `markdown_in_html_blocks` Plain demotion.
2003    SkipTrailingBlanks,
2004    /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2005    /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2006    /// for non-div strict-block tags whose body emits at top-level
2007    /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2008    /// demotes the trailing `Para` to `Plain` unless a blank line
2009    /// separates them.
2010    OnlyIfLast,
2011}
2012
2013/// Lift the HTML-block body into structural CST children: build the
2014/// inner text from `pre_content` + `content_lines` + `post_content`
2015/// (in order), recursively parse it as Pandoc-flavored markdown, and
2016/// graft the resulting top-level blocks into `builder`. `demote_policy`
2017/// controls whether the trailing paragraph is retagged as `PLAIN` to
2018/// encode pandoc's Plain/Para adjacency rules structurally.
2019fn emit_html_block_body_lifted(
2020    builder: &mut GreenNodeBuilder<'static>,
2021    pre_content: &str,
2022    content_lines: &[&str],
2023    post_content: &str,
2024    demote_policy: LastParaDemote,
2025    config: &ParserOptions,
2026) {
2027    emit_html_block_body_lifted_inner(
2028        builder,
2029        pre_content,
2030        content_lines,
2031        post_content,
2032        demote_policy,
2033        config,
2034        &mut None,
2035    )
2036}
2037
2038/// Body-lift variant for `<div>` inside a blockquote. Strips
2039/// `bq_depth` levels of blockquote markers from each `content_line`,
2040/// captures the per-line prefix bytes, and grafts the recursive parse
2041/// with prefix injection so the output CST stays byte-equal to the
2042/// source. `pre_content` and `post_content` must be empty (the bq
2043/// clean lift only handles the shape where the open and close tags
2044/// stand alone on their source lines).
2045fn emit_html_block_body_lifted_bq(
2046    builder: &mut GreenNodeBuilder<'static>,
2047    content_lines: &[&str],
2048    prefix: &ContainerPrefix,
2049    demote_policy: LastParaDemote,
2050    config: &ParserOptions,
2051) {
2052    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2053    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2054    for cl in content_lines {
2055        let (li, bq, inner) = prefix.split(cl);
2056        prefix_lines.push(ContainerPrefixLine {
2057            list_indent: li.to_string(),
2058            bq_prefix: bq.to_string(),
2059        });
2060        stripped_lines.push(inner);
2061    }
2062    let mut state = ContainerPrefixState::new(prefix_lines);
2063    emit_html_block_body_lifted_inner(
2064        builder,
2065        "",
2066        &stripped_lines,
2067        "",
2068        demote_policy,
2069        config,
2070        &mut state,
2071    )
2072}
2073
2074/// Body-lift variant for the bq messy-shape lift — open-trailing,
2075/// butted-close, or both. The open-trailing bytes (if any) sit in
2076/// `pre_content` (line 0 of the body — no bq prefix in source because
2077/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2078/// each carry their own bq prefix. The close line's `leading` (body
2079/// bytes before `</tag>`) sits on the close line, prefixed in source
2080/// by `close_line_prefix` (the bq prefix captured from `line`).
2081///
2082/// Builds `prefixes` so each emitted line in the recursive parse
2083/// output gets the right per-line bq prefix re-injected at line start:
2084/// `pre_content` → empty prefix (no source `> ` precedes it); each
2085/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2086/// Result CST stays byte-equal to source.
2087#[allow(clippy::too_many_arguments)]
2088fn emit_html_block_body_lifted_bq_messy(
2089    builder: &mut GreenNodeBuilder<'static>,
2090    pre_content: &str,
2091    content_lines: &[&str],
2092    leading: &str,
2093    close_line_prefix: &str,
2094    prefix: &ContainerPrefix,
2095    demote_policy: LastParaDemote,
2096    config: &ParserOptions,
2097) {
2098    let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2099    if !pre_content.is_empty() {
2100        prefix_lines.push(ContainerPrefixLine::default());
2101    }
2102    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2103    for cl in content_lines {
2104        let (li, bq, inner) = prefix.split(cl);
2105        prefix_lines.push(ContainerPrefixLine {
2106            list_indent: li.to_string(),
2107            bq_prefix: bq.to_string(),
2108        });
2109        stripped_lines.push(inner);
2110    }
2111    if !leading.is_empty() {
2112        // The close line carries its own captured prefix bytes; treat
2113        // them as bq-prefix only (no list-indent split applied) to keep
2114        // the legacy bq-only re-injection behavior for messy-shape
2115        // close-line lifts.
2116        prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2117    }
2118    let mut state = ContainerPrefixState::new(prefix_lines);
2119    emit_html_block_body_lifted_inner(
2120        builder,
2121        pre_content,
2122        &stripped_lines,
2123        leading,
2124        demote_policy,
2125        config,
2126        &mut state,
2127    )
2128}
2129
2130fn emit_html_block_body_lifted_inner(
2131    builder: &mut GreenNodeBuilder<'static>,
2132    pre_content: &str,
2133    content_lines: &[&str],
2134    post_content: &str,
2135    demote_policy: LastParaDemote,
2136    config: &ParserOptions,
2137    bq: &mut Option<ContainerPrefixState>,
2138) {
2139    if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2140        return;
2141    }
2142    let mut inner_text = String::with_capacity(
2143        pre_content.len()
2144            + content_lines.iter().map(|s| s.len()).sum::<usize>()
2145            + post_content.len(),
2146    );
2147    inner_text.push_str(pre_content);
2148    for line in content_lines {
2149        inner_text.push_str(line);
2150    }
2151    inner_text.push_str(post_content);
2152
2153    let mut inner_options = config.clone();
2154    let refdefs = config.refdef_labels.clone().unwrap_or_default();
2155    inner_options.refdef_labels = Some(refdefs.clone());
2156    let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2157    graft_document_children(builder, &inner_root, demote_policy, bq);
2158}
2159
2160/// Walk a parsed inner document's top-level children and re-emit them
2161/// into `builder`. The document's wrapper node is skipped — only its
2162/// children are grafted.
2163///
2164/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2165/// as `PLAIN` — see [`LastParaDemote`].
2166///
2167/// `bq` is `Some` when grafting a body that lived inside an outer
2168/// container (blockquote, list-item, or both) — token emission then
2169/// injects the captured per-line prefix tokens at line starts so the
2170/// CST stays byte-equal to source. See
2171/// [`super::container_prefix::ContainerPrefixState`].
2172fn graft_document_children(
2173    builder: &mut GreenNodeBuilder<'static>,
2174    doc: &SyntaxNode,
2175    demote_policy: LastParaDemote,
2176    bq: &mut Option<ContainerPrefixState>,
2177) {
2178    let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2179
2180    let mut demote_idx: Option<usize> = None;
2181    match demote_policy {
2182        LastParaDemote::Never => {}
2183        LastParaDemote::SkipTrailingBlanks => {
2184            for (i, c) in children.iter().enumerate().rev() {
2185                if let rowan::NodeOrToken::Node(n) = c {
2186                    if n.kind() == SyntaxKind::BLANK_LINE {
2187                        continue;
2188                    }
2189                    if n.kind() == SyntaxKind::PARAGRAPH {
2190                        demote_idx = Some(i);
2191                    }
2192                    break;
2193                }
2194            }
2195        }
2196        LastParaDemote::OnlyIfLast => {
2197            for (i, c) in children.iter().enumerate().rev() {
2198                if let rowan::NodeOrToken::Node(n) = c {
2199                    if n.kind() == SyntaxKind::PARAGRAPH {
2200                        demote_idx = Some(i);
2201                    }
2202                    break;
2203                }
2204            }
2205        }
2206    }
2207
2208    for (i, child) in children.into_iter().enumerate() {
2209        match child {
2210            rowan::NodeOrToken::Node(n) => {
2211                if Some(i) == demote_idx {
2212                    graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2213                } else {
2214                    graft_subtree(builder, &n, bq);
2215                }
2216            }
2217            rowan::NodeOrToken::Token(t) => {
2218                emit_grafted_token(builder, t.kind(), t.text(), bq);
2219            }
2220        }
2221    }
2222}
2223
2224/// Recursively re-emit `node` and its descendants into `builder`.
2225/// Token text is copied verbatim so the result is byte-identical to
2226/// the input span (modulo bq prefix tokens injected at line starts
2227/// when `bq` is `Some`).
2228fn graft_subtree(
2229    builder: &mut GreenNodeBuilder<'static>,
2230    node: &SyntaxNode,
2231    bq: &mut Option<ContainerPrefixState>,
2232) {
2233    graft_subtree_as(builder, node, node.kind(), bq);
2234}
2235
2236/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2237/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2238/// the close-butted demotion rule.
2239fn graft_subtree_as(
2240    builder: &mut GreenNodeBuilder<'static>,
2241    node: &SyntaxNode,
2242    kind: SyntaxKind,
2243    bq: &mut Option<ContainerPrefixState>,
2244) {
2245    builder.start_node(kind.into());
2246    for child in node.children_with_tokens() {
2247        match child {
2248            rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2249            rowan::NodeOrToken::Token(t) => {
2250                emit_grafted_token(builder, t.kind(), t.text(), bq);
2251            }
2252        }
2253    }
2254    builder.finish_node();
2255}
2256
2257/// Emit a single token while optionally injecting blockquote prefix
2258/// tokens at line starts. When `bq` is `None`, this is a plain
2259/// `builder.token()` passthrough.
2260fn emit_grafted_token(
2261    builder: &mut GreenNodeBuilder<'static>,
2262    kind: SyntaxKind,
2263    text: &str,
2264    bq: &mut Option<ContainerPrefixState>,
2265) {
2266    if let Some(state) = bq.as_mut() {
2267        if state.at_line_start {
2268            if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2269                emit_container_prefix_tokens(builder, line_prefix);
2270            }
2271            state.at_line_start = false;
2272        }
2273        builder.token(kind.into(), text);
2274        // `BLANK_LINE` token represents an entirely blank source line —
2275        // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2276        // token as line-ending so the per-line prefix index advances
2277        // correctly.
2278        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2279            state.line_idx += 1;
2280            state.at_line_start = true;
2281        }
2282    } else {
2283        builder.token(kind.into(), text);
2284    }
2285}
2286
2287/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2288/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2289fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2290    for ch in prefix.chars() {
2291        if ch == '>' {
2292            builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2293        } else {
2294            let mut buf = [0u8; 4];
2295            builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2296        }
2297    }
2298}
2299
2300/// Locate the byte index (within `line`) of the open-tag's closing `>`
2301/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2302/// the line doesn't fit the expected shape. Mirrors the inner scan of
2303/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2304/// caller can slice off the trailing bytes.
2305fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2306    let bytes = line.as_bytes();
2307    let indent_end = bytes
2308        .iter()
2309        .position(|&b| b != b' ' && b != b'\t')
2310        .unwrap_or(bytes.len());
2311    let rest = &line[indent_end..];
2312    let rest_bytes = rest.as_bytes();
2313    let prefix_len = 1 + tag_name.len();
2314    if rest_bytes.len() < prefix_len + 1
2315        || rest_bytes[0] != b'<'
2316        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2317    {
2318        return None;
2319    }
2320    let after_name = &rest[prefix_len..];
2321    let after_name_bytes = after_name.as_bytes();
2322    let mut i = 0usize;
2323    let mut quote: Option<u8> = None;
2324    while i < after_name_bytes.len() {
2325        match (quote, after_name_bytes[i]) {
2326            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2327            (Some(q), b2) if b2 == q => quote = None,
2328            (None, b'>') => return Some(indent_end + prefix_len + i),
2329            _ => {}
2330        }
2331        i += 1;
2332    }
2333    None
2334}
2335
2336/// Whether `slice` begins (after leading ASCII whitespace) with an
2337/// open tag whose name is a Pandoc void block tag (`<source>`,
2338/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2339/// open tags return false.
2340///
2341/// Used by the inline-block matched-pair lift gate: pandoc-native
2342/// abandons the lift when the body's first non-blank content is a
2343/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2344/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2345/// matched-pair lift).
2346fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2347    let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2348    if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2349        return false;
2350    }
2351    let Some(tag_end) = parse_open_tag(trimmed) else {
2352        return false;
2353    };
2354    let bytes = trimmed.as_bytes();
2355    let mut name_end = 1usize;
2356    while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2357    {
2358        name_end += 1;
2359    }
2360    if name_end == 1 {
2361        return false;
2362    }
2363    is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2364}
2365
2366/// Whether the body of an inline-block matched-pair (`<video>...`,
2367/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2368/// a void block tag — the condition under which pandoc-native abandons
2369/// the matched-pair lift. Probes three shapes:
2370///
2371/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2372///   after the open `>` on `first_inner` start with `<source`.
2373/// - **Single-line open + multi-line body**: open-trailing on the open
2374///   line is empty/whitespace AND the first non-blank body line
2375///   (`lines[start_pos+1..]`) starts with a void tag.
2376/// - **Multi-line open**: same body-line scan starting at
2377///   `lines[multiline_open_end+1..]`.
2378///
2379/// Returns `false` when the body begins with text, with a close tag,
2380/// or with a non-void block tag — those cases all proceed with the
2381/// matched-pair lift.
2382fn inline_block_void_interior_abandons(
2383    first_inner: &str,
2384    lines: &[&str],
2385    start_pos: usize,
2386    multiline_open_end: Option<usize>,
2387    bq_depth: usize,
2388    tag_name: &str,
2389) -> bool {
2390    let (line_no_nl, _) = strip_newline(first_inner);
2391    let (body_start_line_idx, open_trailing) = match multiline_open_end {
2392        Some(end) => (end + 1, ""),
2393        None => {
2394            let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2395            let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2396            (start_pos + 1, trailing)
2397        }
2398    };
2399    let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2400    if !trimmed.is_empty() {
2401        return slice_starts_with_void_block_tag(trimmed);
2402    }
2403    for line in &lines[body_start_line_idx..] {
2404        let inner = if bq_depth > 0 {
2405            strip_n_blockquote_markers(line, bq_depth)
2406        } else {
2407            line
2408        };
2409        let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2410        if trimmed.is_empty() {
2411            continue;
2412        }
2413        return slice_starts_with_void_block_tag(trimmed);
2414    }
2415    false
2416}
2417
2418/// Probe whether the open-tag line has a valid (quote-aware) closing
2419/// `>` after the tag name. Admits trailing content after `>` (the
2420/// open-trailing shape `<form>foo`) — the caller is expected to capture
2421/// that trailing into the structural lift's `pre_content`.
2422pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2423    let bytes = line.as_bytes();
2424    let indent_end = bytes
2425        .iter()
2426        .position(|&b| b != b' ' && b != b'\t')
2427        .unwrap_or(bytes.len());
2428    let rest = &line[indent_end..];
2429    let rest_bytes = rest.as_bytes();
2430    let prefix_len = 1 + tag_name.len();
2431    if rest_bytes.len() < prefix_len + 1
2432        || rest_bytes[0] != b'<'
2433        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2434    {
2435        return false;
2436    }
2437    let after_name = &rest[prefix_len..];
2438    let after_name_bytes = after_name.as_bytes();
2439    let mut i = 0usize;
2440    let mut quote: Option<u8> = None;
2441    while i < after_name_bytes.len() {
2442        match (quote, after_name_bytes[i]) {
2443            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2444            (Some(q), b2) if b2 == q => quote = None,
2445            (None, b'>') => return true,
2446            _ => {}
2447        }
2448        i += 1;
2449    }
2450    false
2451}
2452
2453/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2454/// be lifted structurally. Returns `true` only when:
2455/// - The line starts with `<tag_name` (modulo leading whitespace).
2456/// - The open tag's `>` exists with proper quote handling.
2457/// - The bytes after the open `>` contain a depth-zero matched
2458///   `</tag_name>` close (depth-aware: nested `<tag>` opens
2459///   increment depth; matching is case-insensitive, quote-aware).
2460///
2461/// Trailing bytes after the matched close are accepted and grafted
2462/// as a sibling block by the caller. Examples:
2463/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2464/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2465///   recursively parses to a `RawBlock`).
2466/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2467///   parsed recursively), trailing=`bar`.
2468fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2469    let bytes = line.as_bytes();
2470    let indent_end = bytes
2471        .iter()
2472        .position(|&b| b != b' ' && b != b'\t')
2473        .unwrap_or(bytes.len());
2474    let rest = &line[indent_end..];
2475    let rest_bytes = rest.as_bytes();
2476    let prefix_len = 1 + tag_name.len();
2477    if rest_bytes.len() < prefix_len
2478        || rest_bytes[0] != b'<'
2479        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2480    {
2481        return false;
2482    }
2483    let after_name = &rest[prefix_len..];
2484    let after_name_bytes = after_name.as_bytes();
2485    let mut i = 0usize;
2486    let mut quote: Option<u8> = None;
2487    let mut gt_idx: Option<usize> = None;
2488    while i < after_name_bytes.len() {
2489        match (quote, after_name_bytes[i]) {
2490            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2491            (Some(q), b2) if b2 == q => quote = None,
2492            (None, b'>') => {
2493                gt_idx = Some(i);
2494                break;
2495            }
2496            _ => {}
2497        }
2498        i += 1;
2499    }
2500    let Some(gt_idx) = gt_idx else {
2501        return false;
2502    };
2503    let trailing = &after_name[gt_idx + 1..];
2504    // Depth-aware: walk `trailing` (we begin inside the open tag at
2505    // depth 1). Return true iff a matched `</tag>` exists where depth
2506    // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2507    matched_close_offset(trailing, tag_name).is_some()
2508}
2509
2510/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2511/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2512/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2513/// starts at 1 (we begin inside the open tag). Self-closing opens
2514/// (`<tag/>`) do not increment depth.
2515///
2516/// Returns `Some((close_start, close_end))` where:
2517/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2518/// - `close_end` is one past the matched `>`.
2519///
2520/// Returns `None` when no matched close is present (unclosed tag,
2521/// depth never returns to 0).
2522fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2523    let bytes = trailing.as_bytes();
2524    let lower_line = trailing.to_ascii_lowercase();
2525    let lower_bytes = lower_line.as_bytes();
2526    let tag_lower = tag_name.to_ascii_lowercase();
2527    let tag_bytes = tag_lower.as_bytes();
2528
2529    let mut depth: i32 = 1;
2530    let mut i = 0usize;
2531
2532    while i < bytes.len() {
2533        if bytes[i] != b'<' {
2534            i += 1;
2535            continue;
2536        }
2537        let after = i + 1;
2538        let is_close = after < bytes.len() && bytes[after] == b'/';
2539        let name_start = if is_close { after + 1 } else { after };
2540        let matched = name_start + tag_bytes.len() <= bytes.len()
2541            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2542        let after_name = name_start + tag_bytes.len();
2543        let is_boundary = matched
2544            && matches!(
2545                bytes.get(after_name).copied(),
2546                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2547            );
2548
2549        // Scan forward to this tag bracket's `>`, respecting quoted
2550        // attribute values; track self-closing form (`/>`).
2551        let mut j = if matched { after_name } else { after };
2552        let mut quote: Option<u8> = None;
2553        let mut self_close = false;
2554        let mut found_gt = false;
2555        while j < bytes.len() {
2556            let b = bytes[j];
2557            match (quote, b) {
2558                (Some(q), x) if x == q => quote = None,
2559                (None, b'"') | (None, b'\'') => quote = Some(b),
2560                (None, b'>') => {
2561                    found_gt = true;
2562                    if j > i + 1 && bytes[j - 1] == b'/' {
2563                        self_close = true;
2564                    }
2565                    break;
2566                }
2567                _ => {}
2568            }
2569            j += 1;
2570        }
2571
2572        if matched && is_boundary {
2573            if is_close {
2574                depth -= 1;
2575                if depth == 0 && found_gt {
2576                    return Some((i, j + 1));
2577                }
2578            } else if !self_close {
2579                depth += 1;
2580            }
2581        }
2582
2583        if found_gt {
2584            i = j + 1;
2585        } else {
2586            // Unterminated `<...` — give up.
2587            break;
2588        }
2589    }
2590    None
2591}
2592
2593/// Locate the byte offset of the first `>` after a `</tag` prefix at
2594/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2595/// the caller can split `close_part` into the close-marker bytes
2596/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2597/// expected prefix shape is missing — caller treats the whole slice
2598/// as the close marker (no trailing).
2599fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2600    let prefix_len = 2 + tag_name.len();
2601    let bytes = close_part.as_bytes();
2602    if bytes.len() < prefix_len
2603        || bytes[0] != b'<'
2604        || bytes[1] != b'/'
2605        || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2606    {
2607        return None;
2608    }
2609    // Scan from after `</tag` to the first unquoted `>`.
2610    let mut i = prefix_len;
2611    let mut quote: Option<u8> = None;
2612    while i < bytes.len() {
2613        match (quote, bytes[i]) {
2614            (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2615            (Some(q), b2) if b2 == q => quote = None,
2616            (None, b'>') => return Some(i + 1),
2617            _ => {}
2618        }
2619        i += 1;
2620    }
2621    None
2622}
2623
2624/// Try to split the close line of an HTML_BLOCK_DIV body into a
2625/// leading content prefix and a clean `</tag>...` remainder. Returns
2626/// `Some((leading, close_part))` only when the line contains exactly
2627/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2628/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2629/// for missing close tags, or for compound shapes the parser
2630/// shouldn't attempt to lift in this pass.
2631///
2632/// `leading` may be empty (close starts at column 0) or pure
2633/// whitespace (close on an indented line). Both count as "butted" per
2634/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2635/// the trailing paragraph inside the div demotes Para→Plain.
2636fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2637    let (opens, closes) = count_tag_balance(line, tag_name);
2638    if opens != 0 || closes != 1 {
2639        return None;
2640    }
2641    // Locate the close tag's opening `<` by lowercased substring search.
2642    // Safe because we've already established (above) that the line has
2643    // exactly one `</tag>` and no `<tag>` opens, so the first match is
2644    // THE close.
2645    let needle = format!("</{}", tag_name);
2646    let lower = line.to_ascii_lowercase();
2647    let close_lt = lower.find(&needle)?;
2648    Some((&line[..close_lt], &line[close_lt..]))
2649}
2650
2651/// Depth-aware variant of `try_split_close_line` used by the same-line
2652/// lift path. Walks `line` starting at depth 1 (we begin inside the
2653/// open `<tag>`) and splits at the byte position where the matched
2654/// `</tag>` close brings depth to 0. Returns `Some((body,
2655/// close_part))` where `body` is the bytes before the matched-close
2656/// start and `close_part` is the bytes from the matched close onward.
2657///
2658/// Unlike `try_split_close_line` this accepts nested same-tag opens
2659/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2660/// body=`<div>x</div>` (a nested div the body lift parses
2661/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2662/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2663/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2664fn try_split_close_line_depth_aware<'a>(
2665    line: &'a str,
2666    tag_name: &str,
2667) -> Option<(&'a str, &'a str)> {
2668    let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2669    Some((&line[..close_start], &line[close_start..]))
2670}
2671
2672/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2673/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2674/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2675/// + TEXT(">") + TEXT(trailing)?`.
2676///
2677/// Bytes are byte-identical to the source — this only tokenizes at finer
2678/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2679/// region structurally. Falls back to a single TEXT token if the line
2680/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2681/// only retags as the lift kind when this shape was matched).
2682///
2683/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2684/// returned as `&str` instead so the caller can splice them into the
2685/// recursive-parse input for the structural body lift. When false
2686/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2687/// empty slice is returned.
2688fn emit_open_tag_tokens<'a>(
2689    builder: &mut GreenNodeBuilder<'static>,
2690    line: &'a str,
2691    tag_name: &str,
2692    lift_trailing: bool,
2693) -> &'a str {
2694    let bytes = line.as_bytes();
2695    // Leading indent (CommonMark allows up to 3 spaces).
2696    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2697    if indent_end > 0 {
2698        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2699    }
2700    let rest = &line[indent_end..];
2701    // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2702    let prefix_len = 1 + tag_name.len();
2703    if !rest.starts_with('<')
2704        || rest.len() < prefix_len
2705        || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2706    {
2707        builder.token(SyntaxKind::TEXT.into(), rest);
2708        return "";
2709    }
2710    let after_name = &rest[prefix_len..];
2711    let after_name_bytes = after_name.as_bytes();
2712    // Find the closing `>` of the open tag, respecting quoted attribute values.
2713    let mut i = 0usize;
2714    let mut quote: Option<u8> = None;
2715    let mut tag_close: Option<usize> = None;
2716    while i < after_name_bytes.len() {
2717        let b = after_name_bytes[i];
2718        match (quote, b) {
2719            (None, b'"') | (None, b'\'') => quote = Some(b),
2720            (Some(q), b2) if b2 == q => quote = None,
2721            (None, b'>') => {
2722                tag_close = Some(i);
2723                break;
2724            }
2725            _ => {}
2726        }
2727        i += 1;
2728    }
2729    let Some(tag_close) = tag_close else {
2730        // Open tag has no closing `>` on this line — defensive fallback.
2731        builder.token(SyntaxKind::TEXT.into(), rest);
2732        return "";
2733    };
2734    // Whitespace between the tag name and the attribute region.
2735    let attrs_inner = &after_name[..tag_close];
2736    let ws_end = attrs_inner
2737        .as_bytes()
2738        .iter()
2739        .position(|&b| !matches!(b, b' ' | b'\t'))
2740        .unwrap_or(attrs_inner.len());
2741    let leading_ws = &attrs_inner[..ws_end];
2742    // Strip a trailing self-closing slash and the whitespace before it
2743    // from the attribute region; emit them as TEXT outside the
2744    // HTML_ATTRS node so the structural region only holds attribute
2745    // bytes (not formatting punctuation).
2746    let attrs_after_ws = &attrs_inner[ws_end..];
2747    let mut attr_end = attrs_after_ws.len();
2748    let attr_bytes = attrs_after_ws.as_bytes();
2749    let mut self_close_start = attr_end;
2750    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2751        self_close_start = attr_end - 1;
2752        attr_end = self_close_start;
2753        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2754            attr_end -= 1;
2755        }
2756    }
2757    let attrs_text = &attrs_after_ws[..attr_end];
2758    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2759    let after_self_close = &attrs_after_ws[self_close_start..];
2760
2761    // Use the original source bytes for the `<tag` prefix (preserves
2762    // source casing — losslessness).
2763    builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2764    if !leading_ws.is_empty() {
2765        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2766    }
2767    if !attrs_text.is_empty() {
2768        builder.start_node(SyntaxKind::HTML_ATTRS.into());
2769        builder.token(SyntaxKind::TEXT.into(), attrs_text);
2770        builder.finish_node();
2771    }
2772    if !trailing_text.is_empty() {
2773        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2774    }
2775    if !after_self_close.is_empty() {
2776        builder.token(SyntaxKind::TEXT.into(), after_self_close);
2777    }
2778    builder.token(SyntaxKind::TEXT.into(), ">");
2779    let after_gt = &after_name[tag_close + 1..];
2780    if lift_trailing {
2781        // Return trailing bytes to the caller (will be spliced into the
2782        // recursive-parse input for the body lift).
2783        return after_gt;
2784    }
2785    if !after_gt.is_empty() {
2786        builder.token(SyntaxKind::TEXT.into(), after_gt);
2787    }
2788    ""
2789}
2790
2791/// Detect a multi-line HTML open tag for `tag_name`. Returns
2792/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2793/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2794/// the existing path) or when the `>` is missing entirely.
2795///
2796/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2797/// attribute value doesn't terminate the open tag. Quote state carries
2798/// across line boundaries.
2799fn find_multiline_open_end(
2800    lines: &[&str],
2801    start_pos: usize,
2802    first_inner: &str,
2803    tag_name: &str,
2804    prefix: &ContainerPrefix,
2805) -> Option<usize> {
2806    // Locate the `<tag_name` literal in `first_inner` to start scanning past
2807    // it. Match is ASCII case-insensitive; the parser preserves source casing.
2808    // `first_inner` is already bq-stripped by the caller; subsequent lines are
2809    // stripped inline below via `strip_n_blockquote_markers`.
2810    let trimmed = strip_leading_spaces(first_inner);
2811    let prefix_len = 1 + tag_name.len();
2812    if !trimmed.starts_with('<')
2813        || trimmed.len() < prefix_len
2814        || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2815    {
2816        return None;
2817    }
2818    let leading_indent = first_inner.len() - trimmed.len();
2819    let mut i = leading_indent + prefix_len; // past `<tag_name`
2820    let mut quote: Option<u8> = None;
2821
2822    // Scan first line for an unquoted `>`.
2823    let line0_bytes = first_inner.as_bytes();
2824    while i < line0_bytes.len() {
2825        match (quote, line0_bytes[i]) {
2826            (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2827            (Some(q), x) if x == q => quote = None,
2828            (None, b'>') => return None, // single-line case
2829            _ => {}
2830        }
2831        i += 1;
2832    }
2833
2834    // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2835    // blockquote markers per line so `> ` prefixes don't count toward the
2836    // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2837    let mut line_idx = start_pos + 1;
2838    while line_idx < lines.len() {
2839        let raw = lines[line_idx];
2840        let inner = prefix.strip(raw);
2841        for &b in inner.as_bytes() {
2842            match (quote, b) {
2843                (None, b'"') | (None, b'\'') => quote = Some(b),
2844                (Some(q), x) if x == q => quote = None,
2845                (None, b'>') => return Some(line_idx),
2846                _ => {}
2847            }
2848        }
2849        line_idx += 1;
2850    }
2851
2852    None
2853}
2854
2855/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2856/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2857/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2858/// open tag (no `>` in the remaining input) as paragraph text rather than
2859/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2860/// reparse the same content recursively, causing a stack overflow.
2861///
2862/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2863/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2864/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2865/// the tag eventually closes far away.
2866pub(crate) fn pandoc_html_open_tag_closes(
2867    lines: &[&str],
2868    start_pos: usize,
2869    prefix: &ContainerPrefix,
2870) -> bool {
2871    if start_pos >= lines.len() {
2872        return false;
2873    }
2874    let mut quote: Option<u8> = None;
2875    for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2876        let inner = prefix.strip(line);
2877        let bytes = inner.as_bytes();
2878        let mut i = 0usize;
2879        if offset == start_pos {
2880            while i < bytes.len() && bytes[i] == b' ' {
2881                i += 1;
2882            }
2883            if bytes.get(i) != Some(&b'<') {
2884                return false;
2885            }
2886            i += 1;
2887        }
2888        while i < bytes.len() {
2889            match (quote, bytes[i]) {
2890                (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2891                (Some(q), x) if x == q => quote = None,
2892                (None, b'>') => return true,
2893                _ => {}
2894            }
2895            i += 1;
2896        }
2897    }
2898    false
2899}
2900
2901/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2902/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2903/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2904/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2905/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2906/// Phase 6 structural lift.
2907///
2908/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2909/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2910/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2911/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2912///   + TEXT(">") + (TEXT(trailing))? + NEWLINE
2913///
2914/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2915/// newline; `parse_html_attribute_list` tolerates whitespace.
2916#[allow(clippy::too_many_arguments)]
2917fn emit_multiline_open_tag_with_attrs(
2918    builder: &mut GreenNodeBuilder<'static>,
2919    lines: &[&str],
2920    start_pos: usize,
2921    end_line_idx: usize,
2922    tag_name: &str,
2923    bq_depth: usize,
2924    lift_trailing: bool,
2925    pre_content: &mut String,
2926) {
2927    let prefix_len = 1 + tag_name.len();
2928    for (line_idx, raw) in lines
2929        .iter()
2930        .enumerate()
2931        .take(end_line_idx + 1)
2932        .skip(start_pos)
2933    {
2934        // Strip `bq_depth` blockquote markers from the source line so
2935        // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2936        // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2937        // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2938        // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2939        // before this parser runs; re-emitting it here would double
2940        // the bytes and break losslessness.
2941        let stripped = if bq_depth > 0 {
2942            strip_n_blockquote_markers(raw, bq_depth)
2943        } else {
2944            raw
2945        };
2946        let bq_prefix_len = raw.len() - stripped.len();
2947        if bq_prefix_len > 0 && line_idx != start_pos {
2948            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2949        }
2950        let line = stripped;
2951        let (line_no_nl, newline_str) = strip_newline(line);
2952
2953        if line_idx == start_pos {
2954            // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2955            // + attrs)?. The closing `>` is on a later line, so any
2956            // remaining bytes after "<{tag_name}" on this line are the
2957            // start of the attribute region.
2958            let bytes = line_no_nl.as_bytes();
2959            let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2960            if indent_end > 0 {
2961                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2962            }
2963            // Defensive: caller verified the line starts with `<{tag_name}`.
2964            let after_indent = &line_no_nl[indent_end..];
2965            if after_indent.len() >= prefix_len {
2966                builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2967                let rest = &after_indent[prefix_len..];
2968                emit_attr_region(builder, rest);
2969            } else {
2970                builder.token(SyntaxKind::TEXT.into(), after_indent);
2971            }
2972        } else if line_idx < end_line_idx {
2973            // Pure attribute line.
2974            let bytes = line_no_nl.as_bytes();
2975            let indent_end = bytes
2976                .iter()
2977                .position(|&b| !matches!(b, b' ' | b'\t'))
2978                .unwrap_or(bytes.len());
2979            if indent_end > 0 {
2980                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2981            }
2982            let attrs_text = &line_no_nl[indent_end..];
2983            if !attrs_text.is_empty() {
2984                builder.start_node(SyntaxKind::HTML_ATTRS.into());
2985                builder.token(SyntaxKind::TEXT.into(), attrs_text);
2986                builder.finish_node();
2987            }
2988        } else {
2989            // Last line: indent + attrs + ">" + trailing.
2990            let bytes = line_no_nl.as_bytes();
2991            let indent_end = bytes
2992                .iter()
2993                .position(|&b| !matches!(b, b' ' | b'\t'))
2994                .unwrap_or(bytes.len());
2995            if indent_end > 0 {
2996                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2997            }
2998            // Find the unquoted `>` byte position in this line.
2999            let mut quote: Option<u8> = None;
3000            let mut gt_pos: Option<usize> = None;
3001            for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3002                let actual_j = indent_end + j;
3003                match (quote, b) {
3004                    (None, b'"') | (None, b'\'') => quote = Some(b),
3005                    (Some(q), x) if x == q => quote = None,
3006                    (None, b'>') => {
3007                        gt_pos = Some(actual_j);
3008                        break;
3009                    }
3010                    _ => {}
3011                }
3012            }
3013            let Some(gt) = gt_pos else {
3014                // Defensive — caller said `>` is on this line.
3015                builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3016                if !newline_str.is_empty() {
3017                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3018                }
3019                continue;
3020            };
3021            // Attribute region: between indent_end and gt, with possibly
3022            // trailing whitespace before `>`.
3023            let attrs_region = &line_no_nl[indent_end..gt];
3024            let region_bytes = attrs_region.as_bytes();
3025            // Strip trailing whitespace from attrs region; emit as
3026            // separate WHITESPACE so HTML_ATTRS only contains attribute
3027            // bytes.
3028            let mut attr_end = region_bytes.len();
3029            while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3030                attr_end -= 1;
3031            }
3032            let attrs_text = &attrs_region[..attr_end];
3033            let trailing_ws = &attrs_region[attr_end..];
3034            if !attrs_text.is_empty() {
3035                builder.start_node(SyntaxKind::HTML_ATTRS.into());
3036                builder.token(SyntaxKind::TEXT.into(), attrs_text);
3037                builder.finish_node();
3038            }
3039            if !trailing_ws.is_empty() {
3040                builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3041            }
3042            builder.token(SyntaxKind::TEXT.into(), ">");
3043            let after_gt = &line_no_nl[gt + 1..];
3044            if lift_trailing && !after_gt.is_empty() {
3045                // Lift trailing bytes (and the trailing newline) into
3046                // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3047                // with `TEXT(">")`. The recursive parse at the close-marker
3048                // site treats `pre_content` as the leading bytes of the
3049                // structural body — same shape produced by `emit_open_tag_tokens`
3050                // for single-line opens.
3051                pre_content.push_str(after_gt);
3052                pre_content.push_str(newline_str);
3053                continue;
3054            }
3055            if !after_gt.is_empty() {
3056                builder.token(SyntaxKind::TEXT.into(), after_gt);
3057            }
3058        }
3059
3060        if !newline_str.is_empty() {
3061            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3062        }
3063    }
3064}
3065
3066/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3067/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3068/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3069/// node is added. Pandoc's projector reads attributes only for `<div>` /
3070/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3071fn emit_multiline_open_tag_simple(
3072    builder: &mut GreenNodeBuilder<'static>,
3073    lines: &[&str],
3074    start_pos: usize,
3075    end_line_idx: usize,
3076    bq_depth: usize,
3077) {
3078    for (line_idx, raw) in lines
3079        .iter()
3080        .enumerate()
3081        .take(end_line_idx + 1)
3082        .skip(start_pos)
3083    {
3084        let stripped = if bq_depth > 0 {
3085            strip_n_blockquote_markers(raw, bq_depth)
3086        } else {
3087            raw
3088        };
3089        let bq_prefix_len = raw.len() - stripped.len();
3090        // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3091        // re-emit prefixes only for subsequent lines.
3092        if bq_prefix_len > 0 && line_idx != start_pos {
3093            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3094        }
3095        let (line_no_nl, newline_str) = strip_newline(stripped);
3096        if !line_no_nl.is_empty() {
3097            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3098        }
3099        if !newline_str.is_empty() {
3100            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3101        }
3102    }
3103}
3104
3105/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3106/// `<div` literal up to end-of-line. Called only from
3107/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3108/// pure attribute (and possibly inter-attribute whitespace).
3109fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3110    if region.is_empty() {
3111        return;
3112    }
3113    let bytes = region.as_bytes();
3114    // Split a leading run of whitespace into a WHITESPACE token so the
3115    // HTML_ATTRS node holds only attribute bytes.
3116    let ws_end = bytes
3117        .iter()
3118        .position(|&b| !matches!(b, b' ' | b'\t'))
3119        .unwrap_or(bytes.len());
3120    if ws_end > 0 {
3121        builder.token(SyntaxKind::WHITESPACE.into(), &region[..ws_end]);
3122    }
3123    let attrs_text = &region[ws_end..];
3124    if !attrs_text.is_empty() {
3125        builder.start_node(SyntaxKind::HTML_ATTRS.into());
3126        builder.token(SyntaxKind::TEXT.into(), attrs_text);
3127        builder.finish_node();
3128    }
3129}
3130
3131/// Emit one continuation line of an HTML block, preserving any blockquote
3132/// markers as structural tokens (so the CST stays byte-equal to the source
3133/// and downstream consumers can strip them per-context).
3134fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3135    let inner = if bq_depth > 0 {
3136        let stripped = strip_n_blockquote_markers(line, bq_depth);
3137        let prefix_len = line.len() - stripped.len();
3138        if prefix_len > 0 {
3139            for ch in line[..prefix_len].chars() {
3140                if ch == '>' {
3141                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3142                } else {
3143                    let mut buf = [0u8; 4];
3144                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3145                }
3146            }
3147        }
3148        stripped
3149    } else {
3150        line
3151    };
3152
3153    let (line_without_newline, newline_str) = strip_newline(inner);
3154    if !line_without_newline.is_empty() {
3155        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3156    }
3157    if !newline_str.is_empty() {
3158        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3159    }
3160}
3161
3162#[cfg(test)]
3163mod tests {
3164    use super::*;
3165
3166    #[test]
3167    fn test_try_parse_html_comment() {
3168        assert_eq!(
3169            try_parse_html_block_start("<!-- comment -->", false),
3170            Some(HtmlBlockType::Comment)
3171        );
3172        assert_eq!(
3173            try_parse_html_block_start("  <!-- comment -->", false),
3174            Some(HtmlBlockType::Comment)
3175        );
3176    }
3177
3178    #[test]
3179    fn test_try_parse_div_tag() {
3180        assert_eq!(
3181            try_parse_html_block_start("<div>", false),
3182            Some(HtmlBlockType::BlockTag {
3183                tag_name: "div".to_string(),
3184                is_verbatim: false,
3185                closed_by_blank_line: false,
3186                depth_aware: true,
3187                closes_at_open_tag: false,
3188                is_closing: false,
3189            })
3190        );
3191        assert_eq!(
3192            try_parse_html_block_start("<div class=\"test\">", false),
3193            Some(HtmlBlockType::BlockTag {
3194                tag_name: "div".to_string(),
3195                is_verbatim: false,
3196                closed_by_blank_line: false,
3197                depth_aware: true,
3198                closes_at_open_tag: false,
3199                is_closing: false,
3200            })
3201        );
3202    }
3203
3204    #[test]
3205    fn test_try_parse_script_tag() {
3206        assert_eq!(
3207            try_parse_html_block_start("<script>", false),
3208            Some(HtmlBlockType::BlockTag {
3209                tag_name: "script".to_string(),
3210                is_verbatim: true,
3211                closed_by_blank_line: false,
3212                depth_aware: true,
3213                closes_at_open_tag: false,
3214                is_closing: false,
3215            })
3216        );
3217    }
3218
3219    #[test]
3220    fn test_try_parse_processing_instruction() {
3221        assert_eq!(
3222            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3223            Some(HtmlBlockType::ProcessingInstruction)
3224        );
3225    }
3226
3227    #[test]
3228    fn test_try_parse_declaration() {
3229        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3230        assert_eq!(
3231            try_parse_html_block_start("<!DOCTYPE html>", true),
3232            Some(HtmlBlockType::Declaration)
3233        );
3234        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3235        // just uppercase. Lowercase doctype must match too.
3236        assert_eq!(
3237            try_parse_html_block_start("<!doctype html>", true),
3238            Some(HtmlBlockType::Declaration)
3239        );
3240        // Pandoc dialect does not — bare declarations fall through to
3241        // paragraph parsing.
3242        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3243        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3244    }
3245
3246    #[test]
3247    fn test_dialect_specific_block_tag_membership() {
3248        // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3249        // CommonMark §4.6 type-6 plus a few additions. These tags
3250        // diverge between dialects:
3251        //   CM-only block tags (Pandoc treats as inline raw HTML):
3252        //     dialog, legend, menuitem, optgroup, option, frame,
3253        //     base, basefont, link, param
3254        //   Pandoc-only block tags (CM doesn't recognize):
3255        //     canvas, hgroup, isindex, meta, output
3256        for cm_only in [
3257            "<dialog>",
3258            "<legend>",
3259            "<menuitem>",
3260            "<optgroup>",
3261            "<option>",
3262            "<frame>",
3263            "<base>",
3264            "<basefont>",
3265            "<link>",
3266            "<param>",
3267        ] {
3268            assert!(
3269                matches!(
3270                    try_parse_html_block_start(cm_only, true),
3271                    Some(HtmlBlockType::BlockTag { .. })
3272                ),
3273                "{cm_only} should be a block-tag start under CommonMark",
3274            );
3275            assert_eq!(
3276                try_parse_html_block_start(cm_only, false),
3277                None,
3278                "{cm_only} should NOT be a block-tag start under Pandoc",
3279            );
3280        }
3281        for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3282            // Under CM these are not type-6 BlockTags; they may still match
3283            // type-7 (complete tag on a line) which has different semantics.
3284            assert!(
3285                !matches!(
3286                    try_parse_html_block_start(pandoc_only, true),
3287                    Some(HtmlBlockType::BlockTag { .. })
3288                ),
3289                "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3290            );
3291            assert!(
3292                matches!(
3293                    try_parse_html_block_start(pandoc_only, false),
3294                    Some(HtmlBlockType::BlockTag { .. })
3295                ),
3296                "{pandoc_only} should be a block-tag start under Pandoc",
3297            );
3298        }
3299    }
3300
3301    #[test]
3302    fn test_pandoc_inline_block_tag_membership() {
3303        // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3304        // fresh-block positions under Pandoc dialect. We list the
3305        // non-void, non-script subset (verbatim `script` is handled
3306        // via the verbatim path; void elements are deferred — see
3307        // PANDOC_INLINE_BLOCK_TAGS docs).
3308        for tag in [
3309            "<button>",
3310            "<iframe>",
3311            "<video>",
3312            "<audio>",
3313            "<noscript>",
3314            "<object>",
3315            "<map>",
3316            "<progress>",
3317            "<del>",
3318            "<ins>",
3319            "<svg>",
3320            "<applet>",
3321        ] {
3322            assert!(
3323                matches!(
3324                    try_parse_html_block_start(tag, false),
3325                    Some(HtmlBlockType::BlockTag {
3326                        depth_aware: true,
3327                        ..
3328                    })
3329                ),
3330                "{tag} should be a depth-aware block-tag start under Pandoc",
3331            );
3332        }
3333        // Closing forms of inline-block tags also start a block under
3334        // Pandoc — pandoc-native pins `</button>` standalone as a
3335        // single-line `RawBlock`. These use `closes_at_open_tag: true`
3336        // (no balanced match — the close emits as a one-line block on
3337        // its own).
3338        for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3339            assert!(
3340                matches!(
3341                    try_parse_html_block_start(closing, false),
3342                    Some(HtmlBlockType::BlockTag {
3343                        depth_aware: false,
3344                        closes_at_open_tag: true,
3345                        ..
3346                    })
3347                ),
3348                "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3349            );
3350        }
3351    }
3352
3353    #[test]
3354    fn test_pandoc_void_block_tag_membership() {
3355        // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3356        // at fresh-block positions under Pandoc dialect, with
3357        // `closes_at_open_tag: true` — the block always ends on the
3358        // open-tag line (no closing tag to match).
3359        for tag in [
3360            "<area>",
3361            "<embed>",
3362            "<source>",
3363            "<track>",
3364            "<embed src=\"foo.swf\">",
3365            "<source src=\"foo.mp4\" type=\"video/mp4\">",
3366        ] {
3367            assert!(
3368                matches!(
3369                    try_parse_html_block_start(tag, false),
3370                    Some(HtmlBlockType::BlockTag {
3371                        depth_aware: false,
3372                        closes_at_open_tag: true,
3373                        ..
3374                    })
3375                ),
3376                "{tag} should be a void block-tag start under Pandoc",
3377            );
3378        }
3379        // Closing forms of void tags also start a single-line block
3380        // under Pandoc. Void elements have no closing tag in HTML, but
3381        // `</embed>` etc. can appear in the wild — pandoc-native still
3382        // emits them as `RawBlock`s at fresh-block positions; mirror
3383        // that with the same `closes_at_open_tag: true` shape.
3384        for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3385            assert!(
3386                matches!(
3387                    try_parse_html_block_start(closing, false),
3388                    Some(HtmlBlockType::BlockTag {
3389                        depth_aware: false,
3390                        closes_at_open_tag: true,
3391                        ..
3392                    })
3393                ),
3394                "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3395            );
3396        }
3397        // Under CommonMark dialect, the void-tag block-start path is
3398        // skipped. `<source>` and `<track>` are in the CM type-6
3399        // BLOCK_TAGS set so they DO start a block, but with CM type-6
3400        // semantics (`closed_by_blank_line: true`,
3401        // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3402        // `<embed>` and `<area>` aren't in the CM type-6 list — they
3403        // fall through to type 7 (complete tag on a line by itself).
3404        assert_eq!(
3405            try_parse_html_block_start("<embed>", true),
3406            Some(HtmlBlockType::Type7)
3407        );
3408        assert_eq!(
3409            try_parse_html_block_start("<area>", true),
3410            Some(HtmlBlockType::Type7)
3411        );
3412        assert!(matches!(
3413            try_parse_html_block_start("<source src=\"x\">", true),
3414            Some(HtmlBlockType::BlockTag {
3415                closed_by_blank_line: true,
3416                closes_at_open_tag: false,
3417                ..
3418            })
3419        ));
3420        assert!(matches!(
3421            try_parse_html_block_start("<track src=\"x\">", true),
3422            Some(HtmlBlockType::BlockTag {
3423                closed_by_blank_line: true,
3424                closes_at_open_tag: false,
3425                ..
3426            })
3427        ));
3428    }
3429
3430    #[test]
3431    fn test_find_multiline_open_end() {
3432        // Single-line opens return None (caller takes the regular path).
3433        assert_eq!(
3434            find_multiline_open_end(
3435                &["<div id=\"x\">"],
3436                0,
3437                "<div id=\"x\">",
3438                "div",
3439                &ContainerPrefix::default()
3440            ),
3441            None
3442        );
3443        assert_eq!(
3444            find_multiline_open_end(
3445                &["<embed src=\"x\">"],
3446                0,
3447                "<embed src=\"x\">",
3448                "embed",
3449                &ContainerPrefix::default()
3450            ),
3451            None
3452        );
3453        // Multi-line opens return the line index of the closing `>`.
3454        assert_eq!(
3455            find_multiline_open_end(
3456                &["<embed", "  src=\"x\">"],
3457                0,
3458                "<embed",
3459                "embed",
3460                &ContainerPrefix::default()
3461            ),
3462            Some(1)
3463        );
3464        assert_eq!(
3465            find_multiline_open_end(
3466                &["<embed", "  src=\"x\"", "  type=\"video\">"],
3467                0,
3468                "<embed",
3469                "embed",
3470                &ContainerPrefix::default()
3471            ),
3472            Some(2)
3473        );
3474        // Tag-name mismatch returns None (case-insensitive on the tag name).
3475        assert_eq!(
3476            find_multiline_open_end(
3477                &["<embed", "  src=\"x\">"],
3478                0,
3479                "<embed",
3480                "div",
3481                &ContainerPrefix::default()
3482            ),
3483            None
3484        );
3485        assert_eq!(
3486            find_multiline_open_end(
3487                &["<EMBED", "  src=\"x\">"],
3488                0,
3489                "<EMBED",
3490                "embed",
3491                &ContainerPrefix::default()
3492            ),
3493            Some(1)
3494        );
3495        // Quoted `>` does not terminate the open tag; quote state threads
3496        // across line boundaries.
3497        assert_eq!(
3498            find_multiline_open_end(
3499                &["<embed title=\"a>b", "  c\">"],
3500                0,
3501                "<embed title=\"a>b",
3502                "embed",
3503                &ContainerPrefix::default()
3504            ),
3505            Some(1)
3506        );
3507        // No `>` anywhere returns None.
3508        assert_eq!(
3509            find_multiline_open_end(
3510                &["<embed", "  src=\"x\""],
3511                0,
3512                "<embed",
3513                "embed",
3514                &ContainerPrefix::default()
3515            ),
3516            None
3517        );
3518        // Subsequent lines inside a blockquote: bq markers stripped before
3519        // scanning so `> ` prefixes don't count.
3520        assert_eq!(
3521            find_multiline_open_end(
3522                &["<div", ">   id=\"x\">"],
3523                0,
3524                "<div",
3525                "div",
3526                &ContainerPrefix::bq_only(1)
3527            ),
3528            Some(1)
3529        );
3530        // Nested bq: strips two `> ` per line.
3531        assert_eq!(
3532            find_multiline_open_end(
3533                &["<section", "> >   id=\"x\">"],
3534                0,
3535                "<section",
3536                "section",
3537                &ContainerPrefix::bq_only(2)
3538            ),
3539            Some(1)
3540        );
3541    }
3542
3543    #[test]
3544    fn test_pandoc_html_open_tag_closes() {
3545        // Single-line complete: scanner finds `>` on the first line.
3546        assert!(pandoc_html_open_tag_closes(
3547            &["<div>"],
3548            0,
3549            &ContainerPrefix::default()
3550        ));
3551        assert!(pandoc_html_open_tag_closes(
3552            &["<embed src=\"x\">"],
3553            0,
3554            &ContainerPrefix::default()
3555        ));
3556        // Multi-line complete: scanner finds `>` on a later line.
3557        assert!(pandoc_html_open_tag_closes(
3558            &["<div", "  id=\"x\">", "body", "</div>"],
3559            0,
3560            &ContainerPrefix::default()
3561        ));
3562        assert!(pandoc_html_open_tag_closes(
3563            &["<embed", "  src=\"x.png\" alt=\"y\">"],
3564            0,
3565            &ContainerPrefix::default()
3566        ));
3567        // Quoted `>` does not close: scanner threads quote state.
3568        assert!(!pandoc_html_open_tag_closes(
3569            &["<div title=\"a>b", "  c\""],
3570            0,
3571            &ContainerPrefix::default()
3572        ));
3573        assert!(pandoc_html_open_tag_closes(
3574            &["<div title=\"a>b", "  c\">"],
3575            0,
3576            &ContainerPrefix::default()
3577        ));
3578        // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3579        assert!(!pandoc_html_open_tag_closes(
3580            &["<embed"],
3581            0,
3582            &ContainerPrefix::default()
3583        ));
3584        assert!(!pandoc_html_open_tag_closes(
3585            &["<div", "foo", "bar"],
3586            0,
3587            &ContainerPrefix::default()
3588        ));
3589        // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3590        // across them); the scan continues until EOF or `>`.
3591        assert!(pandoc_html_open_tag_closes(
3592            &["<div", "", "id=\"x\">"],
3593            0,
3594            &ContainerPrefix::default()
3595        ));
3596    }
3597
3598    #[test]
3599    fn test_try_parse_cdata() {
3600        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3601        assert_eq!(
3602            try_parse_html_block_start("<![CDATA[content]]>", true),
3603            Some(HtmlBlockType::CData)
3604        );
3605        // Pandoc dialect does not.
3606        assert_eq!(
3607            try_parse_html_block_start("<![CDATA[content]]>", false),
3608            None
3609        );
3610    }
3611
3612    #[test]
3613    fn test_extract_block_tag_name_open_only() {
3614        assert_eq!(
3615            extract_block_tag_name("<div>", false),
3616            Some("div".to_string())
3617        );
3618        assert_eq!(
3619            extract_block_tag_name("<div class=\"test\">", false),
3620            Some("div".to_string())
3621        );
3622        assert_eq!(
3623            extract_block_tag_name("<div/>", false),
3624            Some("div".to_string())
3625        );
3626        assert_eq!(extract_block_tag_name("</div>", false), None);
3627        assert_eq!(extract_block_tag_name("<>", false), None);
3628        assert_eq!(extract_block_tag_name("< div>", false), None);
3629    }
3630
3631    #[test]
3632    fn test_extract_block_tag_name_with_closing() {
3633        // CommonMark §4.6 type-6 starts also accept closing tags.
3634        assert_eq!(
3635            extract_block_tag_name("</div>", true),
3636            Some("div".to_string())
3637        );
3638        assert_eq!(
3639            extract_block_tag_name("</div >", true),
3640            Some("div".to_string())
3641        );
3642    }
3643
3644    #[test]
3645    fn test_commonmark_type6_closing_tag_start() {
3646        assert_eq!(
3647            try_parse_html_block_start("</div>", true),
3648            Some(HtmlBlockType::BlockTag {
3649                tag_name: "div".to_string(),
3650                is_verbatim: false,
3651                closed_by_blank_line: true,
3652                depth_aware: false,
3653                closes_at_open_tag: false,
3654                is_closing: true,
3655            })
3656        );
3657    }
3658
3659    #[test]
3660    fn test_commonmark_type7_open_tag() {
3661        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3662        // CommonMark; rejected under non-CommonMark.
3663        assert_eq!(
3664            try_parse_html_block_start("<a href=\"foo\">", true),
3665            Some(HtmlBlockType::Type7)
3666        );
3667        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3668    }
3669
3670    #[test]
3671    fn test_commonmark_type7_close_tag() {
3672        assert_eq!(
3673            try_parse_html_block_start("</ins>", true),
3674            Some(HtmlBlockType::Type7)
3675        );
3676    }
3677
3678    #[test]
3679    fn test_commonmark_type7_rejects_with_trailing_text() {
3680        // A complete tag must be followed only by whitespace.
3681        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3682    }
3683
3684    #[test]
3685    fn test_is_closing_marker_comment() {
3686        let block_type = HtmlBlockType::Comment;
3687        assert!(is_closing_marker("-->", &block_type));
3688        assert!(is_closing_marker("end -->", &block_type));
3689        assert!(!is_closing_marker("<!--", &block_type));
3690    }
3691
3692    #[test]
3693    fn test_is_closing_marker_tag() {
3694        let block_type = HtmlBlockType::BlockTag {
3695            tag_name: "div".to_string(),
3696            is_verbatim: false,
3697            closed_by_blank_line: false,
3698            depth_aware: false,
3699            closes_at_open_tag: false,
3700            is_closing: false,
3701        };
3702        assert!(is_closing_marker("</div>", &block_type));
3703        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3704        assert!(is_closing_marker("content</div>", &block_type));
3705        assert!(!is_closing_marker("<div>", &block_type));
3706    }
3707
3708    #[test]
3709    fn test_parse_html_comment_block() {
3710        let input = "<!-- comment -->\n";
3711        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3712        let mut builder = GreenNodeBuilder::new();
3713
3714        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3715        let opts = ParserOptions::default();
3716        let new_pos = parse_html_block_with_wrapper(
3717            &mut builder,
3718            &lines,
3719            0,
3720            block_type,
3721            &ContainerPrefix::default(),
3722            SyntaxKind::HTML_BLOCK,
3723            &opts,
3724        );
3725
3726        assert_eq!(new_pos, 1);
3727    }
3728
3729    #[test]
3730    fn test_parse_div_block() {
3731        let input = "<div>\ncontent\n</div>\n";
3732        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3733        let mut builder = GreenNodeBuilder::new();
3734
3735        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3736        let opts = ParserOptions::default();
3737        let new_pos = parse_html_block_with_wrapper(
3738            &mut builder,
3739            &lines,
3740            0,
3741            block_type,
3742            &ContainerPrefix::default(),
3743            SyntaxKind::HTML_BLOCK,
3744            &opts,
3745        );
3746
3747        assert_eq!(new_pos, 3);
3748    }
3749
3750    #[test]
3751    fn test_parse_html_block_no_closing() {
3752        let input = "<div>\ncontent\n";
3753        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3754        let mut builder = GreenNodeBuilder::new();
3755
3756        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3757        let opts = ParserOptions::default();
3758        let new_pos = parse_html_block_with_wrapper(
3759            &mut builder,
3760            &lines,
3761            0,
3762            block_type,
3763            &ContainerPrefix::default(),
3764            SyntaxKind::HTML_BLOCK,
3765            &opts,
3766        );
3767
3768        // Should consume all lines even without closing tag
3769        assert_eq!(new_pos, 2);
3770    }
3771
3772    #[test]
3773    fn test_parse_div_block_nested_pandoc() {
3774        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3775        // close on the OUTER `</div>`, not the first `</div>` seen. The
3776        // CommonMark-style "first close" scanner is wrong here; Pandoc's
3777        // div parser is depth-aware (mirrors `htmlInBalanced`).
3778        let input =
3779            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3780        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3781        let mut builder = GreenNodeBuilder::new();
3782
3783        // is_commonmark = false → Pandoc dialect.
3784        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3785        let opts = ParserOptions::default();
3786        let new_pos = parse_html_block_with_wrapper(
3787            &mut builder,
3788            &lines,
3789            0,
3790            block_type,
3791            &ContainerPrefix::default(),
3792            SyntaxKind::HTML_BLOCK_DIV,
3793            &opts,
3794        );
3795
3796        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3797        // inner-close, blank, outer-close. All consumed.
3798        assert_eq!(new_pos, 9);
3799    }
3800
3801    #[test]
3802    fn test_parse_div_block_same_line_pandoc() {
3803        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3804        // close on first line. Depth-aware tracking must not regress this.
3805        let input = "<div>foo</div>\n";
3806        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3807        let mut builder = GreenNodeBuilder::new();
3808
3809        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3810        let opts = ParserOptions::default();
3811        let new_pos = parse_html_block_with_wrapper(
3812            &mut builder,
3813            &lines,
3814            0,
3815            block_type,
3816            &ContainerPrefix::default(),
3817            SyntaxKind::HTML_BLOCK_DIV,
3818            &opts,
3819        );
3820        assert_eq!(new_pos, 1);
3821    }
3822
3823    #[test]
3824    fn test_commonmark_verbatim_first_close() {
3825        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3826        // ends at the first matching close — not depth-aware. Stash a
3827        // bogus inner `<script>` inside a JS string; the outer block
3828        // still closes at the first `</script>`.
3829        let input = "<script>\nlet x = '<script>';\n</script>\n";
3830        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3831        let mut builder = GreenNodeBuilder::new();
3832
3833        // is_commonmark = true.
3834        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3835        let opts = ParserOptions::default();
3836        let new_pos = parse_html_block_with_wrapper(
3837            &mut builder,
3838            &lines,
3839            0,
3840            block_type,
3841            &ContainerPrefix::default(),
3842            SyntaxKind::HTML_BLOCK,
3843            &opts,
3844        );
3845        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3846        assert_eq!(new_pos, 3);
3847    }
3848
3849    #[test]
3850    fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3851        // Multi-line open tag with the closing `>` on its own line:
3852        //
3853        //   <div
3854        //     id="x"
3855        //     class="y"
3856        //   >
3857        //
3858        //   foo
3859        //
3860        //   </div>
3861        //
3862        // Open tag spans lines 0..=3. Content starts at line 4.
3863        let input = "<div\n  id=\"x\"\n  class=\"y\"\n>\n\nfoo\n\n</div>\n";
3864        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3865        let mut builder = GreenNodeBuilder::new();
3866
3867        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3868        let opts = ParserOptions::default();
3869        let new_pos = parse_html_block_with_wrapper(
3870            &mut builder,
3871            &lines,
3872            0,
3873            block_type,
3874            &ContainerPrefix::default(),
3875            SyntaxKind::HTML_BLOCK_DIV,
3876            &opts,
3877        );
3878
3879        // 8 lines: open-line 0, open-line 1 (`  id="x"`), open-line 2
3880        // (`  class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3881        assert_eq!(new_pos, 8);
3882
3883        // CST must contain a structural HTML_ATTRS region holding the
3884        // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3885        let green = builder.finish();
3886        let root = crate::syntax::SyntaxNode::new_root(green);
3887        let attrs_count = root
3888            .descendants()
3889            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3890            .count();
3891        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3892
3893        // Byte-identical losslessness check.
3894        let collected: String = root
3895            .descendants_with_tokens()
3896            .filter_map(|n| n.into_token())
3897            .map(|t| t.text().to_string())
3898            .collect();
3899        assert_eq!(collected, input);
3900    }
3901
3902    #[test]
3903    fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3904        // Multi-line open tag with the closing `>` on the last attribute
3905        // line (case 0262 already covers this pattern; pin behavior to
3906        // also ensure HTML_ATTRS structural exposure).
3907        let input = "<div\n  id=\"x\"\n  class=\"y\">\nfoo\n</div>\n";
3908        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3909        let mut builder = GreenNodeBuilder::new();
3910
3911        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3912        let opts = ParserOptions::default();
3913        let new_pos = parse_html_block_with_wrapper(
3914            &mut builder,
3915            &lines,
3916            0,
3917            block_type,
3918            &ContainerPrefix::default(),
3919            SyntaxKind::HTML_BLOCK_DIV,
3920            &opts,
3921        );
3922
3923        // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3924        // </div>.
3925        assert_eq!(new_pos, 5);
3926
3927        let green = builder.finish();
3928        let root = crate::syntax::SyntaxNode::new_root(green);
3929        let attrs_count = root
3930            .descendants()
3931            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3932            .count();
3933        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3934
3935        let collected: String = root
3936            .descendants_with_tokens()
3937            .filter_map(|n| n.into_token())
3938            .map(|t| t.text().to_string())
3939            .collect();
3940        assert_eq!(collected, input);
3941    }
3942
3943    #[test]
3944    fn test_commonmark_type6_blank_line_terminates() {
3945        let input = "<div>\nfoo\n\nbar\n";
3946        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3947        let mut builder = GreenNodeBuilder::new();
3948
3949        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3950        let opts = ParserOptions::default();
3951        let new_pos = parse_html_block_with_wrapper(
3952            &mut builder,
3953            &lines,
3954            0,
3955            block_type,
3956            &ContainerPrefix::default(),
3957            SyntaxKind::HTML_BLOCK,
3958            &opts,
3959        );
3960
3961        // Block contains <div>\nfoo\n; stops at blank line (line 2).
3962        assert_eq!(new_pos, 2);
3963    }
3964}