Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
10
11/// HTML block-level tags as defined by CommonMark spec.
12/// These tags start an HTML block when found at the start of a line.
13const BLOCK_TAGS: &[&str] = &[
14    "address",
15    "article",
16    "aside",
17    "base",
18    "basefont",
19    "blockquote",
20    "body",
21    "caption",
22    "center",
23    "col",
24    "colgroup",
25    "dd",
26    "details",
27    "dialog",
28    "dir",
29    "div",
30    "dl",
31    "dt",
32    "fieldset",
33    "figcaption",
34    "figure",
35    "footer",
36    "form",
37    "frame",
38    "frameset",
39    "h1",
40    "h2",
41    "h3",
42    "h4",
43    "h5",
44    "h6",
45    "head",
46    "header",
47    "hr",
48    "html",
49    "iframe",
50    "legend",
51    "li",
52    "link",
53    "main",
54    "menu",
55    "menuitem",
56    "nav",
57    "noframes",
58    "ol",
59    "optgroup",
60    "option",
61    "p",
62    "param",
63    "section",
64    "source",
65    "summary",
66    "table",
67    "tbody",
68    "td",
69    "tfoot",
70    "th",
71    "thead",
72    "title",
73    "tr",
74    "track",
75    "ul",
76];
77
78/// Tags that contain raw/verbatim content (no Markdown processing inside).
79const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
80
81/// Pandoc's `blockHtmlTags` (mirrors
82/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
83/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
84/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
85/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
86/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
87/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
88///
89/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
90/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
91/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
92/// and the verbatim `script`) is tracked separately as
93/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
94/// fresh-block positions but stay inline inside an existing HTML block
95/// (e.g. `<form><input><button>X</button></form>`); the projector's
96/// `split_html_block_by_tags` keys on `inline_pending` to keep them
97/// inline once an inline-only tag or text byte has been seen since the
98/// last splitter.
99const PANDOC_BLOCK_TAGS: &[&str] = &[
100    "address",
101    "article",
102    "aside",
103    "blockquote",
104    "body",
105    "canvas",
106    "caption",
107    "center",
108    "col",
109    "colgroup",
110    "dd",
111    "details",
112    "dir",
113    "div",
114    "dl",
115    "dt",
116    "fieldset",
117    "figcaption",
118    "figure",
119    "footer",
120    "form",
121    "frameset",
122    "h1",
123    "h2",
124    "h3",
125    "h4",
126    "h5",
127    "h6",
128    "head",
129    "header",
130    "hgroup",
131    "hr",
132    "html",
133    "isindex",
134    "li",
135    "main",
136    "menu",
137    "meta",
138    "nav",
139    "noframes",
140    "ol",
141    "output",
142    "p",
143    "pre",
144    "script",
145    "section",
146    "style",
147    "summary",
148    "table",
149    "tbody",
150    "td",
151    "textarea",
152    "tfoot",
153    "th",
154    "thead",
155    "tr",
156    "ul",
157];
158
159/// Whether `name` (case-insensitive) is one of the HTML block-level tags
160/// recognized by CommonMark §4.6 type-6.
161pub fn is_html_block_tag_name(name: &str) -> bool {
162    let lower = name.to_ascii_lowercase();
163    BLOCK_TAGS.contains(&lower.as_str())
164}
165
166/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
167/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
168/// Used by the pandoc-native projector's `split_html_block_by_tags` to
169/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
170/// the block — block-level tags emit as separate `RawBlock` entries;
171/// inline tags stay inline in the surrounding `Plain` content.
172pub fn is_pandoc_block_tag_name(name: &str) -> bool {
173    let lower = name.to_ascii_lowercase();
174    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
175}
176
177/// Pandoc's `eitherBlockOrInline` set (mirrors
178/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
179/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
180/// (because `name ∉ blockTags`). At top level (or after a blank line)
181/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
182/// inside an existing HTML block once a paragraph has started parsing,
183/// the same tag stays inline as `RawInline`.
184///
185/// The projector's `split_html_block_by_tags` mirrors this with an
186/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
187/// always split; inline-block tags split only when no inline content
188/// has been buffered since the last splitter.
189///
190/// Void elements (`area`, `embed`, `source`, `track`) live in
191/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
192/// rule as non-void inline-block tags but emit a single RawBlock per
193/// instance instead of a matched-pair lift.
194/// `script` is omitted because it is already verbatim (handled by the
195/// `<script>...</script>` raw-text path) and the strict-block check
196/// fires first regardless.
197const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
198    "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
199    "svg", "video",
200];
201
202/// Whether `name` (case-insensitive) is one of pandoc's
203/// `eitherBlockOrInline` tags (excluding void elements and `script`;
204/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
205pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
206    let lower = name.to_ascii_lowercase();
207    PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
208}
209
210/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
211/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
212/// minus those handled elsewhere: `br` and `wbr` are inline-only;
213/// `img` and `input` are inline-only; HTML void elements that pandoc
214/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
215/// `track`).
216///
217/// At fresh-block positions (or after a blank line) pandoc emits these
218/// as a single `RawBlock`; inside a running paragraph they stay inline
219/// as `RawInline`. The parser opens a depth-zero HTML block (closes
220/// immediately on the open-tag line — there is no closing tag to
221/// match) so subsequent lines start fresh blocks; the projector's
222/// `split_html_block_by_tags` handles the same-line splitting via
223/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
224const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
225
226/// Whether `name` (case-insensitive) is one of pandoc's void
227/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
228pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
229    let lower = name.to_ascii_lowercase();
230    PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
231}
232
233/// Whether the given tag name is eligible for the Phase 6 / Fix #4
234/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
235/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
236/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
237/// and NOT void. These are the tags where pandoc parses the body as
238/// fresh markdown between RawBlock emissions of the open/close tags —
239/// exactly the shape we can lift into structural CST children.
240///
241/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
242/// additional gate at the lift-gate site: the lift is abandoned when
243/// the body's first non-blank content is a void block tag at a
244/// fresh-block position (`<video>\n<source ...>\n</video>` projects
245/// per-tag rather than matched-pair, mirroring pandoc).
246///
247/// `<div>` is intentionally excluded — it has its own lift path
248/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
249/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
250pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
251    let lower = name.to_ascii_lowercase();
252    if VERBATIM_TAGS.contains(&lower.as_str()) {
253        return false;
254    }
255    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
256        return false;
257    }
258    if lower == "div" {
259        return false;
260    }
261    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
262        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
263}
264
265/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
266/// — anything that has an opening and a matching closing form whose
267/// `</tag>` would be recognized by the dispatcher as a separate block
268/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
269/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
270/// Void tags are excluded — they have no close form.
271///
272/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
273/// an open inside the buffer whose close would otherwise interrupt the
274/// list item mid-construct.
275pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
276    let lower = name.to_ascii_lowercase();
277    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
278        return false;
279    }
280    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
281        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
282        || VERBATIM_TAGS.contains(&lower.as_str())
283}
284
285/// Open-tag-attribute tokenization gate for non-div strict-block tags
286/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
287/// open tag is eligible for finer-grained tokenization
288/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
289/// driving the full body lift — that's the `bq_clean_lift` path. The
290/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
291/// the salsa anchor index.
292///
293/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
294/// wrapper) regardless of bq depth, so this gate skips it.
295fn bq_strict_attr_emit_tag_name(
296    wrapper_kind: SyntaxKind,
297    block_type: &HtmlBlockType,
298    bq_depth: usize,
299) -> Option<&str> {
300    if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
301        return None;
302    }
303    match block_type {
304        HtmlBlockType::BlockTag {
305            tag_name,
306            is_verbatim: false,
307            closed_by_blank_line: false,
308            depth_aware: true,
309            closes_at_open_tag: false,
310            is_closing: false,
311        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
312        _ => None,
313    }
314}
315
316/// Information about a detected HTML block opening.
317#[derive(Debug, Clone, PartialEq, Eq)]
318pub(crate) enum HtmlBlockType {
319    /// HTML comment: <!-- ... -->
320    Comment,
321    /// Processing instruction: <? ... ?>
322    ProcessingInstruction,
323    /// Declaration: <!...>
324    Declaration,
325    /// CDATA section: <![CDATA[ ... ]]>
326    CData,
327    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
328    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
329    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
330    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
331    /// `depth_aware` extends the matching-tag close path with balanced
332    /// open/close tracking of the same tag name (mirrors pandoc's
333    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
334    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
335    /// `closed_by_blank_line` is true.
336    /// `closes_at_open_tag` short-circuits the close search: the block
337    /// always ends after the open-tag line. Used for void
338    /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
339    /// `<track>`) which have no closing tag — depth-aware matching
340    /// would walk to end-of-input.
341    /// `is_closing` records whether the tag at the start position is a
342    /// closing form (`</tag>`) rather than an opening form (`<tag>`).
343    /// The dispatcher's `cannot_interrupt` consults this to mirror
344    /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
345    /// even when `<script>` is not — pandoc treats the close-form as
346    /// always-inline regardless of attributes).
347    BlockTag {
348        tag_name: String,
349        is_verbatim: bool,
350        closed_by_blank_line: bool,
351        depth_aware: bool,
352        closes_at_open_tag: bool,
353        is_closing: bool,
354    },
355    /// CommonMark §4.6 type 7: complete open or close tag on a line by
356    /// itself, tag name not in the type-1 verbatim list. Block ends at
357    /// blank line. Cannot interrupt a paragraph.
358    Type7,
359}
360
361/// Try to detect an HTML block opening from content.
362/// Returns block type if this is a valid HTML block start.
363///
364/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
365/// accept closing tags (`</div>`), type-6 blocks end at the next blank
366/// line (rather than a matching close tag), and type 7 is recognized.
367pub(crate) fn try_parse_html_block_start(
368    content: &str,
369    is_commonmark: bool,
370) -> Option<HtmlBlockType> {
371    let trimmed = strip_leading_spaces(content);
372
373    // Must start with <
374    if !trimmed.starts_with('<') {
375        return None;
376    }
377
378    // HTML comment
379    if trimmed.starts_with("<!--") {
380        return Some(HtmlBlockType::Comment);
381    }
382
383    // Processing instruction
384    if trimmed.starts_with("<?") {
385        return Some(HtmlBlockType::ProcessingInstruction);
386    }
387
388    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
389    // recognize bare CDATA as a raw HTML block; the literal bytes fall
390    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
391    // text is parsed as inline markdown, etc).
392    if is_commonmark && trimmed.starts_with("<![CDATA[") {
393        return Some(HtmlBlockType::CData);
394    }
395
396    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
397    // does not recognize bare declarations as raw HTML blocks (its
398    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
399    // tag-shaped blocks); the bytes fall through to paragraph parsing.
400    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
401        let after_bang = &trimmed[2..];
402        if after_bang.chars().next()?.is_ascii_alphabetic() {
403            return Some(HtmlBlockType::Declaration);
404        }
405    }
406
407    // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
408    // Pandoc-native recognizes standalone closing forms of strict-block tags
409    // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
410    // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
411    // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
412    // the open-tag line via `closes_at_open_tag: true`.
413    if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
414        let tag_lower = tag_name.to_lowercase();
415        let is_closing = trimmed.starts_with("</");
416
417        // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
418        // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
419        // Unlike inline-block / void closes, these CAN interrupt a running
420        // paragraph (the dispatcher's `cannot_interrupt` only covers the
421        // inline-block / void categories). Inline-block / void closes are
422        // handled by their own branches further below.
423        if !is_commonmark
424            && is_closing
425            && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
426                || VERBATIM_TAGS.contains(&tag_lower.as_str()))
427            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
428            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
429        {
430            return Some(HtmlBlockType::BlockTag {
431                tag_name: tag_lower,
432                is_verbatim: false,
433                closed_by_blank_line: false,
434                depth_aware: false,
435                closes_at_open_tag: true,
436                is_closing: true,
437            });
438        }
439
440        // Under Pandoc, remaining closing forms (truly inline-only tags like
441        // `</em>`, `</span>`) are not block starts — fall through to the
442        // existing inline-html path. Inline-block + void closes are caught
443        // by the dedicated branches further below.
444        if !is_commonmark
445            && is_closing
446            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
447            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
448        {
449            return None;
450        }
451
452        // Check if it's a block-level tag. Pandoc and CommonMark disagree on
453        // membership: pandoc's `blockHtmlTags` (see
454        // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
455        // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
456        // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
457        let is_block_tag = if is_commonmark {
458            BLOCK_TAGS.contains(&tag_lower.as_str())
459        } else {
460            PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
461        };
462        if is_block_tag {
463            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
464            return Some(HtmlBlockType::BlockTag {
465                tag_name: tag_lower,
466                is_verbatim,
467                closed_by_blank_line: is_commonmark && !is_verbatim,
468                depth_aware: !is_commonmark,
469                closes_at_open_tag: false,
470                is_closing,
471            });
472        }
473
474        // Pandoc dialect also treats `eitherBlockOrInline` tags as block
475        // starters at fresh-block positions. The block dispatcher caller
476        // gates these as `cannot_interrupt` (mirrors pandoc — they never
477        // interrupt a running paragraph; only start a fresh block when
478        // following a blank line or at document start). Closing forms
479        // (`</video>`) emit as a single-line `RawBlock` with no balanced
480        // match — pandoc-native pins this for standalone closes.
481        if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
482            return Some(HtmlBlockType::BlockTag {
483                tag_name: tag_lower,
484                is_verbatim: false,
485                closed_by_blank_line: false,
486                depth_aware: !is_closing,
487                closes_at_open_tag: is_closing,
488                is_closing,
489            });
490        }
491
492        // Pandoc dialect also recognizes the void subset of
493        // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
494        // These have no closing tag, so the parser closes the block
495        // immediately on the open-tag line; the projector's
496        // `split_html_block_by_tags` handles the same-line splitting
497        // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
498        // non-void inline-block tags, void tags never interrupt a
499        // running paragraph (gated as `cannot_interrupt` in the
500        // dispatcher). Closing forms (`</embed>`) — semantically
501        // nonsensical for void elements — pandoc still emits as a
502        // single-line `RawBlock`; mirror that.
503        if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
504            return Some(HtmlBlockType::BlockTag {
505                tag_name: tag_lower,
506                is_verbatim: false,
507                closed_by_blank_line: false,
508                depth_aware: false,
509                closes_at_open_tag: true,
510                is_closing,
511            });
512        }
513
514        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
515        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
516        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
517        // do not start a type-1 block. Letting `</pre>` through here would
518        // wrongly interrupt a paragraph.
519        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
520            return Some(HtmlBlockType::BlockTag {
521                tag_name: tag_lower,
522                is_verbatim: true,
523                closed_by_blank_line: false,
524                depth_aware: !is_commonmark,
525                closes_at_open_tag: false,
526                is_closing: false,
527            });
528        }
529    }
530
531    // Type 7 (CommonMark only): complete open or close tag on a line by
532    // itself, tag name not in the type-1 verbatim list.
533    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
534    {
535        let rest = &trimmed[end..];
536        let only_ws = rest
537            .bytes()
538            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
539        if only_ws {
540            // Reject if the tag name belongs to the type-1 verbatim set
541            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
542            // type-1 starts above, so seeing one here means the opener
543            // had a different shape (e.g. `<pre/>` self-closing) that
544            // shouldn't trigger type 7 either. Conservatively skip.
545            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
546            let name_end = leading
547                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
548                .unwrap_or(leading.len());
549            let name = leading[..name_end].to_ascii_lowercase();
550            if !VERBATIM_TAGS.contains(&name.as_str()) {
551                return Some(HtmlBlockType::Type7);
552            }
553        }
554    }
555
556    None
557}
558
559/// Extract the tag name for HTML-block-start detection.
560///
561/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
562/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
563/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
564/// the spec — we approximate that with the space/`>`/`/` boundary check.
565fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
566    if !text.starts_with('<') {
567        return None;
568    }
569
570    let after_bracket = &text[1..];
571
572    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
573        if !accept_closing {
574            return None;
575        }
576        stripped
577    } else {
578        after_bracket
579    };
580
581    // Extract tag name (alphanumeric, ends at space, >, or /)
582    let tag_end = after_slash
583        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
584        .unwrap_or(after_slash.len());
585
586    if tag_end == 0 {
587        return None;
588    }
589
590    let tag_name = &after_slash[..tag_end];
591
592    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
593    if !tag_name.chars().next()?.is_ascii_alphabetic() {
594        return None;
595    }
596
597    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
598        return None;
599    }
600
601    Some(tag_name.to_string())
602}
603
604/// Whether this block type ends at a blank line (CommonMark types 6 & 7
605/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
606/// marker — only at end of input or the next blank line.
607fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
608    matches!(
609        block_type,
610        HtmlBlockType::Type7
611            | HtmlBlockType::BlockTag {
612                closed_by_blank_line: true,
613                ..
614            }
615    )
616}
617
618/// Check if a line contains the closing marker for the given HTML block type.
619/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
620/// blank-line-terminated types (6 in CommonMark, 7) never match here.
621fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
622    match block_type {
623        HtmlBlockType::Comment => line.contains("-->"),
624        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
625        HtmlBlockType::Declaration => line.contains('>'),
626        HtmlBlockType::CData => line.contains("]]>"),
627        HtmlBlockType::BlockTag {
628            tag_name,
629            closed_by_blank_line: false,
630            ..
631        } => {
632            // Look for closing tag </tagname>
633            let closing_tag = format!("</{}>", tag_name);
634            line.to_lowercase().contains(&closing_tag)
635        }
636        HtmlBlockType::BlockTag {
637            closed_by_blank_line: true,
638            ..
639        }
640        | HtmlBlockType::Type7 => false,
641    }
642}
643
644/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
645/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
646/// inside a quoted attribute value are NOT counted — the scanner walks
647/// `<...>` brackets and respects `"`/`'` quoting.
648///
649/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
650/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
651/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
652/// close-form dispatch that would otherwise break the list-item buffer
653/// mid-`<div>...</div>`.
654pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
655    let bytes = line.as_bytes();
656    let lower_line = line.to_ascii_lowercase();
657    let lower_bytes = lower_line.as_bytes();
658    let tag_lower = tag_name.to_ascii_lowercase();
659    let tag_bytes = tag_lower.as_bytes();
660
661    let mut opens = 0usize;
662    let mut closes = 0usize;
663    let mut i = 0usize;
664
665    while i < bytes.len() {
666        if bytes[i] != b'<' {
667            i += 1;
668            continue;
669        }
670        let after = i + 1;
671        let is_close = after < bytes.len() && bytes[after] == b'/';
672        let name_start = if is_close { after + 1 } else { after };
673        let matched = name_start + tag_bytes.len() <= bytes.len()
674            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
675        let after_name = name_start + tag_bytes.len();
676        let is_boundary = matched
677            && matches!(
678                bytes.get(after_name).copied(),
679                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
680            );
681
682        // Walk forward to the closing `>` of this tag bracket, skipping
683        // inside quoted attribute values. Self-closing form ends with `/>`.
684        let mut j = if matched { after_name } else { after };
685        let mut quote: Option<u8> = None;
686        let mut self_close = false;
687        let mut found_gt = false;
688        while j < bytes.len() {
689            let b = bytes[j];
690            match (quote, b) {
691                (Some(q), x) if x == q => quote = None,
692                (None, b'"') | (None, b'\'') => quote = Some(b),
693                (None, b'>') => {
694                    found_gt = true;
695                    if j > i + 1 && bytes[j - 1] == b'/' {
696                        self_close = true;
697                    }
698                    break;
699                }
700                _ => {}
701            }
702            j += 1;
703        }
704
705        if matched && is_boundary {
706            if is_close {
707                closes += 1;
708            } else if !self_close {
709                opens += 1;
710            }
711        }
712
713        if found_gt {
714            i = j + 1;
715        } else {
716            // Unterminated `<...` — bail out to avoid an infinite loop.
717            // The remaining bytes don't form a complete tag.
718            break;
719        }
720    }
721
722    (opens, closes)
723}
724
725/// Pandoc-dialect lift for HTML comments / processing instructions
726/// whose close marker is followed by additional bytes (same-line
727/// trailing or following lines). Pandoc-native emits a `RawBlock` for
728/// the marker bytes only, then parses the remainder as fresh blocks.
729///
730/// Returns `Some(consumed_lines)` when the split fires (caller must
731/// NOT enter the legacy emission); `None` to fall back to the legacy
732/// path (no close marker found, or no trailing content to split).
733///
734/// CST shape on success:
735/// ```text
736/// HTML_BLOCK
737///   HTML_BLOCK_TAG (open)        // line[0] up to and incl close marker
738///     TEXT  "<!-- hi -->"        // or with HTML_BLOCK_CONTENT in between
739///     ...                        // for multi-line `<!--\n…\n-->` shape
740/// <sibling blocks>               // recursive parse of trailing + lines[M+1..]
741/// ```
742fn try_parse_comment_pi_with_trailing_split(
743    builder: &mut GreenNodeBuilder<'static>,
744    lines: &[&str],
745    start_pos: usize,
746    block_type: &HtmlBlockType,
747    wrapper_kind: SyntaxKind,
748    bq_depth: usize,
749    config: &ParserOptions,
750) -> Option<usize> {
751    let marker: &str = match block_type {
752        HtmlBlockType::Comment => "-->",
753        HtmlBlockType::ProcessingInstruction => "?>",
754        _ => return None,
755    };
756
757    // Find the close marker in the bq-stripped line content. For
758    // bq_depth == 0 the inner content equals the raw line; for
759    // bq_depth > 0 we look past the `>` markers stripped by the
760    // outer dispatcher (line 0) and emitted as bq prefix below
761    // (lines > 0). `marker_end_in_inner` is the byte offset of the
762    // first byte AFTER the close marker, measured from the start
763    // of the inner (post-strip) content.
764    let mut close_line_idx: Option<usize> = None;
765    let mut marker_end_in_inner: usize = 0;
766    for (offset, line) in lines[start_pos..].iter().enumerate() {
767        let inner = if bq_depth > 0 {
768            strip_n_blockquote_markers(line, bq_depth)
769        } else {
770            line
771        };
772        if let Some(pos) = inner.find(marker) {
773            close_line_idx = Some(start_pos + offset);
774            marker_end_in_inner = pos + marker.len();
775            break;
776        }
777    }
778    let close_line_idx = close_line_idx?;
779    let close_line = lines[close_line_idx];
780    let close_inner = if bq_depth > 0 {
781        strip_n_blockquote_markers(close_line, bq_depth)
782    } else {
783        close_line
784    };
785    let close_prefix_len = close_line.len() - close_inner.len();
786    let trailing = &close_inner[marker_end_in_inner..];
787
788    // Only fire when there is non-whitespace content AFTER the close
789    // marker on the close line. The legacy path correctly handles
790    // the close-line-ends-at-close-marker shapes (`-->\n` followed
791    // by separate blocks); only the same-line-trailing case needs
792    // structural splitting. Trailing-whitespace-only handling
793    // (`-->   \n`) is a projector-side trim — separate concern.
794    let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
795    if !has_non_ws_trailing {
796        return None;
797    }
798
799    builder.start_node(wrapper_kind.into());
800
801    // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
802    // middle `HTML_BLOCK_CONTENT` lines between open and close. The
803    // close `HTML_BLOCK_TAG` carries only the bytes up to and
804    // including the close marker — trailing bytes go to the sibling.
805    if close_line_idx == start_pos {
806        // Same-line shape: one HTML_BLOCK_TAG containing the close
807        // marker's bytes. The newline lives on the trailing sibling.
808        // Line 0's bq prefix (if any) was already emitted by the
809        // outer dispatcher; emit only the inner marker bytes.
810        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
811        let close_part = &close_inner[..marker_end_in_inner];
812        if !close_part.is_empty() {
813            builder.token(SyntaxKind::TEXT.into(), close_part);
814        }
815        builder.finish_node();
816    } else {
817        // Multi-line shape: open tag covers lines[start_pos..close],
818        // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
819        // only the marker bytes. Line 0's bq prefix was emitted by
820        // the outer dispatcher; subsequent lines (middle + close)
821        // need bq prefix re-emission inside the wrapper.
822        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
823        let first_line = lines[start_pos];
824        let first_inner = if bq_depth > 0 {
825            strip_n_blockquote_markers(first_line, bq_depth)
826        } else {
827            first_line
828        };
829        let (line_no_nl, nl) = strip_newline(first_inner);
830        if !line_no_nl.is_empty() {
831            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
832        }
833        if !nl.is_empty() {
834            builder.token(SyntaxKind::NEWLINE.into(), nl);
835        }
836        builder.finish_node();
837
838        if close_line_idx > start_pos + 1 {
839            builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
840            for content_line in &lines[start_pos + 1..close_line_idx] {
841                emit_html_block_line(builder, content_line, bq_depth);
842            }
843            builder.finish_node();
844        }
845
846        builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
847        if bq_depth > 0 && close_prefix_len > 0 {
848            emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
849        }
850        let close_part = &close_inner[..marker_end_in_inner];
851        if !close_part.is_empty() {
852            builder.token(SyntaxKind::TEXT.into(), close_part);
853        }
854        builder.finish_node();
855    }
856
857    builder.finish_node(); // HTML_BLOCK
858
859    // Recursively parse JUST the trailing bytes on the close line
860    // and graft top-level children as siblings of the HTML_BLOCK we
861    // just closed. We do NOT consume subsequent lines here — the
862    // outer dispatcher continues from `close_line_idx + 1` and
863    // handles container-boundary lines (`:::` div closes, blockquote
864    // markers, list-marker continuations) correctly. Multi-line
865    // softbreak continuation (`<!-- --> trailing\nmore\n` →
866    // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
867    // outer dispatcher sees `more` after the close line and starts
868    // a fresh paragraph. Refdefs flow through from the outer config
869    // (same pattern as `emit_html_block_body_lifted_inner`).
870    if !trailing.is_empty() {
871        let mut inner_options = config.clone();
872        let refdefs = config.refdef_labels.clone().unwrap_or_default();
873        inner_options.refdef_labels = Some(refdefs.clone());
874        let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
875        let mut bq = None;
876        graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
877    }
878
879    Some(close_line_idx + 1)
880}
881
882/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
883/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
884/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
885/// identical to the source either way; only the wrapper retag changes.
886pub(crate) fn parse_html_block_with_wrapper(
887    builder: &mut GreenNodeBuilder<'static>,
888    lines: &[&str],
889    start_pos: usize,
890    block_type: HtmlBlockType,
891    bq_depth: usize,
892    wrapper_kind: SyntaxKind,
893    config: &ParserOptions,
894) -> usize {
895    // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
896    // closes the RawBlock at the close marker (`-->` / `?>`) and parses
897    // any subsequent bytes (same-line trailing or following lines) as
898    // fresh blocks. The legacy path absorbs them into the HTML block
899    // wrapper, producing one oversized RawBlock. Handle the split here
900    // before entering the legacy emission so the CST encodes the
901    // sibling structure.
902    if config.dialect == crate::options::Dialect::Pandoc
903        && matches!(
904            block_type,
905            HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
906        )
907        && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
908            builder,
909            lines,
910            start_pos,
911            &block_type,
912            wrapper_kind,
913            bq_depth,
914            config,
915        )
916    {
917        return consumed;
918    }
919
920    // Start HTML block
921    builder.start_node(wrapper_kind.into());
922
923    let first_line = lines[start_pos];
924    let blank_terminated = ends_at_blank_line(&block_type);
925
926    // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
927    // tokens for the first line's blockquote prefix; emit only the inner
928    // content as TEXT to keep the CST byte-equal to the source.
929    let first_inner = if bq_depth > 0 {
930        strip_n_blockquote_markers(first_line, bq_depth)
931    } else {
932        first_line
933    };
934
935    // Detect a multi-line open tag.
936    // - `<div>` (Pandoc lift): we tokenize each line structurally so the
937    //   salsa anchor walk picks up `id` from the HTML_ATTRS region.
938    // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
939    //   `<section>`, `<header>`, …): same structural emission, exposing
940    //   `id` to the salsa anchor walk and enabling the body lift below.
941    // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
942    //   without this, the parser closes the block after line 0 and the
943    //   remainder of the open tag falls into following paragraphs;
944    //   pandoc-native treats the whole multi-line open tag as a single
945    //   `RawBlock`. Emission for void tags uses simple per-line
946    //   TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
947    //   from void tags).
948    let multiline_open_end = match (wrapper_kind, &block_type) {
949        (SyntaxKind::HTML_BLOCK_DIV, _) => {
950            find_multiline_open_end(lines, start_pos, first_inner, "div", bq_depth)
951        }
952        (
953            _,
954            HtmlBlockType::BlockTag {
955                tag_name,
956                closes_at_open_tag: true,
957                ..
958            },
959        ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, bq_depth),
960        (
961            _,
962            HtmlBlockType::BlockTag {
963                tag_name,
964                is_verbatim: false,
965                closed_by_blank_line: false,
966                depth_aware: true,
967                closes_at_open_tag: false,
968                is_closing: false,
969            },
970        ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
971            find_multiline_open_end(lines, start_pos, first_inner, tag_name, bq_depth)
972        }
973        _ => None,
974    };
975
976    // Set up depth-aware close tracking when the block type asks for it
977    // (Pandoc dialect, balanced same-name tag matching). A `None` means
978    // we fall back to the legacy "first matching close" path via
979    // `is_closing_marker`. Computed up front so the lift-mode gate
980    // below can decide whether the open line already balances the
981    // block (same-line `<div>...</div>`).
982    let depth_aware_tag: Option<String> = match &block_type {
983        HtmlBlockType::BlockTag {
984            tag_name,
985            closed_by_blank_line: false,
986            depth_aware: true,
987            ..
988        } => Some(tag_name.clone()),
989        _ => None,
990    };
991    let mut depth: i64 = 1;
992    if let Some(tag_name) = &depth_aware_tag {
993        // Sum opens/closes across all open-tag lines (single-line: just
994        // line 0; multi-line: lines 0..=end_line_idx).
995        let last_open_line = multiline_open_end.unwrap_or(start_pos);
996        let mut opens = 0usize;
997        let mut closes = 0usize;
998        for line in &lines[start_pos..=last_open_line] {
999            let inner = if bq_depth > 0 {
1000                strip_n_blockquote_markers(line, bq_depth)
1001            } else {
1002                line
1003            };
1004            let (o, c) = count_tag_balance(inner, tag_name);
1005            opens += o;
1006            closes += c;
1007        }
1008        depth = opens as i64 - closes as i64;
1009    }
1010
1011    // Same-line `<div>foo</div>` shape: the open line balances the
1012    // block under depth-aware tracking. We can lift this structurally
1013    // only when the open-tag trailing has exactly one `</div>` close,
1014    // zero `<div>` opens, and no non-whitespace content after the
1015    // close. Other same-line shapes (nested, trailing text, malformed)
1016    // fall through to the byte-reparse path.
1017    let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1018        && multiline_open_end.is_none()
1019        && depth_aware_tag.is_some()
1020        && depth <= 0;
1021    let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1022        let (line_without_newline, _) = strip_newline(first_inner);
1023        probe_same_line_lift(line_without_newline, "div")
1024    };
1025
1026    // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1027    // `<nav>`, …): the body parses as fresh markdown between RawBlock
1028    // emissions of the open/close tags. Covers the clean multi-line
1029    // shape (open tag stands alone on its line), open-trailing
1030    // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1031    // and same-line (`<form>foo</form>`). Multi-line open and
1032    // blockquote-wrapped non-div shapes still fall through to the
1033    // byte-walker path.
1034    let strict_block_tag_name: Option<&str> =
1035        if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1036            match &block_type {
1037                HtmlBlockType::BlockTag {
1038                    tag_name,
1039                    is_verbatim: false,
1040                    closed_by_blank_line: false,
1041                    depth_aware: true,
1042                    closes_at_open_tag: false,
1043                    is_closing: false,
1044                } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1045                _ => None,
1046            }
1047        } else {
1048            None
1049        };
1050    // Same-line `<form>foo</form>` shape: the open line already
1051    // balances the block (`depth <= 0`). Lift only when the trailing
1052    // bytes after the open `>` end with `</tag>` and contain exactly
1053    // one close + zero nested opens.
1054    let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1055        multiline_open_end.is_none() && depth <= 0 && {
1056            let (line_no_nl, _) = strip_newline(first_inner);
1057            probe_same_line_lift(line_no_nl, name)
1058        }
1059    });
1060    // Strict-block lift gate: accept (a) a multi-line open tag spanning
1061    // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1062    // trailing single-line open (depth > 0, open `>` is present with
1063    // quote-aware matching), or (c) a safe same-line shape. For
1064    // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1065    // …) the lift additionally abandons when the body starts at a
1066    // fresh-block position with a void block tag — pandoc-native pins
1067    // per-tag emission rather than a matched-pair lift in that case.
1068    let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1069        let (line_no_nl, _) = strip_newline(first_inner);
1070        let shape_ok = if multiline_open_end.is_some() {
1071            // `find_multiline_open_end` already verified the open tag
1072            // closes with a quote-aware `>` somewhere in lines
1073            // `start_pos+1..=end`. No same-line trailing content to
1074            // probe; defer trailing-on-close-`>`-line handling to a
1075            // future session (rare in practice).
1076            true
1077        } else if depth > 0 {
1078            probe_open_tag_line_has_close_gt(line_no_nl, name)
1079        } else {
1080            same_line_strict_lift_safe
1081        };
1082        if !shape_ok {
1083            return false;
1084        }
1085        if !is_pandoc_inline_block_tag_name(name) {
1086            return true;
1087        }
1088        !inline_block_void_interior_abandons(
1089            first_inner,
1090            lines,
1091            start_pos,
1092            multiline_open_end,
1093            bq_depth,
1094            name,
1095        )
1096    });
1097
1098    // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1099    // are byte-equal to the non-bq same-line shape minus the leading
1100    // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1101    // The body has no inner newlines, so no bq prefix re-injection is
1102    // needed when grafting — `emit_html_block_body_lifted` (passing
1103    // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1104    // open-trailing) still fall through to the projector's byte
1105    // walker — they need per-line prefix injection.
1106    let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1107        && multiline_open_end.is_none()
1108        && depth_aware_tag.is_some()
1109        && depth <= 0
1110    {
1111        let (line_no_nl, _) = strip_newline(first_inner);
1112        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1113            if probe_same_line_lift(line_no_nl, "div") {
1114                Some("div")
1115            } else {
1116                None
1117            }
1118        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1119            match &block_type {
1120                HtmlBlockType::BlockTag {
1121                    tag_name,
1122                    is_verbatim: false,
1123                    closed_by_blank_line: false,
1124                    depth_aware: true,
1125                    closes_at_open_tag: false,
1126                    is_closing: false,
1127                } if is_pandoc_lift_eligible_block_tag(tag_name)
1128                    && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1129                {
1130                    // Inline-block tags (`<video>`, `<iframe>`, …) skip
1131                    // the void-interior check at same-line — the shape
1132                    // has no inner block content to interfere with.
1133                    Some(tag_name.as_str())
1134                }
1135                _ => None,
1136            }
1137        } else {
1138            None
1139        }
1140    } else {
1141        None
1142    };
1143
1144    // Messy-shape lift inside a blockquote — covers open-trailing
1145    // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1146    // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1147    // including the multi-line-open variants (`> <div\n>   id="x">foo\n>
1148    // body\n> </div>`) where the trailing is captured into `pre_content`
1149    // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1150    // The open line does NOT balance the block (depth > 0 after the
1151    // open line, distinguishing this from `same_line_bq_lift_tag` which
1152    // requires depth <= 0). The close line — possibly with leading body
1153    // text — closes the block when depth returns to 0. Body lines (incl.
1154    // open trailing and close leading) graft via prefix re-injection.
1155    let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1156    {
1157        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1158            Some("div")
1159        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1160            match &block_type {
1161                HtmlBlockType::BlockTag {
1162                    tag_name,
1163                    is_verbatim: false,
1164                    closed_by_blank_line: false,
1165                    depth_aware: true,
1166                    closes_at_open_tag: false,
1167                    is_closing: false,
1168                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1169                    // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1170                    // …) abandon the lift when the body starts at a
1171                    // fresh-block position with a void block tag. Same gate
1172                    // as the non-bq matched-pair lift (`strict_block_lift`).
1173                    if is_pandoc_inline_block_tag_name(tag_name)
1174                        && inline_block_void_interior_abandons(
1175                            first_inner,
1176                            lines,
1177                            start_pos,
1178                            multiline_open_end,
1179                            bq_depth,
1180                            tag_name,
1181                        )
1182                    {
1183                        None
1184                    } else {
1185                        Some(tag_name.as_str())
1186                    }
1187                }
1188                _ => None,
1189            }
1190        } else {
1191            None
1192        }
1193    } else {
1194        None
1195    };
1196
1197    // Multi-line open + matched close-on-the-open's-last-line shape inside
1198    // a blockquote (`> <div\n>   id="x">foo</div>` and depth-aware variants:
1199    // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1200    // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1201    // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1202    // close is on the open's last line (`depth <= 0` after the open lines)
1203    // AND `multiline_open_end.is_some()`. The trailing bytes after the
1204    // last `>` get lifted into `pre_content` via
1205    // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1206    // new branch below splits `pre_content` at the matched close marker
1207    // and grafts body + close + any trailing siblings.
1208    let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1209        && multiline_open_end.is_some()
1210        && depth_aware_tag.is_some()
1211        && depth <= 0
1212    {
1213        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1214            Some("div")
1215        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1216            match &block_type {
1217                HtmlBlockType::BlockTag {
1218                    tag_name,
1219                    is_verbatim: false,
1220                    closed_by_blank_line: false,
1221                    depth_aware: true,
1222                    closes_at_open_tag: false,
1223                    is_closing: false,
1224                } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1225                    if is_pandoc_inline_block_tag_name(tag_name)
1226                        && inline_block_void_interior_abandons(
1227                            first_inner,
1228                            lines,
1229                            start_pos,
1230                            multiline_open_end,
1231                            bq_depth,
1232                            tag_name,
1233                        )
1234                    {
1235                        None
1236                    } else {
1237                        Some(tag_name.as_str())
1238                    }
1239                }
1240                _ => None,
1241            }
1242        } else {
1243            None
1244        }
1245    } else {
1246        None
1247    };
1248
1249    // Whether this block participates in the Phase 6 structural lift
1250    // (recursively parse body as Pandoc markdown and graft children).
1251    // Covers `<div>` outside blockquote context. For same-line shapes
1252    // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1253    // keep the legacy single-HTML_BLOCK_TAG shape and let the
1254    // byte-reparse path handle projection.
1255    let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1256        && bq_depth == 0
1257        && (!is_same_line_div || same_line_div_lift_safe))
1258        || strict_block_lift
1259        || same_line_bq_lift_tag.is_some()
1260        || bq_messy_lift_tag.is_some()
1261        || bq_multiline_close_lift_tag.is_some();
1262
1263    // Trailing content from the open tag (after `>`). When the lift is
1264    // active and the open line is `<div ATTRS>foo\n`, this captures
1265    // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1266    // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1267    // shapes (same-line / blockquote-wrapped).
1268    let mut pre_content = String::new();
1269
1270    // Emit opening line(s)
1271    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1272
1273    if let Some(end_line_idx) = multiline_open_end {
1274        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1275            emit_multiline_open_tag_with_attrs(
1276                builder,
1277                lines,
1278                start_pos,
1279                end_line_idx,
1280                "div",
1281                bq_depth,
1282                lift_mode,
1283                &mut pre_content,
1284            );
1285        } else if let Some(name) = strict_block_tag_name
1286            && strict_block_lift
1287        {
1288            emit_multiline_open_tag_with_attrs(
1289                builder,
1290                lines,
1291                start_pos,
1292                end_line_idx,
1293                name,
1294                bq_depth,
1295                lift_mode,
1296                &mut pre_content,
1297            );
1298        } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1299        {
1300            // Multi-line open of a lift-eligible strict-block tag inside a
1301            // blockquote (`> <section\n>   id=...>`). The non-bq
1302            // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1303            // covers the bq side so the open tag emits HTML_ATTRS regions
1304            // for `AttributeNode::cast` and the projector's canonicalizer.
1305            //
1306            // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1307            // call below: only push trailing bytes into `pre_content` when
1308            // the structural lift will consume them (bq messy lift). The
1309            // bq clean-lift requires `pre_content.is_empty()`, so for clean
1310            // multi-line opens the trailing is empty anyway and this is
1311            // a no-op.
1312            let lift_trailing =
1313                bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1314            emit_multiline_open_tag_with_attrs(
1315                builder,
1316                lines,
1317                start_pos,
1318                end_line_idx,
1319                name,
1320                bq_depth,
1321                lift_trailing,
1322                &mut pre_content,
1323            );
1324        } else {
1325            emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1326        }
1327    } else {
1328        let (line_without_newline, newline_str) = strip_newline(first_inner);
1329        if !line_without_newline.is_empty() {
1330            // For HTML_BLOCK_DIV, expose the open tag's attributes
1331            // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1332            // via the same descendants walk that handles fenced-div /
1333            // heading attrs. CST bytes stay byte-equal to source — we only
1334            // tokenize at finer granularity for matched div opens.
1335            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1336                let trailing =
1337                    emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1338                if !trailing.is_empty() {
1339                    pre_content.push_str(trailing);
1340                    pre_content.push_str(newline_str);
1341                }
1342            } else if let Some(name) = strict_block_tag_name
1343                && strict_block_lift
1344            {
1345                let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1346                if !trailing.is_empty() {
1347                    pre_content.push_str(trailing);
1348                    pre_content.push_str(newline_str);
1349                }
1350            } else if let Some(name) =
1351                bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1352            {
1353                // Inside a blockquote, lift trailing bytes into
1354                // `pre_content` when either the same-line bq gate fires
1355                // (`> <tag>body</tag>` — handled by `same_line_closed`)
1356                // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1357                // </tag>` and butted-close — handled at the close-marker
1358                // site below). For the clean-shape bq lift the open has
1359                // no trailing bytes regardless, so `lift_trailing=true`
1360                // is a no-op there.
1361                let lift_trailing =
1362                    same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1363                let trailing =
1364                    emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1365                if lift_trailing && !trailing.is_empty() {
1366                    pre_content.push_str(trailing);
1367                    pre_content.push_str(newline_str);
1368                }
1369            } else {
1370                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1371            }
1372        }
1373        // When the open tag has trailing content under lift mode, the
1374        // newline belongs to that trailing line (it terminates the
1375        // synthetic body line, not the open tag). Don't double-emit.
1376        if pre_content.is_empty() && !newline_str.is_empty() {
1377            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1378        }
1379    }
1380
1381    builder.finish_node(); // HtmlBlockTag
1382
1383    // Check if opening line also contains closing marker. Blank-line-terminated
1384    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1385    // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1386    // (`closes_at_open_tag: true`) close immediately — the block always
1387    // ends on the open-tag line since there is no closing tag to find.
1388    let void_block = matches!(
1389        &block_type,
1390        HtmlBlockType::BlockTag {
1391            closes_at_open_tag: true,
1392            ..
1393        }
1394    );
1395    // Void tags with a multi-line open close immediately after the open
1396    // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1397    // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1398    // a single RawBlock for the whole multi-line tag, with no following
1399    // content.
1400    if void_block && let Some(end_line_idx) = multiline_open_end {
1401        log::trace!(
1402            "HTML void block at line {} closes after multi-line open ending at line {}",
1403            start_pos + 1,
1404            end_line_idx + 1
1405        );
1406        builder.finish_node(); // HtmlBlock
1407        return end_line_idx + 1;
1408    }
1409    // Multi-line open with all matched closes on the open's last line:
1410    // `pre_content` holds the bytes after the last open `>` (lifted there
1411    // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1412    // When `depth <= 0` after the multi-line open and the trailing bytes
1413    // contain the depth-zero matched close, do the same-line lift on
1414    // `pre_content` directly. Mirrors the single-line `same_line_closed`
1415    // lift below — same body / close-marker / trailing-graft shape, just
1416    // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1417    //
1418    // The body bytes of `pre_content` come from the open's last line,
1419    // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1420    // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1421    // tag thus inherit the bq context without per-line prefix injection,
1422    // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1423    // both the non-bq and bq variants of this shape.
1424    if let Some(end_line_idx) = multiline_open_end
1425        && !blank_terminated
1426        && depth_aware_tag.is_some()
1427        && depth <= 0
1428        && lift_mode
1429        && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1430        && !pre_content.is_empty()
1431    {
1432        let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1433            Some("div")
1434        } else if strict_block_lift {
1435            strict_block_tag_name
1436        } else if let Some(name) = bq_multiline_close_lift_tag {
1437            Some(name)
1438        } else {
1439            None
1440        };
1441        if let Some(tag_name) = tag_name_opt {
1442            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1443            if let Some((leading, close_part)) =
1444                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1445            {
1446                let close_marker_end =
1447                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1448                let close_marker = &close_part[..close_marker_end];
1449                let same_line_trailing = &close_part[close_marker_end..];
1450                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1451                    LastParaDemote::SkipTrailingBlanks
1452                } else {
1453                    LastParaDemote::OnlyIfLast
1454                };
1455                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1456                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1457                if same_line_trailing.is_empty() {
1458                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1459                    close_line.push_str(close_marker);
1460                    close_line.push_str(post_nl);
1461                    emit_html_block_line(builder, &close_line, 0);
1462                    builder.finish_node();
1463                    builder.finish_node(); // HtmlBlock
1464                } else {
1465                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1466                    builder.finish_node(); // HTML_BLOCK_TAG
1467                    builder.finish_node(); // HtmlBlock
1468
1469                    let mut trailing_text =
1470                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1471                    trailing_text.push_str(same_line_trailing);
1472                    trailing_text.push_str(post_nl);
1473                    let mut inner_options = config.clone();
1474                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1475                    inner_options.refdef_labels = Some(refdefs.clone());
1476                    let inner_root = crate::parser::parse_with_refdefs(
1477                        &trailing_text,
1478                        Some(inner_options),
1479                        refdefs,
1480                    );
1481                    let mut bq = None;
1482                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1483                }
1484                return end_line_idx + 1;
1485            }
1486        }
1487    }
1488
1489    let same_line_closed = !blank_terminated
1490        && multiline_open_end.is_none()
1491        && (void_block
1492            || match &depth_aware_tag {
1493                Some(_) => depth <= 0,
1494                None => is_closing_marker(first_inner, &block_type),
1495            });
1496    if same_line_closed {
1497        log::trace!(
1498            "HTML block at line {} opens and closes on same line",
1499            start_pos + 1
1500        );
1501        // Same-line structural lift (div or non-div strict-block):
1502        // pre_content holds the bytes after the open `>` (including
1503        // the close `</tag>` and the trailing newline). Split into
1504        // body + close tag, emit body via recursive parse, emit close
1505        // tag as a sibling `HTML_BLOCK_TAG`.
1506        let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1507            None
1508        } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1509            Some("div")
1510        } else if same_line_strict_lift_safe {
1511            strict_block_tag_name
1512        } else if let Some(name) = same_line_bq_lift_tag {
1513            // Bq same-line: body has no inner newlines so the standard
1514            // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1515            // sufficient. The bq prefix `> ` lives on the outer
1516            // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1517            Some(name)
1518        } else {
1519            None
1520        };
1521        if let Some(tag_name) = same_line_lift_tag {
1522            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1523            // Depth-aware split: handles `<tag>foo</tag>bar` (single
1524            // close, trailing text), `<tag>foo</tag></tag>` (matched
1525            // close + unmatched trailing close → sibling RawBlock),
1526            // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1527            // recursive body parse).
1528            if let Some((leading, close_part)) =
1529                try_split_close_line_depth_aware(pre_no_nl, tag_name)
1530            {
1531                // `close_part` starts with `</tag` and contains the close
1532                // marker followed by any same-line trailing text. Split
1533                // off the close marker bytes (`</tag>`) so the close
1534                // `HTML_BLOCK_TAG` carries only those bytes; trailing
1535                // text is parsed and grafted as a sibling block at the
1536                // parent level (matches pandoc-native shape:
1537                // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1538                let close_marker_end =
1539                    split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1540                let close_marker = &close_part[..close_marker_end];
1541                let same_line_trailing = &close_part[close_marker_end..];
1542
1543                // Same-line is always close-butted; div demotes the
1544                // trailing Para→Plain via `SkipTrailingBlanks`.
1545                // Non-div strict-block uses `OnlyIfLast` (consistent
1546                // with butted-close — no trailing BLANK_LINE before
1547                // the close means the trailing Para demotes).
1548                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1549                    LastParaDemote::SkipTrailingBlanks
1550                } else {
1551                    LastParaDemote::OnlyIfLast
1552                };
1553                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1554                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1555                if same_line_trailing.is_empty() {
1556                    let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1557                    close_line.push_str(close_marker);
1558                    close_line.push_str(post_nl);
1559                    emit_html_block_line(builder, &close_line, 0);
1560                    builder.finish_node();
1561                    builder.finish_node(); // HtmlBlock
1562                } else {
1563                    // Close tag holds only the close-marker bytes;
1564                    // trailing + newline graft as siblings of the
1565                    // wrapper (matches pandoc's per-tag block split).
1566                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1567                    builder.finish_node(); // HTML_BLOCK_TAG
1568                    builder.finish_node(); // HtmlBlock
1569
1570                    let mut trailing_text =
1571                        String::with_capacity(same_line_trailing.len() + post_nl.len());
1572                    trailing_text.push_str(same_line_trailing);
1573                    trailing_text.push_str(post_nl);
1574                    let mut inner_options = config.clone();
1575                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1576                    inner_options.refdef_labels = Some(refdefs.clone());
1577                    let inner_root = crate::parser::parse_with_refdefs(
1578                        &trailing_text,
1579                        Some(inner_options),
1580                        refdefs,
1581                    );
1582                    let mut bq = None;
1583                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1584                }
1585                return start_pos + 1;
1586            }
1587        }
1588        builder.finish_node(); // HtmlBlock
1589        return start_pos + 1;
1590    }
1591
1592    let mut current_pos = multiline_open_end
1593        .map(|end| end + 1)
1594        .unwrap_or(start_pos + 1);
1595    let mut content_lines: Vec<&str> = Vec::new();
1596    let mut found_closing = false;
1597
1598    // Parse content until we find the closing marker
1599    while current_pos < lines.len() {
1600        let line = lines[current_pos];
1601        let (line_bq_depth, inner) = count_blockquote_markers(line);
1602
1603        // Only process lines at the same or deeper blockquote depth
1604        if line_bq_depth < bq_depth {
1605            break;
1606        }
1607
1608        // Blank-line-terminated blocks (types 6/7) end before the blank line.
1609        // The blank line itself is not part of the block.
1610        if blank_terminated && inner.trim().is_empty() {
1611            break;
1612        }
1613
1614        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1615        // count opens/closes of the same tag name and only close when depth
1616        // returns to 0; otherwise fall back to substring-match on the line.
1617        let line_closes = match &depth_aware_tag {
1618            Some(tag_name) => {
1619                let (opens, closes) = count_tag_balance(inner, tag_name);
1620                depth += opens as i64;
1621                depth -= closes as i64;
1622                depth <= 0
1623            }
1624            None => is_closing_marker(inner, &block_type),
1625        };
1626
1627        if line_closes {
1628            log::trace!("Found HTML block closing at line {}", current_pos + 1);
1629            found_closing = true;
1630
1631            // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1632            // the open and close tags stand alone on their source lines
1633            // (no trailing on open, no body content on close after
1634            // stripping bq markers), lift the body lines structurally
1635            // so the projector walks CST children instead of
1636            // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1637            //
1638            // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1639            // grafted, Para preserved), non-div strict-block tags
1640            // (`<form>`, `<section>`, …) and inline-block matched-pair
1641            // tags (`<video>`, `<iframe>`, …) — the latter two under
1642            // HTML_BLOCK with the structural lift hitting pandoc's
1643            // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1644            // demotion. Inline-block additionally bails if the body
1645            // starts at a fresh-block position with a void block tag
1646            // (mirrors the non-bq matched-pair gate).
1647            //
1648            // Other bq-wrapped shapes (butted-close / open-trailing /
1649            // same-line) still fall through to the opaque path.
1650            // Multi-line opens are allowed here as of 2026-05-12: the
1651            // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1652            // per attr line and per-line bq prefix tokens) by the bq-aware
1653            // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1654            // empty for multi-line opens (the emitter writes any trailing
1655            // bytes on the last open line directly as TEXT inside
1656            // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1657            // trailing falls through to the opaque path, matching the non-
1658            // bq deferral.
1659            let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1660                if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1661                    Some("div")
1662                } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1663                    match &block_type {
1664                        HtmlBlockType::BlockTag {
1665                            tag_name,
1666                            is_verbatim: false,
1667                            closed_by_blank_line: false,
1668                            depth_aware: true,
1669                            closes_at_open_tag: false,
1670                            is_closing: false,
1671                        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1672                        _ => None,
1673                    }
1674                } else {
1675                    None
1676                }
1677            } else {
1678                None
1679            };
1680
1681            let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1682                // Open-shape: last open line must end with `>` (clean
1683                // close-of-open). For single-line, that's `first_inner`
1684                // (already bq-stripped); for multi-line, strip bq markers
1685                // from `lines[end_line_idx]` and check the same.
1686                let last_open_line: &str = match multiline_open_end {
1687                    None => first_inner,
1688                    Some(end) if bq_depth > 0 => strip_n_blockquote_markers(lines[end], bq_depth),
1689                    Some(end) => lines[end],
1690                };
1691                let (open_no_nl, _) = strip_newline(last_open_line);
1692                if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1693                    return false;
1694                }
1695                let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1696                let (close_no_nl, _) = strip_newline(close_stripped);
1697                if !close_no_nl
1698                    .trim_start_matches([' ', '\t'])
1699                    .starts_with("</")
1700                {
1701                    return false;
1702                }
1703                if is_pandoc_inline_block_tag_name(tag_name)
1704                    && inline_block_void_interior_abandons(
1705                        first_inner,
1706                        lines,
1707                        start_pos,
1708                        multiline_open_end,
1709                        bq_depth,
1710                        tag_name,
1711                    )
1712                {
1713                    return false;
1714                }
1715                true
1716            });
1717
1718            if bq_clean_lift {
1719                let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1720                    LastParaDemote::Never
1721                } else {
1722                    LastParaDemote::OnlyIfLast
1723                };
1724                emit_html_block_body_lifted_bq(
1725                    builder,
1726                    &content_lines,
1727                    bq_depth,
1728                    demote_policy,
1729                    config,
1730                );
1731                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1732                emit_html_block_line(builder, line, bq_depth);
1733                builder.finish_node();
1734                current_pos += 1;
1735                break;
1736            }
1737
1738            // Bq messy-shape lift — single-line open with trailing or
1739            // butted-close (or both). `pre_content` already captures any
1740            // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1741            // strip the close line's bq markers before splitting so
1742            // `leading` and `close_part` are bq-prefix-free. Body parses
1743            // recursively from `pre_content + stripped(content_lines) +
1744            // leading`, with per-line bq prefixes re-injected so the CST
1745            // stays byte-equal to the source. Demote: div is keyed on
1746            // close-butted-ness (Plain when leading non-empty, Para
1747            // otherwise); non-div uses OnlyIfLast either way.
1748            if let Some(tag_name) = bq_messy_lift_tag {
1749                let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1750                let close_prefix_len = line.len() - close_stripped.len();
1751                let close_prefix = &line[..close_prefix_len];
1752                if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1753                {
1754                    let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1755                        if leading.is_empty() {
1756                            LastParaDemote::Never
1757                        } else {
1758                            LastParaDemote::SkipTrailingBlanks
1759                        }
1760                    } else {
1761                        LastParaDemote::OnlyIfLast
1762                    };
1763                    emit_html_block_body_lifted_bq_messy(
1764                        builder,
1765                        &pre_content,
1766                        &content_lines,
1767                        leading,
1768                        close_prefix,
1769                        bq_depth,
1770                        policy,
1771                        config,
1772                    );
1773                    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1774                    // When `leading` is empty, no recursive-parse output carries
1775                    // the close line's bq prefix, so emit it here before the
1776                    // close tag. When `leading` is non-empty,
1777                    // `emit_html_block_body_lifted_bq_messy` already injected
1778                    // the prefix at the start of the leading bytes (via the
1779                    // BqPrefixState entry); emitting again would double the
1780                    // prefix bytes and break losslessness.
1781                    if leading.is_empty() {
1782                        emit_bq_prefix_tokens(builder, close_prefix);
1783                    }
1784                    emit_html_block_line(builder, close_part, 0);
1785                    builder.finish_node();
1786                    current_pos += 1;
1787                    break;
1788                }
1789            }
1790
1791            // Under lift mode, try to split the close line into a
1792            // leading "body content" prefix and the close-marker
1793            // remainder using depth-aware matching. Walks at depth 1
1794            // (we're inside the open tag) so nested same-tag opens
1795            // (e.g. `<inner></inner></tag>` style with a nested div)
1796            // are absorbed into the body and parsed recursively, and
1797            // multi-close shapes (`foo</div></div>` on the close line)
1798            // peel off the matched-pair close — the unmatched
1799            // trailing close projects as a sibling `RawBlock` per
1800            // pandoc-native. For `<div>`, non-empty `leading`
1801            // propagates pandoc's `markdown_in_html_blocks` Plain
1802            // demotion rule. For non-div strict-block tags, demotion
1803            // follows pandoc's `OnlyIfLast` rule (demote the trailing
1804            // Para only when no blank line precedes the close).
1805            let close_split_tag = if lift_mode {
1806                if strict_block_lift {
1807                    strict_block_tag_name
1808                } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1809                    Some("div")
1810                } else {
1811                    None
1812                }
1813            } else {
1814                None
1815            };
1816            let (close_no_nl, close_post_nl) = strip_newline(line);
1817            let close_split = close_split_tag
1818                .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1819
1820            if let Some((leading, close_part)) = close_split {
1821                // Close-line leading that is whitespace-only is close-tag
1822                // indentation, not body content (pandoc-native strips it
1823                // from the close RawBlock and treats the close as butted —
1824                // see `   </tag>` shapes). Route those bytes into the
1825                // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1826                // projector strips them; keep the demote policy keyed on
1827                // the original leading so butted-close detection (Plain
1828                // demotion for div, OnlyIfLast for non-div) still fires.
1829                let leading_is_ws_only =
1830                    !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1831                let body_leading = if leading_is_ws_only { "" } else { leading };
1832                let policy = if strict_block_lift {
1833                    LastParaDemote::OnlyIfLast
1834                } else if !leading.is_empty() {
1835                    LastParaDemote::SkipTrailingBlanks
1836                } else {
1837                    LastParaDemote::Never
1838                };
1839                // Split close_part into close-marker bytes (`</tag>`)
1840                // and trailing bytes (e.g. an extra `</div>` for the
1841                // double-close case, or `bar` for trailing text after
1842                // a normal close). Trailing bytes are recursively
1843                // parsed and grafted as siblings of the HTML_BLOCK_DIV
1844                // wrapper.
1845                let close_tag_name = close_split_tag.expect("close_split_tag present");
1846                let close_marker_end =
1847                    split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1848                let close_marker = &close_part[..close_marker_end];
1849                let close_trailing = &close_part[close_marker_end..];
1850
1851                emit_html_block_body_lifted(
1852                    builder,
1853                    &pre_content,
1854                    &content_lines,
1855                    body_leading,
1856                    policy,
1857                    config,
1858                );
1859                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1860                if leading_is_ws_only {
1861                    builder.token(SyntaxKind::WHITESPACE.into(), leading);
1862                }
1863                if close_trailing.is_empty() {
1864                    let mut close_line =
1865                        String::with_capacity(close_marker.len() + close_post_nl.len());
1866                    close_line.push_str(close_marker);
1867                    close_line.push_str(close_post_nl);
1868                    emit_html_block_line(builder, &close_line, 0);
1869                    builder.finish_node();
1870                } else {
1871                    // Close tag holds only the close-marker bytes;
1872                    // trailing + newline graft as siblings.
1873                    builder.token(SyntaxKind::TEXT.into(), close_marker);
1874                    builder.finish_node(); // HTML_BLOCK_TAG
1875                    builder.finish_node(); // HtmlBlock
1876
1877                    let mut trailing_text =
1878                        String::with_capacity(close_trailing.len() + close_post_nl.len());
1879                    trailing_text.push_str(close_trailing);
1880                    trailing_text.push_str(close_post_nl);
1881                    let mut inner_options = config.clone();
1882                    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1883                    inner_options.refdef_labels = Some(refdefs.clone());
1884                    let inner_root = crate::parser::parse_with_refdefs(
1885                        &trailing_text,
1886                        Some(inner_options),
1887                        refdefs,
1888                    );
1889                    let mut bq = None;
1890                    graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1891                    current_pos += 1;
1892                    return current_pos;
1893                }
1894            } else {
1895                emit_html_block_body(
1896                    builder,
1897                    &pre_content,
1898                    &content_lines,
1899                    bq_depth,
1900                    wrapper_kind,
1901                    lift_mode,
1902                    config,
1903                );
1904                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1905                emit_html_block_line(builder, line, bq_depth);
1906                builder.finish_node();
1907            }
1908
1909            current_pos += 1;
1910            break;
1911        }
1912
1913        // Regular content line
1914        content_lines.push(line);
1915        current_pos += 1;
1916    }
1917
1918    // If we didn't find a closing marker, emit what we collected
1919    if !found_closing {
1920        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1921        emit_html_block_body(
1922            builder,
1923            &pre_content,
1924            &content_lines,
1925            bq_depth,
1926            wrapper_kind,
1927            lift_mode,
1928            config,
1929        );
1930    }
1931
1932    builder.finish_node(); // HtmlBlock
1933    current_pos
1934}
1935
1936/// Emit the collected inner content lines for an HTML block.
1937///
1938/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1939/// line `<div>` open outside blockquote), recursively parse the inner
1940/// content (including any open-tag trailing) as Pandoc-flavored
1941/// markdown and graft the resulting top-level blocks as direct children
1942/// of the wrapper. This is the Phase 6 structural lift — the projector
1943/// and downstream consumers (linter, salsa, LSP) can walk the
1944/// structural children instead of re-tokenizing the body bytes.
1945///
1946/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1947/// blockquote, multi-line open, or no content at all — fall through to
1948/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1949///
1950/// CST bytes remain byte-identical to source: the recursive parser is
1951/// lossless on the same byte slice the legacy path would have captured
1952/// as TEXT.
1953fn emit_html_block_body(
1954    builder: &mut GreenNodeBuilder<'static>,
1955    pre_content: &str,
1956    content_lines: &[&str],
1957    bq_depth: usize,
1958    wrapper_kind: SyntaxKind,
1959    lift_mode: bool,
1960    config: &ParserOptions,
1961) {
1962    if pre_content.is_empty() && content_lines.is_empty() {
1963        return;
1964    }
1965    if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1966        // Reached when the parser walked to end-of-input without finding
1967        // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1968        emit_html_block_body_lifted(
1969            builder,
1970            pre_content,
1971            content_lines,
1972            "",
1973            LastParaDemote::Never,
1974            config,
1975        );
1976        return;
1977    }
1978    // Legacy path: opaque TEXT capture. `pre_content` is always empty
1979    // here (lift_mode is the only path that populates it), but be
1980    // defensive — if a trailing prefix snuck in, emit it as TEXT so
1981    // bytes are preserved.
1982    builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1983    if !pre_content.is_empty() {
1984        builder.token(SyntaxKind::TEXT.into(), pre_content);
1985    }
1986    for content_line in content_lines {
1987        emit_html_block_line(builder, content_line, bq_depth);
1988    }
1989    builder.finish_node();
1990}
1991
1992/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1993/// to `PLAIN` when grafting children into the structural CST.
1994#[derive(Copy, Clone, Debug)]
1995enum LastParaDemote {
1996    /// Never demote — pandoc preserves the trailing `Para`.
1997    Never,
1998    /// Demote the LAST `PARAGRAPH` child, skipping any trailing
1999    /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2000    /// tag is butted against the paragraph text on its source line —
2001    /// pandoc's `markdown_in_html_blocks` Plain demotion.
2002    SkipTrailingBlanks,
2003    /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2004    /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2005    /// for non-div strict-block tags whose body emits at top-level
2006    /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2007    /// demotes the trailing `Para` to `Plain` unless a blank line
2008    /// separates them.
2009    OnlyIfLast,
2010}
2011
2012/// Lift the HTML-block body into structural CST children: build the
2013/// inner text from `pre_content` + `content_lines` + `post_content`
2014/// (in order), recursively parse it as Pandoc-flavored markdown, and
2015/// graft the resulting top-level blocks into `builder`. `demote_policy`
2016/// controls whether the trailing paragraph is retagged as `PLAIN` to
2017/// encode pandoc's Plain/Para adjacency rules structurally.
2018fn emit_html_block_body_lifted(
2019    builder: &mut GreenNodeBuilder<'static>,
2020    pre_content: &str,
2021    content_lines: &[&str],
2022    post_content: &str,
2023    demote_policy: LastParaDemote,
2024    config: &ParserOptions,
2025) {
2026    emit_html_block_body_lifted_inner(
2027        builder,
2028        pre_content,
2029        content_lines,
2030        post_content,
2031        demote_policy,
2032        config,
2033        &mut None,
2034    )
2035}
2036
2037/// Body-lift variant for `<div>` inside a blockquote. Strips
2038/// `bq_depth` levels of blockquote markers from each `content_line`,
2039/// captures the per-line prefix bytes, and grafts the recursive parse
2040/// with prefix injection so the output CST stays byte-equal to the
2041/// source. `pre_content` and `post_content` must be empty (the bq
2042/// clean lift only handles the shape where the open and close tags
2043/// stand alone on their source lines).
2044fn emit_html_block_body_lifted_bq(
2045    builder: &mut GreenNodeBuilder<'static>,
2046    content_lines: &[&str],
2047    bq_depth: usize,
2048    demote_policy: LastParaDemote,
2049    config: &ParserOptions,
2050) {
2051    let mut prefixes: Vec<String> = Vec::with_capacity(content_lines.len());
2052    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2053    for cl in content_lines {
2054        let stripped = strip_n_blockquote_markers(cl, bq_depth);
2055        let prefix_len = cl.len() - stripped.len();
2056        prefixes.push(cl[..prefix_len].to_string());
2057        stripped_lines.push(stripped);
2058    }
2059    let mut bq = Some(BqPrefixState {
2060        prefixes,
2061        line_idx: 0,
2062        at_line_start: true,
2063    });
2064    emit_html_block_body_lifted_inner(
2065        builder,
2066        "",
2067        &stripped_lines,
2068        "",
2069        demote_policy,
2070        config,
2071        &mut bq,
2072    )
2073}
2074
2075/// Body-lift variant for the bq messy-shape lift — open-trailing,
2076/// butted-close, or both. The open-trailing bytes (if any) sit in
2077/// `pre_content` (line 0 of the body — no bq prefix in source because
2078/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2079/// each carry their own bq prefix. The close line's `leading` (body
2080/// bytes before `</tag>`) sits on the close line, prefixed in source
2081/// by `close_line_prefix` (the bq prefix captured from `line`).
2082///
2083/// Builds `prefixes` so each emitted line in the recursive parse
2084/// output gets the right per-line bq prefix re-injected at line start:
2085/// `pre_content` → empty prefix (no source `> ` precedes it); each
2086/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2087/// Result CST stays byte-equal to source.
2088#[allow(clippy::too_many_arguments)]
2089fn emit_html_block_body_lifted_bq_messy(
2090    builder: &mut GreenNodeBuilder<'static>,
2091    pre_content: &str,
2092    content_lines: &[&str],
2093    leading: &str,
2094    close_line_prefix: &str,
2095    bq_depth: usize,
2096    demote_policy: LastParaDemote,
2097    config: &ParserOptions,
2098) {
2099    let mut prefixes: Vec<String> = Vec::new();
2100    if !pre_content.is_empty() {
2101        prefixes.push(String::new());
2102    }
2103    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2104    for cl in content_lines {
2105        let stripped = strip_n_blockquote_markers(cl, bq_depth);
2106        let prefix_len = cl.len() - stripped.len();
2107        prefixes.push(cl[..prefix_len].to_string());
2108        stripped_lines.push(stripped);
2109    }
2110    if !leading.is_empty() {
2111        prefixes.push(close_line_prefix.to_string());
2112    }
2113    let mut bq = Some(BqPrefixState {
2114        prefixes,
2115        line_idx: 0,
2116        at_line_start: true,
2117    });
2118    emit_html_block_body_lifted_inner(
2119        builder,
2120        pre_content,
2121        &stripped_lines,
2122        leading,
2123        demote_policy,
2124        config,
2125        &mut bq,
2126    )
2127}
2128
2129fn emit_html_block_body_lifted_inner(
2130    builder: &mut GreenNodeBuilder<'static>,
2131    pre_content: &str,
2132    content_lines: &[&str],
2133    post_content: &str,
2134    demote_policy: LastParaDemote,
2135    config: &ParserOptions,
2136    bq: &mut Option<BqPrefixState>,
2137) {
2138    if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2139        return;
2140    }
2141    let mut inner_text = String::with_capacity(
2142        pre_content.len()
2143            + content_lines.iter().map(|s| s.len()).sum::<usize>()
2144            + post_content.len(),
2145    );
2146    inner_text.push_str(pre_content);
2147    for line in content_lines {
2148        inner_text.push_str(line);
2149    }
2150    inner_text.push_str(post_content);
2151
2152    let mut inner_options = config.clone();
2153    let refdefs = config.refdef_labels.clone().unwrap_or_default();
2154    inner_options.refdef_labels = Some(refdefs.clone());
2155    let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2156    graft_document_children(builder, &inner_root, demote_policy, bq);
2157}
2158
2159/// Per-line blockquote-prefix injection state used by the graft helpers
2160/// when the lifted body originated inside a `> …` blockquote: the
2161/// recursive parse was fed the bq-stripped text, so the prefix bytes
2162/// (`BLOCK_QUOTE_MARKER` + `WHITESPACE`) must be re-emitted at the
2163/// start of each source line to keep the CST byte-equal to the source.
2164///
2165/// `prefixes[i]` is the literal prefix bytes for source line `i` of the
2166/// body (e.g. `"> "`, `">  "`, or `">"`). `line_idx` is the index of
2167/// the next prefix to emit; `at_line_start` flips to `true` after every
2168/// `NEWLINE` so the next token triggers prefix emission.
2169struct BqPrefixState {
2170    prefixes: Vec<String>,
2171    line_idx: usize,
2172    at_line_start: bool,
2173}
2174
2175/// Walk a parsed inner document's top-level children and re-emit them
2176/// into `builder`. The document's wrapper node is skipped — only its
2177/// children are grafted.
2178///
2179/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2180/// as `PLAIN` — see [`LastParaDemote`].
2181///
2182/// `bq` is `Some` when grafting a body that lived inside a blockquote
2183/// — token emission then injects `BLOCK_QUOTE_MARKER + WHITESPACE`
2184/// prefix tokens at line starts. See [`BqPrefixState`].
2185fn graft_document_children(
2186    builder: &mut GreenNodeBuilder<'static>,
2187    doc: &SyntaxNode,
2188    demote_policy: LastParaDemote,
2189    bq: &mut Option<BqPrefixState>,
2190) {
2191    let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2192
2193    let mut demote_idx: Option<usize> = None;
2194    match demote_policy {
2195        LastParaDemote::Never => {}
2196        LastParaDemote::SkipTrailingBlanks => {
2197            for (i, c) in children.iter().enumerate().rev() {
2198                if let rowan::NodeOrToken::Node(n) = c {
2199                    if n.kind() == SyntaxKind::BLANK_LINE {
2200                        continue;
2201                    }
2202                    if n.kind() == SyntaxKind::PARAGRAPH {
2203                        demote_idx = Some(i);
2204                    }
2205                    break;
2206                }
2207            }
2208        }
2209        LastParaDemote::OnlyIfLast => {
2210            for (i, c) in children.iter().enumerate().rev() {
2211                if let rowan::NodeOrToken::Node(n) = c {
2212                    if n.kind() == SyntaxKind::PARAGRAPH {
2213                        demote_idx = Some(i);
2214                    }
2215                    break;
2216                }
2217            }
2218        }
2219    }
2220
2221    for (i, child) in children.into_iter().enumerate() {
2222        match child {
2223            rowan::NodeOrToken::Node(n) => {
2224                if Some(i) == demote_idx {
2225                    graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2226                } else {
2227                    graft_subtree(builder, &n, bq);
2228                }
2229            }
2230            rowan::NodeOrToken::Token(t) => {
2231                emit_grafted_token(builder, t.kind(), t.text(), bq);
2232            }
2233        }
2234    }
2235}
2236
2237/// Recursively re-emit `node` and its descendants into `builder`.
2238/// Token text is copied verbatim so the result is byte-identical to
2239/// the input span (modulo bq prefix tokens injected at line starts
2240/// when `bq` is `Some`).
2241fn graft_subtree(
2242    builder: &mut GreenNodeBuilder<'static>,
2243    node: &SyntaxNode,
2244    bq: &mut Option<BqPrefixState>,
2245) {
2246    graft_subtree_as(builder, node, node.kind(), bq);
2247}
2248
2249/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2250/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2251/// the close-butted demotion rule.
2252fn graft_subtree_as(
2253    builder: &mut GreenNodeBuilder<'static>,
2254    node: &SyntaxNode,
2255    kind: SyntaxKind,
2256    bq: &mut Option<BqPrefixState>,
2257) {
2258    builder.start_node(kind.into());
2259    for child in node.children_with_tokens() {
2260        match child {
2261            rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2262            rowan::NodeOrToken::Token(t) => {
2263                emit_grafted_token(builder, t.kind(), t.text(), bq);
2264            }
2265        }
2266    }
2267    builder.finish_node();
2268}
2269
2270/// Emit a single token while optionally injecting blockquote prefix
2271/// tokens at line starts. When `bq` is `None`, this is a plain
2272/// `builder.token()` passthrough.
2273fn emit_grafted_token(
2274    builder: &mut GreenNodeBuilder<'static>,
2275    kind: SyntaxKind,
2276    text: &str,
2277    bq: &mut Option<BqPrefixState>,
2278) {
2279    if let Some(state) = bq.as_mut() {
2280        if state.at_line_start {
2281            if let Some(prefix) = state.prefixes.get(state.line_idx) {
2282                emit_bq_prefix_tokens(builder, prefix);
2283            }
2284            state.at_line_start = false;
2285        }
2286        builder.token(kind.into(), text);
2287        // `BLANK_LINE` token represents an entirely blank source line —
2288        // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2289        // token as line-ending so the per-line prefix index advances
2290        // correctly.
2291        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2292            state.line_idx += 1;
2293            state.at_line_start = true;
2294        }
2295    } else {
2296        builder.token(kind.into(), text);
2297    }
2298}
2299
2300/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2301/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2302fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2303    for ch in prefix.chars() {
2304        if ch == '>' {
2305            builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2306        } else {
2307            let mut buf = [0u8; 4];
2308            builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2309        }
2310    }
2311}
2312
2313/// Locate the byte index (within `line`) of the open-tag's closing `>`
2314/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2315/// the line doesn't fit the expected shape. Mirrors the inner scan of
2316/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2317/// caller can slice off the trailing bytes.
2318fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2319    let bytes = line.as_bytes();
2320    let indent_end = bytes
2321        .iter()
2322        .position(|&b| b != b' ' && b != b'\t')
2323        .unwrap_or(bytes.len());
2324    let rest = &line[indent_end..];
2325    let rest_bytes = rest.as_bytes();
2326    let prefix_len = 1 + tag_name.len();
2327    if rest_bytes.len() < prefix_len + 1
2328        || rest_bytes[0] != b'<'
2329        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2330    {
2331        return None;
2332    }
2333    let after_name = &rest[prefix_len..];
2334    let after_name_bytes = after_name.as_bytes();
2335    let mut i = 0usize;
2336    let mut quote: Option<u8> = None;
2337    while i < after_name_bytes.len() {
2338        match (quote, after_name_bytes[i]) {
2339            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2340            (Some(q), b2) if b2 == q => quote = None,
2341            (None, b'>') => return Some(indent_end + prefix_len + i),
2342            _ => {}
2343        }
2344        i += 1;
2345    }
2346    None
2347}
2348
2349/// Whether `slice` begins (after leading ASCII whitespace) with an
2350/// open tag whose name is a Pandoc void block tag (`<source>`,
2351/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2352/// open tags return false.
2353///
2354/// Used by the inline-block matched-pair lift gate: pandoc-native
2355/// abandons the lift when the body's first non-blank content is a
2356/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2357/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2358/// matched-pair lift).
2359fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2360    let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2361    if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2362        return false;
2363    }
2364    let Some(tag_end) = parse_open_tag(trimmed) else {
2365        return false;
2366    };
2367    let bytes = trimmed.as_bytes();
2368    let mut name_end = 1usize;
2369    while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2370    {
2371        name_end += 1;
2372    }
2373    if name_end == 1 {
2374        return false;
2375    }
2376    is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2377}
2378
2379/// Whether the body of an inline-block matched-pair (`<video>...`,
2380/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2381/// a void block tag — the condition under which pandoc-native abandons
2382/// the matched-pair lift. Probes three shapes:
2383///
2384/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2385///   after the open `>` on `first_inner` start with `<source`.
2386/// - **Single-line open + multi-line body**: open-trailing on the open
2387///   line is empty/whitespace AND the first non-blank body line
2388///   (`lines[start_pos+1..]`) starts with a void tag.
2389/// - **Multi-line open**: same body-line scan starting at
2390///   `lines[multiline_open_end+1..]`.
2391///
2392/// Returns `false` when the body begins with text, with a close tag,
2393/// or with a non-void block tag — those cases all proceed with the
2394/// matched-pair lift.
2395fn inline_block_void_interior_abandons(
2396    first_inner: &str,
2397    lines: &[&str],
2398    start_pos: usize,
2399    multiline_open_end: Option<usize>,
2400    bq_depth: usize,
2401    tag_name: &str,
2402) -> bool {
2403    let (line_no_nl, _) = strip_newline(first_inner);
2404    let (body_start_line_idx, open_trailing) = match multiline_open_end {
2405        Some(end) => (end + 1, ""),
2406        None => {
2407            let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2408            let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2409            (start_pos + 1, trailing)
2410        }
2411    };
2412    let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2413    if !trimmed.is_empty() {
2414        return slice_starts_with_void_block_tag(trimmed);
2415    }
2416    for line in &lines[body_start_line_idx..] {
2417        let inner = if bq_depth > 0 {
2418            strip_n_blockquote_markers(line, bq_depth)
2419        } else {
2420            line
2421        };
2422        let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2423        if trimmed.is_empty() {
2424            continue;
2425        }
2426        return slice_starts_with_void_block_tag(trimmed);
2427    }
2428    false
2429}
2430
2431/// Probe whether the open-tag line has a valid (quote-aware) closing
2432/// `>` after the tag name. Admits trailing content after `>` (the
2433/// open-trailing shape `<form>foo`) — the caller is expected to capture
2434/// that trailing into the structural lift's `pre_content`.
2435pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2436    let bytes = line.as_bytes();
2437    let indent_end = bytes
2438        .iter()
2439        .position(|&b| b != b' ' && b != b'\t')
2440        .unwrap_or(bytes.len());
2441    let rest = &line[indent_end..];
2442    let rest_bytes = rest.as_bytes();
2443    let prefix_len = 1 + tag_name.len();
2444    if rest_bytes.len() < prefix_len + 1
2445        || rest_bytes[0] != b'<'
2446        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2447    {
2448        return false;
2449    }
2450    let after_name = &rest[prefix_len..];
2451    let after_name_bytes = after_name.as_bytes();
2452    let mut i = 0usize;
2453    let mut quote: Option<u8> = None;
2454    while i < after_name_bytes.len() {
2455        match (quote, after_name_bytes[i]) {
2456            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2457            (Some(q), b2) if b2 == q => quote = None,
2458            (None, b'>') => return true,
2459            _ => {}
2460        }
2461        i += 1;
2462    }
2463    false
2464}
2465
2466/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2467/// be lifted structurally. Returns `true` only when:
2468/// - The line starts with `<tag_name` (modulo leading whitespace).
2469/// - The open tag's `>` exists with proper quote handling.
2470/// - The bytes after the open `>` contain a depth-zero matched
2471///   `</tag_name>` close (depth-aware: nested `<tag>` opens
2472///   increment depth; matching is case-insensitive, quote-aware).
2473///
2474/// Trailing bytes after the matched close are accepted and grafted
2475/// as a sibling block by the caller. Examples:
2476/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2477/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2478///   recursively parses to a `RawBlock`).
2479/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2480///   parsed recursively), trailing=`bar`.
2481fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2482    let bytes = line.as_bytes();
2483    let indent_end = bytes
2484        .iter()
2485        .position(|&b| b != b' ' && b != b'\t')
2486        .unwrap_or(bytes.len());
2487    let rest = &line[indent_end..];
2488    let rest_bytes = rest.as_bytes();
2489    let prefix_len = 1 + tag_name.len();
2490    if rest_bytes.len() < prefix_len
2491        || rest_bytes[0] != b'<'
2492        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2493    {
2494        return false;
2495    }
2496    let after_name = &rest[prefix_len..];
2497    let after_name_bytes = after_name.as_bytes();
2498    let mut i = 0usize;
2499    let mut quote: Option<u8> = None;
2500    let mut gt_idx: Option<usize> = None;
2501    while i < after_name_bytes.len() {
2502        match (quote, after_name_bytes[i]) {
2503            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2504            (Some(q), b2) if b2 == q => quote = None,
2505            (None, b'>') => {
2506                gt_idx = Some(i);
2507                break;
2508            }
2509            _ => {}
2510        }
2511        i += 1;
2512    }
2513    let Some(gt_idx) = gt_idx else {
2514        return false;
2515    };
2516    let trailing = &after_name[gt_idx + 1..];
2517    // Depth-aware: walk `trailing` (we begin inside the open tag at
2518    // depth 1). Return true iff a matched `</tag>` exists where depth
2519    // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2520    matched_close_offset(trailing, tag_name).is_some()
2521}
2522
2523/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2524/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2525/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2526/// starts at 1 (we begin inside the open tag). Self-closing opens
2527/// (`<tag/>`) do not increment depth.
2528///
2529/// Returns `Some((close_start, close_end))` where:
2530/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2531/// - `close_end` is one past the matched `>`.
2532///
2533/// Returns `None` when no matched close is present (unclosed tag,
2534/// depth never returns to 0).
2535fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2536    let bytes = trailing.as_bytes();
2537    let lower_line = trailing.to_ascii_lowercase();
2538    let lower_bytes = lower_line.as_bytes();
2539    let tag_lower = tag_name.to_ascii_lowercase();
2540    let tag_bytes = tag_lower.as_bytes();
2541
2542    let mut depth: i32 = 1;
2543    let mut i = 0usize;
2544
2545    while i < bytes.len() {
2546        if bytes[i] != b'<' {
2547            i += 1;
2548            continue;
2549        }
2550        let after = i + 1;
2551        let is_close = after < bytes.len() && bytes[after] == b'/';
2552        let name_start = if is_close { after + 1 } else { after };
2553        let matched = name_start + tag_bytes.len() <= bytes.len()
2554            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2555        let after_name = name_start + tag_bytes.len();
2556        let is_boundary = matched
2557            && matches!(
2558                bytes.get(after_name).copied(),
2559                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2560            );
2561
2562        // Scan forward to this tag bracket's `>`, respecting quoted
2563        // attribute values; track self-closing form (`/>`).
2564        let mut j = if matched { after_name } else { after };
2565        let mut quote: Option<u8> = None;
2566        let mut self_close = false;
2567        let mut found_gt = false;
2568        while j < bytes.len() {
2569            let b = bytes[j];
2570            match (quote, b) {
2571                (Some(q), x) if x == q => quote = None,
2572                (None, b'"') | (None, b'\'') => quote = Some(b),
2573                (None, b'>') => {
2574                    found_gt = true;
2575                    if j > i + 1 && bytes[j - 1] == b'/' {
2576                        self_close = true;
2577                    }
2578                    break;
2579                }
2580                _ => {}
2581            }
2582            j += 1;
2583        }
2584
2585        if matched && is_boundary {
2586            if is_close {
2587                depth -= 1;
2588                if depth == 0 && found_gt {
2589                    return Some((i, j + 1));
2590                }
2591            } else if !self_close {
2592                depth += 1;
2593            }
2594        }
2595
2596        if found_gt {
2597            i = j + 1;
2598        } else {
2599            // Unterminated `<...` — give up.
2600            break;
2601        }
2602    }
2603    None
2604}
2605
2606/// Locate the byte offset of the first `>` after a `</tag` prefix at
2607/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2608/// the caller can split `close_part` into the close-marker bytes
2609/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2610/// expected prefix shape is missing — caller treats the whole slice
2611/// as the close marker (no trailing).
2612fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2613    let prefix_len = 2 + tag_name.len();
2614    let bytes = close_part.as_bytes();
2615    if bytes.len() < prefix_len
2616        || bytes[0] != b'<'
2617        || bytes[1] != b'/'
2618        || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2619    {
2620        return None;
2621    }
2622    // Scan from after `</tag` to the first unquoted `>`.
2623    let mut i = prefix_len;
2624    let mut quote: Option<u8> = None;
2625    while i < bytes.len() {
2626        match (quote, bytes[i]) {
2627            (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2628            (Some(q), b2) if b2 == q => quote = None,
2629            (None, b'>') => return Some(i + 1),
2630            _ => {}
2631        }
2632        i += 1;
2633    }
2634    None
2635}
2636
2637/// Try to split the close line of an HTML_BLOCK_DIV body into a
2638/// leading content prefix and a clean `</tag>...` remainder. Returns
2639/// `Some((leading, close_part))` only when the line contains exactly
2640/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2641/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2642/// for missing close tags, or for compound shapes the parser
2643/// shouldn't attempt to lift in this pass.
2644///
2645/// `leading` may be empty (close starts at column 0) or pure
2646/// whitespace (close on an indented line). Both count as "butted" per
2647/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2648/// the trailing paragraph inside the div demotes Para→Plain.
2649fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2650    let (opens, closes) = count_tag_balance(line, tag_name);
2651    if opens != 0 || closes != 1 {
2652        return None;
2653    }
2654    // Locate the close tag's opening `<` by lowercased substring search.
2655    // Safe because we've already established (above) that the line has
2656    // exactly one `</tag>` and no `<tag>` opens, so the first match is
2657    // THE close.
2658    let needle = format!("</{}", tag_name);
2659    let lower = line.to_ascii_lowercase();
2660    let close_lt = lower.find(&needle)?;
2661    Some((&line[..close_lt], &line[close_lt..]))
2662}
2663
2664/// Depth-aware variant of `try_split_close_line` used by the same-line
2665/// lift path. Walks `line` starting at depth 1 (we begin inside the
2666/// open `<tag>`) and splits at the byte position where the matched
2667/// `</tag>` close brings depth to 0. Returns `Some((body,
2668/// close_part))` where `body` is the bytes before the matched-close
2669/// start and `close_part` is the bytes from the matched close onward.
2670///
2671/// Unlike `try_split_close_line` this accepts nested same-tag opens
2672/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2673/// body=`<div>x</div>` (a nested div the body lift parses
2674/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2675/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2676/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2677fn try_split_close_line_depth_aware<'a>(
2678    line: &'a str,
2679    tag_name: &str,
2680) -> Option<(&'a str, &'a str)> {
2681    let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2682    Some((&line[..close_start], &line[close_start..]))
2683}
2684
2685/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2686/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2687/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2688/// + TEXT(">") + TEXT(trailing)?`.
2689///
2690/// Bytes are byte-identical to the source — this only tokenizes at finer
2691/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2692/// region structurally. Falls back to a single TEXT token if the line
2693/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2694/// only retags as the lift kind when this shape was matched).
2695///
2696/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2697/// returned as `&str` instead so the caller can splice them into the
2698/// recursive-parse input for the structural body lift. When false
2699/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2700/// empty slice is returned.
2701fn emit_open_tag_tokens<'a>(
2702    builder: &mut GreenNodeBuilder<'static>,
2703    line: &'a str,
2704    tag_name: &str,
2705    lift_trailing: bool,
2706) -> &'a str {
2707    let bytes = line.as_bytes();
2708    // Leading indent (CommonMark allows up to 3 spaces).
2709    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2710    if indent_end > 0 {
2711        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2712    }
2713    let rest = &line[indent_end..];
2714    // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2715    let prefix_len = 1 + tag_name.len();
2716    if !rest.starts_with('<')
2717        || rest.len() < prefix_len
2718        || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2719    {
2720        builder.token(SyntaxKind::TEXT.into(), rest);
2721        return "";
2722    }
2723    let after_name = &rest[prefix_len..];
2724    let after_name_bytes = after_name.as_bytes();
2725    // Find the closing `>` of the open tag, respecting quoted attribute values.
2726    let mut i = 0usize;
2727    let mut quote: Option<u8> = None;
2728    let mut tag_close: Option<usize> = None;
2729    while i < after_name_bytes.len() {
2730        let b = after_name_bytes[i];
2731        match (quote, b) {
2732            (None, b'"') | (None, b'\'') => quote = Some(b),
2733            (Some(q), b2) if b2 == q => quote = None,
2734            (None, b'>') => {
2735                tag_close = Some(i);
2736                break;
2737            }
2738            _ => {}
2739        }
2740        i += 1;
2741    }
2742    let Some(tag_close) = tag_close else {
2743        // Open tag has no closing `>` on this line — defensive fallback.
2744        builder.token(SyntaxKind::TEXT.into(), rest);
2745        return "";
2746    };
2747    // Whitespace between the tag name and the attribute region.
2748    let attrs_inner = &after_name[..tag_close];
2749    let ws_end = attrs_inner
2750        .as_bytes()
2751        .iter()
2752        .position(|&b| !matches!(b, b' ' | b'\t'))
2753        .unwrap_or(attrs_inner.len());
2754    let leading_ws = &attrs_inner[..ws_end];
2755    // Strip a trailing self-closing slash and the whitespace before it
2756    // from the attribute region; emit them as TEXT outside the
2757    // HTML_ATTRS node so the structural region only holds attribute
2758    // bytes (not formatting punctuation).
2759    let attrs_after_ws = &attrs_inner[ws_end..];
2760    let mut attr_end = attrs_after_ws.len();
2761    let attr_bytes = attrs_after_ws.as_bytes();
2762    let mut self_close_start = attr_end;
2763    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2764        self_close_start = attr_end - 1;
2765        attr_end = self_close_start;
2766        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2767            attr_end -= 1;
2768        }
2769    }
2770    let attrs_text = &attrs_after_ws[..attr_end];
2771    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2772    let after_self_close = &attrs_after_ws[self_close_start..];
2773
2774    // Use the original source bytes for the `<tag` prefix (preserves
2775    // source casing — losslessness).
2776    builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2777    if !leading_ws.is_empty() {
2778        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2779    }
2780    if !attrs_text.is_empty() {
2781        builder.start_node(SyntaxKind::HTML_ATTRS.into());
2782        builder.token(SyntaxKind::TEXT.into(), attrs_text);
2783        builder.finish_node();
2784    }
2785    if !trailing_text.is_empty() {
2786        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2787    }
2788    if !after_self_close.is_empty() {
2789        builder.token(SyntaxKind::TEXT.into(), after_self_close);
2790    }
2791    builder.token(SyntaxKind::TEXT.into(), ">");
2792    let after_gt = &after_name[tag_close + 1..];
2793    if lift_trailing {
2794        // Return trailing bytes to the caller (will be spliced into the
2795        // recursive-parse input for the body lift).
2796        return after_gt;
2797    }
2798    if !after_gt.is_empty() {
2799        builder.token(SyntaxKind::TEXT.into(), after_gt);
2800    }
2801    ""
2802}
2803
2804/// Detect a multi-line HTML open tag for `tag_name`. Returns
2805/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2806/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2807/// the existing path) or when the `>` is missing entirely.
2808///
2809/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2810/// attribute value doesn't terminate the open tag. Quote state carries
2811/// across line boundaries.
2812fn find_multiline_open_end(
2813    lines: &[&str],
2814    start_pos: usize,
2815    first_inner: &str,
2816    tag_name: &str,
2817    bq_depth: usize,
2818) -> Option<usize> {
2819    // Locate the `<tag_name` literal in `first_inner` to start scanning past
2820    // it. Match is ASCII case-insensitive; the parser preserves source casing.
2821    // `first_inner` is already bq-stripped by the caller; subsequent lines are
2822    // stripped inline below via `strip_n_blockquote_markers`.
2823    let trimmed = strip_leading_spaces(first_inner);
2824    let prefix_len = 1 + tag_name.len();
2825    if !trimmed.starts_with('<')
2826        || trimmed.len() < prefix_len
2827        || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2828    {
2829        return None;
2830    }
2831    let leading_indent = first_inner.len() - trimmed.len();
2832    let mut i = leading_indent + prefix_len; // past `<tag_name`
2833    let mut quote: Option<u8> = None;
2834
2835    // Scan first line for an unquoted `>`.
2836    let line0_bytes = first_inner.as_bytes();
2837    while i < line0_bytes.len() {
2838        match (quote, line0_bytes[i]) {
2839            (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2840            (Some(q), x) if x == q => quote = None,
2841            (None, b'>') => return None, // single-line case
2842            _ => {}
2843        }
2844        i += 1;
2845    }
2846
2847    // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2848    // blockquote markers per line so `> ` prefixes don't count toward the
2849    // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2850    let mut line_idx = start_pos + 1;
2851    while line_idx < lines.len() {
2852        let raw = lines[line_idx];
2853        let inner = if bq_depth > 0 {
2854            strip_n_blockquote_markers(raw, bq_depth)
2855        } else {
2856            raw
2857        };
2858        for &b in inner.as_bytes() {
2859            match (quote, b) {
2860                (None, b'"') | (None, b'\'') => quote = Some(b),
2861                (Some(q), x) if x == q => quote = None,
2862                (None, b'>') => return Some(line_idx),
2863                _ => {}
2864            }
2865        }
2866        line_idx += 1;
2867    }
2868
2869    None
2870}
2871
2872/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2873/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2874/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2875/// open tag (no `>` in the remaining input) as paragraph text rather than
2876/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2877/// reparse the same content recursively, causing a stack overflow.
2878///
2879/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2880/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2881/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2882/// the tag eventually closes far away.
2883pub(crate) fn pandoc_html_open_tag_closes(
2884    lines: &[&str],
2885    start_pos: usize,
2886    bq_depth: usize,
2887) -> bool {
2888    if start_pos >= lines.len() {
2889        return false;
2890    }
2891    let mut quote: Option<u8> = None;
2892    for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2893        let inner = if bq_depth > 0 {
2894            strip_n_blockquote_markers(line, bq_depth)
2895        } else {
2896            line
2897        };
2898        let bytes = inner.as_bytes();
2899        let mut i = 0usize;
2900        if offset == start_pos {
2901            while i < bytes.len() && bytes[i] == b' ' {
2902                i += 1;
2903            }
2904            if bytes.get(i) != Some(&b'<') {
2905                return false;
2906            }
2907            i += 1;
2908        }
2909        while i < bytes.len() {
2910            match (quote, bytes[i]) {
2911                (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2912                (Some(q), x) if x == q => quote = None,
2913                (None, b'>') => return true,
2914                _ => {}
2915            }
2916            i += 1;
2917        }
2918    }
2919    false
2920}
2921
2922/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2923/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2924/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2925/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2926/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2927/// Phase 6 structural lift.
2928///
2929/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2930/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2931/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2932/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2933///   + TEXT(">") + (TEXT(trailing))? + NEWLINE
2934///
2935/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2936/// newline; `parse_html_attribute_list` tolerates whitespace.
2937#[allow(clippy::too_many_arguments)]
2938fn emit_multiline_open_tag_with_attrs(
2939    builder: &mut GreenNodeBuilder<'static>,
2940    lines: &[&str],
2941    start_pos: usize,
2942    end_line_idx: usize,
2943    tag_name: &str,
2944    bq_depth: usize,
2945    lift_trailing: bool,
2946    pre_content: &mut String,
2947) {
2948    let prefix_len = 1 + tag_name.len();
2949    for (line_idx, raw) in lines
2950        .iter()
2951        .enumerate()
2952        .take(end_line_idx + 1)
2953        .skip(start_pos)
2954    {
2955        // Strip `bq_depth` blockquote markers from the source line so
2956        // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2957        // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2958        // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2959        // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2960        // before this parser runs; re-emitting it here would double
2961        // the bytes and break losslessness.
2962        let stripped = if bq_depth > 0 {
2963            strip_n_blockquote_markers(raw, bq_depth)
2964        } else {
2965            raw
2966        };
2967        let bq_prefix_len = raw.len() - stripped.len();
2968        if bq_prefix_len > 0 && line_idx != start_pos {
2969            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2970        }
2971        let line = stripped;
2972        let (line_no_nl, newline_str) = strip_newline(line);
2973
2974        if line_idx == start_pos {
2975            // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2976            // + attrs)?. The closing `>` is on a later line, so any
2977            // remaining bytes after "<{tag_name}" on this line are the
2978            // start of the attribute region.
2979            let bytes = line_no_nl.as_bytes();
2980            let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2981            if indent_end > 0 {
2982                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2983            }
2984            // Defensive: caller verified the line starts with `<{tag_name}`.
2985            let after_indent = &line_no_nl[indent_end..];
2986            if after_indent.len() >= prefix_len {
2987                builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2988                let rest = &after_indent[prefix_len..];
2989                emit_attr_region(builder, rest);
2990            } else {
2991                builder.token(SyntaxKind::TEXT.into(), after_indent);
2992            }
2993        } else if line_idx < end_line_idx {
2994            // Pure attribute line.
2995            let bytes = line_no_nl.as_bytes();
2996            let indent_end = bytes
2997                .iter()
2998                .position(|&b| !matches!(b, b' ' | b'\t'))
2999                .unwrap_or(bytes.len());
3000            if indent_end > 0 {
3001                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3002            }
3003            let attrs_text = &line_no_nl[indent_end..];
3004            if !attrs_text.is_empty() {
3005                builder.start_node(SyntaxKind::HTML_ATTRS.into());
3006                builder.token(SyntaxKind::TEXT.into(), attrs_text);
3007                builder.finish_node();
3008            }
3009        } else {
3010            // Last line: indent + attrs + ">" + trailing.
3011            let bytes = line_no_nl.as_bytes();
3012            let indent_end = bytes
3013                .iter()
3014                .position(|&b| !matches!(b, b' ' | b'\t'))
3015                .unwrap_or(bytes.len());
3016            if indent_end > 0 {
3017                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3018            }
3019            // Find the unquoted `>` byte position in this line.
3020            let mut quote: Option<u8> = None;
3021            let mut gt_pos: Option<usize> = None;
3022            for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3023                let actual_j = indent_end + j;
3024                match (quote, b) {
3025                    (None, b'"') | (None, b'\'') => quote = Some(b),
3026                    (Some(q), x) if x == q => quote = None,
3027                    (None, b'>') => {
3028                        gt_pos = Some(actual_j);
3029                        break;
3030                    }
3031                    _ => {}
3032                }
3033            }
3034            let Some(gt) = gt_pos else {
3035                // Defensive — caller said `>` is on this line.
3036                builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3037                if !newline_str.is_empty() {
3038                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3039                }
3040                continue;
3041            };
3042            // Attribute region: between indent_end and gt, with possibly
3043            // trailing whitespace before `>`.
3044            let attrs_region = &line_no_nl[indent_end..gt];
3045            let region_bytes = attrs_region.as_bytes();
3046            // Strip trailing whitespace from attrs region; emit as
3047            // separate WHITESPACE so HTML_ATTRS only contains attribute
3048            // bytes.
3049            let mut attr_end = region_bytes.len();
3050            while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3051                attr_end -= 1;
3052            }
3053            let attrs_text = &attrs_region[..attr_end];
3054            let trailing_ws = &attrs_region[attr_end..];
3055            if !attrs_text.is_empty() {
3056                builder.start_node(SyntaxKind::HTML_ATTRS.into());
3057                builder.token(SyntaxKind::TEXT.into(), attrs_text);
3058                builder.finish_node();
3059            }
3060            if !trailing_ws.is_empty() {
3061                builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3062            }
3063            builder.token(SyntaxKind::TEXT.into(), ">");
3064            let after_gt = &line_no_nl[gt + 1..];
3065            if lift_trailing && !after_gt.is_empty() {
3066                // Lift trailing bytes (and the trailing newline) into
3067                // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3068                // with `TEXT(">")`. The recursive parse at the close-marker
3069                // site treats `pre_content` as the leading bytes of the
3070                // structural body — same shape produced by `emit_open_tag_tokens`
3071                // for single-line opens.
3072                pre_content.push_str(after_gt);
3073                pre_content.push_str(newline_str);
3074                continue;
3075            }
3076            if !after_gt.is_empty() {
3077                builder.token(SyntaxKind::TEXT.into(), after_gt);
3078            }
3079        }
3080
3081        if !newline_str.is_empty() {
3082            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3083        }
3084    }
3085}
3086
3087/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3088/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3089/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3090/// node is added. Pandoc's projector reads attributes only for `<div>` /
3091/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3092fn emit_multiline_open_tag_simple(
3093    builder: &mut GreenNodeBuilder<'static>,
3094    lines: &[&str],
3095    start_pos: usize,
3096    end_line_idx: usize,
3097    bq_depth: usize,
3098) {
3099    for (line_idx, raw) in lines
3100        .iter()
3101        .enumerate()
3102        .take(end_line_idx + 1)
3103        .skip(start_pos)
3104    {
3105        let stripped = if bq_depth > 0 {
3106            strip_n_blockquote_markers(raw, bq_depth)
3107        } else {
3108            raw
3109        };
3110        let bq_prefix_len = raw.len() - stripped.len();
3111        // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3112        // re-emit prefixes only for subsequent lines.
3113        if bq_prefix_len > 0 && line_idx != start_pos {
3114            emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3115        }
3116        let (line_no_nl, newline_str) = strip_newline(stripped);
3117        if !line_no_nl.is_empty() {
3118            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3119        }
3120        if !newline_str.is_empty() {
3121            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3122        }
3123    }
3124}
3125
3126/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3127/// `<div` literal up to end-of-line. Called only from
3128/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3129/// pure attribute (and possibly inter-attribute whitespace).
3130fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3131    if region.is_empty() {
3132        return;
3133    }
3134    let bytes = region.as_bytes();
3135    // Split a leading run of whitespace into a WHITESPACE token so the
3136    // HTML_ATTRS node holds only attribute bytes.
3137    let ws_end = bytes
3138        .iter()
3139        .position(|&b| !matches!(b, b' ' | b'\t'))
3140        .unwrap_or(bytes.len());
3141    if ws_end > 0 {
3142        builder.token(SyntaxKind::WHITESPACE.into(), &region[..ws_end]);
3143    }
3144    let attrs_text = &region[ws_end..];
3145    if !attrs_text.is_empty() {
3146        builder.start_node(SyntaxKind::HTML_ATTRS.into());
3147        builder.token(SyntaxKind::TEXT.into(), attrs_text);
3148        builder.finish_node();
3149    }
3150}
3151
3152/// Emit one continuation line of an HTML block, preserving any blockquote
3153/// markers as structural tokens (so the CST stays byte-equal to the source
3154/// and downstream consumers can strip them per-context).
3155fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3156    let inner = if bq_depth > 0 {
3157        let stripped = strip_n_blockquote_markers(line, bq_depth);
3158        let prefix_len = line.len() - stripped.len();
3159        if prefix_len > 0 {
3160            for ch in line[..prefix_len].chars() {
3161                if ch == '>' {
3162                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3163                } else {
3164                    let mut buf = [0u8; 4];
3165                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3166                }
3167            }
3168        }
3169        stripped
3170    } else {
3171        line
3172    };
3173
3174    let (line_without_newline, newline_str) = strip_newline(inner);
3175    if !line_without_newline.is_empty() {
3176        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3177    }
3178    if !newline_str.is_empty() {
3179        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3180    }
3181}
3182
3183#[cfg(test)]
3184mod tests {
3185    use super::*;
3186
3187    #[test]
3188    fn test_try_parse_html_comment() {
3189        assert_eq!(
3190            try_parse_html_block_start("<!-- comment -->", false),
3191            Some(HtmlBlockType::Comment)
3192        );
3193        assert_eq!(
3194            try_parse_html_block_start("  <!-- comment -->", false),
3195            Some(HtmlBlockType::Comment)
3196        );
3197    }
3198
3199    #[test]
3200    fn test_try_parse_div_tag() {
3201        assert_eq!(
3202            try_parse_html_block_start("<div>", false),
3203            Some(HtmlBlockType::BlockTag {
3204                tag_name: "div".to_string(),
3205                is_verbatim: false,
3206                closed_by_blank_line: false,
3207                depth_aware: true,
3208                closes_at_open_tag: false,
3209                is_closing: false,
3210            })
3211        );
3212        assert_eq!(
3213            try_parse_html_block_start("<div class=\"test\">", false),
3214            Some(HtmlBlockType::BlockTag {
3215                tag_name: "div".to_string(),
3216                is_verbatim: false,
3217                closed_by_blank_line: false,
3218                depth_aware: true,
3219                closes_at_open_tag: false,
3220                is_closing: false,
3221            })
3222        );
3223    }
3224
3225    #[test]
3226    fn test_try_parse_script_tag() {
3227        assert_eq!(
3228            try_parse_html_block_start("<script>", false),
3229            Some(HtmlBlockType::BlockTag {
3230                tag_name: "script".to_string(),
3231                is_verbatim: true,
3232                closed_by_blank_line: false,
3233                depth_aware: true,
3234                closes_at_open_tag: false,
3235                is_closing: false,
3236            })
3237        );
3238    }
3239
3240    #[test]
3241    fn test_try_parse_processing_instruction() {
3242        assert_eq!(
3243            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3244            Some(HtmlBlockType::ProcessingInstruction)
3245        );
3246    }
3247
3248    #[test]
3249    fn test_try_parse_declaration() {
3250        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3251        assert_eq!(
3252            try_parse_html_block_start("<!DOCTYPE html>", true),
3253            Some(HtmlBlockType::Declaration)
3254        );
3255        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3256        // just uppercase. Lowercase doctype must match too.
3257        assert_eq!(
3258            try_parse_html_block_start("<!doctype html>", true),
3259            Some(HtmlBlockType::Declaration)
3260        );
3261        // Pandoc dialect does not — bare declarations fall through to
3262        // paragraph parsing.
3263        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3264        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3265    }
3266
3267    #[test]
3268    fn test_dialect_specific_block_tag_membership() {
3269        // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3270        // CommonMark §4.6 type-6 plus a few additions. These tags
3271        // diverge between dialects:
3272        //   CM-only block tags (Pandoc treats as inline raw HTML):
3273        //     dialog, legend, menuitem, optgroup, option, frame,
3274        //     base, basefont, link, param
3275        //   Pandoc-only block tags (CM doesn't recognize):
3276        //     canvas, hgroup, isindex, meta, output
3277        for cm_only in [
3278            "<dialog>",
3279            "<legend>",
3280            "<menuitem>",
3281            "<optgroup>",
3282            "<option>",
3283            "<frame>",
3284            "<base>",
3285            "<basefont>",
3286            "<link>",
3287            "<param>",
3288        ] {
3289            assert!(
3290                matches!(
3291                    try_parse_html_block_start(cm_only, true),
3292                    Some(HtmlBlockType::BlockTag { .. })
3293                ),
3294                "{cm_only} should be a block-tag start under CommonMark",
3295            );
3296            assert_eq!(
3297                try_parse_html_block_start(cm_only, false),
3298                None,
3299                "{cm_only} should NOT be a block-tag start under Pandoc",
3300            );
3301        }
3302        for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3303            // Under CM these are not type-6 BlockTags; they may still match
3304            // type-7 (complete tag on a line) which has different semantics.
3305            assert!(
3306                !matches!(
3307                    try_parse_html_block_start(pandoc_only, true),
3308                    Some(HtmlBlockType::BlockTag { .. })
3309                ),
3310                "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3311            );
3312            assert!(
3313                matches!(
3314                    try_parse_html_block_start(pandoc_only, false),
3315                    Some(HtmlBlockType::BlockTag { .. })
3316                ),
3317                "{pandoc_only} should be a block-tag start under Pandoc",
3318            );
3319        }
3320    }
3321
3322    #[test]
3323    fn test_pandoc_inline_block_tag_membership() {
3324        // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3325        // fresh-block positions under Pandoc dialect. We list the
3326        // non-void, non-script subset (verbatim `script` is handled
3327        // via the verbatim path; void elements are deferred — see
3328        // PANDOC_INLINE_BLOCK_TAGS docs).
3329        for tag in [
3330            "<button>",
3331            "<iframe>",
3332            "<video>",
3333            "<audio>",
3334            "<noscript>",
3335            "<object>",
3336            "<map>",
3337            "<progress>",
3338            "<del>",
3339            "<ins>",
3340            "<svg>",
3341            "<applet>",
3342        ] {
3343            assert!(
3344                matches!(
3345                    try_parse_html_block_start(tag, false),
3346                    Some(HtmlBlockType::BlockTag {
3347                        depth_aware: true,
3348                        ..
3349                    })
3350                ),
3351                "{tag} should be a depth-aware block-tag start under Pandoc",
3352            );
3353        }
3354        // Closing forms of inline-block tags also start a block under
3355        // Pandoc — pandoc-native pins `</button>` standalone as a
3356        // single-line `RawBlock`. These use `closes_at_open_tag: true`
3357        // (no balanced match — the close emits as a one-line block on
3358        // its own).
3359        for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3360            assert!(
3361                matches!(
3362                    try_parse_html_block_start(closing, false),
3363                    Some(HtmlBlockType::BlockTag {
3364                        depth_aware: false,
3365                        closes_at_open_tag: true,
3366                        ..
3367                    })
3368                ),
3369                "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3370            );
3371        }
3372    }
3373
3374    #[test]
3375    fn test_pandoc_void_block_tag_membership() {
3376        // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3377        // at fresh-block positions under Pandoc dialect, with
3378        // `closes_at_open_tag: true` — the block always ends on the
3379        // open-tag line (no closing tag to match).
3380        for tag in [
3381            "<area>",
3382            "<embed>",
3383            "<source>",
3384            "<track>",
3385            "<embed src=\"foo.swf\">",
3386            "<source src=\"foo.mp4\" type=\"video/mp4\">",
3387        ] {
3388            assert!(
3389                matches!(
3390                    try_parse_html_block_start(tag, false),
3391                    Some(HtmlBlockType::BlockTag {
3392                        depth_aware: false,
3393                        closes_at_open_tag: true,
3394                        ..
3395                    })
3396                ),
3397                "{tag} should be a void block-tag start under Pandoc",
3398            );
3399        }
3400        // Closing forms of void tags also start a single-line block
3401        // under Pandoc. Void elements have no closing tag in HTML, but
3402        // `</embed>` etc. can appear in the wild — pandoc-native still
3403        // emits them as `RawBlock`s at fresh-block positions; mirror
3404        // that with the same `closes_at_open_tag: true` shape.
3405        for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3406            assert!(
3407                matches!(
3408                    try_parse_html_block_start(closing, false),
3409                    Some(HtmlBlockType::BlockTag {
3410                        depth_aware: false,
3411                        closes_at_open_tag: true,
3412                        ..
3413                    })
3414                ),
3415                "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3416            );
3417        }
3418        // Under CommonMark dialect, the void-tag block-start path is
3419        // skipped. `<source>` and `<track>` are in the CM type-6
3420        // BLOCK_TAGS set so they DO start a block, but with CM type-6
3421        // semantics (`closed_by_blank_line: true`,
3422        // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3423        // `<embed>` and `<area>` aren't in the CM type-6 list — they
3424        // fall through to type 7 (complete tag on a line by itself).
3425        assert_eq!(
3426            try_parse_html_block_start("<embed>", true),
3427            Some(HtmlBlockType::Type7)
3428        );
3429        assert_eq!(
3430            try_parse_html_block_start("<area>", true),
3431            Some(HtmlBlockType::Type7)
3432        );
3433        assert!(matches!(
3434            try_parse_html_block_start("<source src=\"x\">", true),
3435            Some(HtmlBlockType::BlockTag {
3436                closed_by_blank_line: true,
3437                closes_at_open_tag: false,
3438                ..
3439            })
3440        ));
3441        assert!(matches!(
3442            try_parse_html_block_start("<track src=\"x\">", true),
3443            Some(HtmlBlockType::BlockTag {
3444                closed_by_blank_line: true,
3445                closes_at_open_tag: false,
3446                ..
3447            })
3448        ));
3449    }
3450
3451    #[test]
3452    fn test_find_multiline_open_end() {
3453        // Single-line opens return None (caller takes the regular path).
3454        assert_eq!(
3455            find_multiline_open_end(&["<div id=\"x\">"], 0, "<div id=\"x\">", "div", 0),
3456            None
3457        );
3458        assert_eq!(
3459            find_multiline_open_end(&["<embed src=\"x\">"], 0, "<embed src=\"x\">", "embed", 0),
3460            None
3461        );
3462        // Multi-line opens return the line index of the closing `>`.
3463        assert_eq!(
3464            find_multiline_open_end(&["<embed", "  src=\"x\">"], 0, "<embed", "embed", 0),
3465            Some(1)
3466        );
3467        assert_eq!(
3468            find_multiline_open_end(
3469                &["<embed", "  src=\"x\"", "  type=\"video\">"],
3470                0,
3471                "<embed",
3472                "embed",
3473                0
3474            ),
3475            Some(2)
3476        );
3477        // Tag-name mismatch returns None (case-insensitive on the tag name).
3478        assert_eq!(
3479            find_multiline_open_end(&["<embed", "  src=\"x\">"], 0, "<embed", "div", 0),
3480            None
3481        );
3482        assert_eq!(
3483            find_multiline_open_end(&["<EMBED", "  src=\"x\">"], 0, "<EMBED", "embed", 0),
3484            Some(1)
3485        );
3486        // Quoted `>` does not terminate the open tag; quote state threads
3487        // across line boundaries.
3488        assert_eq!(
3489            find_multiline_open_end(
3490                &["<embed title=\"a>b", "  c\">"],
3491                0,
3492                "<embed title=\"a>b",
3493                "embed",
3494                0
3495            ),
3496            Some(1)
3497        );
3498        // No `>` anywhere returns None.
3499        assert_eq!(
3500            find_multiline_open_end(&["<embed", "  src=\"x\""], 0, "<embed", "embed", 0),
3501            None
3502        );
3503        // Subsequent lines inside a blockquote: bq markers stripped before
3504        // scanning so `> ` prefixes don't count.
3505        assert_eq!(
3506            find_multiline_open_end(&["<div", ">   id=\"x\">"], 0, "<div", "div", 1),
3507            Some(1)
3508        );
3509        // Nested bq: strips two `> ` per line.
3510        assert_eq!(
3511            find_multiline_open_end(
3512                &["<section", "> >   id=\"x\">"],
3513                0,
3514                "<section",
3515                "section",
3516                2
3517            ),
3518            Some(1)
3519        );
3520    }
3521
3522    #[test]
3523    fn test_pandoc_html_open_tag_closes() {
3524        // Single-line complete: scanner finds `>` on the first line.
3525        assert!(pandoc_html_open_tag_closes(&["<div>"], 0, 0));
3526        assert!(pandoc_html_open_tag_closes(&["<embed src=\"x\">"], 0, 0));
3527        // Multi-line complete: scanner finds `>` on a later line.
3528        assert!(pandoc_html_open_tag_closes(
3529            &["<div", "  id=\"x\">", "body", "</div>"],
3530            0,
3531            0
3532        ));
3533        assert!(pandoc_html_open_tag_closes(
3534            &["<embed", "  src=\"x.png\" alt=\"y\">"],
3535            0,
3536            0
3537        ));
3538        // Quoted `>` does not close: scanner threads quote state.
3539        assert!(!pandoc_html_open_tag_closes(
3540            &["<div title=\"a>b", "  c\""],
3541            0,
3542            0
3543        ));
3544        assert!(pandoc_html_open_tag_closes(
3545            &["<div title=\"a>b", "  c\">"],
3546            0,
3547            0
3548        ));
3549        // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3550        assert!(!pandoc_html_open_tag_closes(&["<embed"], 0, 0));
3551        assert!(!pandoc_html_open_tag_closes(&["<div", "foo", "bar"], 0, 0));
3552        // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3553        // across them); the scan continues until EOF or `>`.
3554        assert!(pandoc_html_open_tag_closes(
3555            &["<div", "", "id=\"x\">"],
3556            0,
3557            0
3558        ));
3559    }
3560
3561    #[test]
3562    fn test_try_parse_cdata() {
3563        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3564        assert_eq!(
3565            try_parse_html_block_start("<![CDATA[content]]>", true),
3566            Some(HtmlBlockType::CData)
3567        );
3568        // Pandoc dialect does not.
3569        assert_eq!(
3570            try_parse_html_block_start("<![CDATA[content]]>", false),
3571            None
3572        );
3573    }
3574
3575    #[test]
3576    fn test_extract_block_tag_name_open_only() {
3577        assert_eq!(
3578            extract_block_tag_name("<div>", false),
3579            Some("div".to_string())
3580        );
3581        assert_eq!(
3582            extract_block_tag_name("<div class=\"test\">", false),
3583            Some("div".to_string())
3584        );
3585        assert_eq!(
3586            extract_block_tag_name("<div/>", false),
3587            Some("div".to_string())
3588        );
3589        assert_eq!(extract_block_tag_name("</div>", false), None);
3590        assert_eq!(extract_block_tag_name("<>", false), None);
3591        assert_eq!(extract_block_tag_name("< div>", false), None);
3592    }
3593
3594    #[test]
3595    fn test_extract_block_tag_name_with_closing() {
3596        // CommonMark §4.6 type-6 starts also accept closing tags.
3597        assert_eq!(
3598            extract_block_tag_name("</div>", true),
3599            Some("div".to_string())
3600        );
3601        assert_eq!(
3602            extract_block_tag_name("</div >", true),
3603            Some("div".to_string())
3604        );
3605    }
3606
3607    #[test]
3608    fn test_commonmark_type6_closing_tag_start() {
3609        assert_eq!(
3610            try_parse_html_block_start("</div>", true),
3611            Some(HtmlBlockType::BlockTag {
3612                tag_name: "div".to_string(),
3613                is_verbatim: false,
3614                closed_by_blank_line: true,
3615                depth_aware: false,
3616                closes_at_open_tag: false,
3617                is_closing: true,
3618            })
3619        );
3620    }
3621
3622    #[test]
3623    fn test_commonmark_type7_open_tag() {
3624        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3625        // CommonMark; rejected under non-CommonMark.
3626        assert_eq!(
3627            try_parse_html_block_start("<a href=\"foo\">", true),
3628            Some(HtmlBlockType::Type7)
3629        );
3630        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3631    }
3632
3633    #[test]
3634    fn test_commonmark_type7_close_tag() {
3635        assert_eq!(
3636            try_parse_html_block_start("</ins>", true),
3637            Some(HtmlBlockType::Type7)
3638        );
3639    }
3640
3641    #[test]
3642    fn test_commonmark_type7_rejects_with_trailing_text() {
3643        // A complete tag must be followed only by whitespace.
3644        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3645    }
3646
3647    #[test]
3648    fn test_is_closing_marker_comment() {
3649        let block_type = HtmlBlockType::Comment;
3650        assert!(is_closing_marker("-->", &block_type));
3651        assert!(is_closing_marker("end -->", &block_type));
3652        assert!(!is_closing_marker("<!--", &block_type));
3653    }
3654
3655    #[test]
3656    fn test_is_closing_marker_tag() {
3657        let block_type = HtmlBlockType::BlockTag {
3658            tag_name: "div".to_string(),
3659            is_verbatim: false,
3660            closed_by_blank_line: false,
3661            depth_aware: false,
3662            closes_at_open_tag: false,
3663            is_closing: false,
3664        };
3665        assert!(is_closing_marker("</div>", &block_type));
3666        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3667        assert!(is_closing_marker("content</div>", &block_type));
3668        assert!(!is_closing_marker("<div>", &block_type));
3669    }
3670
3671    #[test]
3672    fn test_parse_html_comment_block() {
3673        let input = "<!-- comment -->\n";
3674        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3675        let mut builder = GreenNodeBuilder::new();
3676
3677        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3678        let opts = ParserOptions::default();
3679        let new_pos = parse_html_block_with_wrapper(
3680            &mut builder,
3681            &lines,
3682            0,
3683            block_type,
3684            0,
3685            SyntaxKind::HTML_BLOCK,
3686            &opts,
3687        );
3688
3689        assert_eq!(new_pos, 1);
3690    }
3691
3692    #[test]
3693    fn test_parse_div_block() {
3694        let input = "<div>\ncontent\n</div>\n";
3695        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3696        let mut builder = GreenNodeBuilder::new();
3697
3698        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3699        let opts = ParserOptions::default();
3700        let new_pos = parse_html_block_with_wrapper(
3701            &mut builder,
3702            &lines,
3703            0,
3704            block_type,
3705            0,
3706            SyntaxKind::HTML_BLOCK,
3707            &opts,
3708        );
3709
3710        assert_eq!(new_pos, 3);
3711    }
3712
3713    #[test]
3714    fn test_parse_html_block_no_closing() {
3715        let input = "<div>\ncontent\n";
3716        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3717        let mut builder = GreenNodeBuilder::new();
3718
3719        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3720        let opts = ParserOptions::default();
3721        let new_pos = parse_html_block_with_wrapper(
3722            &mut builder,
3723            &lines,
3724            0,
3725            block_type,
3726            0,
3727            SyntaxKind::HTML_BLOCK,
3728            &opts,
3729        );
3730
3731        // Should consume all lines even without closing tag
3732        assert_eq!(new_pos, 2);
3733    }
3734
3735    #[test]
3736    fn test_parse_div_block_nested_pandoc() {
3737        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3738        // close on the OUTER `</div>`, not the first `</div>` seen. The
3739        // CommonMark-style "first close" scanner is wrong here; Pandoc's
3740        // div parser is depth-aware (mirrors `htmlInBalanced`).
3741        let input =
3742            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3743        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3744        let mut builder = GreenNodeBuilder::new();
3745
3746        // is_commonmark = false → Pandoc dialect.
3747        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3748        let opts = ParserOptions::default();
3749        let new_pos = parse_html_block_with_wrapper(
3750            &mut builder,
3751            &lines,
3752            0,
3753            block_type,
3754            0,
3755            SyntaxKind::HTML_BLOCK_DIV,
3756            &opts,
3757        );
3758
3759        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3760        // inner-close, blank, outer-close. All consumed.
3761        assert_eq!(new_pos, 9);
3762    }
3763
3764    #[test]
3765    fn test_parse_div_block_same_line_pandoc() {
3766        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3767        // close on first line. Depth-aware tracking must not regress this.
3768        let input = "<div>foo</div>\n";
3769        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3770        let mut builder = GreenNodeBuilder::new();
3771
3772        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3773        let opts = ParserOptions::default();
3774        let new_pos = parse_html_block_with_wrapper(
3775            &mut builder,
3776            &lines,
3777            0,
3778            block_type,
3779            0,
3780            SyntaxKind::HTML_BLOCK_DIV,
3781            &opts,
3782        );
3783        assert_eq!(new_pos, 1);
3784    }
3785
3786    #[test]
3787    fn test_commonmark_verbatim_first_close() {
3788        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3789        // ends at the first matching close — not depth-aware. Stash a
3790        // bogus inner `<script>` inside a JS string; the outer block
3791        // still closes at the first `</script>`.
3792        let input = "<script>\nlet x = '<script>';\n</script>\n";
3793        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3794        let mut builder = GreenNodeBuilder::new();
3795
3796        // is_commonmark = true.
3797        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3798        let opts = ParserOptions::default();
3799        let new_pos = parse_html_block_with_wrapper(
3800            &mut builder,
3801            &lines,
3802            0,
3803            block_type,
3804            0,
3805            SyntaxKind::HTML_BLOCK,
3806            &opts,
3807        );
3808        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3809        assert_eq!(new_pos, 3);
3810    }
3811
3812    #[test]
3813    fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3814        // Multi-line open tag with the closing `>` on its own line:
3815        //
3816        //   <div
3817        //     id="x"
3818        //     class="y"
3819        //   >
3820        //
3821        //   foo
3822        //
3823        //   </div>
3824        //
3825        // Open tag spans lines 0..=3. Content starts at line 4.
3826        let input = "<div\n  id=\"x\"\n  class=\"y\"\n>\n\nfoo\n\n</div>\n";
3827        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3828        let mut builder = GreenNodeBuilder::new();
3829
3830        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3831        let opts = ParserOptions::default();
3832        let new_pos = parse_html_block_with_wrapper(
3833            &mut builder,
3834            &lines,
3835            0,
3836            block_type,
3837            0,
3838            SyntaxKind::HTML_BLOCK_DIV,
3839            &opts,
3840        );
3841
3842        // 8 lines: open-line 0, open-line 1 (`  id="x"`), open-line 2
3843        // (`  class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3844        assert_eq!(new_pos, 8);
3845
3846        // CST must contain a structural HTML_ATTRS region holding the
3847        // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3848        let green = builder.finish();
3849        let root = crate::syntax::SyntaxNode::new_root(green);
3850        let attrs_count = root
3851            .descendants()
3852            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3853            .count();
3854        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3855
3856        // Byte-identical losslessness check.
3857        let collected: String = root
3858            .descendants_with_tokens()
3859            .filter_map(|n| n.into_token())
3860            .map(|t| t.text().to_string())
3861            .collect();
3862        assert_eq!(collected, input);
3863    }
3864
3865    #[test]
3866    fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3867        // Multi-line open tag with the closing `>` on the last attribute
3868        // line (case 0262 already covers this pattern; pin behavior to
3869        // also ensure HTML_ATTRS structural exposure).
3870        let input = "<div\n  id=\"x\"\n  class=\"y\">\nfoo\n</div>\n";
3871        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3872        let mut builder = GreenNodeBuilder::new();
3873
3874        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3875        let opts = ParserOptions::default();
3876        let new_pos = parse_html_block_with_wrapper(
3877            &mut builder,
3878            &lines,
3879            0,
3880            block_type,
3881            0,
3882            SyntaxKind::HTML_BLOCK_DIV,
3883            &opts,
3884        );
3885
3886        // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3887        // </div>.
3888        assert_eq!(new_pos, 5);
3889
3890        let green = builder.finish();
3891        let root = crate::syntax::SyntaxNode::new_root(green);
3892        let attrs_count = root
3893            .descendants()
3894            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3895            .count();
3896        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3897
3898        let collected: String = root
3899            .descendants_with_tokens()
3900            .filter_map(|n| n.into_token())
3901            .map(|t| t.text().to_string())
3902            .collect();
3903        assert_eq!(collected, input);
3904    }
3905
3906    #[test]
3907    fn test_commonmark_type6_blank_line_terminates() {
3908        let input = "<div>\nfoo\n\nbar\n";
3909        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3910        let mut builder = GreenNodeBuilder::new();
3911
3912        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3913        let opts = ParserOptions::default();
3914        let new_pos = parse_html_block_with_wrapper(
3915            &mut builder,
3916            &lines,
3917            0,
3918            block_type,
3919            0,
3920            SyntaxKind::HTML_BLOCK,
3921            &opts,
3922        );
3923
3924        // Block contains <div>\nfoo\n; stops at blank line (line 2).
3925        assert_eq!(new_pos, 2);
3926    }
3927}