Skip to main content

panache_parser/parser/blocks/
html_blocks.rs

1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
10
11/// HTML block-level tags as defined by CommonMark spec.
12/// These tags start an HTML block when found at the start of a line.
13const BLOCK_TAGS: &[&str] = &[
14    "address",
15    "article",
16    "aside",
17    "base",
18    "basefont",
19    "blockquote",
20    "body",
21    "caption",
22    "center",
23    "col",
24    "colgroup",
25    "dd",
26    "details",
27    "dialog",
28    "dir",
29    "div",
30    "dl",
31    "dt",
32    "fieldset",
33    "figcaption",
34    "figure",
35    "footer",
36    "form",
37    "frame",
38    "frameset",
39    "h1",
40    "h2",
41    "h3",
42    "h4",
43    "h5",
44    "h6",
45    "head",
46    "header",
47    "hr",
48    "html",
49    "iframe",
50    "legend",
51    "li",
52    "link",
53    "main",
54    "menu",
55    "menuitem",
56    "nav",
57    "noframes",
58    "ol",
59    "optgroup",
60    "option",
61    "p",
62    "param",
63    "section",
64    "source",
65    "summary",
66    "table",
67    "tbody",
68    "td",
69    "tfoot",
70    "th",
71    "thead",
72    "title",
73    "tr",
74    "track",
75    "ul",
76];
77
78/// Tags that contain raw/verbatim content (no Markdown processing inside).
79const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
80
81/// Pandoc's `blockHtmlTags` (mirrors
82/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
83/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
84/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
85/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
86/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
87/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
88///
89/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
90/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
91/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
92/// and the verbatim `script`) is tracked separately as
93/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
94/// fresh-block positions but stay inline inside an existing HTML block
95/// (e.g. `<form><input><button>X</button></form>`); the projector's
96/// `split_html_block_by_tags` keys on `inline_pending` to keep them
97/// inline once an inline-only tag or text byte has been seen since the
98/// last splitter.
99const PANDOC_BLOCK_TAGS: &[&str] = &[
100    "address",
101    "article",
102    "aside",
103    "blockquote",
104    "body",
105    "canvas",
106    "caption",
107    "center",
108    "col",
109    "colgroup",
110    "dd",
111    "details",
112    "dir",
113    "div",
114    "dl",
115    "dt",
116    "fieldset",
117    "figcaption",
118    "figure",
119    "footer",
120    "form",
121    "frameset",
122    "h1",
123    "h2",
124    "h3",
125    "h4",
126    "h5",
127    "h6",
128    "head",
129    "header",
130    "hgroup",
131    "hr",
132    "html",
133    "isindex",
134    "li",
135    "main",
136    "menu",
137    "meta",
138    "nav",
139    "noframes",
140    "ol",
141    "output",
142    "p",
143    "pre",
144    "script",
145    "section",
146    "style",
147    "summary",
148    "table",
149    "tbody",
150    "td",
151    "textarea",
152    "tfoot",
153    "th",
154    "thead",
155    "tr",
156    "ul",
157];
158
159/// Whether `name` (case-insensitive) is one of the HTML block-level tags
160/// recognized by CommonMark §4.6 type-6.
161pub fn is_html_block_tag_name(name: &str) -> bool {
162    let lower = name.to_ascii_lowercase();
163    BLOCK_TAGS.contains(&lower.as_str())
164}
165
166/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
167/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
168/// Used by the pandoc-native projector's `split_html_block_by_tags` to
169/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
170/// the block — block-level tags emit as separate `RawBlock` entries;
171/// inline tags stay inline in the surrounding `Plain` content.
172pub fn is_pandoc_block_tag_name(name: &str) -> bool {
173    let lower = name.to_ascii_lowercase();
174    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
175}
176
177/// Pandoc's `eitherBlockOrInline` set (mirrors
178/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
179/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
180/// (because `name ∉ blockTags`). At top level (or after a blank line)
181/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
182/// inside an existing HTML block once a paragraph has started parsing,
183/// the same tag stays inline as `RawInline`.
184///
185/// The projector's `split_html_block_by_tags` mirrors this with an
186/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
187/// always split; inline-block tags split only when no inline content
188/// has been buffered since the last splitter.
189///
190/// Void elements (`area`, `embed`, `source`, `track`) live in
191/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
192/// rule as non-void inline-block tags but emit a single RawBlock per
193/// instance instead of a matched-pair lift.
194/// `script` is omitted because it is already verbatim (handled by the
195/// `<script>...</script>` raw-text path) and the strict-block check
196/// fires first regardless.
197const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
198    "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
199    "svg", "video",
200];
201
202/// Whether `name` (case-insensitive) is one of pandoc's
203/// `eitherBlockOrInline` tags (excluding void elements and `script`;
204/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
205pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
206    let lower = name.to_ascii_lowercase();
207    PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
208}
209
210/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
211/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
212/// minus those handled elsewhere: `br` and `wbr` are inline-only;
213/// `img` and `input` are inline-only; HTML void elements that pandoc
214/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
215/// `track`).
216///
217/// At fresh-block positions (or after a blank line) pandoc emits these
218/// as a single `RawBlock`; inside a running paragraph they stay inline
219/// as `RawInline`. The parser opens a depth-zero HTML block (closes
220/// immediately on the open-tag line — there is no closing tag to
221/// match) so subsequent lines start fresh blocks; the projector's
222/// `split_html_block_by_tags` handles the same-line splitting via
223/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
224const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
225
226/// Whether `name` (case-insensitive) is one of pandoc's void
227/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
228pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
229    let lower = name.to_ascii_lowercase();
230    PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
231}
232
233/// Whether the given tag name is eligible for the Phase 6 / Fix #4
234/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
235/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
236/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
237/// and NOT void. These are the tags where pandoc parses the body as
238/// fresh markdown between RawBlock emissions of the open/close tags —
239/// exactly the shape we can lift into structural CST children.
240///
241/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
242/// additional gate at the lift-gate site: the lift is abandoned when
243/// the body's first non-blank content is a void block tag at a
244/// fresh-block position (`<video>\n<source ...>\n</video>` projects
245/// per-tag rather than matched-pair, mirroring pandoc).
246///
247/// `<div>` is intentionally excluded — it has its own lift path
248/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
249/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
250fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
251    let lower = name.to_ascii_lowercase();
252    if VERBATIM_TAGS.contains(&lower.as_str()) {
253        return false;
254    }
255    if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
256        return false;
257    }
258    if lower == "div" {
259        return false;
260    }
261    PANDOC_BLOCK_TAGS.contains(&lower.as_str())
262        || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
263}
264
265/// Open-tag-attribute tokenization gate for non-div strict-block tags
266/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
267/// open tag is eligible for finer-grained tokenization
268/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
269/// driving the full body lift — that's the `bq_clean_lift` path. The
270/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
271/// the salsa anchor index.
272///
273/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
274/// wrapper) regardless of bq depth, so this gate skips it.
275fn bq_strict_attr_emit_tag_name(
276    wrapper_kind: SyntaxKind,
277    block_type: &HtmlBlockType,
278    bq_depth: usize,
279) -> Option<&str> {
280    if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
281        return None;
282    }
283    match block_type {
284        HtmlBlockType::BlockTag {
285            tag_name,
286            is_verbatim: false,
287            closed_by_blank_line: false,
288            depth_aware: true,
289            closes_at_open_tag: false,
290            is_closing: false,
291        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
292        _ => None,
293    }
294}
295
296/// Information about a detected HTML block opening.
297#[derive(Debug, Clone, PartialEq, Eq)]
298pub(crate) enum HtmlBlockType {
299    /// HTML comment: <!-- ... -->
300    Comment,
301    /// Processing instruction: <? ... ?>
302    ProcessingInstruction,
303    /// Declaration: <!...>
304    Declaration,
305    /// CDATA section: <![CDATA[ ... ]]>
306    CData,
307    /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
308    /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
309    /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
310    /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
311    /// `depth_aware` extends the matching-tag close path with balanced
312    /// open/close tracking of the same tag name (mirrors pandoc's
313    /// `htmlInBalanced`); used under Pandoc dialect to handle nested
314    /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
315    /// `closed_by_blank_line` is true.
316    /// `closes_at_open_tag` short-circuits the close search: the block
317    /// always ends after the open-tag line. Used for void
318    /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
319    /// `<track>`) which have no closing tag — depth-aware matching
320    /// would walk to end-of-input.
321    /// `is_closing` records whether the tag at the start position is a
322    /// closing form (`</tag>`) rather than an opening form (`<tag>`).
323    /// The dispatcher's `cannot_interrupt` consults this to mirror
324    /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
325    /// even when `<script>` is not — pandoc treats the close-form as
326    /// always-inline regardless of attributes).
327    BlockTag {
328        tag_name: String,
329        is_verbatim: bool,
330        closed_by_blank_line: bool,
331        depth_aware: bool,
332        closes_at_open_tag: bool,
333        is_closing: bool,
334    },
335    /// CommonMark §4.6 type 7: complete open or close tag on a line by
336    /// itself, tag name not in the type-1 verbatim list. Block ends at
337    /// blank line. Cannot interrupt a paragraph.
338    Type7,
339}
340
341/// Try to detect an HTML block opening from content.
342/// Returns block type if this is a valid HTML block start.
343///
344/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
345/// accept closing tags (`</div>`), type-6 blocks end at the next blank
346/// line (rather than a matching close tag), and type 7 is recognized.
347pub(crate) fn try_parse_html_block_start(
348    content: &str,
349    is_commonmark: bool,
350) -> Option<HtmlBlockType> {
351    let trimmed = strip_leading_spaces(content);
352
353    // Must start with <
354    if !trimmed.starts_with('<') {
355        return None;
356    }
357
358    // HTML comment
359    if trimmed.starts_with("<!--") {
360        return Some(HtmlBlockType::Comment);
361    }
362
363    // Processing instruction
364    if trimmed.starts_with("<?") {
365        return Some(HtmlBlockType::ProcessingInstruction);
366    }
367
368    // CDATA section — CommonMark dialect only. Pandoc-markdown does not
369    // recognize bare CDATA as a raw HTML block; the literal bytes fall
370    // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
371    // text is parsed as inline markdown, etc).
372    if is_commonmark && trimmed.starts_with("<![CDATA[") {
373        return Some(HtmlBlockType::CData);
374    }
375
376    // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
377    // does not recognize bare declarations as raw HTML blocks (its
378    // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
379    // tag-shaped blocks); the bytes fall through to paragraph parsing.
380    if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
381        let after_bang = &trimmed[2..];
382        if after_bang.chars().next()?.is_ascii_alphabetic() {
383            return Some(HtmlBlockType::Declaration);
384        }
385    }
386
387    // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
388    // Pandoc-native recognizes standalone closing forms of strict-block tags
389    // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
390    // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
391    // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
392    // the open-tag line via `closes_at_open_tag: true`.
393    if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
394        let tag_lower = tag_name.to_lowercase();
395        let is_closing = trimmed.starts_with("</");
396
397        // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
398        // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
399        // Unlike inline-block / void closes, these CAN interrupt a running
400        // paragraph (the dispatcher's `cannot_interrupt` only covers the
401        // inline-block / void categories). Inline-block / void closes are
402        // handled by their own branches further below.
403        if !is_commonmark
404            && is_closing
405            && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
406                || VERBATIM_TAGS.contains(&tag_lower.as_str()))
407            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
408            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
409        {
410            return Some(HtmlBlockType::BlockTag {
411                tag_name: tag_lower,
412                is_verbatim: false,
413                closed_by_blank_line: false,
414                depth_aware: false,
415                closes_at_open_tag: true,
416                is_closing: true,
417            });
418        }
419
420        // Under Pandoc, remaining closing forms (truly inline-only tags like
421        // `</em>`, `</span>`) are not block starts — fall through to the
422        // existing inline-html path. Inline-block + void closes are caught
423        // by the dedicated branches further below.
424        if !is_commonmark
425            && is_closing
426            && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
427            && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
428        {
429            return None;
430        }
431
432        // Check if it's a block-level tag. Pandoc and CommonMark disagree on
433        // membership: pandoc's `blockHtmlTags` (see
434        // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
435        // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
436        // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
437        let is_block_tag = if is_commonmark {
438            BLOCK_TAGS.contains(&tag_lower.as_str())
439        } else {
440            PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
441        };
442        if is_block_tag {
443            let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
444            return Some(HtmlBlockType::BlockTag {
445                tag_name: tag_lower,
446                is_verbatim,
447                closed_by_blank_line: is_commonmark && !is_verbatim,
448                depth_aware: !is_commonmark,
449                closes_at_open_tag: false,
450                is_closing,
451            });
452        }
453
454        // Pandoc dialect also treats `eitherBlockOrInline` tags as block
455        // starters at fresh-block positions. The block dispatcher caller
456        // gates these as `cannot_interrupt` (mirrors pandoc — they never
457        // interrupt a running paragraph; only start a fresh block when
458        // following a blank line or at document start). Closing forms
459        // (`</video>`) emit as a single-line `RawBlock` with no balanced
460        // match — pandoc-native pins this for standalone closes.
461        if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
462            return Some(HtmlBlockType::BlockTag {
463                tag_name: tag_lower,
464                is_verbatim: false,
465                closed_by_blank_line: false,
466                depth_aware: !is_closing,
467                closes_at_open_tag: is_closing,
468                is_closing,
469            });
470        }
471
472        // Pandoc dialect also recognizes the void subset of
473        // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
474        // These have no closing tag, so the parser closes the block
475        // immediately on the open-tag line; the projector's
476        // `split_html_block_by_tags` handles the same-line splitting
477        // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
478        // non-void inline-block tags, void tags never interrupt a
479        // running paragraph (gated as `cannot_interrupt` in the
480        // dispatcher). Closing forms (`</embed>`) — semantically
481        // nonsensical for void elements — pandoc still emits as a
482        // single-line `RawBlock`; mirror that.
483        if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
484            return Some(HtmlBlockType::BlockTag {
485                tag_name: tag_lower,
486                is_verbatim: false,
487                closed_by_blank_line: false,
488                depth_aware: false,
489                closes_at_open_tag: true,
490                is_closing,
491            });
492        }
493
494        // Also accept verbatim tags even if not in BLOCK_TAGS list — but
495        // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
496        // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
497        // do not start a type-1 block. Letting `</pre>` through here would
498        // wrongly interrupt a paragraph.
499        if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
500            return Some(HtmlBlockType::BlockTag {
501                tag_name: tag_lower,
502                is_verbatim: true,
503                closed_by_blank_line: false,
504                depth_aware: !is_commonmark,
505                closes_at_open_tag: false,
506                is_closing: false,
507            });
508        }
509    }
510
511    // Type 7 (CommonMark only): complete open or close tag on a line by
512    // itself, tag name not in the type-1 verbatim list.
513    if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
514    {
515        let rest = &trimmed[end..];
516        let only_ws = rest
517            .bytes()
518            .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
519        if only_ws {
520            // Reject if the tag name belongs to the type-1 verbatim set
521            // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
522            // type-1 starts above, so seeing one here means the opener
523            // had a different shape (e.g. `<pre/>` self-closing) that
524            // shouldn't trigger type 7 either. Conservatively skip.
525            let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
526            let name_end = leading
527                .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
528                .unwrap_or(leading.len());
529            let name = leading[..name_end].to_ascii_lowercase();
530            if !VERBATIM_TAGS.contains(&name.as_str()) {
531                return Some(HtmlBlockType::Type7);
532            }
533        }
534    }
535
536    None
537}
538
539/// Extract the tag name for HTML-block-start detection.
540///
541/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
542/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
543/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
544/// the spec — we approximate that with the space/`>`/`/` boundary check.
545fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
546    if !text.starts_with('<') {
547        return None;
548    }
549
550    let after_bracket = &text[1..];
551
552    let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
553        if !accept_closing {
554            return None;
555        }
556        stripped
557    } else {
558        after_bracket
559    };
560
561    // Extract tag name (alphanumeric, ends at space, >, or /)
562    let tag_end = after_slash
563        .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
564        .unwrap_or(after_slash.len());
565
566    if tag_end == 0 {
567        return None;
568    }
569
570    let tag_name = &after_slash[..tag_end];
571
572    // Tag name must be valid (ASCII alphabetic start, alphanumeric)
573    if !tag_name.chars().next()?.is_ascii_alphabetic() {
574        return None;
575    }
576
577    if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
578        return None;
579    }
580
581    Some(tag_name.to_string())
582}
583
584/// Whether this block type ends at a blank line (CommonMark types 6 & 7
585/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
586/// marker — only at end of input or the next blank line.
587fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
588    matches!(
589        block_type,
590        HtmlBlockType::Type7
591            | HtmlBlockType::BlockTag {
592                closed_by_blank_line: true,
593                ..
594            }
595    )
596}
597
598/// Check if a line contains the closing marker for the given HTML block type.
599/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
600/// blank-line-terminated types (6 in CommonMark, 7) never match here.
601fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
602    match block_type {
603        HtmlBlockType::Comment => line.contains("-->"),
604        HtmlBlockType::ProcessingInstruction => line.contains("?>"),
605        HtmlBlockType::Declaration => line.contains('>'),
606        HtmlBlockType::CData => line.contains("]]>"),
607        HtmlBlockType::BlockTag {
608            tag_name,
609            closed_by_blank_line: false,
610            ..
611        } => {
612            // Look for closing tag </tagname>
613            let closing_tag = format!("</{}>", tag_name);
614            line.to_lowercase().contains(&closing_tag)
615        }
616        HtmlBlockType::BlockTag {
617            closed_by_blank_line: true,
618            ..
619        }
620        | HtmlBlockType::Type7 => false,
621    }
622}
623
624/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
625/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
626/// inside a quoted attribute value are NOT counted — the scanner walks
627/// `<...>` brackets and respects `"`/`'` quoting.
628///
629/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
630/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`).
631fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
632    let bytes = line.as_bytes();
633    let lower_line = line.to_ascii_lowercase();
634    let lower_bytes = lower_line.as_bytes();
635    let tag_lower = tag_name.to_ascii_lowercase();
636    let tag_bytes = tag_lower.as_bytes();
637
638    let mut opens = 0usize;
639    let mut closes = 0usize;
640    let mut i = 0usize;
641
642    while i < bytes.len() {
643        if bytes[i] != b'<' {
644            i += 1;
645            continue;
646        }
647        let after = i + 1;
648        let is_close = after < bytes.len() && bytes[after] == b'/';
649        let name_start = if is_close { after + 1 } else { after };
650        let matched = name_start + tag_bytes.len() <= bytes.len()
651            && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
652        let after_name = name_start + tag_bytes.len();
653        let is_boundary = matched
654            && matches!(
655                bytes.get(after_name).copied(),
656                Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
657            );
658
659        // Walk forward to the closing `>` of this tag bracket, skipping
660        // inside quoted attribute values. Self-closing form ends with `/>`.
661        let mut j = if matched { after_name } else { after };
662        let mut quote: Option<u8> = None;
663        let mut self_close = false;
664        let mut found_gt = false;
665        while j < bytes.len() {
666            let b = bytes[j];
667            match (quote, b) {
668                (Some(q), x) if x == q => quote = None,
669                (None, b'"') | (None, b'\'') => quote = Some(b),
670                (None, b'>') => {
671                    found_gt = true;
672                    if j > i + 1 && bytes[j - 1] == b'/' {
673                        self_close = true;
674                    }
675                    break;
676                }
677                _ => {}
678            }
679            j += 1;
680        }
681
682        if matched && is_boundary {
683            if is_close {
684                closes += 1;
685            } else if !self_close {
686                opens += 1;
687            }
688        }
689
690        if found_gt {
691            i = j + 1;
692        } else {
693            // Unterminated `<...` — bail out to avoid an infinite loop.
694            // The remaining bytes don't form a complete tag.
695            break;
696        }
697    }
698
699    (opens, closes)
700}
701
702/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
703/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
704/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
705/// identical to the source either way; only the wrapper retag changes.
706pub(crate) fn parse_html_block_with_wrapper(
707    builder: &mut GreenNodeBuilder<'static>,
708    lines: &[&str],
709    start_pos: usize,
710    block_type: HtmlBlockType,
711    bq_depth: usize,
712    wrapper_kind: SyntaxKind,
713    config: &ParserOptions,
714) -> usize {
715    // Start HTML block
716    builder.start_node(wrapper_kind.into());
717
718    let first_line = lines[start_pos];
719    let blank_terminated = ends_at_blank_line(&block_type);
720
721    // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
722    // tokens for the first line's blockquote prefix; emit only the inner
723    // content as TEXT to keep the CST byte-equal to the source.
724    let first_inner = if bq_depth > 0 {
725        strip_n_blockquote_markers(first_line, bq_depth)
726    } else {
727        first_line
728    };
729
730    // Detect a multi-line open tag.
731    // - `<div>` (Pandoc lift): we tokenize each line structurally so the
732    //   salsa anchor walk picks up `id` from the HTML_ATTRS region.
733    // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
734    //   `<section>`, `<header>`, …): same structural emission, exposing
735    //   `id` to the salsa anchor walk and enabling the body lift below.
736    // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
737    //   without this, the parser closes the block after line 0 and the
738    //   remainder of the open tag falls into following paragraphs;
739    //   pandoc-native treats the whole multi-line open tag as a single
740    //   `RawBlock`. Emission for void tags uses simple per-line
741    //   TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
742    //   from void tags).
743    let multiline_open_end = if bq_depth == 0 {
744        match (wrapper_kind, &block_type) {
745            (SyntaxKind::HTML_BLOCK_DIV, _) => {
746                find_multiline_open_end(lines, start_pos, first_inner, "div")
747            }
748            (
749                _,
750                HtmlBlockType::BlockTag {
751                    tag_name,
752                    closes_at_open_tag: true,
753                    ..
754                },
755            ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name),
756            (
757                _,
758                HtmlBlockType::BlockTag {
759                    tag_name,
760                    is_verbatim: false,
761                    closed_by_blank_line: false,
762                    depth_aware: true,
763                    closes_at_open_tag: false,
764                    is_closing: false,
765                },
766            ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
767                find_multiline_open_end(lines, start_pos, first_inner, tag_name)
768            }
769            _ => None,
770        }
771    } else {
772        None
773    };
774
775    // Set up depth-aware close tracking when the block type asks for it
776    // (Pandoc dialect, balanced same-name tag matching). A `None` means
777    // we fall back to the legacy "first matching close" path via
778    // `is_closing_marker`. Computed up front so the lift-mode gate
779    // below can decide whether the open line already balances the
780    // block (same-line `<div>...</div>`).
781    let depth_aware_tag: Option<String> = match &block_type {
782        HtmlBlockType::BlockTag {
783            tag_name,
784            closed_by_blank_line: false,
785            depth_aware: true,
786            ..
787        } => Some(tag_name.clone()),
788        _ => None,
789    };
790    let mut depth: i64 = 1;
791    if let Some(tag_name) = &depth_aware_tag {
792        // Sum opens/closes across all open-tag lines (single-line: just
793        // line 0; multi-line: lines 0..=end_line_idx).
794        let last_open_line = multiline_open_end.unwrap_or(start_pos);
795        let mut opens = 0usize;
796        let mut closes = 0usize;
797        for line in &lines[start_pos..=last_open_line] {
798            let inner = if bq_depth > 0 {
799                strip_n_blockquote_markers(line, bq_depth)
800            } else {
801                line
802            };
803            let (o, c) = count_tag_balance(inner, tag_name);
804            opens += o;
805            closes += c;
806        }
807        depth = opens as i64 - closes as i64;
808    }
809
810    // Same-line `<div>foo</div>` shape: the open line balances the
811    // block under depth-aware tracking. We can lift this structurally
812    // only when the open-tag trailing has exactly one `</div>` close,
813    // zero `<div>` opens, and no non-whitespace content after the
814    // close. Other same-line shapes (nested, trailing text, malformed)
815    // fall through to the byte-reparse path.
816    let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
817        && multiline_open_end.is_none()
818        && depth_aware_tag.is_some()
819        && depth <= 0;
820    let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
821        let (line_without_newline, _) = strip_newline(first_inner);
822        probe_same_line_lift(line_without_newline, "div")
823    };
824
825    // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
826    // `<nav>`, …): the body parses as fresh markdown between RawBlock
827    // emissions of the open/close tags. Covers the clean multi-line
828    // shape (open tag stands alone on its line), open-trailing
829    // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
830    // and same-line (`<form>foo</form>`). Multi-line open and
831    // blockquote-wrapped non-div shapes still fall through to the
832    // byte-walker path.
833    let strict_block_tag_name: Option<&str> =
834        if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
835            match &block_type {
836                HtmlBlockType::BlockTag {
837                    tag_name,
838                    is_verbatim: false,
839                    closed_by_blank_line: false,
840                    depth_aware: true,
841                    closes_at_open_tag: false,
842                    is_closing: false,
843                } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
844                _ => None,
845            }
846        } else {
847            None
848        };
849    // Same-line `<form>foo</form>` shape: the open line already
850    // balances the block (`depth <= 0`). Lift only when the trailing
851    // bytes after the open `>` end with `</tag>` and contain exactly
852    // one close + zero nested opens.
853    let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
854        multiline_open_end.is_none() && depth <= 0 && {
855            let (line_no_nl, _) = strip_newline(first_inner);
856            probe_same_line_lift(line_no_nl, name)
857        }
858    });
859    // Strict-block lift gate: accept (a) a multi-line open tag spanning
860    // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
861    // trailing single-line open (depth > 0, open `>` is present with
862    // quote-aware matching), or (c) a safe same-line shape. For
863    // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
864    // …) the lift additionally abandons when the body starts at a
865    // fresh-block position with a void block tag — pandoc-native pins
866    // per-tag emission rather than a matched-pair lift in that case.
867    let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
868        let (line_no_nl, _) = strip_newline(first_inner);
869        let shape_ok = if multiline_open_end.is_some() {
870            // `find_multiline_open_end` already verified the open tag
871            // closes with a quote-aware `>` somewhere in lines
872            // `start_pos+1..=end`. No same-line trailing content to
873            // probe; defer trailing-on-close-`>`-line handling to a
874            // future session (rare in practice).
875            true
876        } else if depth > 0 {
877            probe_open_tag_line_has_close_gt(line_no_nl, name)
878        } else {
879            same_line_strict_lift_safe
880        };
881        if !shape_ok {
882            return false;
883        }
884        if !is_pandoc_inline_block_tag_name(name) {
885            return true;
886        }
887        !inline_block_void_interior_abandons(
888            first_inner,
889            lines,
890            start_pos,
891            multiline_open_end,
892            bq_depth,
893            name,
894        )
895    });
896
897    // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
898    // are byte-equal to the non-bq same-line shape minus the leading
899    // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
900    // The body has no inner newlines, so no bq prefix re-injection is
901    // needed when grafting — `emit_html_block_body_lifted` (passing
902    // `bq: &mut None`) is enough. Other bq shapes (butted-close,
903    // open-trailing) still fall through to the projector's byte
904    // walker — they need per-line prefix injection.
905    let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
906        && multiline_open_end.is_none()
907        && depth_aware_tag.is_some()
908        && depth <= 0
909    {
910        let (line_no_nl, _) = strip_newline(first_inner);
911        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
912            if probe_same_line_lift(line_no_nl, "div") {
913                Some("div")
914            } else {
915                None
916            }
917        } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
918            match &block_type {
919                HtmlBlockType::BlockTag {
920                    tag_name,
921                    is_verbatim: false,
922                    closed_by_blank_line: false,
923                    depth_aware: true,
924                    closes_at_open_tag: false,
925                    is_closing: false,
926                } if is_pandoc_lift_eligible_block_tag(tag_name)
927                    && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
928                {
929                    // Inline-block tags (`<video>`, `<iframe>`, …) skip
930                    // the void-interior check at same-line — the shape
931                    // has no inner block content to interfere with.
932                    Some(tag_name.as_str())
933                }
934                _ => None,
935            }
936        } else {
937            None
938        }
939    } else {
940        None
941    };
942
943    // Messy-shape lift inside a blockquote — covers open-trailing
944    // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
945    // and open-trailing + butted-close (`> <div>foo\n> bar</div>`).
946    // The open line does NOT balance the block (depth > 0 after the
947    // open line, distinguishing this from `same_line_bq_lift_tag` which
948    // requires depth <= 0). The close line — possibly with leading body
949    // text — closes the block when depth returns to 0. Body lines (incl.
950    // open trailing and close leading) graft via prefix re-injection.
951    let bq_messy_lift_tag: Option<&str> =
952        if bq_depth > 0 && multiline_open_end.is_none() && depth_aware_tag.is_some() && depth > 0 {
953            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
954                Some("div")
955            } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
956                match &block_type {
957                    HtmlBlockType::BlockTag {
958                        tag_name,
959                        is_verbatim: false,
960                        closed_by_blank_line: false,
961                        depth_aware: true,
962                        closes_at_open_tag: false,
963                        is_closing: false,
964                    } if is_pandoc_lift_eligible_block_tag(tag_name) => {
965                        // Inline-block matched-pair tags (`<video>`, `<iframe>`,
966                        // …) abandon the lift when the body starts at a
967                        // fresh-block position with a void block tag. Same gate
968                        // as the non-bq matched-pair lift (`strict_block_lift`).
969                        if is_pandoc_inline_block_tag_name(tag_name)
970                            && inline_block_void_interior_abandons(
971                                first_inner,
972                                lines,
973                                start_pos,
974                                multiline_open_end,
975                                bq_depth,
976                                tag_name,
977                            )
978                        {
979                            None
980                        } else {
981                            Some(tag_name.as_str())
982                        }
983                    }
984                    _ => None,
985                }
986            } else {
987                None
988            }
989        } else {
990            None
991        };
992
993    // Whether this block participates in the Phase 6 structural lift
994    // (recursively parse body as Pandoc markdown and graft children).
995    // Covers `<div>` outside blockquote context. For same-line shapes
996    // the lift is gated on `same_line_*_lift_safe` — when unsafe we
997    // keep the legacy single-HTML_BLOCK_TAG shape and let the
998    // byte-reparse path handle projection.
999    let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1000        && bq_depth == 0
1001        && (!is_same_line_div || same_line_div_lift_safe))
1002        || strict_block_lift
1003        || same_line_bq_lift_tag.is_some()
1004        || bq_messy_lift_tag.is_some();
1005
1006    // Trailing content from the open tag (after `>`). When the lift is
1007    // active and the open line is `<div ATTRS>foo\n`, this captures
1008    // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1009    // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1010    // shapes (same-line / blockquote-wrapped).
1011    let mut pre_content = String::new();
1012
1013    // Emit opening line(s)
1014    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1015
1016    if let Some(end_line_idx) = multiline_open_end {
1017        if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1018            emit_multiline_open_tag_with_attrs(builder, lines, start_pos, end_line_idx, "div");
1019        } else if let Some(name) = strict_block_tag_name
1020            && strict_block_lift
1021        {
1022            emit_multiline_open_tag_with_attrs(builder, lines, start_pos, end_line_idx, name);
1023        } else {
1024            emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx);
1025        }
1026    } else {
1027        let (line_without_newline, newline_str) = strip_newline(first_inner);
1028        if !line_without_newline.is_empty() {
1029            // For HTML_BLOCK_DIV, expose the open tag's attributes
1030            // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1031            // via the same descendants walk that handles fenced-div /
1032            // heading attrs. CST bytes stay byte-equal to source — we only
1033            // tokenize at finer granularity for matched div opens.
1034            if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1035                let trailing =
1036                    emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1037                if !trailing.is_empty() {
1038                    pre_content.push_str(trailing);
1039                    pre_content.push_str(newline_str);
1040                }
1041            } else if let Some(name) = strict_block_tag_name
1042                && strict_block_lift
1043            {
1044                let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1045                if !trailing.is_empty() {
1046                    pre_content.push_str(trailing);
1047                    pre_content.push_str(newline_str);
1048                }
1049            } else if let Some(name) =
1050                bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1051            {
1052                // Inside a blockquote, lift trailing bytes into
1053                // `pre_content` when either the same-line bq gate fires
1054                // (`> <tag>body</tag>` — handled by `same_line_closed`)
1055                // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1056                // </tag>` and butted-close — handled at the close-marker
1057                // site below). For the clean-shape bq lift the open has
1058                // no trailing bytes regardless, so `lift_trailing=true`
1059                // is a no-op there.
1060                let lift_trailing =
1061                    same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1062                let trailing =
1063                    emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1064                if lift_trailing && !trailing.is_empty() {
1065                    pre_content.push_str(trailing);
1066                    pre_content.push_str(newline_str);
1067                }
1068            } else {
1069                builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1070            }
1071        }
1072        // When the open tag has trailing content under lift mode, the
1073        // newline belongs to that trailing line (it terminates the
1074        // synthetic body line, not the open tag). Don't double-emit.
1075        if pre_content.is_empty() && !newline_str.is_empty() {
1076            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1077        }
1078    }
1079
1080    builder.finish_node(); // HtmlBlockTag
1081
1082    // Check if opening line also contains closing marker. Blank-line-terminated
1083    // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1084    // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1085    // (`closes_at_open_tag: true`) close immediately — the block always
1086    // ends on the open-tag line since there is no closing tag to find.
1087    let void_block = matches!(
1088        &block_type,
1089        HtmlBlockType::BlockTag {
1090            closes_at_open_tag: true,
1091            ..
1092        }
1093    );
1094    // Void tags with a multi-line open close immediately after the open
1095    // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1096    // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1097    // a single RawBlock for the whole multi-line tag, with no following
1098    // content.
1099    if void_block && let Some(end_line_idx) = multiline_open_end {
1100        log::trace!(
1101            "HTML void block at line {} closes after multi-line open ending at line {}",
1102            start_pos + 1,
1103            end_line_idx + 1
1104        );
1105        builder.finish_node(); // HtmlBlock
1106        return end_line_idx + 1;
1107    }
1108    let same_line_closed = !blank_terminated
1109        && multiline_open_end.is_none()
1110        && (void_block
1111            || match &depth_aware_tag {
1112                Some(_) => depth <= 0,
1113                None => is_closing_marker(first_inner, &block_type),
1114            });
1115    if same_line_closed {
1116        log::trace!(
1117            "HTML block at line {} opens and closes on same line",
1118            start_pos + 1
1119        );
1120        // Same-line structural lift (div or non-div strict-block):
1121        // pre_content holds the bytes after the open `>` (including
1122        // the close `</tag>` and the trailing newline). Split into
1123        // body + close tag, emit body via recursive parse, emit close
1124        // tag as a sibling `HTML_BLOCK_TAG`.
1125        let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1126            None
1127        } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1128            Some("div")
1129        } else if same_line_strict_lift_safe {
1130            strict_block_tag_name
1131        } else if let Some(name) = same_line_bq_lift_tag {
1132            // Bq same-line: body has no inner newlines so the standard
1133            // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1134            // sufficient. The bq prefix `> ` lives on the outer
1135            // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1136            Some(name)
1137        } else {
1138            None
1139        };
1140        if let Some(tag_name) = same_line_lift_tag {
1141            let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1142            if let Some((leading, close_part)) = try_split_close_line(pre_no_nl, tag_name) {
1143                // Same-line is always close-butted; div demotes the
1144                // trailing Para→Plain via `SkipTrailingBlanks`.
1145                // Non-div strict-block uses `OnlyIfLast` (consistent
1146                // with butted-close — no trailing BLANK_LINE before
1147                // the close means the trailing Para demotes).
1148                let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1149                    LastParaDemote::SkipTrailingBlanks
1150                } else {
1151                    LastParaDemote::OnlyIfLast
1152                };
1153                emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1154                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1155                let mut close_line = String::with_capacity(close_part.len() + post_nl.len());
1156                close_line.push_str(close_part);
1157                close_line.push_str(post_nl);
1158                emit_html_block_line(builder, &close_line, 0);
1159                builder.finish_node();
1160                builder.finish_node(); // HtmlBlock
1161                return start_pos + 1;
1162            }
1163        }
1164        builder.finish_node(); // HtmlBlock
1165        return start_pos + 1;
1166    }
1167
1168    let mut current_pos = multiline_open_end
1169        .map(|end| end + 1)
1170        .unwrap_or(start_pos + 1);
1171    let mut content_lines: Vec<&str> = Vec::new();
1172    let mut found_closing = false;
1173
1174    // Parse content until we find the closing marker
1175    while current_pos < lines.len() {
1176        let line = lines[current_pos];
1177        let (line_bq_depth, inner) = count_blockquote_markers(line);
1178
1179        // Only process lines at the same or deeper blockquote depth
1180        if line_bq_depth < bq_depth {
1181            break;
1182        }
1183
1184        // Blank-line-terminated blocks (types 6/7) end before the blank line.
1185        // The blank line itself is not part of the block.
1186        if blank_terminated && inner.trim().is_empty() {
1187            break;
1188        }
1189
1190        // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1191        // count opens/closes of the same tag name and only close when depth
1192        // returns to 0; otherwise fall back to substring-match on the line.
1193        let line_closes = match &depth_aware_tag {
1194            Some(tag_name) => {
1195                let (opens, closes) = count_tag_balance(inner, tag_name);
1196                depth += opens as i64;
1197                depth -= closes as i64;
1198                depth <= 0
1199            }
1200            None => is_closing_marker(inner, &block_type),
1201        };
1202
1203        if line_closes {
1204            log::trace!("Found HTML block closing at line {}", current_pos + 1);
1205            found_closing = true;
1206
1207            // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1208            // the open and close tags stand alone on their source lines
1209            // (no trailing on open, no body content on close after
1210            // stripping bq markers), lift the body lines structurally
1211            // so the projector walks CST children instead of
1212            // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1213            //
1214            // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1215            // grafted, Para preserved), non-div strict-block tags
1216            // (`<form>`, `<section>`, …) and inline-block matched-pair
1217            // tags (`<video>`, `<iframe>`, …) — the latter two under
1218            // HTML_BLOCK with the structural lift hitting pandoc's
1219            // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1220            // demotion. Inline-block additionally bails if the body
1221            // starts at a fresh-block position with a void block tag
1222            // (mirrors the non-bq matched-pair gate).
1223            //
1224            // Other bq-wrapped shapes (butted-close / open-trailing /
1225            // same-line) still fall through to the opaque path.
1226            let bq_lift_tag: Option<&str> = if bq_depth > 0
1227                && multiline_open_end.is_none()
1228                && pre_content.is_empty()
1229            {
1230                if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1231                    Some("div")
1232                } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1233                    match &block_type {
1234                        HtmlBlockType::BlockTag {
1235                            tag_name,
1236                            is_verbatim: false,
1237                            closed_by_blank_line: false,
1238                            depth_aware: true,
1239                            closes_at_open_tag: false,
1240                            is_closing: false,
1241                        } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1242                        _ => None,
1243                    }
1244                } else {
1245                    None
1246                }
1247            } else {
1248                None
1249            };
1250
1251            let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1252                let (open_no_nl, _) = strip_newline(first_inner);
1253                if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1254                    return false;
1255                }
1256                let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1257                let (close_no_nl, _) = strip_newline(close_stripped);
1258                if !close_no_nl
1259                    .trim_start_matches([' ', '\t'])
1260                    .starts_with("</")
1261                {
1262                    return false;
1263                }
1264                if is_pandoc_inline_block_tag_name(tag_name)
1265                    && inline_block_void_interior_abandons(
1266                        first_inner,
1267                        lines,
1268                        start_pos,
1269                        multiline_open_end,
1270                        bq_depth,
1271                        tag_name,
1272                    )
1273                {
1274                    return false;
1275                }
1276                true
1277            });
1278
1279            if bq_clean_lift {
1280                let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1281                    LastParaDemote::Never
1282                } else {
1283                    LastParaDemote::OnlyIfLast
1284                };
1285                emit_html_block_body_lifted_bq(
1286                    builder,
1287                    &content_lines,
1288                    bq_depth,
1289                    demote_policy,
1290                    config,
1291                );
1292                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1293                emit_html_block_line(builder, line, bq_depth);
1294                builder.finish_node();
1295                current_pos += 1;
1296                break;
1297            }
1298
1299            // Bq messy-shape lift — single-line open with trailing or
1300            // butted-close (or both). `pre_content` already captures any
1301            // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1302            // strip the close line's bq markers before splitting so
1303            // `leading` and `close_part` are bq-prefix-free. Body parses
1304            // recursively from `pre_content + stripped(content_lines) +
1305            // leading`, with per-line bq prefixes re-injected so the CST
1306            // stays byte-equal to the source. Demote: div is keyed on
1307            // close-butted-ness (Plain when leading non-empty, Para
1308            // otherwise); non-div uses OnlyIfLast either way.
1309            if let Some(tag_name) = bq_messy_lift_tag {
1310                let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1311                let close_prefix_len = line.len() - close_stripped.len();
1312                let close_prefix = &line[..close_prefix_len];
1313                if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1314                {
1315                    let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1316                        if leading.is_empty() {
1317                            LastParaDemote::Never
1318                        } else {
1319                            LastParaDemote::SkipTrailingBlanks
1320                        }
1321                    } else {
1322                        LastParaDemote::OnlyIfLast
1323                    };
1324                    emit_html_block_body_lifted_bq_messy(
1325                        builder,
1326                        &pre_content,
1327                        &content_lines,
1328                        leading,
1329                        close_prefix,
1330                        bq_depth,
1331                        policy,
1332                        config,
1333                    );
1334                    builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1335                    // When `leading` is empty, no recursive-parse output carries
1336                    // the close line's bq prefix, so emit it here before the
1337                    // close tag. When `leading` is non-empty,
1338                    // `emit_html_block_body_lifted_bq_messy` already injected
1339                    // the prefix at the start of the leading bytes (via the
1340                    // BqPrefixState entry); emitting again would double the
1341                    // prefix bytes and break losslessness.
1342                    if leading.is_empty() {
1343                        emit_bq_prefix_tokens(builder, close_prefix);
1344                    }
1345                    emit_html_block_line(builder, close_part, 0);
1346                    builder.finish_node();
1347                    current_pos += 1;
1348                    break;
1349                }
1350            }
1351
1352            // Under lift mode, try to split the close line into a
1353            // leading "body content" prefix and a clean `</tag>...`
1354            // remainder. Lift only when the close line has exactly one
1355            // `</tag>` and no nested `<tag>` opens — depth-aware corner
1356            // cases (e.g. `<inner></inner></tag>` on the close line)
1357            // fall back to the non-lift path. For `<div>`, non-empty
1358            // `leading` propagates pandoc's `markdown_in_html_blocks`
1359            // Plain demotion rule. For non-div strict-block tags,
1360            // demotion follows pandoc's `OnlyIfLast` rule (demote the
1361            // trailing Para only when no blank line precedes the close).
1362            let close_split_tag = if lift_mode {
1363                if strict_block_lift {
1364                    strict_block_tag_name
1365                } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1366                    Some("div")
1367                } else {
1368                    None
1369                }
1370            } else {
1371                None
1372            };
1373            let close_split = close_split_tag.and_then(|name| try_split_close_line(line, name));
1374
1375            if let Some((leading, close_part)) = close_split {
1376                let policy = if strict_block_lift {
1377                    LastParaDemote::OnlyIfLast
1378                } else if !leading.is_empty() {
1379                    LastParaDemote::SkipTrailingBlanks
1380                } else {
1381                    LastParaDemote::Never
1382                };
1383                emit_html_block_body_lifted(
1384                    builder,
1385                    &pre_content,
1386                    &content_lines,
1387                    leading,
1388                    policy,
1389                    config,
1390                );
1391                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1392                emit_html_block_line(builder, close_part, 0);
1393                builder.finish_node();
1394            } else {
1395                emit_html_block_body(
1396                    builder,
1397                    &pre_content,
1398                    &content_lines,
1399                    bq_depth,
1400                    wrapper_kind,
1401                    lift_mode,
1402                    config,
1403                );
1404                builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1405                emit_html_block_line(builder, line, bq_depth);
1406                builder.finish_node();
1407            }
1408
1409            current_pos += 1;
1410            break;
1411        }
1412
1413        // Regular content line
1414        content_lines.push(line);
1415        current_pos += 1;
1416    }
1417
1418    // If we didn't find a closing marker, emit what we collected
1419    if !found_closing {
1420        log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1421        emit_html_block_body(
1422            builder,
1423            &pre_content,
1424            &content_lines,
1425            bq_depth,
1426            wrapper_kind,
1427            lift_mode,
1428            config,
1429        );
1430    }
1431
1432    builder.finish_node(); // HtmlBlock
1433    current_pos
1434}
1435
1436/// Emit the collected inner content lines for an HTML block.
1437///
1438/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1439/// line `<div>` open outside blockquote), recursively parse the inner
1440/// content (including any open-tag trailing) as Pandoc-flavored
1441/// markdown and graft the resulting top-level blocks as direct children
1442/// of the wrapper. This is the Phase 6 structural lift — the projector
1443/// and downstream consumers (linter, salsa, LSP) can walk the
1444/// structural children instead of re-tokenizing the body bytes.
1445///
1446/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1447/// blockquote, multi-line open, or no content at all — fall through to
1448/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1449///
1450/// CST bytes remain byte-identical to source: the recursive parser is
1451/// lossless on the same byte slice the legacy path would have captured
1452/// as TEXT.
1453fn emit_html_block_body(
1454    builder: &mut GreenNodeBuilder<'static>,
1455    pre_content: &str,
1456    content_lines: &[&str],
1457    bq_depth: usize,
1458    wrapper_kind: SyntaxKind,
1459    lift_mode: bool,
1460    config: &ParserOptions,
1461) {
1462    if pre_content.is_empty() && content_lines.is_empty() {
1463        return;
1464    }
1465    if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1466        // Reached when the parser walked to end-of-input without finding
1467        // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1468        emit_html_block_body_lifted(
1469            builder,
1470            pre_content,
1471            content_lines,
1472            "",
1473            LastParaDemote::Never,
1474            config,
1475        );
1476        return;
1477    }
1478    // Legacy path: opaque TEXT capture. `pre_content` is always empty
1479    // here (lift_mode is the only path that populates it), but be
1480    // defensive — if a trailing prefix snuck in, emit it as TEXT so
1481    // bytes are preserved.
1482    builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1483    if !pre_content.is_empty() {
1484        builder.token(SyntaxKind::TEXT.into(), pre_content);
1485    }
1486    for content_line in content_lines {
1487        emit_html_block_line(builder, content_line, bq_depth);
1488    }
1489    builder.finish_node();
1490}
1491
1492/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1493/// to `PLAIN` when grafting children into the structural CST.
1494#[derive(Copy, Clone, Debug)]
1495enum LastParaDemote {
1496    /// Never demote — pandoc preserves the trailing `Para`.
1497    Never,
1498    /// Demote the LAST `PARAGRAPH` child, skipping any trailing
1499    /// `BLANK_LINE` children. Used for `<div>` shapes where the close
1500    /// tag is butted against the paragraph text on its source line —
1501    /// pandoc's `markdown_in_html_blocks` Plain demotion.
1502    SkipTrailingBlanks,
1503    /// Demote the LAST top-level child only when it is a `PARAGRAPH`
1504    /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
1505    /// for non-div strict-block tags whose body emits at top-level
1506    /// adjacent to the close-tag `RawBlock`; pandoc's rule there
1507    /// demotes the trailing `Para` to `Plain` unless a blank line
1508    /// separates them.
1509    OnlyIfLast,
1510}
1511
1512/// Lift the HTML-block body into structural CST children: build the
1513/// inner text from `pre_content` + `content_lines` + `post_content`
1514/// (in order), recursively parse it as Pandoc-flavored markdown, and
1515/// graft the resulting top-level blocks into `builder`. `demote_policy`
1516/// controls whether the trailing paragraph is retagged as `PLAIN` to
1517/// encode pandoc's Plain/Para adjacency rules structurally.
1518fn emit_html_block_body_lifted(
1519    builder: &mut GreenNodeBuilder<'static>,
1520    pre_content: &str,
1521    content_lines: &[&str],
1522    post_content: &str,
1523    demote_policy: LastParaDemote,
1524    config: &ParserOptions,
1525) {
1526    emit_html_block_body_lifted_inner(
1527        builder,
1528        pre_content,
1529        content_lines,
1530        post_content,
1531        demote_policy,
1532        config,
1533        &mut None,
1534    )
1535}
1536
1537/// Body-lift variant for `<div>` inside a blockquote. Strips
1538/// `bq_depth` levels of blockquote markers from each `content_line`,
1539/// captures the per-line prefix bytes, and grafts the recursive parse
1540/// with prefix injection so the output CST stays byte-equal to the
1541/// source. `pre_content` and `post_content` must be empty (the bq
1542/// clean lift only handles the shape where the open and close tags
1543/// stand alone on their source lines).
1544fn emit_html_block_body_lifted_bq(
1545    builder: &mut GreenNodeBuilder<'static>,
1546    content_lines: &[&str],
1547    bq_depth: usize,
1548    demote_policy: LastParaDemote,
1549    config: &ParserOptions,
1550) {
1551    let mut prefixes: Vec<String> = Vec::with_capacity(content_lines.len());
1552    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
1553    for cl in content_lines {
1554        let stripped = strip_n_blockquote_markers(cl, bq_depth);
1555        let prefix_len = cl.len() - stripped.len();
1556        prefixes.push(cl[..prefix_len].to_string());
1557        stripped_lines.push(stripped);
1558    }
1559    let mut bq = Some(BqPrefixState {
1560        prefixes,
1561        line_idx: 0,
1562        at_line_start: true,
1563    });
1564    emit_html_block_body_lifted_inner(
1565        builder,
1566        "",
1567        &stripped_lines,
1568        "",
1569        demote_policy,
1570        config,
1571        &mut bq,
1572    )
1573}
1574
1575/// Body-lift variant for the bq messy-shape lift — open-trailing,
1576/// butted-close, or both. The open-trailing bytes (if any) sit in
1577/// `pre_content` (line 0 of the body — no bq prefix in source because
1578/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
1579/// each carry their own bq prefix. The close line's `leading` (body
1580/// bytes before `</tag>`) sits on the close line, prefixed in source
1581/// by `close_line_prefix` (the bq prefix captured from `line`).
1582///
1583/// Builds `prefixes` so each emitted line in the recursive parse
1584/// output gets the right per-line bq prefix re-injected at line start:
1585/// `pre_content` → empty prefix (no source `> ` precedes it); each
1586/// content line → its stripped prefix; `leading` → `close_line_prefix`.
1587/// Result CST stays byte-equal to source.
1588#[allow(clippy::too_many_arguments)]
1589fn emit_html_block_body_lifted_bq_messy(
1590    builder: &mut GreenNodeBuilder<'static>,
1591    pre_content: &str,
1592    content_lines: &[&str],
1593    leading: &str,
1594    close_line_prefix: &str,
1595    bq_depth: usize,
1596    demote_policy: LastParaDemote,
1597    config: &ParserOptions,
1598) {
1599    let mut prefixes: Vec<String> = Vec::new();
1600    if !pre_content.is_empty() {
1601        prefixes.push(String::new());
1602    }
1603    let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
1604    for cl in content_lines {
1605        let stripped = strip_n_blockquote_markers(cl, bq_depth);
1606        let prefix_len = cl.len() - stripped.len();
1607        prefixes.push(cl[..prefix_len].to_string());
1608        stripped_lines.push(stripped);
1609    }
1610    if !leading.is_empty() {
1611        prefixes.push(close_line_prefix.to_string());
1612    }
1613    let mut bq = Some(BqPrefixState {
1614        prefixes,
1615        line_idx: 0,
1616        at_line_start: true,
1617    });
1618    emit_html_block_body_lifted_inner(
1619        builder,
1620        pre_content,
1621        &stripped_lines,
1622        leading,
1623        demote_policy,
1624        config,
1625        &mut bq,
1626    )
1627}
1628
1629fn emit_html_block_body_lifted_inner(
1630    builder: &mut GreenNodeBuilder<'static>,
1631    pre_content: &str,
1632    content_lines: &[&str],
1633    post_content: &str,
1634    demote_policy: LastParaDemote,
1635    config: &ParserOptions,
1636    bq: &mut Option<BqPrefixState>,
1637) {
1638    if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
1639        return;
1640    }
1641    let mut inner_text = String::with_capacity(
1642        pre_content.len()
1643            + content_lines.iter().map(|s| s.len()).sum::<usize>()
1644            + post_content.len(),
1645    );
1646    inner_text.push_str(pre_content);
1647    for line in content_lines {
1648        inner_text.push_str(line);
1649    }
1650    inner_text.push_str(post_content);
1651
1652    let mut inner_options = config.clone();
1653    let refdefs = config.refdef_labels.clone().unwrap_or_default();
1654    inner_options.refdef_labels = Some(refdefs.clone());
1655    let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
1656    graft_document_children(builder, &inner_root, demote_policy, bq);
1657}
1658
1659/// Per-line blockquote-prefix injection state used by the graft helpers
1660/// when the lifted body originated inside a `> …` blockquote: the
1661/// recursive parse was fed the bq-stripped text, so the prefix bytes
1662/// (`BLOCK_QUOTE_MARKER` + `WHITESPACE`) must be re-emitted at the
1663/// start of each source line to keep the CST byte-equal to the source.
1664///
1665/// `prefixes[i]` is the literal prefix bytes for source line `i` of the
1666/// body (e.g. `"> "`, `">  "`, or `">"`). `line_idx` is the index of
1667/// the next prefix to emit; `at_line_start` flips to `true` after every
1668/// `NEWLINE` so the next token triggers prefix emission.
1669struct BqPrefixState {
1670    prefixes: Vec<String>,
1671    line_idx: usize,
1672    at_line_start: bool,
1673}
1674
1675/// Walk a parsed inner document's top-level children and re-emit them
1676/// into `builder`. The document's wrapper node is skipped — only its
1677/// children are grafted.
1678///
1679/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
1680/// as `PLAIN` — see [`LastParaDemote`].
1681///
1682/// `bq` is `Some` when grafting a body that lived inside a blockquote
1683/// — token emission then injects `BLOCK_QUOTE_MARKER + WHITESPACE`
1684/// prefix tokens at line starts. See [`BqPrefixState`].
1685fn graft_document_children(
1686    builder: &mut GreenNodeBuilder<'static>,
1687    doc: &SyntaxNode,
1688    demote_policy: LastParaDemote,
1689    bq: &mut Option<BqPrefixState>,
1690) {
1691    let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
1692
1693    let mut demote_idx: Option<usize> = None;
1694    match demote_policy {
1695        LastParaDemote::Never => {}
1696        LastParaDemote::SkipTrailingBlanks => {
1697            for (i, c) in children.iter().enumerate().rev() {
1698                if let rowan::NodeOrToken::Node(n) = c {
1699                    if n.kind() == SyntaxKind::BLANK_LINE {
1700                        continue;
1701                    }
1702                    if n.kind() == SyntaxKind::PARAGRAPH {
1703                        demote_idx = Some(i);
1704                    }
1705                    break;
1706                }
1707            }
1708        }
1709        LastParaDemote::OnlyIfLast => {
1710            for (i, c) in children.iter().enumerate().rev() {
1711                if let rowan::NodeOrToken::Node(n) = c {
1712                    if n.kind() == SyntaxKind::PARAGRAPH {
1713                        demote_idx = Some(i);
1714                    }
1715                    break;
1716                }
1717            }
1718        }
1719    }
1720
1721    for (i, child) in children.into_iter().enumerate() {
1722        match child {
1723            rowan::NodeOrToken::Node(n) => {
1724                if Some(i) == demote_idx {
1725                    graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
1726                } else {
1727                    graft_subtree(builder, &n, bq);
1728                }
1729            }
1730            rowan::NodeOrToken::Token(t) => {
1731                emit_grafted_token(builder, t.kind(), t.text(), bq);
1732            }
1733        }
1734    }
1735}
1736
1737/// Recursively re-emit `node` and its descendants into `builder`.
1738/// Token text is copied verbatim so the result is byte-identical to
1739/// the input span (modulo bq prefix tokens injected at line starts
1740/// when `bq` is `Some`).
1741fn graft_subtree(
1742    builder: &mut GreenNodeBuilder<'static>,
1743    node: &SyntaxNode,
1744    bq: &mut Option<BqPrefixState>,
1745) {
1746    graft_subtree_as(builder, node, node.kind(), bq);
1747}
1748
1749/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
1750/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
1751/// the close-butted demotion rule.
1752fn graft_subtree_as(
1753    builder: &mut GreenNodeBuilder<'static>,
1754    node: &SyntaxNode,
1755    kind: SyntaxKind,
1756    bq: &mut Option<BqPrefixState>,
1757) {
1758    builder.start_node(kind.into());
1759    for child in node.children_with_tokens() {
1760        match child {
1761            rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
1762            rowan::NodeOrToken::Token(t) => {
1763                emit_grafted_token(builder, t.kind(), t.text(), bq);
1764            }
1765        }
1766    }
1767    builder.finish_node();
1768}
1769
1770/// Emit a single token while optionally injecting blockquote prefix
1771/// tokens at line starts. When `bq` is `None`, this is a plain
1772/// `builder.token()` passthrough.
1773fn emit_grafted_token(
1774    builder: &mut GreenNodeBuilder<'static>,
1775    kind: SyntaxKind,
1776    text: &str,
1777    bq: &mut Option<BqPrefixState>,
1778) {
1779    if let Some(state) = bq.as_mut() {
1780        if state.at_line_start {
1781            if let Some(prefix) = state.prefixes.get(state.line_idx) {
1782                emit_bq_prefix_tokens(builder, prefix);
1783            }
1784            state.at_line_start = false;
1785        }
1786        builder.token(kind.into(), text);
1787        // `BLANK_LINE` token represents an entirely blank source line —
1788        // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
1789        // token as line-ending so the per-line prefix index advances
1790        // correctly.
1791        if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
1792            state.line_idx += 1;
1793            state.at_line_start = true;
1794        }
1795    } else {
1796        builder.token(kind.into(), text);
1797    }
1798}
1799
1800/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
1801/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
1802fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
1803    for ch in prefix.chars() {
1804        if ch == '>' {
1805            builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
1806        } else {
1807            let mut buf = [0u8; 4];
1808            builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
1809        }
1810    }
1811}
1812
1813/// Locate the byte index (within `line`) of the open-tag's closing `>`
1814/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
1815/// the line doesn't fit the expected shape. Mirrors the inner scan of
1816/// `probe_open_tag_line_has_close_gt` but exposes the position so the
1817/// caller can slice off the trailing bytes.
1818fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
1819    let bytes = line.as_bytes();
1820    let indent_end = bytes
1821        .iter()
1822        .position(|&b| b != b' ' && b != b'\t')
1823        .unwrap_or(bytes.len());
1824    let rest = &line[indent_end..];
1825    let rest_bytes = rest.as_bytes();
1826    let prefix_len = 1 + tag_name.len();
1827    if rest_bytes.len() < prefix_len + 1
1828        || rest_bytes[0] != b'<'
1829        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1830    {
1831        return None;
1832    }
1833    let after_name = &rest[prefix_len..];
1834    let after_name_bytes = after_name.as_bytes();
1835    let mut i = 0usize;
1836    let mut quote: Option<u8> = None;
1837    while i < after_name_bytes.len() {
1838        match (quote, after_name_bytes[i]) {
1839            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
1840            (Some(q), b2) if b2 == q => quote = None,
1841            (None, b'>') => return Some(indent_end + prefix_len + i),
1842            _ => {}
1843        }
1844        i += 1;
1845    }
1846    None
1847}
1848
1849/// Whether `slice` begins (after leading ASCII whitespace) with an
1850/// open tag whose name is a Pandoc void block tag (`<source>`,
1851/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
1852/// open tags return false.
1853///
1854/// Used by the inline-block matched-pair lift gate: pandoc-native
1855/// abandons the lift when the body's first non-blank content is a
1856/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
1857/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
1858/// matched-pair lift).
1859fn slice_starts_with_void_block_tag(slice: &str) -> bool {
1860    let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
1861    if !trimmed.starts_with('<') || trimmed.starts_with("</") {
1862        return false;
1863    }
1864    let Some(tag_end) = parse_open_tag(trimmed) else {
1865        return false;
1866    };
1867    let bytes = trimmed.as_bytes();
1868    let mut name_end = 1usize;
1869    while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
1870    {
1871        name_end += 1;
1872    }
1873    if name_end == 1 {
1874        return false;
1875    }
1876    is_pandoc_void_block_tag_name(&trimmed[1..name_end])
1877}
1878
1879/// Whether the body of an inline-block matched-pair (`<video>...`,
1880/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
1881/// a void block tag — the condition under which pandoc-native abandons
1882/// the matched-pair lift. Probes three shapes:
1883///
1884/// - **Same-line** (`<video><source ...></video>`): trailing bytes
1885///   after the open `>` on `first_inner` start with `<source`.
1886/// - **Single-line open + multi-line body**: open-trailing on the open
1887///   line is empty/whitespace AND the first non-blank body line
1888///   (`lines[start_pos+1..]`) starts with a void tag.
1889/// - **Multi-line open**: same body-line scan starting at
1890///   `lines[multiline_open_end+1..]`.
1891///
1892/// Returns `false` when the body begins with text, with a close tag,
1893/// or with a non-void block tag — those cases all proceed with the
1894/// matched-pair lift.
1895fn inline_block_void_interior_abandons(
1896    first_inner: &str,
1897    lines: &[&str],
1898    start_pos: usize,
1899    multiline_open_end: Option<usize>,
1900    bq_depth: usize,
1901    tag_name: &str,
1902) -> bool {
1903    let (line_no_nl, _) = strip_newline(first_inner);
1904    let (body_start_line_idx, open_trailing) = match multiline_open_end {
1905        Some(end) => (end + 1, ""),
1906        None => {
1907            let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
1908            let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
1909            (start_pos + 1, trailing)
1910        }
1911    };
1912    let trimmed = open_trailing.trim_start_matches([' ', '\t']);
1913    if !trimmed.is_empty() {
1914        return slice_starts_with_void_block_tag(trimmed);
1915    }
1916    for line in &lines[body_start_line_idx..] {
1917        let inner = if bq_depth > 0 {
1918            strip_n_blockquote_markers(line, bq_depth)
1919        } else {
1920            line
1921        };
1922        let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
1923        if trimmed.is_empty() {
1924            continue;
1925        }
1926        return slice_starts_with_void_block_tag(trimmed);
1927    }
1928    false
1929}
1930
1931/// Probe whether the open-tag line has a valid (quote-aware) closing
1932/// `>` after the tag name. Admits trailing content after `>` (the
1933/// open-trailing shape `<form>foo`) — the caller is expected to capture
1934/// that trailing into the structural lift's `pre_content`.
1935fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
1936    let bytes = line.as_bytes();
1937    let indent_end = bytes
1938        .iter()
1939        .position(|&b| b != b' ' && b != b'\t')
1940        .unwrap_or(bytes.len());
1941    let rest = &line[indent_end..];
1942    let rest_bytes = rest.as_bytes();
1943    let prefix_len = 1 + tag_name.len();
1944    if rest_bytes.len() < prefix_len + 1
1945        || rest_bytes[0] != b'<'
1946        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1947    {
1948        return false;
1949    }
1950    let after_name = &rest[prefix_len..];
1951    let after_name_bytes = after_name.as_bytes();
1952    let mut i = 0usize;
1953    let mut quote: Option<u8> = None;
1954    while i < after_name_bytes.len() {
1955        match (quote, after_name_bytes[i]) {
1956            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
1957            (Some(q), b2) if b2 == q => quote = None,
1958            (None, b'>') => return true,
1959            _ => {}
1960        }
1961        i += 1;
1962    }
1963    false
1964}
1965
1966/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
1967/// be lifted structurally. Returns `true` only when:
1968/// - The line starts with `<tag_name` (modulo leading whitespace).
1969/// - The open tag's `>` exists with proper quote handling.
1970/// - The bytes after the open `>` end with `</tag_name>` (case-
1971///   insensitive, allowing trailing whitespace).
1972/// - The trailing has exactly one `</tag_name>` close and zero
1973///   `<tag_name>` opens (rejects nested same-line shapes).
1974///
1975/// Trailing non-whitespace content after `</tag_name>` (e.g.
1976/// `<form>foo</form>extra`) rejects the lift — pandoc projects that
1977/// shape as RawBlock + content + RawBlock + trailing-Para, which the
1978/// byte walker handles via `split_html_block_by_tags`.
1979fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
1980    let bytes = line.as_bytes();
1981    let indent_end = bytes
1982        .iter()
1983        .position(|&b| b != b' ' && b != b'\t')
1984        .unwrap_or(bytes.len());
1985    let rest = &line[indent_end..];
1986    let rest_bytes = rest.as_bytes();
1987    let prefix_len = 1 + tag_name.len();
1988    if rest_bytes.len() < prefix_len
1989        || rest_bytes[0] != b'<'
1990        || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1991    {
1992        return false;
1993    }
1994    let after_name = &rest[prefix_len..];
1995    let after_name_bytes = after_name.as_bytes();
1996    let mut i = 0usize;
1997    let mut quote: Option<u8> = None;
1998    let mut gt_idx: Option<usize> = None;
1999    while i < after_name_bytes.len() {
2000        match (quote, after_name_bytes[i]) {
2001            (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2002            (Some(q), b2) if b2 == q => quote = None,
2003            (None, b'>') => {
2004                gt_idx = Some(i);
2005                break;
2006            }
2007            _ => {}
2008        }
2009        i += 1;
2010    }
2011    let Some(gt_idx) = gt_idx else {
2012        return false;
2013    };
2014    let trailing = &after_name[gt_idx + 1..];
2015    let trimmed = trailing.trim_end_matches([' ', '\t']);
2016    let close_marker = format!("</{}>", tag_name);
2017    if !trimmed
2018        .to_ascii_lowercase()
2019        .ends_with(&close_marker.to_ascii_lowercase())
2020    {
2021        return false;
2022    }
2023    let (opens, closes) = count_tag_balance(trailing, tag_name);
2024    opens == 0 && closes == 1
2025}
2026
2027/// Try to split the close line of an HTML_BLOCK_DIV body into a
2028/// leading content prefix and a clean `</tag>...` remainder. Returns
2029/// `Some((leading, close_part))` only when the line contains exactly
2030/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2031/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2032/// for missing close tags, or for compound shapes the parser
2033/// shouldn't attempt to lift in this pass.
2034///
2035/// `leading` may be empty (close starts at column 0) or pure
2036/// whitespace (close on an indented line). Both count as "butted" per
2037/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2038/// the trailing paragraph inside the div demotes Para→Plain.
2039fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2040    let (opens, closes) = count_tag_balance(line, tag_name);
2041    if opens != 0 || closes != 1 {
2042        return None;
2043    }
2044    // Locate the close tag's opening `<` by lowercased substring search.
2045    // Safe because we've already established (above) that the line has
2046    // exactly one `</tag>` and no `<tag>` opens, so the first match is
2047    // THE close.
2048    let needle = format!("</{}", tag_name);
2049    let lower = line.to_ascii_lowercase();
2050    let close_lt = lower.find(&needle)?;
2051    Some((&line[..close_lt], &line[close_lt..]))
2052}
2053
2054/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2055/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2056/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2057/// + TEXT(">") + TEXT(trailing)?`.
2058///
2059/// Bytes are byte-identical to the source — this only tokenizes at finer
2060/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2061/// region structurally. Falls back to a single TEXT token if the line
2062/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2063/// only retags as the lift kind when this shape was matched).
2064///
2065/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2066/// returned as `&str` instead so the caller can splice them into the
2067/// recursive-parse input for the structural body lift. When false
2068/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2069/// empty slice is returned.
2070fn emit_open_tag_tokens<'a>(
2071    builder: &mut GreenNodeBuilder<'static>,
2072    line: &'a str,
2073    tag_name: &str,
2074    lift_trailing: bool,
2075) -> &'a str {
2076    let bytes = line.as_bytes();
2077    // Leading indent (CommonMark allows up to 3 spaces).
2078    let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2079    if indent_end > 0 {
2080        builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2081    }
2082    let rest = &line[indent_end..];
2083    // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2084    let prefix_len = 1 + tag_name.len();
2085    if !rest.starts_with('<')
2086        || rest.len() < prefix_len
2087        || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2088    {
2089        builder.token(SyntaxKind::TEXT.into(), rest);
2090        return "";
2091    }
2092    let after_name = &rest[prefix_len..];
2093    let after_name_bytes = after_name.as_bytes();
2094    // Find the closing `>` of the open tag, respecting quoted attribute values.
2095    let mut i = 0usize;
2096    let mut quote: Option<u8> = None;
2097    let mut tag_close: Option<usize> = None;
2098    while i < after_name_bytes.len() {
2099        let b = after_name_bytes[i];
2100        match (quote, b) {
2101            (None, b'"') | (None, b'\'') => quote = Some(b),
2102            (Some(q), b2) if b2 == q => quote = None,
2103            (None, b'>') => {
2104                tag_close = Some(i);
2105                break;
2106            }
2107            _ => {}
2108        }
2109        i += 1;
2110    }
2111    let Some(tag_close) = tag_close else {
2112        // Open tag has no closing `>` on this line — defensive fallback.
2113        builder.token(SyntaxKind::TEXT.into(), rest);
2114        return "";
2115    };
2116    // Whitespace between the tag name and the attribute region.
2117    let attrs_inner = &after_name[..tag_close];
2118    let ws_end = attrs_inner
2119        .as_bytes()
2120        .iter()
2121        .position(|&b| !matches!(b, b' ' | b'\t'))
2122        .unwrap_or(attrs_inner.len());
2123    let leading_ws = &attrs_inner[..ws_end];
2124    // Strip a trailing self-closing slash and the whitespace before it
2125    // from the attribute region; emit them as TEXT outside the
2126    // HTML_ATTRS node so the structural region only holds attribute
2127    // bytes (not formatting punctuation).
2128    let attrs_after_ws = &attrs_inner[ws_end..];
2129    let mut attr_end = attrs_after_ws.len();
2130    let attr_bytes = attrs_after_ws.as_bytes();
2131    let mut self_close_start = attr_end;
2132    if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2133        self_close_start = attr_end - 1;
2134        attr_end = self_close_start;
2135        while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2136            attr_end -= 1;
2137        }
2138    }
2139    let attrs_text = &attrs_after_ws[..attr_end];
2140    let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2141    let after_self_close = &attrs_after_ws[self_close_start..];
2142
2143    // Use the original source bytes for the `<tag` prefix (preserves
2144    // source casing — losslessness).
2145    builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2146    if !leading_ws.is_empty() {
2147        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2148    }
2149    if !attrs_text.is_empty() {
2150        builder.start_node(SyntaxKind::HTML_ATTRS.into());
2151        builder.token(SyntaxKind::TEXT.into(), attrs_text);
2152        builder.finish_node();
2153    }
2154    if !trailing_text.is_empty() {
2155        builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2156    }
2157    if !after_self_close.is_empty() {
2158        builder.token(SyntaxKind::TEXT.into(), after_self_close);
2159    }
2160    builder.token(SyntaxKind::TEXT.into(), ">");
2161    let after_gt = &after_name[tag_close + 1..];
2162    if lift_trailing {
2163        // Return trailing bytes to the caller (will be spliced into the
2164        // recursive-parse input for the body lift).
2165        return after_gt;
2166    }
2167    if !after_gt.is_empty() {
2168        builder.token(SyntaxKind::TEXT.into(), after_gt);
2169    }
2170    ""
2171}
2172
2173/// Detect a multi-line HTML open tag for `tag_name`. Returns
2174/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2175/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2176/// the existing path) or when the `>` is missing entirely.
2177///
2178/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2179/// attribute value doesn't terminate the open tag. Quote state carries
2180/// across line boundaries.
2181fn find_multiline_open_end(
2182    lines: &[&str],
2183    start_pos: usize,
2184    first_inner: &str,
2185    tag_name: &str,
2186) -> Option<usize> {
2187    // Locate the `<tag_name` literal in `first_inner` to start scanning past
2188    // it. Match is ASCII case-insensitive; the parser preserves source casing.
2189    let trimmed = strip_leading_spaces(first_inner);
2190    let prefix_len = 1 + tag_name.len();
2191    if !trimmed.starts_with('<')
2192        || trimmed.len() < prefix_len
2193        || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2194    {
2195        return None;
2196    }
2197    let leading_indent = first_inner.len() - trimmed.len();
2198    let mut i = leading_indent + prefix_len; // past `<tag_name`
2199    let mut quote: Option<u8> = None;
2200
2201    // Scan first line for an unquoted `>`.
2202    let line0_bytes = first_inner.as_bytes();
2203    while i < line0_bytes.len() {
2204        match (quote, line0_bytes[i]) {
2205            (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2206            (Some(q), x) if x == q => quote = None,
2207            (None, b'>') => return None, // single-line case
2208            _ => {}
2209        }
2210        i += 1;
2211    }
2212
2213    // No `>` on first line. Scan subsequent lines.
2214    let mut line_idx = start_pos + 1;
2215    while line_idx < lines.len() {
2216        let bytes = lines[line_idx].as_bytes();
2217        for &b in bytes {
2218            match (quote, b) {
2219                (None, b'"') | (None, b'\'') => quote = Some(b),
2220                (Some(q), x) if x == q => quote = None,
2221                (None, b'>') => return Some(line_idx),
2222                _ => {}
2223            }
2224        }
2225        line_idx += 1;
2226    }
2227
2228    None
2229}
2230
2231/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2232/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2233/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2234/// open tag (no `>` in the remaining input) as paragraph text rather than
2235/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2236/// reparse the same content recursively, causing a stack overflow.
2237///
2238/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2239/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2240/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2241/// the tag eventually closes far away.
2242pub(crate) fn pandoc_html_open_tag_closes(
2243    lines: &[&str],
2244    start_pos: usize,
2245    bq_depth: usize,
2246) -> bool {
2247    if start_pos >= lines.len() {
2248        return false;
2249    }
2250    let mut quote: Option<u8> = None;
2251    for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2252        let inner = if bq_depth > 0 {
2253            strip_n_blockquote_markers(line, bq_depth)
2254        } else {
2255            line
2256        };
2257        let bytes = inner.as_bytes();
2258        let mut i = 0usize;
2259        if offset == start_pos {
2260            while i < bytes.len() && bytes[i] == b' ' {
2261                i += 1;
2262            }
2263            if bytes.get(i) != Some(&b'<') {
2264                return false;
2265            }
2266            i += 1;
2267        }
2268        while i < bytes.len() {
2269            match (quote, bytes[i]) {
2270                (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2271                (Some(q), x) if x == q => quote = None,
2272                (None, b'>') => return true,
2273                _ => {}
2274            }
2275            i += 1;
2276        }
2277    }
2278    false
2279}
2280
2281/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2282/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2283/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2284/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2285/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2286/// Phase 6 structural lift.
2287///
2288/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2289/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2290/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2291/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2292///   + TEXT(">") + (TEXT(trailing))? + NEWLINE
2293///
2294/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2295/// newline; `parse_html_attribute_list` tolerates whitespace.
2296fn emit_multiline_open_tag_with_attrs(
2297    builder: &mut GreenNodeBuilder<'static>,
2298    lines: &[&str],
2299    start_pos: usize,
2300    end_line_idx: usize,
2301    tag_name: &str,
2302) {
2303    let prefix_len = 1 + tag_name.len();
2304    for (line_idx, line) in lines
2305        .iter()
2306        .enumerate()
2307        .take(end_line_idx + 1)
2308        .skip(start_pos)
2309    {
2310        let (line_no_nl, newline_str) = strip_newline(line);
2311
2312        if line_idx == start_pos {
2313            // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2314            // + attrs)?. The closing `>` is on a later line, so any
2315            // remaining bytes after "<{tag_name}" on this line are the
2316            // start of the attribute region.
2317            let bytes = line_no_nl.as_bytes();
2318            let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2319            if indent_end > 0 {
2320                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2321            }
2322            // Defensive: caller verified the line starts with `<{tag_name}`.
2323            let after_indent = &line_no_nl[indent_end..];
2324            if after_indent.len() >= prefix_len {
2325                builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2326                let rest = &after_indent[prefix_len..];
2327                emit_attr_region(builder, rest);
2328            } else {
2329                builder.token(SyntaxKind::TEXT.into(), after_indent);
2330            }
2331        } else if line_idx < end_line_idx {
2332            // Pure attribute line.
2333            let bytes = line_no_nl.as_bytes();
2334            let indent_end = bytes
2335                .iter()
2336                .position(|&b| !matches!(b, b' ' | b'\t'))
2337                .unwrap_or(bytes.len());
2338            if indent_end > 0 {
2339                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2340            }
2341            let attrs_text = &line_no_nl[indent_end..];
2342            if !attrs_text.is_empty() {
2343                builder.start_node(SyntaxKind::HTML_ATTRS.into());
2344                builder.token(SyntaxKind::TEXT.into(), attrs_text);
2345                builder.finish_node();
2346            }
2347        } else {
2348            // Last line: indent + attrs + ">" + trailing.
2349            let bytes = line_no_nl.as_bytes();
2350            let indent_end = bytes
2351                .iter()
2352                .position(|&b| !matches!(b, b' ' | b'\t'))
2353                .unwrap_or(bytes.len());
2354            if indent_end > 0 {
2355                builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2356            }
2357            // Find the unquoted `>` byte position in this line.
2358            let mut quote: Option<u8> = None;
2359            let mut gt_pos: Option<usize> = None;
2360            for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
2361                let actual_j = indent_end + j;
2362                match (quote, b) {
2363                    (None, b'"') | (None, b'\'') => quote = Some(b),
2364                    (Some(q), x) if x == q => quote = None,
2365                    (None, b'>') => {
2366                        gt_pos = Some(actual_j);
2367                        break;
2368                    }
2369                    _ => {}
2370                }
2371            }
2372            let Some(gt) = gt_pos else {
2373                // Defensive — caller said `>` is on this line.
2374                builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
2375                if !newline_str.is_empty() {
2376                    builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2377                }
2378                continue;
2379            };
2380            // Attribute region: between indent_end and gt, with possibly
2381            // trailing whitespace before `>`.
2382            let attrs_region = &line_no_nl[indent_end..gt];
2383            let region_bytes = attrs_region.as_bytes();
2384            // Strip trailing whitespace from attrs region; emit as
2385            // separate WHITESPACE so HTML_ATTRS only contains attribute
2386            // bytes.
2387            let mut attr_end = region_bytes.len();
2388            while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
2389                attr_end -= 1;
2390            }
2391            let attrs_text = &attrs_region[..attr_end];
2392            let trailing_ws = &attrs_region[attr_end..];
2393            if !attrs_text.is_empty() {
2394                builder.start_node(SyntaxKind::HTML_ATTRS.into());
2395                builder.token(SyntaxKind::TEXT.into(), attrs_text);
2396                builder.finish_node();
2397            }
2398            if !trailing_ws.is_empty() {
2399                builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
2400            }
2401            builder.token(SyntaxKind::TEXT.into(), ">");
2402            let after_gt = &line_no_nl[gt + 1..];
2403            if !after_gt.is_empty() {
2404                builder.token(SyntaxKind::TEXT.into(), after_gt);
2405            }
2406        }
2407
2408        if !newline_str.is_empty() {
2409            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2410        }
2411    }
2412}
2413
2414/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
2415/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
2416/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
2417/// node is added. Pandoc's projector reads attributes only for `<div>` /
2418/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
2419fn emit_multiline_open_tag_simple(
2420    builder: &mut GreenNodeBuilder<'static>,
2421    lines: &[&str],
2422    start_pos: usize,
2423    end_line_idx: usize,
2424) {
2425    for line in lines.iter().take(end_line_idx + 1).skip(start_pos) {
2426        let (line_no_nl, newline_str) = strip_newline(line);
2427        if !line_no_nl.is_empty() {
2428            builder.token(SyntaxKind::TEXT.into(), line_no_nl);
2429        }
2430        if !newline_str.is_empty() {
2431            builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2432        }
2433    }
2434}
2435
2436/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
2437/// `<div` literal up to end-of-line. Called only from
2438/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
2439/// pure attribute (and possibly inter-attribute whitespace).
2440fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
2441    if region.is_empty() {
2442        return;
2443    }
2444    let bytes = region.as_bytes();
2445    // Split a leading run of whitespace into a WHITESPACE token so the
2446    // HTML_ATTRS node holds only attribute bytes.
2447    let ws_end = bytes
2448        .iter()
2449        .position(|&b| !matches!(b, b' ' | b'\t'))
2450        .unwrap_or(bytes.len());
2451    if ws_end > 0 {
2452        builder.token(SyntaxKind::WHITESPACE.into(), &region[..ws_end]);
2453    }
2454    let attrs_text = &region[ws_end..];
2455    if !attrs_text.is_empty() {
2456        builder.start_node(SyntaxKind::HTML_ATTRS.into());
2457        builder.token(SyntaxKind::TEXT.into(), attrs_text);
2458        builder.finish_node();
2459    }
2460}
2461
2462/// Emit one continuation line of an HTML block, preserving any blockquote
2463/// markers as structural tokens (so the CST stays byte-equal to the source
2464/// and downstream consumers can strip them per-context).
2465fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
2466    let inner = if bq_depth > 0 {
2467        let stripped = strip_n_blockquote_markers(line, bq_depth);
2468        let prefix_len = line.len() - stripped.len();
2469        if prefix_len > 0 {
2470            for ch in line[..prefix_len].chars() {
2471                if ch == '>' {
2472                    builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2473                } else {
2474                    let mut buf = [0u8; 4];
2475                    builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2476                }
2477            }
2478        }
2479        stripped
2480    } else {
2481        line
2482    };
2483
2484    let (line_without_newline, newline_str) = strip_newline(inner);
2485    if !line_without_newline.is_empty() {
2486        builder.token(SyntaxKind::TEXT.into(), line_without_newline);
2487    }
2488    if !newline_str.is_empty() {
2489        builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2490    }
2491}
2492
2493#[cfg(test)]
2494mod tests {
2495    use super::*;
2496
2497    #[test]
2498    fn test_try_parse_html_comment() {
2499        assert_eq!(
2500            try_parse_html_block_start("<!-- comment -->", false),
2501            Some(HtmlBlockType::Comment)
2502        );
2503        assert_eq!(
2504            try_parse_html_block_start("  <!-- comment -->", false),
2505            Some(HtmlBlockType::Comment)
2506        );
2507    }
2508
2509    #[test]
2510    fn test_try_parse_div_tag() {
2511        assert_eq!(
2512            try_parse_html_block_start("<div>", false),
2513            Some(HtmlBlockType::BlockTag {
2514                tag_name: "div".to_string(),
2515                is_verbatim: false,
2516                closed_by_blank_line: false,
2517                depth_aware: true,
2518                closes_at_open_tag: false,
2519                is_closing: false,
2520            })
2521        );
2522        assert_eq!(
2523            try_parse_html_block_start("<div class=\"test\">", false),
2524            Some(HtmlBlockType::BlockTag {
2525                tag_name: "div".to_string(),
2526                is_verbatim: false,
2527                closed_by_blank_line: false,
2528                depth_aware: true,
2529                closes_at_open_tag: false,
2530                is_closing: false,
2531            })
2532        );
2533    }
2534
2535    #[test]
2536    fn test_try_parse_script_tag() {
2537        assert_eq!(
2538            try_parse_html_block_start("<script>", false),
2539            Some(HtmlBlockType::BlockTag {
2540                tag_name: "script".to_string(),
2541                is_verbatim: true,
2542                closed_by_blank_line: false,
2543                depth_aware: true,
2544                closes_at_open_tag: false,
2545                is_closing: false,
2546            })
2547        );
2548    }
2549
2550    #[test]
2551    fn test_try_parse_processing_instruction() {
2552        assert_eq!(
2553            try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
2554            Some(HtmlBlockType::ProcessingInstruction)
2555        );
2556    }
2557
2558    #[test]
2559    fn test_try_parse_declaration() {
2560        // CommonMark dialect recognizes declarations as type-4 HTML blocks.
2561        assert_eq!(
2562            try_parse_html_block_start("<!DOCTYPE html>", true),
2563            Some(HtmlBlockType::Declaration)
2564        );
2565        // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
2566        // just uppercase. Lowercase doctype must match too.
2567        assert_eq!(
2568            try_parse_html_block_start("<!doctype html>", true),
2569            Some(HtmlBlockType::Declaration)
2570        );
2571        // Pandoc dialect does not — bare declarations fall through to
2572        // paragraph parsing.
2573        assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
2574        assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
2575    }
2576
2577    #[test]
2578    fn test_dialect_specific_block_tag_membership() {
2579        // Pandoc-markdown's `blockHtmlTags` is a strict subset of
2580        // CommonMark §4.6 type-6 plus a few additions. These tags
2581        // diverge between dialects:
2582        //   CM-only block tags (Pandoc treats as inline raw HTML):
2583        //     dialog, legend, menuitem, optgroup, option, frame,
2584        //     base, basefont, link, param
2585        //   Pandoc-only block tags (CM doesn't recognize):
2586        //     canvas, hgroup, isindex, meta, output
2587        for cm_only in [
2588            "<dialog>",
2589            "<legend>",
2590            "<menuitem>",
2591            "<optgroup>",
2592            "<option>",
2593            "<frame>",
2594            "<base>",
2595            "<basefont>",
2596            "<link>",
2597            "<param>",
2598        ] {
2599            assert!(
2600                matches!(
2601                    try_parse_html_block_start(cm_only, true),
2602                    Some(HtmlBlockType::BlockTag { .. })
2603                ),
2604                "{cm_only} should be a block-tag start under CommonMark",
2605            );
2606            assert_eq!(
2607                try_parse_html_block_start(cm_only, false),
2608                None,
2609                "{cm_only} should NOT be a block-tag start under Pandoc",
2610            );
2611        }
2612        for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
2613            // Under CM these are not type-6 BlockTags; they may still match
2614            // type-7 (complete tag on a line) which has different semantics.
2615            assert!(
2616                !matches!(
2617                    try_parse_html_block_start(pandoc_only, true),
2618                    Some(HtmlBlockType::BlockTag { .. })
2619                ),
2620                "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
2621            );
2622            assert!(
2623                matches!(
2624                    try_parse_html_block_start(pandoc_only, false),
2625                    Some(HtmlBlockType::BlockTag { .. })
2626                ),
2627                "{pandoc_only} should be a block-tag start under Pandoc",
2628            );
2629        }
2630    }
2631
2632    #[test]
2633    fn test_pandoc_inline_block_tag_membership() {
2634        // Pandoc's `eitherBlockOrInline` tags start an HTML block at
2635        // fresh-block positions under Pandoc dialect. We list the
2636        // non-void, non-script subset (verbatim `script` is handled
2637        // via the verbatim path; void elements are deferred — see
2638        // PANDOC_INLINE_BLOCK_TAGS docs).
2639        for tag in [
2640            "<button>",
2641            "<iframe>",
2642            "<video>",
2643            "<audio>",
2644            "<noscript>",
2645            "<object>",
2646            "<map>",
2647            "<progress>",
2648            "<del>",
2649            "<ins>",
2650            "<svg>",
2651            "<applet>",
2652        ] {
2653            assert!(
2654                matches!(
2655                    try_parse_html_block_start(tag, false),
2656                    Some(HtmlBlockType::BlockTag {
2657                        depth_aware: true,
2658                        ..
2659                    })
2660                ),
2661                "{tag} should be a depth-aware block-tag start under Pandoc",
2662            );
2663        }
2664        // Closing forms of inline-block tags also start a block under
2665        // Pandoc — pandoc-native pins `</button>` standalone as a
2666        // single-line `RawBlock`. These use `closes_at_open_tag: true`
2667        // (no balanced match — the close emits as a one-line block on
2668        // its own).
2669        for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
2670            assert!(
2671                matches!(
2672                    try_parse_html_block_start(closing, false),
2673                    Some(HtmlBlockType::BlockTag {
2674                        depth_aware: false,
2675                        closes_at_open_tag: true,
2676                        ..
2677                    })
2678                ),
2679                "{closing} (closing form) should be a single-line block-tag start under Pandoc",
2680            );
2681        }
2682    }
2683
2684    #[test]
2685    fn test_pandoc_void_block_tag_membership() {
2686        // Pandoc's void `eitherBlockOrInline` tags start an HTML block
2687        // at fresh-block positions under Pandoc dialect, with
2688        // `closes_at_open_tag: true` — the block always ends on the
2689        // open-tag line (no closing tag to match).
2690        for tag in [
2691            "<area>",
2692            "<embed>",
2693            "<source>",
2694            "<track>",
2695            "<embed src=\"foo.swf\">",
2696            "<source src=\"foo.mp4\" type=\"video/mp4\">",
2697        ] {
2698            assert!(
2699                matches!(
2700                    try_parse_html_block_start(tag, false),
2701                    Some(HtmlBlockType::BlockTag {
2702                        depth_aware: false,
2703                        closes_at_open_tag: true,
2704                        ..
2705                    })
2706                ),
2707                "{tag} should be a void block-tag start under Pandoc",
2708            );
2709        }
2710        // Closing forms of void tags also start a single-line block
2711        // under Pandoc. Void elements have no closing tag in HTML, but
2712        // `</embed>` etc. can appear in the wild — pandoc-native still
2713        // emits them as `RawBlock`s at fresh-block positions; mirror
2714        // that with the same `closes_at_open_tag: true` shape.
2715        for closing in ["</area>", "</embed>", "</source>", "</track>"] {
2716            assert!(
2717                matches!(
2718                    try_parse_html_block_start(closing, false),
2719                    Some(HtmlBlockType::BlockTag {
2720                        depth_aware: false,
2721                        closes_at_open_tag: true,
2722                        ..
2723                    })
2724                ),
2725                "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
2726            );
2727        }
2728        // Under CommonMark dialect, the void-tag block-start path is
2729        // skipped. `<source>` and `<track>` are in the CM type-6
2730        // BLOCK_TAGS set so they DO start a block, but with CM type-6
2731        // semantics (`closed_by_blank_line: true`,
2732        // `closes_at_open_tag: false`), not the Pandoc void-tag path.
2733        // `<embed>` and `<area>` aren't in the CM type-6 list — they
2734        // fall through to type 7 (complete tag on a line by itself).
2735        assert_eq!(
2736            try_parse_html_block_start("<embed>", true),
2737            Some(HtmlBlockType::Type7)
2738        );
2739        assert_eq!(
2740            try_parse_html_block_start("<area>", true),
2741            Some(HtmlBlockType::Type7)
2742        );
2743        assert!(matches!(
2744            try_parse_html_block_start("<source src=\"x\">", true),
2745            Some(HtmlBlockType::BlockTag {
2746                closed_by_blank_line: true,
2747                closes_at_open_tag: false,
2748                ..
2749            })
2750        ));
2751        assert!(matches!(
2752            try_parse_html_block_start("<track src=\"x\">", true),
2753            Some(HtmlBlockType::BlockTag {
2754                closed_by_blank_line: true,
2755                closes_at_open_tag: false,
2756                ..
2757            })
2758        ));
2759    }
2760
2761    #[test]
2762    fn test_find_multiline_open_end() {
2763        // Single-line opens return None (caller takes the regular path).
2764        assert_eq!(
2765            find_multiline_open_end(&["<div id=\"x\">"], 0, "<div id=\"x\">", "div"),
2766            None
2767        );
2768        assert_eq!(
2769            find_multiline_open_end(&["<embed src=\"x\">"], 0, "<embed src=\"x\">", "embed"),
2770            None
2771        );
2772        // Multi-line opens return the line index of the closing `>`.
2773        assert_eq!(
2774            find_multiline_open_end(&["<embed", "  src=\"x\">"], 0, "<embed", "embed"),
2775            Some(1)
2776        );
2777        assert_eq!(
2778            find_multiline_open_end(
2779                &["<embed", "  src=\"x\"", "  type=\"video\">"],
2780                0,
2781                "<embed",
2782                "embed"
2783            ),
2784            Some(2)
2785        );
2786        // Tag-name mismatch returns None (case-insensitive on the tag name).
2787        assert_eq!(
2788            find_multiline_open_end(&["<embed", "  src=\"x\">"], 0, "<embed", "div"),
2789            None
2790        );
2791        assert_eq!(
2792            find_multiline_open_end(&["<EMBED", "  src=\"x\">"], 0, "<EMBED", "embed"),
2793            Some(1)
2794        );
2795        // Quoted `>` does not terminate the open tag; quote state threads
2796        // across line boundaries.
2797        assert_eq!(
2798            find_multiline_open_end(
2799                &["<embed title=\"a>b", "  c\">"],
2800                0,
2801                "<embed title=\"a>b",
2802                "embed"
2803            ),
2804            Some(1)
2805        );
2806        // No `>` anywhere returns None.
2807        assert_eq!(
2808            find_multiline_open_end(&["<embed", "  src=\"x\""], 0, "<embed", "embed"),
2809            None
2810        );
2811    }
2812
2813    #[test]
2814    fn test_pandoc_html_open_tag_closes() {
2815        // Single-line complete: scanner finds `>` on the first line.
2816        assert!(pandoc_html_open_tag_closes(&["<div>"], 0, 0));
2817        assert!(pandoc_html_open_tag_closes(&["<embed src=\"x\">"], 0, 0));
2818        // Multi-line complete: scanner finds `>` on a later line.
2819        assert!(pandoc_html_open_tag_closes(
2820            &["<div", "  id=\"x\">", "body", "</div>"],
2821            0,
2822            0
2823        ));
2824        assert!(pandoc_html_open_tag_closes(
2825            &["<embed", "  src=\"x.png\" alt=\"y\">"],
2826            0,
2827            0
2828        ));
2829        // Quoted `>` does not close: scanner threads quote state.
2830        assert!(!pandoc_html_open_tag_closes(
2831            &["<div title=\"a>b", "  c\""],
2832            0,
2833            0
2834        ));
2835        assert!(pandoc_html_open_tag_closes(
2836            &["<div title=\"a>b", "  c\">"],
2837            0,
2838            0
2839        ));
2840        // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
2841        assert!(!pandoc_html_open_tag_closes(&["<embed"], 0, 0));
2842        assert!(!pandoc_html_open_tag_closes(&["<div", "foo", "bar"], 0, 0));
2843        // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
2844        // across them); the scan continues until EOF or `>`.
2845        assert!(pandoc_html_open_tag_closes(
2846            &["<div", "", "id=\"x\">"],
2847            0,
2848            0
2849        ));
2850    }
2851
2852    #[test]
2853    fn test_try_parse_cdata() {
2854        // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
2855        assert_eq!(
2856            try_parse_html_block_start("<![CDATA[content]]>", true),
2857            Some(HtmlBlockType::CData)
2858        );
2859        // Pandoc dialect does not.
2860        assert_eq!(
2861            try_parse_html_block_start("<![CDATA[content]]>", false),
2862            None
2863        );
2864    }
2865
2866    #[test]
2867    fn test_extract_block_tag_name_open_only() {
2868        assert_eq!(
2869            extract_block_tag_name("<div>", false),
2870            Some("div".to_string())
2871        );
2872        assert_eq!(
2873            extract_block_tag_name("<div class=\"test\">", false),
2874            Some("div".to_string())
2875        );
2876        assert_eq!(
2877            extract_block_tag_name("<div/>", false),
2878            Some("div".to_string())
2879        );
2880        assert_eq!(extract_block_tag_name("</div>", false), None);
2881        assert_eq!(extract_block_tag_name("<>", false), None);
2882        assert_eq!(extract_block_tag_name("< div>", false), None);
2883    }
2884
2885    #[test]
2886    fn test_extract_block_tag_name_with_closing() {
2887        // CommonMark §4.6 type-6 starts also accept closing tags.
2888        assert_eq!(
2889            extract_block_tag_name("</div>", true),
2890            Some("div".to_string())
2891        );
2892        assert_eq!(
2893            extract_block_tag_name("</div >", true),
2894            Some("div".to_string())
2895        );
2896    }
2897
2898    #[test]
2899    fn test_commonmark_type6_closing_tag_start() {
2900        assert_eq!(
2901            try_parse_html_block_start("</div>", true),
2902            Some(HtmlBlockType::BlockTag {
2903                tag_name: "div".to_string(),
2904                is_verbatim: false,
2905                closed_by_blank_line: true,
2906                depth_aware: false,
2907                closes_at_open_tag: false,
2908                is_closing: true,
2909            })
2910        );
2911    }
2912
2913    #[test]
2914    fn test_commonmark_type7_open_tag() {
2915        // `<a>` (not a type-6 tag) on a line by itself is type 7 under
2916        // CommonMark; rejected under non-CommonMark.
2917        assert_eq!(
2918            try_parse_html_block_start("<a href=\"foo\">", true),
2919            Some(HtmlBlockType::Type7)
2920        );
2921        assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
2922    }
2923
2924    #[test]
2925    fn test_commonmark_type7_close_tag() {
2926        assert_eq!(
2927            try_parse_html_block_start("</ins>", true),
2928            Some(HtmlBlockType::Type7)
2929        );
2930    }
2931
2932    #[test]
2933    fn test_commonmark_type7_rejects_with_trailing_text() {
2934        // A complete tag must be followed only by whitespace.
2935        assert_eq!(try_parse_html_block_start("<a> hi", true), None);
2936    }
2937
2938    #[test]
2939    fn test_is_closing_marker_comment() {
2940        let block_type = HtmlBlockType::Comment;
2941        assert!(is_closing_marker("-->", &block_type));
2942        assert!(is_closing_marker("end -->", &block_type));
2943        assert!(!is_closing_marker("<!--", &block_type));
2944    }
2945
2946    #[test]
2947    fn test_is_closing_marker_tag() {
2948        let block_type = HtmlBlockType::BlockTag {
2949            tag_name: "div".to_string(),
2950            is_verbatim: false,
2951            closed_by_blank_line: false,
2952            depth_aware: false,
2953            closes_at_open_tag: false,
2954            is_closing: false,
2955        };
2956        assert!(is_closing_marker("</div>", &block_type));
2957        assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
2958        assert!(is_closing_marker("content</div>", &block_type));
2959        assert!(!is_closing_marker("<div>", &block_type));
2960    }
2961
2962    #[test]
2963    fn test_parse_html_comment_block() {
2964        let input = "<!-- comment -->\n";
2965        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
2966        let mut builder = GreenNodeBuilder::new();
2967
2968        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
2969        let opts = ParserOptions::default();
2970        let new_pos = parse_html_block_with_wrapper(
2971            &mut builder,
2972            &lines,
2973            0,
2974            block_type,
2975            0,
2976            SyntaxKind::HTML_BLOCK,
2977            &opts,
2978        );
2979
2980        assert_eq!(new_pos, 1);
2981    }
2982
2983    #[test]
2984    fn test_parse_div_block() {
2985        let input = "<div>\ncontent\n</div>\n";
2986        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
2987        let mut builder = GreenNodeBuilder::new();
2988
2989        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
2990        let opts = ParserOptions::default();
2991        let new_pos = parse_html_block_with_wrapper(
2992            &mut builder,
2993            &lines,
2994            0,
2995            block_type,
2996            0,
2997            SyntaxKind::HTML_BLOCK,
2998            &opts,
2999        );
3000
3001        assert_eq!(new_pos, 3);
3002    }
3003
3004    #[test]
3005    fn test_parse_html_block_no_closing() {
3006        let input = "<div>\ncontent\n";
3007        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3008        let mut builder = GreenNodeBuilder::new();
3009
3010        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3011        let opts = ParserOptions::default();
3012        let new_pos = parse_html_block_with_wrapper(
3013            &mut builder,
3014            &lines,
3015            0,
3016            block_type,
3017            0,
3018            SyntaxKind::HTML_BLOCK,
3019            &opts,
3020        );
3021
3022        // Should consume all lines even without closing tag
3023        assert_eq!(new_pos, 2);
3024    }
3025
3026    #[test]
3027    fn test_parse_div_block_nested_pandoc() {
3028        // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3029        // close on the OUTER `</div>`, not the first `</div>` seen. The
3030        // CommonMark-style "first close" scanner is wrong here; Pandoc's
3031        // div parser is depth-aware (mirrors `htmlInBalanced`).
3032        let input =
3033            "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3034        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3035        let mut builder = GreenNodeBuilder::new();
3036
3037        // is_commonmark = false → Pandoc dialect.
3038        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3039        let opts = ParserOptions::default();
3040        let new_pos = parse_html_block_with_wrapper(
3041            &mut builder,
3042            &lines,
3043            0,
3044            block_type,
3045            0,
3046            SyntaxKind::HTML_BLOCK_DIV,
3047            &opts,
3048        );
3049
3050        // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3051        // inner-close, blank, outer-close. All consumed.
3052        assert_eq!(new_pos, 9);
3053    }
3054
3055    #[test]
3056    fn test_parse_div_block_same_line_pandoc() {
3057        // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3058        // close on first line. Depth-aware tracking must not regress this.
3059        let input = "<div>foo</div>\n";
3060        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3061        let mut builder = GreenNodeBuilder::new();
3062
3063        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3064        let opts = ParserOptions::default();
3065        let new_pos = parse_html_block_with_wrapper(
3066            &mut builder,
3067            &lines,
3068            0,
3069            block_type,
3070            0,
3071            SyntaxKind::HTML_BLOCK_DIV,
3072            &opts,
3073        );
3074        assert_eq!(new_pos, 1);
3075    }
3076
3077    #[test]
3078    fn test_commonmark_verbatim_first_close() {
3079        // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3080        // ends at the first matching close — not depth-aware. Stash a
3081        // bogus inner `<script>` inside a JS string; the outer block
3082        // still closes at the first `</script>`.
3083        let input = "<script>\nlet x = '<script>';\n</script>\n";
3084        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3085        let mut builder = GreenNodeBuilder::new();
3086
3087        // is_commonmark = true.
3088        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3089        let opts = ParserOptions::default();
3090        let new_pos = parse_html_block_with_wrapper(
3091            &mut builder,
3092            &lines,
3093            0,
3094            block_type,
3095            0,
3096            SyntaxKind::HTML_BLOCK,
3097            &opts,
3098        );
3099        // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3100        assert_eq!(new_pos, 3);
3101    }
3102
3103    #[test]
3104    fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3105        // Multi-line open tag with the closing `>` on its own line:
3106        //
3107        //   <div
3108        //     id="x"
3109        //     class="y"
3110        //   >
3111        //
3112        //   foo
3113        //
3114        //   </div>
3115        //
3116        // Open tag spans lines 0..=3. Content starts at line 4.
3117        let input = "<div\n  id=\"x\"\n  class=\"y\"\n>\n\nfoo\n\n</div>\n";
3118        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3119        let mut builder = GreenNodeBuilder::new();
3120
3121        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3122        let opts = ParserOptions::default();
3123        let new_pos = parse_html_block_with_wrapper(
3124            &mut builder,
3125            &lines,
3126            0,
3127            block_type,
3128            0,
3129            SyntaxKind::HTML_BLOCK_DIV,
3130            &opts,
3131        );
3132
3133        // 8 lines: open-line 0, open-line 1 (`  id="x"`), open-line 2
3134        // (`  class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3135        assert_eq!(new_pos, 8);
3136
3137        // CST must contain a structural HTML_ATTRS region holding the
3138        // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3139        let green = builder.finish();
3140        let root = crate::syntax::SyntaxNode::new_root(green);
3141        let attrs_count = root
3142            .descendants()
3143            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3144            .count();
3145        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3146
3147        // Byte-identical losslessness check.
3148        let collected: String = root
3149            .descendants_with_tokens()
3150            .filter_map(|n| n.into_token())
3151            .map(|t| t.text().to_string())
3152            .collect();
3153        assert_eq!(collected, input);
3154    }
3155
3156    #[test]
3157    fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3158        // Multi-line open tag with the closing `>` on the last attribute
3159        // line (case 0262 already covers this pattern; pin behavior to
3160        // also ensure HTML_ATTRS structural exposure).
3161        let input = "<div\n  id=\"x\"\n  class=\"y\">\nfoo\n</div>\n";
3162        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3163        let mut builder = GreenNodeBuilder::new();
3164
3165        let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3166        let opts = ParserOptions::default();
3167        let new_pos = parse_html_block_with_wrapper(
3168            &mut builder,
3169            &lines,
3170            0,
3171            block_type,
3172            0,
3173            SyntaxKind::HTML_BLOCK_DIV,
3174            &opts,
3175        );
3176
3177        // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3178        // </div>.
3179        assert_eq!(new_pos, 5);
3180
3181        let green = builder.finish();
3182        let root = crate::syntax::SyntaxNode::new_root(green);
3183        let attrs_count = root
3184            .descendants()
3185            .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3186            .count();
3187        assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3188
3189        let collected: String = root
3190            .descendants_with_tokens()
3191            .filter_map(|n| n.into_token())
3192            .map(|t| t.text().to_string())
3193            .collect();
3194        assert_eq!(collected, input);
3195    }
3196
3197    #[test]
3198    fn test_commonmark_type6_blank_line_terminates() {
3199        let input = "<div>\nfoo\n\nbar\n";
3200        let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3201        let mut builder = GreenNodeBuilder::new();
3202
3203        let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3204        let opts = ParserOptions::default();
3205        let new_pos = parse_html_block_with_wrapper(
3206            &mut builder,
3207            &lines,
3208            0,
3209            block_type,
3210            0,
3211            SyntaxKind::HTML_BLOCK,
3212            &opts,
3213        );
3214
3215        // Block contains <div>\nfoo\n; stops at blank line (line 2).
3216        assert_eq!(new_pos, 2);
3217    }
3218}