panache_parser/parser/blocks/html_blocks.rs
1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10 ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::attributes::emit_html_attrs_node;
13use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
14
15/// HTML block-level tags as defined by CommonMark spec.
16/// These tags start an HTML block when found at the start of a line.
17const BLOCK_TAGS: &[&str] = &[
18 "address",
19 "article",
20 "aside",
21 "base",
22 "basefont",
23 "blockquote",
24 "body",
25 "caption",
26 "center",
27 "col",
28 "colgroup",
29 "dd",
30 "details",
31 "dialog",
32 "dir",
33 "div",
34 "dl",
35 "dt",
36 "fieldset",
37 "figcaption",
38 "figure",
39 "footer",
40 "form",
41 "frame",
42 "frameset",
43 "h1",
44 "h2",
45 "h3",
46 "h4",
47 "h5",
48 "h6",
49 "head",
50 "header",
51 "hr",
52 "html",
53 "iframe",
54 "legend",
55 "li",
56 "link",
57 "main",
58 "menu",
59 "menuitem",
60 "nav",
61 "noframes",
62 "ol",
63 "optgroup",
64 "option",
65 "p",
66 "param",
67 "section",
68 "source",
69 "summary",
70 "table",
71 "tbody",
72 "td",
73 "tfoot",
74 "th",
75 "thead",
76 "title",
77 "tr",
78 "track",
79 "ul",
80];
81
82/// Tags that contain raw/verbatim content (no Markdown processing inside).
83const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
84
85/// Pandoc's `blockHtmlTags` (mirrors
86/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
87/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
88/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
89/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
90/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
91/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
92///
93/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
94/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
95/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
96/// and the verbatim `script`) is tracked separately as
97/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
98/// fresh-block positions but stay inline inside an existing HTML block
99/// (e.g. `<form><input><button>X</button></form>`); the projector's
100/// `split_html_block_by_tags` keys on `inline_pending` to keep them
101/// inline once an inline-only tag or text byte has been seen since the
102/// last splitter.
103const PANDOC_BLOCK_TAGS: &[&str] = &[
104 "address",
105 "article",
106 "aside",
107 "blockquote",
108 "body",
109 "canvas",
110 "caption",
111 "center",
112 "col",
113 "colgroup",
114 "dd",
115 "details",
116 "dir",
117 "div",
118 "dl",
119 "dt",
120 "fieldset",
121 "figcaption",
122 "figure",
123 "footer",
124 "form",
125 "frameset",
126 "h1",
127 "h2",
128 "h3",
129 "h4",
130 "h5",
131 "h6",
132 "head",
133 "header",
134 "hgroup",
135 "hr",
136 "html",
137 "isindex",
138 "li",
139 "main",
140 "menu",
141 "meta",
142 "nav",
143 "noframes",
144 "ol",
145 "output",
146 "p",
147 "pre",
148 "script",
149 "section",
150 "style",
151 "summary",
152 "table",
153 "tbody",
154 "td",
155 "textarea",
156 "tfoot",
157 "th",
158 "thead",
159 "tr",
160 "ul",
161];
162
163/// Whether `name` (case-insensitive) is one of the HTML block-level tags
164/// recognized by CommonMark §4.6 type-6.
165pub fn is_html_block_tag_name(name: &str) -> bool {
166 let lower = name.to_ascii_lowercase();
167 BLOCK_TAGS.contains(&lower.as_str())
168}
169
170/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
171/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
172/// Used by the pandoc-native projector's `split_html_block_by_tags` to
173/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
174/// the block — block-level tags emit as separate `RawBlock` entries;
175/// inline tags stay inline in the surrounding `Plain` content.
176pub fn is_pandoc_block_tag_name(name: &str) -> bool {
177 let lower = name.to_ascii_lowercase();
178 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
179}
180
181/// Pandoc's `eitherBlockOrInline` set (mirrors
182/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
183/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
184/// (because `name ∉ blockTags`). At top level (or after a blank line)
185/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
186/// inside an existing HTML block once a paragraph has started parsing,
187/// the same tag stays inline as `RawInline`.
188///
189/// The projector's `split_html_block_by_tags` mirrors this with an
190/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
191/// always split; inline-block tags split only when no inline content
192/// has been buffered since the last splitter.
193///
194/// Void elements (`area`, `embed`, `source`, `track`) live in
195/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
196/// rule as non-void inline-block tags but emit a single RawBlock per
197/// instance instead of a matched-pair lift.
198/// `script` is omitted because it is already verbatim (handled by the
199/// `<script>...</script>` raw-text path) and the strict-block check
200/// fires first regardless.
201const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
202 "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
203 "svg", "video",
204];
205
206/// Whether `name` (case-insensitive) is one of pandoc's
207/// `eitherBlockOrInline` tags (excluding void elements and `script`;
208/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
209pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
210 let lower = name.to_ascii_lowercase();
211 PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
212}
213
214/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
215/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
216/// minus those handled elsewhere: `br` and `wbr` are inline-only;
217/// `img` and `input` are inline-only; HTML void elements that pandoc
218/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
219/// `track`).
220///
221/// At fresh-block positions (or after a blank line) pandoc emits these
222/// as a single `RawBlock`; inside a running paragraph they stay inline
223/// as `RawInline`. The parser opens a depth-zero HTML block (closes
224/// immediately on the open-tag line — there is no closing tag to
225/// match) so subsequent lines start fresh blocks; the projector's
226/// `split_html_block_by_tags` handles the same-line splitting via
227/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
228const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
229
230/// Whether `name` (case-insensitive) is one of pandoc's void
231/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
232pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
233 let lower = name.to_ascii_lowercase();
234 PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
235}
236
237/// Whether the given tag name is eligible for the Phase 6 / Fix #4
238/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
239/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
240/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
241/// and NOT void. These are the tags where pandoc parses the body as
242/// fresh markdown between RawBlock emissions of the open/close tags —
243/// exactly the shape we can lift into structural CST children.
244///
245/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
246/// additional gate at the lift-gate site: the lift is abandoned when
247/// the body's first non-blank content is a void block tag at a
248/// fresh-block position (`<video>\n<source ...>\n</video>` projects
249/// per-tag rather than matched-pair, mirroring pandoc).
250///
251/// `<div>` is intentionally excluded — it has its own lift path
252/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
253/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
254pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
255 let lower = name.to_ascii_lowercase();
256 if VERBATIM_TAGS.contains(&lower.as_str()) {
257 return false;
258 }
259 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
260 return false;
261 }
262 if lower == "div" {
263 return false;
264 }
265 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
266 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
267}
268
269/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
270/// — anything that has an opening and a matching closing form whose
271/// `</tag>` would be recognized by the dispatcher as a separate block
272/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
273/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
274/// Void tags are excluded — they have no close form.
275///
276/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
277/// an open inside the buffer whose close would otherwise interrupt the
278/// list item mid-construct.
279pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
280 let lower = name.to_ascii_lowercase();
281 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
282 return false;
283 }
284 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
285 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
286 || VERBATIM_TAGS.contains(&lower.as_str())
287}
288
289/// Open-tag-attribute tokenization gate for non-div strict-block tags
290/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
291/// open tag is eligible for finer-grained tokenization
292/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
293/// driving the full body lift — that's the `bq_clean_lift` path. The
294/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
295/// the salsa anchor index.
296///
297/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
298/// wrapper) regardless of bq depth, so this gate skips it.
299fn bq_strict_attr_emit_tag_name(
300 wrapper_kind: SyntaxKind,
301 block_type: &HtmlBlockType,
302 bq_depth: usize,
303) -> Option<&str> {
304 if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
305 return None;
306 }
307 match block_type {
308 HtmlBlockType::BlockTag {
309 tag_name,
310 is_verbatim: false,
311 closed_by_blank_line: false,
312 depth_aware: true,
313 closes_at_open_tag: false,
314 is_closing: false,
315 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
316 _ => None,
317 }
318}
319
320/// Information about a detected HTML block opening.
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub(crate) enum HtmlBlockType {
323 /// HTML comment: <!-- ... -->
324 Comment,
325 /// Processing instruction: <? ... ?>
326 ProcessingInstruction,
327 /// Declaration: <!...>
328 Declaration,
329 /// CDATA section: <![CDATA[ ... ]]>
330 CData,
331 /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
332 /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
333 /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
334 /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
335 /// `depth_aware` extends the matching-tag close path with balanced
336 /// open/close tracking of the same tag name (mirrors pandoc's
337 /// `htmlInBalanced`); used under Pandoc dialect to handle nested
338 /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
339 /// `closed_by_blank_line` is true.
340 /// `closes_at_open_tag` short-circuits the close search: the block
341 /// always ends after the open-tag line. Used for void
342 /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
343 /// `<track>`) which have no closing tag — depth-aware matching
344 /// would walk to end-of-input.
345 /// `is_closing` records whether the tag at the start position is a
346 /// closing form (`</tag>`) rather than an opening form (`<tag>`).
347 /// The dispatcher's `cannot_interrupt` consults this to mirror
348 /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
349 /// even when `<script>` is not — pandoc treats the close-form as
350 /// always-inline regardless of attributes).
351 BlockTag {
352 tag_name: String,
353 is_verbatim: bool,
354 closed_by_blank_line: bool,
355 depth_aware: bool,
356 closes_at_open_tag: bool,
357 is_closing: bool,
358 },
359 /// CommonMark §4.6 type 7: complete open or close tag on a line by
360 /// itself, tag name not in the type-1 verbatim list. Block ends at
361 /// blank line. Cannot interrupt a paragraph.
362 Type7,
363}
364
365/// Try to detect an HTML block opening from content.
366/// Returns block type if this is a valid HTML block start.
367///
368/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
369/// accept closing tags (`</div>`), type-6 blocks end at the next blank
370/// line (rather than a matching close tag), and type 7 is recognized.
371pub(crate) fn try_parse_html_block_start(
372 content: &str,
373 is_commonmark: bool,
374) -> Option<HtmlBlockType> {
375 let trimmed = strip_leading_spaces(content);
376
377 // Must start with <
378 if !trimmed.starts_with('<') {
379 return None;
380 }
381
382 // HTML comment
383 if trimmed.starts_with("<!--") {
384 return Some(HtmlBlockType::Comment);
385 }
386
387 // Processing instruction
388 if trimmed.starts_with("<?") {
389 return Some(HtmlBlockType::ProcessingInstruction);
390 }
391
392 // CDATA section — CommonMark dialect only. Pandoc-markdown does not
393 // recognize bare CDATA as a raw HTML block; the literal bytes fall
394 // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
395 // text is parsed as inline markdown, etc).
396 if is_commonmark && trimmed.starts_with("<![CDATA[") {
397 return Some(HtmlBlockType::CData);
398 }
399
400 // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
401 // does not recognize bare declarations as raw HTML blocks (its
402 // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
403 // tag-shaped blocks); the bytes fall through to paragraph parsing.
404 if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
405 let after_bang = &trimmed[2..];
406 if after_bang.chars().next()?.is_ascii_alphabetic() {
407 return Some(HtmlBlockType::Declaration);
408 }
409 }
410
411 // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
412 // Pandoc-native recognizes standalone closing forms of strict-block tags
413 // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
414 // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
415 // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
416 // the open-tag line via `closes_at_open_tag: true`.
417 if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
418 let tag_lower = tag_name.to_lowercase();
419 let is_closing = trimmed.starts_with("</");
420
421 // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
422 // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
423 // Unlike inline-block / void closes, these CAN interrupt a running
424 // paragraph (the dispatcher's `cannot_interrupt` only covers the
425 // inline-block / void categories). Inline-block / void closes are
426 // handled by their own branches further below.
427 if !is_commonmark
428 && is_closing
429 && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
430 || VERBATIM_TAGS.contains(&tag_lower.as_str()))
431 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
432 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
433 {
434 return Some(HtmlBlockType::BlockTag {
435 tag_name: tag_lower,
436 is_verbatim: false,
437 closed_by_blank_line: false,
438 depth_aware: false,
439 closes_at_open_tag: true,
440 is_closing: true,
441 });
442 }
443
444 // Under Pandoc, remaining closing forms (truly inline-only tags like
445 // `</em>`, `</span>`) are not block starts — fall through to the
446 // existing inline-html path. Inline-block + void closes are caught
447 // by the dedicated branches further below.
448 if !is_commonmark
449 && is_closing
450 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
451 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
452 {
453 return None;
454 }
455
456 // Check if it's a block-level tag. Pandoc and CommonMark disagree on
457 // membership: pandoc's `blockHtmlTags` (see
458 // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
459 // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
460 // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
461 let is_block_tag = if is_commonmark {
462 BLOCK_TAGS.contains(&tag_lower.as_str())
463 } else {
464 PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
465 };
466 if is_block_tag {
467 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
468 return Some(HtmlBlockType::BlockTag {
469 tag_name: tag_lower,
470 is_verbatim,
471 closed_by_blank_line: is_commonmark && !is_verbatim,
472 depth_aware: !is_commonmark,
473 closes_at_open_tag: false,
474 is_closing,
475 });
476 }
477
478 // Pandoc dialect also treats `eitherBlockOrInline` tags as block
479 // starters at fresh-block positions. The block dispatcher caller
480 // gates these as `cannot_interrupt` (mirrors pandoc — they never
481 // interrupt a running paragraph; only start a fresh block when
482 // following a blank line or at document start). Closing forms
483 // (`</video>`) emit as a single-line `RawBlock` with no balanced
484 // match — pandoc-native pins this for standalone closes.
485 if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
486 return Some(HtmlBlockType::BlockTag {
487 tag_name: tag_lower,
488 is_verbatim: false,
489 closed_by_blank_line: false,
490 depth_aware: !is_closing,
491 closes_at_open_tag: is_closing,
492 is_closing,
493 });
494 }
495
496 // Pandoc dialect also recognizes the void subset of
497 // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
498 // These have no closing tag, so the parser closes the block
499 // immediately on the open-tag line; the projector's
500 // `split_html_block_by_tags` handles the same-line splitting
501 // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
502 // non-void inline-block tags, void tags never interrupt a
503 // running paragraph (gated as `cannot_interrupt` in the
504 // dispatcher). Closing forms (`</embed>`) — semantically
505 // nonsensical for void elements — pandoc still emits as a
506 // single-line `RawBlock`; mirror that.
507 if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
508 return Some(HtmlBlockType::BlockTag {
509 tag_name: tag_lower,
510 is_verbatim: false,
511 closed_by_blank_line: false,
512 depth_aware: false,
513 closes_at_open_tag: true,
514 is_closing,
515 });
516 }
517
518 // Also accept verbatim tags even if not in BLOCK_TAGS list — but
519 // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
520 // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
521 // do not start a type-1 block. Letting `</pre>` through here would
522 // wrongly interrupt a paragraph.
523 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
524 return Some(HtmlBlockType::BlockTag {
525 tag_name: tag_lower,
526 is_verbatim: true,
527 closed_by_blank_line: false,
528 depth_aware: !is_commonmark,
529 closes_at_open_tag: false,
530 is_closing: false,
531 });
532 }
533 }
534
535 // Type 7 (CommonMark only): complete open or close tag on a line by
536 // itself, tag name not in the type-1 verbatim list.
537 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
538 {
539 let rest = &trimmed[end..];
540 let only_ws = rest
541 .bytes()
542 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
543 if only_ws {
544 // Reject if the tag name belongs to the type-1 verbatim set
545 // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
546 // type-1 starts above, so seeing one here means the opener
547 // had a different shape (e.g. `<pre/>` self-closing) that
548 // shouldn't trigger type 7 either. Conservatively skip.
549 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
550 let name_end = leading
551 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
552 .unwrap_or(leading.len());
553 let name = leading[..name_end].to_ascii_lowercase();
554 if !VERBATIM_TAGS.contains(&name.as_str()) {
555 return Some(HtmlBlockType::Type7);
556 }
557 }
558 }
559
560 None
561}
562
563/// Extract the tag name for HTML-block-start detection.
564///
565/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
566/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
567/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
568/// the spec — we approximate that with the space/`>`/`/` boundary check.
569fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
570 if !text.starts_with('<') {
571 return None;
572 }
573
574 let after_bracket = &text[1..];
575
576 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
577 if !accept_closing {
578 return None;
579 }
580 stripped
581 } else {
582 after_bracket
583 };
584
585 // Extract tag name (alphanumeric, ends at space, >, or /)
586 let tag_end = after_slash
587 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
588 .unwrap_or(after_slash.len());
589
590 if tag_end == 0 {
591 return None;
592 }
593
594 let tag_name = &after_slash[..tag_end];
595
596 // Tag name must be valid (ASCII alphabetic start, alphanumeric)
597 if !tag_name.chars().next()?.is_ascii_alphabetic() {
598 return None;
599 }
600
601 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
602 return None;
603 }
604
605 Some(tag_name.to_string())
606}
607
608/// Whether this block type ends at a blank line (CommonMark types 6 & 7
609/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
610/// marker — only at end of input or the next blank line.
611fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
612 matches!(
613 block_type,
614 HtmlBlockType::Type7
615 | HtmlBlockType::BlockTag {
616 closed_by_blank_line: true,
617 ..
618 }
619 )
620}
621
622/// Check if a line contains the closing marker for the given HTML block type.
623/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
624/// blank-line-terminated types (6 in CommonMark, 7) never match here.
625fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
626 match block_type {
627 HtmlBlockType::Comment => line.contains("-->"),
628 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
629 HtmlBlockType::Declaration => line.contains('>'),
630 HtmlBlockType::CData => line.contains("]]>"),
631 HtmlBlockType::BlockTag {
632 tag_name,
633 closed_by_blank_line: false,
634 ..
635 } => {
636 // Look for closing tag </tagname>
637 let closing_tag = format!("</{}>", tag_name);
638 line.to_lowercase().contains(&closing_tag)
639 }
640 HtmlBlockType::BlockTag {
641 closed_by_blank_line: true,
642 ..
643 }
644 | HtmlBlockType::Type7 => false,
645 }
646}
647
648/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
649/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
650/// inside a quoted attribute value are NOT counted — the scanner walks
651/// `<...>` brackets and respects `"`/`'` quoting.
652///
653/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
654/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
655/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
656/// close-form dispatch that would otherwise break the list-item buffer
657/// mid-`<div>...</div>`.
658pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
659 let bytes = line.as_bytes();
660 let lower_line = line.to_ascii_lowercase();
661 let lower_bytes = lower_line.as_bytes();
662 let tag_lower = tag_name.to_ascii_lowercase();
663 let tag_bytes = tag_lower.as_bytes();
664
665 let mut opens = 0usize;
666 let mut closes = 0usize;
667 let mut i = 0usize;
668
669 while i < bytes.len() {
670 if bytes[i] != b'<' {
671 i += 1;
672 continue;
673 }
674 let after = i + 1;
675 let is_close = after < bytes.len() && bytes[after] == b'/';
676 let name_start = if is_close { after + 1 } else { after };
677 let matched = name_start + tag_bytes.len() <= bytes.len()
678 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
679 let after_name = name_start + tag_bytes.len();
680 let is_boundary = matched
681 && matches!(
682 bytes.get(after_name).copied(),
683 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
684 );
685
686 // Walk forward to the closing `>` of this tag bracket, skipping
687 // inside quoted attribute values. Self-closing form ends with `/>`.
688 let mut j = if matched { after_name } else { after };
689 let mut quote: Option<u8> = None;
690 let mut self_close = false;
691 let mut found_gt = false;
692 while j < bytes.len() {
693 let b = bytes[j];
694 match (quote, b) {
695 (Some(q), x) if x == q => quote = None,
696 (None, b'"') | (None, b'\'') => quote = Some(b),
697 (None, b'>') => {
698 found_gt = true;
699 if j > i + 1 && bytes[j - 1] == b'/' {
700 self_close = true;
701 }
702 break;
703 }
704 _ => {}
705 }
706 j += 1;
707 }
708
709 if matched && is_boundary {
710 if is_close {
711 closes += 1;
712 } else if !self_close {
713 opens += 1;
714 }
715 }
716
717 if found_gt {
718 i = j + 1;
719 } else {
720 // Unterminated `<...` — bail out to avoid an infinite loop.
721 // The remaining bytes don't form a complete tag.
722 break;
723 }
724 }
725
726 (opens, closes)
727}
728
729/// Pandoc-dialect lift for HTML comments / processing instructions
730/// whose close marker is followed by additional bytes (same-line
731/// trailing or following lines). Pandoc-native emits a `RawBlock` for
732/// the marker bytes only, then parses the remainder as fresh blocks.
733///
734/// Returns `Some(consumed_lines)` when the split fires (caller must
735/// NOT enter the legacy emission); `None` to fall back to the legacy
736/// path (no close marker found, or no trailing content to split).
737///
738/// CST shape on success:
739/// ```text
740/// HTML_BLOCK
741/// HTML_BLOCK_TAG (open) // line[0] up to and incl close marker
742/// TEXT "<!-- hi -->" // or with HTML_BLOCK_CONTENT in between
743/// ... // for multi-line `<!--\n…\n-->` shape
744/// <sibling blocks> // recursive parse of trailing + lines[M+1..]
745/// ```
746/// The CST node kind to emit for an opaque single-construct HTML block.
747/// Under `Dialect::Pandoc`, comments, processing instructions, and
748/// verbatim raw-text elements (`<pre>`/`<script>`/`<style>`/`<textarea>`)
749/// each project to exactly one `RawBlock "html"`; tagging the wrapper
750/// `HTML_BLOCK_RAW` lets the pandoc-native projector route by kind instead
751/// of re-sniffing the leading bytes. This changes only the wrapper `u16` —
752/// the child tokens are emitted byte-for-byte identically, so the CST stays
753/// lossless (the `HTML_BLOCK_DIV` precedent). The behavioral `wrapper_kind`
754/// stays `HTML_BLOCK` everywhere else in `parse_html_block_with_wrapper`, so
755/// no lift gate changes. CommonMark dialect keeps the opaque `HTML_BLOCK`
756/// shape.
757fn html_block_node_kind(
758 wrapper_kind: SyntaxKind,
759 block_type: &HtmlBlockType,
760 dialect: crate::options::Dialect,
761) -> SyntaxKind {
762 if wrapper_kind == SyntaxKind::HTML_BLOCK
763 && dialect == crate::options::Dialect::Pandoc
764 && matches!(
765 block_type,
766 HtmlBlockType::Comment
767 | HtmlBlockType::ProcessingInstruction
768 | HtmlBlockType::BlockTag {
769 is_verbatim: true,
770 ..
771 }
772 )
773 {
774 SyntaxKind::HTML_BLOCK_RAW
775 } else {
776 wrapper_kind
777 }
778}
779
780fn try_parse_comment_pi_with_trailing_split(
781 builder: &mut GreenNodeBuilder<'static>,
782 lines: &[&str],
783 start_pos: usize,
784 block_type: &HtmlBlockType,
785 wrapper_kind: SyntaxKind,
786 bq_depth: usize,
787 config: &ParserOptions,
788) -> Option<usize> {
789 let marker: &str = match block_type {
790 HtmlBlockType::Comment => "-->",
791 HtmlBlockType::ProcessingInstruction => "?>",
792 _ => return None,
793 };
794
795 // Find the close marker in the bq-stripped line content. For
796 // bq_depth == 0 the inner content equals the raw line; for
797 // bq_depth > 0 we look past the `>` markers stripped by the
798 // outer dispatcher (line 0) and emitted as bq prefix below
799 // (lines > 0). `marker_end_in_inner` is the byte offset of the
800 // first byte AFTER the close marker, measured from the start
801 // of the inner (post-strip) content.
802 let mut close_line_idx: Option<usize> = None;
803 let mut marker_end_in_inner: usize = 0;
804 for (offset, line) in lines[start_pos..].iter().enumerate() {
805 let inner = if bq_depth > 0 {
806 strip_n_blockquote_markers(line, bq_depth)
807 } else {
808 line
809 };
810 if let Some(pos) = inner.find(marker) {
811 close_line_idx = Some(start_pos + offset);
812 marker_end_in_inner = pos + marker.len();
813 break;
814 }
815 }
816 let close_line_idx = close_line_idx?;
817 let close_line = lines[close_line_idx];
818 let close_inner = if bq_depth > 0 {
819 strip_n_blockquote_markers(close_line, bq_depth)
820 } else {
821 close_line
822 };
823 let close_prefix_len = close_line.len() - close_inner.len();
824 let trailing = &close_inner[marker_end_in_inner..];
825
826 // Only fire when there is non-whitespace content AFTER the close
827 // marker on the close line. The legacy path correctly handles
828 // the close-line-ends-at-close-marker shapes (`-->\n` followed
829 // by separate blocks); only the same-line-trailing case needs
830 // structural splitting. Trailing-whitespace-only handling
831 // (`--> \n`) is a projector-side trim — separate concern.
832 let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
833 if !has_non_ws_trailing {
834 return None;
835 }
836
837 builder.start_node(html_block_node_kind(wrapper_kind, block_type, config.dialect).into());
838
839 // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
840 // middle `HTML_BLOCK_CONTENT` lines between open and close. The
841 // close `HTML_BLOCK_TAG` carries only the bytes up to and
842 // including the close marker — trailing bytes go to the sibling.
843 if close_line_idx == start_pos {
844 // Same-line shape: one HTML_BLOCK_TAG containing the close
845 // marker's bytes. The newline lives on the trailing sibling.
846 // Line 0's bq prefix (if any) was already emitted by the
847 // outer dispatcher; emit only the inner marker bytes.
848 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
849 let close_part = &close_inner[..marker_end_in_inner];
850 if !close_part.is_empty() {
851 builder.token(SyntaxKind::TEXT.into(), close_part);
852 }
853 builder.finish_node();
854 } else {
855 // Multi-line shape: open tag covers lines[start_pos..close],
856 // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
857 // only the marker bytes. Line 0's bq prefix was emitted by
858 // the outer dispatcher; subsequent lines (middle + close)
859 // need bq prefix re-emission inside the wrapper.
860 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
861 let first_line = lines[start_pos];
862 let first_inner = if bq_depth > 0 {
863 strip_n_blockquote_markers(first_line, bq_depth)
864 } else {
865 first_line
866 };
867 let (line_no_nl, nl) = strip_newline(first_inner);
868 if !line_no_nl.is_empty() {
869 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
870 }
871 if !nl.is_empty() {
872 builder.token(SyntaxKind::NEWLINE.into(), nl);
873 }
874 builder.finish_node();
875
876 if close_line_idx > start_pos + 1 {
877 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
878 for content_line in &lines[start_pos + 1..close_line_idx] {
879 emit_html_block_line(builder, content_line, bq_depth);
880 }
881 builder.finish_node();
882 }
883
884 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
885 if bq_depth > 0 && close_prefix_len > 0 {
886 emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
887 }
888 let close_part = &close_inner[..marker_end_in_inner];
889 if !close_part.is_empty() {
890 builder.token(SyntaxKind::TEXT.into(), close_part);
891 }
892 builder.finish_node();
893 }
894
895 builder.finish_node(); // HTML_BLOCK
896
897 // Recursively parse JUST the trailing bytes on the close line
898 // and graft top-level children as siblings of the HTML_BLOCK we
899 // just closed. We do NOT consume subsequent lines here — the
900 // outer dispatcher continues from `close_line_idx + 1` and
901 // handles container-boundary lines (`:::` div closes, blockquote
902 // markers, list-marker continuations) correctly. Multi-line
903 // softbreak continuation (`<!-- --> trailing\nmore\n` →
904 // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
905 // outer dispatcher sees `more` after the close line and starts
906 // a fresh paragraph. Refdefs flow through from the outer config
907 // (same pattern as `emit_html_block_body_lifted_inner`).
908 if !trailing.is_empty() {
909 let mut inner_options = config.clone();
910 let refdefs = config.refdef_labels.clone().unwrap_or_default();
911 inner_options.refdef_labels = Some(refdefs.clone());
912 let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
913 let mut bq = None;
914 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
915 }
916
917 Some(close_line_idx + 1)
918}
919
920/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
921/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
922/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
923/// identical to the source either way; only the wrapper retag changes.
924pub(crate) fn parse_html_block_with_wrapper(
925 builder: &mut GreenNodeBuilder<'static>,
926 lines: &[&str],
927 start_pos: usize,
928 block_type: HtmlBlockType,
929 prefix: &ContainerPrefix,
930 wrapper_kind: SyntaxKind,
931 config: &ParserOptions,
932) -> usize {
933 let bq_depth = prefix.bq_depth();
934 // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
935 // closes the RawBlock at the close marker (`-->` / `?>`) and parses
936 // any subsequent bytes (same-line trailing or following lines) as
937 // fresh blocks. The legacy path absorbs them into the HTML block
938 // wrapper, producing one oversized RawBlock. Handle the split here
939 // before entering the legacy emission so the CST encodes the
940 // sibling structure.
941 if config.dialect == crate::options::Dialect::Pandoc
942 && matches!(
943 block_type,
944 HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
945 )
946 && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
947 builder,
948 lines,
949 start_pos,
950 &block_type,
951 wrapper_kind,
952 bq_depth,
953 config,
954 )
955 {
956 return consumed;
957 }
958
959 // Start HTML block. The node kind may retag to `HTML_BLOCK_RAW` for
960 // single-construct opaque shapes (comment / PI / verbatim) under
961 // Pandoc; `wrapper_kind` itself stays the behavioral gate below so no
962 // lift logic changes and the child tokens stay byte-identical.
963 builder.start_node(html_block_node_kind(wrapper_kind, &block_type, config.dialect).into());
964
965 let first_line = lines[start_pos];
966 let blank_terminated = ends_at_blank_line(&block_type);
967
968 // The block dispatcher has already emitted the bq prefix tokens for
969 // the first line; emit only the inner content as TEXT to keep the
970 // CST byte-equal to the source. List-marker bytes are stripped only
971 // when this dispatch fires on a list-marker line — for
972 // continuation-line dispatches (the much more common case) the
973 // leading indent is inner content, not upstream-emitted prefix.
974 let first_inner = prefix.strip_line_0_for_emission(first_line);
975
976 // Detect a multi-line open tag.
977 // - `<div>` (Pandoc lift): we tokenize each line structurally so the
978 // salsa anchor walk picks up `id` from the HTML_ATTRS region.
979 // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
980 // `<section>`, `<header>`, …): same structural emission, exposing
981 // `id` to the salsa anchor walk and enabling the body lift below.
982 // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
983 // without this, the parser closes the block after line 0 and the
984 // remainder of the open tag falls into following paragraphs;
985 // pandoc-native treats the whole multi-line open tag as a single
986 // `RawBlock`. Emission for void tags uses simple per-line
987 // TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
988 // from void tags).
989 let multiline_open_end = match (wrapper_kind, &block_type) {
990 (SyntaxKind::HTML_BLOCK_DIV, _) => {
991 find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
992 }
993 (
994 _,
995 HtmlBlockType::BlockTag {
996 tag_name,
997 closes_at_open_tag: true,
998 ..
999 },
1000 ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
1001 (
1002 _,
1003 HtmlBlockType::BlockTag {
1004 tag_name,
1005 is_verbatim: false,
1006 closed_by_blank_line: false,
1007 depth_aware: true,
1008 closes_at_open_tag: false,
1009 is_closing: false,
1010 },
1011 ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
1012 find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
1013 }
1014 _ => None,
1015 };
1016
1017 // Set up depth-aware close tracking when the block type asks for it
1018 // (Pandoc dialect, balanced same-name tag matching). A `None` means
1019 // we fall back to the legacy "first matching close" path via
1020 // `is_closing_marker`. Computed up front so the lift-mode gate
1021 // below can decide whether the open line already balances the
1022 // block (same-line `<div>...</div>`).
1023 let depth_aware_tag: Option<String> = match &block_type {
1024 HtmlBlockType::BlockTag {
1025 tag_name,
1026 closed_by_blank_line: false,
1027 depth_aware: true,
1028 ..
1029 } => Some(tag_name.clone()),
1030 _ => None,
1031 };
1032 let mut depth: i64 = 1;
1033 if let Some(tag_name) = &depth_aware_tag {
1034 // Sum opens/closes across all open-tag lines (single-line: just
1035 // line 0; multi-line: lines 0..=end_line_idx).
1036 let last_open_line = multiline_open_end.unwrap_or(start_pos);
1037 let mut opens = 0usize;
1038 let mut closes = 0usize;
1039 for line in &lines[start_pos..=last_open_line] {
1040 let inner = prefix.strip(line);
1041 let (o, c) = count_tag_balance(inner, tag_name);
1042 opens += o;
1043 closes += c;
1044 }
1045 depth = opens as i64 - closes as i64;
1046 }
1047
1048 // Same-line `<div>foo</div>` shape: the open line balances the
1049 // block under depth-aware tracking. We can lift this structurally
1050 // only when the open-tag trailing has exactly one `</div>` close,
1051 // zero `<div>` opens, and no non-whitespace content after the
1052 // close. Other same-line shapes (nested, trailing text, malformed)
1053 // fall through to the byte-reparse path.
1054 let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1055 && multiline_open_end.is_none()
1056 && depth_aware_tag.is_some()
1057 && depth <= 0;
1058 let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1059 let (line_without_newline, _) = strip_newline(first_inner);
1060 probe_same_line_lift(line_without_newline, "div")
1061 };
1062
1063 // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1064 // `<nav>`, …): the body parses as fresh markdown between RawBlock
1065 // emissions of the open/close tags. Covers the clean multi-line
1066 // shape (open tag stands alone on its line), open-trailing
1067 // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1068 // and same-line (`<form>foo</form>`). Multi-line open and
1069 // blockquote-wrapped non-div shapes still fall through to the
1070 // byte-walker path.
1071 let strict_block_tag_name: Option<&str> =
1072 if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1073 match &block_type {
1074 HtmlBlockType::BlockTag {
1075 tag_name,
1076 is_verbatim: false,
1077 closed_by_blank_line: false,
1078 depth_aware: true,
1079 closes_at_open_tag: false,
1080 is_closing: false,
1081 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1082 _ => None,
1083 }
1084 } else {
1085 None
1086 };
1087 // Same-line `<form>foo</form>` shape: the open line already
1088 // balances the block (`depth <= 0`). Lift only when the trailing
1089 // bytes after the open `>` end with `</tag>` and contain exactly
1090 // one close + zero nested opens.
1091 let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1092 multiline_open_end.is_none() && depth <= 0 && {
1093 let (line_no_nl, _) = strip_newline(first_inner);
1094 probe_same_line_lift(line_no_nl, name)
1095 }
1096 });
1097 // Strict-block lift gate: accept (a) a multi-line open tag spanning
1098 // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1099 // trailing single-line open (depth > 0, open `>` is present with
1100 // quote-aware matching), or (c) a safe same-line shape. For
1101 // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1102 // …) the lift additionally abandons when the body starts at a
1103 // fresh-block position with a void block tag — pandoc-native pins
1104 // per-tag emission rather than a matched-pair lift in that case.
1105 let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1106 let (line_no_nl, _) = strip_newline(first_inner);
1107 let shape_ok = if multiline_open_end.is_some() {
1108 // `find_multiline_open_end` already verified the open tag
1109 // closes with a quote-aware `>` somewhere in lines
1110 // `start_pos+1..=end`. No same-line trailing content to
1111 // probe; defer trailing-on-close-`>`-line handling to a
1112 // future session (rare in practice).
1113 true
1114 } else if depth > 0 {
1115 probe_open_tag_line_has_close_gt(line_no_nl, name)
1116 } else {
1117 same_line_strict_lift_safe
1118 };
1119 if !shape_ok {
1120 return false;
1121 }
1122 if !is_pandoc_inline_block_tag_name(name) {
1123 return true;
1124 }
1125 !inline_block_void_interior_abandons(
1126 first_inner,
1127 lines,
1128 start_pos,
1129 multiline_open_end,
1130 bq_depth,
1131 name,
1132 )
1133 });
1134
1135 // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1136 // are byte-equal to the non-bq same-line shape minus the leading
1137 // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1138 // The body has no inner newlines, so no bq prefix re-injection is
1139 // needed when grafting — `emit_html_block_body_lifted` (passing
1140 // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1141 // open-trailing) still fall through to the projector's byte
1142 // walker — they need per-line prefix injection.
1143 let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1144 && multiline_open_end.is_none()
1145 && depth_aware_tag.is_some()
1146 && depth <= 0
1147 {
1148 let (line_no_nl, _) = strip_newline(first_inner);
1149 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1150 if probe_same_line_lift(line_no_nl, "div") {
1151 Some("div")
1152 } else {
1153 None
1154 }
1155 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1156 match &block_type {
1157 HtmlBlockType::BlockTag {
1158 tag_name,
1159 is_verbatim: false,
1160 closed_by_blank_line: false,
1161 depth_aware: true,
1162 closes_at_open_tag: false,
1163 is_closing: false,
1164 } if is_pandoc_lift_eligible_block_tag(tag_name)
1165 && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1166 {
1167 // Inline-block tags (`<video>`, `<iframe>`, …) skip
1168 // the void-interior check at same-line — the shape
1169 // has no inner block content to interfere with.
1170 Some(tag_name.as_str())
1171 }
1172 _ => None,
1173 }
1174 } else {
1175 None
1176 }
1177 } else {
1178 None
1179 };
1180
1181 // Messy-shape lift inside a blockquote — covers open-trailing
1182 // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1183 // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1184 // including the multi-line-open variants (`> <div\n> id="x">foo\n>
1185 // body\n> </div>`) where the trailing is captured into `pre_content`
1186 // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1187 // The open line does NOT balance the block (depth > 0 after the
1188 // open line, distinguishing this from `same_line_bq_lift_tag` which
1189 // requires depth <= 0). The close line — possibly with leading body
1190 // text — closes the block when depth returns to 0. Body lines (incl.
1191 // open trailing and close leading) graft via prefix re-injection.
1192 let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1193 {
1194 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1195 Some("div")
1196 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1197 match &block_type {
1198 HtmlBlockType::BlockTag {
1199 tag_name,
1200 is_verbatim: false,
1201 closed_by_blank_line: false,
1202 depth_aware: true,
1203 closes_at_open_tag: false,
1204 is_closing: false,
1205 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1206 // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1207 // …) abandon the lift when the body starts at a
1208 // fresh-block position with a void block tag. Same gate
1209 // as the non-bq matched-pair lift (`strict_block_lift`).
1210 if is_pandoc_inline_block_tag_name(tag_name)
1211 && inline_block_void_interior_abandons(
1212 first_inner,
1213 lines,
1214 start_pos,
1215 multiline_open_end,
1216 bq_depth,
1217 tag_name,
1218 )
1219 {
1220 None
1221 } else {
1222 Some(tag_name.as_str())
1223 }
1224 }
1225 _ => None,
1226 }
1227 } else {
1228 None
1229 }
1230 } else {
1231 None
1232 };
1233
1234 // Multi-line open + matched close-on-the-open's-last-line shape inside
1235 // a blockquote (`> <div\n> id="x">foo</div>` and depth-aware variants:
1236 // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1237 // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1238 // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1239 // close is on the open's last line (`depth <= 0` after the open lines)
1240 // AND `multiline_open_end.is_some()`. The trailing bytes after the
1241 // last `>` get lifted into `pre_content` via
1242 // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1243 // new branch below splits `pre_content` at the matched close marker
1244 // and grafts body + close + any trailing siblings.
1245 let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1246 && multiline_open_end.is_some()
1247 && depth_aware_tag.is_some()
1248 && depth <= 0
1249 {
1250 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1251 Some("div")
1252 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1253 match &block_type {
1254 HtmlBlockType::BlockTag {
1255 tag_name,
1256 is_verbatim: false,
1257 closed_by_blank_line: false,
1258 depth_aware: true,
1259 closes_at_open_tag: false,
1260 is_closing: false,
1261 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1262 if is_pandoc_inline_block_tag_name(tag_name)
1263 && inline_block_void_interior_abandons(
1264 first_inner,
1265 lines,
1266 start_pos,
1267 multiline_open_end,
1268 bq_depth,
1269 tag_name,
1270 )
1271 {
1272 None
1273 } else {
1274 Some(tag_name.as_str())
1275 }
1276 }
1277 _ => None,
1278 }
1279 } else {
1280 None
1281 }
1282 } else {
1283 None
1284 };
1285
1286 // Whether this block participates in the Phase 6 structural lift
1287 // (recursively parse body as Pandoc markdown and graft children).
1288 // Covers `<div>` outside blockquote context. For same-line shapes
1289 // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1290 // keep the legacy single-HTML_BLOCK_TAG shape and let the
1291 // byte-reparse path handle projection.
1292 let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1293 && bq_depth == 0
1294 && (!is_same_line_div || same_line_div_lift_safe))
1295 || strict_block_lift
1296 || same_line_bq_lift_tag.is_some()
1297 || bq_messy_lift_tag.is_some()
1298 || bq_multiline_close_lift_tag.is_some();
1299
1300 // Trailing content from the open tag (after `>`). When the lift is
1301 // active and the open line is `<div ATTRS>foo\n`, this captures
1302 // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1303 // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1304 // shapes (same-line / blockquote-wrapped).
1305 let mut pre_content = String::new();
1306
1307 // Emit opening line(s)
1308 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1309
1310 if let Some(end_line_idx) = multiline_open_end {
1311 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1312 emit_multiline_open_tag_with_attrs(
1313 builder,
1314 lines,
1315 start_pos,
1316 end_line_idx,
1317 "div",
1318 bq_depth,
1319 lift_mode,
1320 &mut pre_content,
1321 );
1322 } else if let Some(name) = strict_block_tag_name
1323 && strict_block_lift
1324 {
1325 emit_multiline_open_tag_with_attrs(
1326 builder,
1327 lines,
1328 start_pos,
1329 end_line_idx,
1330 name,
1331 bq_depth,
1332 lift_mode,
1333 &mut pre_content,
1334 );
1335 } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1336 {
1337 // Multi-line open of a lift-eligible strict-block tag inside a
1338 // blockquote (`> <section\n> id=...>`). The non-bq
1339 // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1340 // covers the bq side so the open tag emits HTML_ATTRS regions
1341 // for `AttributeNode::cast` and the projector's canonicalizer.
1342 //
1343 // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1344 // call below: only push trailing bytes into `pre_content` when
1345 // the structural lift will consume them (bq messy lift). The
1346 // bq clean-lift requires `pre_content.is_empty()`, so for clean
1347 // multi-line opens the trailing is empty anyway and this is
1348 // a no-op.
1349 let lift_trailing =
1350 bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1351 emit_multiline_open_tag_with_attrs(
1352 builder,
1353 lines,
1354 start_pos,
1355 end_line_idx,
1356 name,
1357 bq_depth,
1358 lift_trailing,
1359 &mut pre_content,
1360 );
1361 } else {
1362 emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1363 }
1364 } else {
1365 let (line_without_newline, newline_str) = strip_newline(first_inner);
1366 if !line_without_newline.is_empty() {
1367 // For HTML_BLOCK_DIV, expose the open tag's attributes
1368 // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1369 // via the same descendants walk that handles fenced-div /
1370 // heading attrs. CST bytes stay byte-equal to source — we only
1371 // tokenize at finer granularity for matched div opens.
1372 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1373 let trailing =
1374 emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1375 if !trailing.is_empty() {
1376 pre_content.push_str(trailing);
1377 pre_content.push_str(newline_str);
1378 }
1379 } else if let Some(name) = strict_block_tag_name
1380 && strict_block_lift
1381 {
1382 let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1383 if !trailing.is_empty() {
1384 pre_content.push_str(trailing);
1385 pre_content.push_str(newline_str);
1386 }
1387 } else if let Some(name) =
1388 bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1389 {
1390 // Inside a blockquote, lift trailing bytes into
1391 // `pre_content` when either the same-line bq gate fires
1392 // (`> <tag>body</tag>` — handled by `same_line_closed`)
1393 // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1394 // </tag>` and butted-close — handled at the close-marker
1395 // site below). For the clean-shape bq lift the open has
1396 // no trailing bytes regardless, so `lift_trailing=true`
1397 // is a no-op there.
1398 let lift_trailing =
1399 same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1400 let trailing =
1401 emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1402 if lift_trailing && !trailing.is_empty() {
1403 pre_content.push_str(trailing);
1404 pre_content.push_str(newline_str);
1405 }
1406 } else {
1407 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1408 }
1409 }
1410 // When the open tag has trailing content under lift mode, the
1411 // newline belongs to that trailing line (it terminates the
1412 // synthetic body line, not the open tag). Don't double-emit.
1413 if pre_content.is_empty() && !newline_str.is_empty() {
1414 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1415 }
1416 }
1417
1418 builder.finish_node(); // HtmlBlockTag
1419
1420 // Check if opening line also contains closing marker. Blank-line-terminated
1421 // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1422 // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1423 // (`closes_at_open_tag: true`) close immediately — the block always
1424 // ends on the open-tag line since there is no closing tag to find.
1425 let void_block = matches!(
1426 &block_type,
1427 HtmlBlockType::BlockTag {
1428 closes_at_open_tag: true,
1429 ..
1430 }
1431 );
1432 // Void tags with a multi-line open close immediately after the open
1433 // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1434 // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1435 // a single RawBlock for the whole multi-line tag, with no following
1436 // content.
1437 if void_block && let Some(end_line_idx) = multiline_open_end {
1438 log::trace!(
1439 "HTML void block at line {} closes after multi-line open ending at line {}",
1440 start_pos + 1,
1441 end_line_idx + 1
1442 );
1443 builder.finish_node(); // HtmlBlock
1444 return end_line_idx + 1;
1445 }
1446 // Multi-line open with all matched closes on the open's last line:
1447 // `pre_content` holds the bytes after the last open `>` (lifted there
1448 // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1449 // When `depth <= 0` after the multi-line open and the trailing bytes
1450 // contain the depth-zero matched close, do the same-line lift on
1451 // `pre_content` directly. Mirrors the single-line `same_line_closed`
1452 // lift below — same body / close-marker / trailing-graft shape, just
1453 // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1454 //
1455 // The body bytes of `pre_content` come from the open's last line,
1456 // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1457 // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1458 // tag thus inherit the bq context without per-line prefix injection,
1459 // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1460 // both the non-bq and bq variants of this shape.
1461 if let Some(end_line_idx) = multiline_open_end
1462 && !blank_terminated
1463 && depth_aware_tag.is_some()
1464 && depth <= 0
1465 && lift_mode
1466 && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1467 && !pre_content.is_empty()
1468 {
1469 let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1470 Some("div")
1471 } else if strict_block_lift {
1472 strict_block_tag_name
1473 } else if let Some(name) = bq_multiline_close_lift_tag {
1474 Some(name)
1475 } else {
1476 None
1477 };
1478 if let Some(tag_name) = tag_name_opt {
1479 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1480 if let Some((leading, close_part)) =
1481 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1482 {
1483 let close_marker_end =
1484 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1485 let close_marker = &close_part[..close_marker_end];
1486 let same_line_trailing = &close_part[close_marker_end..];
1487 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1488 LastParaDemote::SkipTrailingBlanks
1489 } else {
1490 LastParaDemote::OnlyIfLast
1491 };
1492 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1493 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1494 if same_line_trailing.is_empty() {
1495 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1496 close_line.push_str(close_marker);
1497 close_line.push_str(post_nl);
1498 emit_html_block_line(builder, &close_line, 0);
1499 builder.finish_node();
1500 builder.finish_node(); // HtmlBlock
1501 } else {
1502 builder.token(SyntaxKind::TEXT.into(), close_marker);
1503 builder.finish_node(); // HTML_BLOCK_TAG
1504 builder.finish_node(); // HtmlBlock
1505
1506 let mut trailing_text =
1507 String::with_capacity(same_line_trailing.len() + post_nl.len());
1508 trailing_text.push_str(same_line_trailing);
1509 trailing_text.push_str(post_nl);
1510 let mut inner_options = config.clone();
1511 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1512 inner_options.refdef_labels = Some(refdefs.clone());
1513 let inner_root = crate::parser::parse_with_refdefs(
1514 &trailing_text,
1515 Some(inner_options),
1516 refdefs,
1517 );
1518 let mut bq = None;
1519 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1520 }
1521 return end_line_idx + 1;
1522 }
1523 }
1524 }
1525
1526 let same_line_closed = !blank_terminated
1527 && multiline_open_end.is_none()
1528 && (void_block
1529 || match &depth_aware_tag {
1530 Some(_) => depth <= 0,
1531 None => is_closing_marker(first_inner, &block_type),
1532 });
1533 if same_line_closed {
1534 log::trace!(
1535 "HTML block at line {} opens and closes on same line",
1536 start_pos + 1
1537 );
1538 // Same-line structural lift (div or non-div strict-block):
1539 // pre_content holds the bytes after the open `>` (including
1540 // the close `</tag>` and the trailing newline). Split into
1541 // body + close tag, emit body via recursive parse, emit close
1542 // tag as a sibling `HTML_BLOCK_TAG`.
1543 let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1544 None
1545 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1546 Some("div")
1547 } else if same_line_strict_lift_safe {
1548 strict_block_tag_name
1549 } else if let Some(name) = same_line_bq_lift_tag {
1550 // Bq same-line: body has no inner newlines so the standard
1551 // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1552 // sufficient. The bq prefix `> ` lives on the outer
1553 // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1554 Some(name)
1555 } else {
1556 None
1557 };
1558 if let Some(tag_name) = same_line_lift_tag {
1559 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1560 // Depth-aware split: handles `<tag>foo</tag>bar` (single
1561 // close, trailing text), `<tag>foo</tag></tag>` (matched
1562 // close + unmatched trailing close → sibling RawBlock),
1563 // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1564 // recursive body parse).
1565 if let Some((leading, close_part)) =
1566 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1567 {
1568 // `close_part` starts with `</tag` and contains the close
1569 // marker followed by any same-line trailing text. Split
1570 // off the close marker bytes (`</tag>`) so the close
1571 // `HTML_BLOCK_TAG` carries only those bytes; trailing
1572 // text is parsed and grafted as a sibling block at the
1573 // parent level (matches pandoc-native shape:
1574 // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1575 let close_marker_end =
1576 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1577 let close_marker = &close_part[..close_marker_end];
1578 let same_line_trailing = &close_part[close_marker_end..];
1579
1580 // Same-line is always close-butted; div demotes the
1581 // trailing Para→Plain via `SkipTrailingBlanks`.
1582 // Non-div strict-block uses `OnlyIfLast` (consistent
1583 // with butted-close — no trailing BLANK_LINE before
1584 // the close means the trailing Para demotes).
1585 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1586 LastParaDemote::SkipTrailingBlanks
1587 } else {
1588 LastParaDemote::OnlyIfLast
1589 };
1590 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1591 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1592 if same_line_trailing.is_empty() {
1593 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1594 close_line.push_str(close_marker);
1595 close_line.push_str(post_nl);
1596 emit_html_block_line(builder, &close_line, 0);
1597 builder.finish_node();
1598 builder.finish_node(); // HtmlBlock
1599 } else {
1600 // Close tag holds only the close-marker bytes;
1601 // trailing + newline graft as siblings of the
1602 // wrapper (matches pandoc's per-tag block split).
1603 builder.token(SyntaxKind::TEXT.into(), close_marker);
1604 builder.finish_node(); // HTML_BLOCK_TAG
1605 builder.finish_node(); // HtmlBlock
1606
1607 let mut trailing_text =
1608 String::with_capacity(same_line_trailing.len() + post_nl.len());
1609 trailing_text.push_str(same_line_trailing);
1610 trailing_text.push_str(post_nl);
1611 let mut inner_options = config.clone();
1612 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1613 inner_options.refdef_labels = Some(refdefs.clone());
1614 let inner_root = crate::parser::parse_with_refdefs(
1615 &trailing_text,
1616 Some(inner_options),
1617 refdefs,
1618 );
1619 let mut bq = None;
1620 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1621 }
1622 return start_pos + 1;
1623 }
1624 }
1625 builder.finish_node(); // HtmlBlock
1626 return start_pos + 1;
1627 }
1628
1629 let mut current_pos = multiline_open_end
1630 .map(|end| end + 1)
1631 .unwrap_or(start_pos + 1);
1632 let mut content_lines: Vec<&str> = Vec::new();
1633 let mut found_closing = false;
1634
1635 // Parse content until we find the closing marker
1636 while current_pos < lines.len() {
1637 let line = lines[current_pos];
1638 let (line_bq_depth, inner) = count_blockquote_markers(line);
1639
1640 // Only process lines at the same or deeper blockquote depth
1641 if line_bq_depth < bq_depth {
1642 break;
1643 }
1644
1645 // Blank-line-terminated blocks (types 6/7) end before the blank line.
1646 // The blank line itself is not part of the block.
1647 if blank_terminated && inner.trim().is_empty() {
1648 break;
1649 }
1650
1651 // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1652 // count opens/closes of the same tag name and only close when depth
1653 // returns to 0; otherwise fall back to substring-match on the line.
1654 let line_closes = match &depth_aware_tag {
1655 Some(tag_name) => {
1656 let (opens, closes) = count_tag_balance(inner, tag_name);
1657 depth += opens as i64;
1658 depth -= closes as i64;
1659 depth <= 0
1660 }
1661 None => is_closing_marker(inner, &block_type),
1662 };
1663
1664 if line_closes {
1665 log::trace!("Found HTML block closing at line {}", current_pos + 1);
1666 found_closing = true;
1667
1668 // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1669 // the open and close tags stand alone on their source lines
1670 // (no trailing on open, no body content on close after
1671 // stripping bq markers), lift the body lines structurally
1672 // so the projector walks CST children instead of
1673 // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1674 //
1675 // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1676 // grafted, Para preserved), non-div strict-block tags
1677 // (`<form>`, `<section>`, …) and inline-block matched-pair
1678 // tags (`<video>`, `<iframe>`, …) — the latter two under
1679 // HTML_BLOCK with the structural lift hitting pandoc's
1680 // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1681 // demotion. Inline-block additionally bails if the body
1682 // starts at a fresh-block position with a void block tag
1683 // (mirrors the non-bq matched-pair gate).
1684 //
1685 // Other bq-wrapped shapes (butted-close / open-trailing /
1686 // same-line) still fall through to the opaque path.
1687 // Multi-line opens are allowed here as of 2026-05-12: the
1688 // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1689 // per attr line and per-line bq prefix tokens) by the bq-aware
1690 // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1691 // empty for multi-line opens (the emitter writes any trailing
1692 // bytes on the last open line directly as TEXT inside
1693 // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1694 // trailing falls through to the opaque path, matching the non-
1695 // bq deferral.
1696 let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1697 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1698 Some("div")
1699 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1700 match &block_type {
1701 HtmlBlockType::BlockTag {
1702 tag_name,
1703 is_verbatim: false,
1704 closed_by_blank_line: false,
1705 depth_aware: true,
1706 closes_at_open_tag: false,
1707 is_closing: false,
1708 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1709 _ => None,
1710 }
1711 } else {
1712 None
1713 }
1714 } else {
1715 None
1716 };
1717
1718 let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1719 // Open-shape: last open line must end with `>` (clean
1720 // close-of-open). For single-line, that's `first_inner`
1721 // (already bq-stripped); for multi-line, strip bq markers
1722 // from `lines[end_line_idx]` and check the same.
1723 let last_open_line: &str = match multiline_open_end {
1724 None => first_inner,
1725 Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1726 prefix.strip(lines[end])
1727 }
1728 Some(end) => lines[end],
1729 };
1730 let (open_no_nl, _) = strip_newline(last_open_line);
1731 if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1732 return false;
1733 }
1734 let close_stripped = prefix.strip(line);
1735 let (close_no_nl, _) = strip_newline(close_stripped);
1736 if !close_no_nl
1737 .trim_start_matches([' ', '\t'])
1738 .starts_with("</")
1739 {
1740 return false;
1741 }
1742 if is_pandoc_inline_block_tag_name(tag_name)
1743 && inline_block_void_interior_abandons(
1744 first_inner,
1745 lines,
1746 start_pos,
1747 multiline_open_end,
1748 bq_depth,
1749 tag_name,
1750 )
1751 {
1752 return false;
1753 }
1754 true
1755 });
1756
1757 if bq_clean_lift {
1758 let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1759 LastParaDemote::Never
1760 } else {
1761 LastParaDemote::OnlyIfLast
1762 };
1763 emit_html_block_body_lifted_bq(
1764 builder,
1765 &content_lines,
1766 prefix,
1767 demote_policy,
1768 config,
1769 );
1770 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1771 emit_html_block_line(builder, line, bq_depth);
1772 builder.finish_node();
1773 current_pos += 1;
1774 break;
1775 }
1776
1777 // Bq messy-shape lift — single-line open with trailing or
1778 // butted-close (or both). `pre_content` already captures any
1779 // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1780 // strip the close line's bq markers before splitting so
1781 // `leading` and `close_part` are bq-prefix-free. Body parses
1782 // recursively from `pre_content + stripped(content_lines) +
1783 // leading`, with per-line bq prefixes re-injected so the CST
1784 // stays byte-equal to the source. Demote: div is keyed on
1785 // close-butted-ness (Plain when leading non-empty, Para
1786 // otherwise); non-div uses OnlyIfLast either way.
1787 if let Some(tag_name) = bq_messy_lift_tag {
1788 let close_stripped = prefix.strip(line);
1789 let close_prefix_len = line.len() - close_stripped.len();
1790 let close_prefix = &line[..close_prefix_len];
1791 if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1792 {
1793 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1794 if leading.is_empty() {
1795 LastParaDemote::Never
1796 } else {
1797 LastParaDemote::SkipTrailingBlanks
1798 }
1799 } else {
1800 LastParaDemote::OnlyIfLast
1801 };
1802 emit_html_block_body_lifted_bq_messy(
1803 builder,
1804 &pre_content,
1805 &content_lines,
1806 leading,
1807 close_prefix,
1808 prefix,
1809 policy,
1810 config,
1811 );
1812 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1813 // When `leading` is empty, no recursive-parse output carries
1814 // the close line's bq prefix, so emit it here before the
1815 // close tag. When `leading` is non-empty,
1816 // `emit_html_block_body_lifted_bq_messy` already injected
1817 // the prefix at the start of the leading bytes (via the
1818 // BqPrefixState entry); emitting again would double the
1819 // prefix bytes and break losslessness.
1820 if leading.is_empty() {
1821 emit_bq_prefix_tokens(builder, close_prefix);
1822 }
1823 emit_html_block_line(builder, close_part, 0);
1824 builder.finish_node();
1825 current_pos += 1;
1826 break;
1827 }
1828 }
1829
1830 // Under lift mode, try to split the close line into a
1831 // leading "body content" prefix and the close-marker
1832 // remainder using depth-aware matching. Walks at depth 1
1833 // (we're inside the open tag) so nested same-tag opens
1834 // (e.g. `<inner></inner></tag>` style with a nested div)
1835 // are absorbed into the body and parsed recursively, and
1836 // multi-close shapes (`foo</div></div>` on the close line)
1837 // peel off the matched-pair close — the unmatched
1838 // trailing close projects as a sibling `RawBlock` per
1839 // pandoc-native. For `<div>`, non-empty `leading`
1840 // propagates pandoc's `markdown_in_html_blocks` Plain
1841 // demotion rule. For non-div strict-block tags, demotion
1842 // follows pandoc's `OnlyIfLast` rule (demote the trailing
1843 // Para only when no blank line precedes the close).
1844 let close_split_tag = if lift_mode {
1845 if strict_block_lift {
1846 strict_block_tag_name
1847 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1848 Some("div")
1849 } else {
1850 None
1851 }
1852 } else {
1853 None
1854 };
1855 let (close_no_nl, close_post_nl) = strip_newline(line);
1856 let close_split = close_split_tag
1857 .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1858
1859 if let Some((leading, close_part)) = close_split {
1860 // Close-line leading that is whitespace-only is close-tag
1861 // indentation, not body content (pandoc-native strips it
1862 // from the close RawBlock and treats the close as butted —
1863 // see ` </tag>` shapes). Route those bytes into the
1864 // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1865 // projector strips them; keep the demote policy keyed on
1866 // the original leading so butted-close detection (Plain
1867 // demotion for div, OnlyIfLast for non-div) still fires.
1868 let leading_is_ws_only =
1869 !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1870 let body_leading = if leading_is_ws_only { "" } else { leading };
1871 let policy = if strict_block_lift {
1872 LastParaDemote::OnlyIfLast
1873 } else if !leading.is_empty() {
1874 LastParaDemote::SkipTrailingBlanks
1875 } else {
1876 LastParaDemote::Never
1877 };
1878 // Split close_part into close-marker bytes (`</tag>`)
1879 // and trailing bytes (e.g. an extra `</div>` for the
1880 // double-close case, or `bar` for trailing text after
1881 // a normal close). Trailing bytes are recursively
1882 // parsed and grafted as siblings of the HTML_BLOCK_DIV
1883 // wrapper.
1884 let close_tag_name = close_split_tag.expect("close_split_tag present");
1885 let close_marker_end =
1886 split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1887 let close_marker = &close_part[..close_marker_end];
1888 let close_trailing = &close_part[close_marker_end..];
1889
1890 emit_html_block_body_lifted(
1891 builder,
1892 &pre_content,
1893 &content_lines,
1894 body_leading,
1895 policy,
1896 config,
1897 );
1898 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1899 if leading_is_ws_only {
1900 builder.token(SyntaxKind::WHITESPACE.into(), leading);
1901 }
1902 if close_trailing.is_empty() {
1903 let mut close_line =
1904 String::with_capacity(close_marker.len() + close_post_nl.len());
1905 close_line.push_str(close_marker);
1906 close_line.push_str(close_post_nl);
1907 emit_html_block_line(builder, &close_line, 0);
1908 builder.finish_node();
1909 } else {
1910 // Close tag holds only the close-marker bytes;
1911 // trailing + newline graft as siblings.
1912 builder.token(SyntaxKind::TEXT.into(), close_marker);
1913 builder.finish_node(); // HTML_BLOCK_TAG
1914 builder.finish_node(); // HtmlBlock
1915
1916 let mut trailing_text =
1917 String::with_capacity(close_trailing.len() + close_post_nl.len());
1918 trailing_text.push_str(close_trailing);
1919 trailing_text.push_str(close_post_nl);
1920 let mut inner_options = config.clone();
1921 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1922 inner_options.refdef_labels = Some(refdefs.clone());
1923 let inner_root = crate::parser::parse_with_refdefs(
1924 &trailing_text,
1925 Some(inner_options),
1926 refdefs,
1927 );
1928 let mut bq = None;
1929 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1930 current_pos += 1;
1931 return current_pos;
1932 }
1933 } else {
1934 emit_html_block_body(
1935 builder,
1936 &pre_content,
1937 &content_lines,
1938 bq_depth,
1939 wrapper_kind,
1940 lift_mode,
1941 config,
1942 );
1943 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1944 emit_html_block_line(builder, line, bq_depth);
1945 builder.finish_node();
1946 }
1947
1948 current_pos += 1;
1949 break;
1950 }
1951
1952 // Regular content line
1953 content_lines.push(line);
1954 current_pos += 1;
1955 }
1956
1957 // If we didn't find a closing marker, emit what we collected
1958 if !found_closing {
1959 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1960 emit_html_block_body(
1961 builder,
1962 &pre_content,
1963 &content_lines,
1964 bq_depth,
1965 wrapper_kind,
1966 lift_mode,
1967 config,
1968 );
1969 }
1970
1971 builder.finish_node(); // HtmlBlock
1972 current_pos
1973}
1974
1975/// Emit the collected inner content lines for an HTML block.
1976///
1977/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1978/// line `<div>` open outside blockquote), recursively parse the inner
1979/// content (including any open-tag trailing) as Pandoc-flavored
1980/// markdown and graft the resulting top-level blocks as direct children
1981/// of the wrapper. This is the Phase 6 structural lift — the projector
1982/// and downstream consumers (linter, salsa, LSP) can walk the
1983/// structural children instead of re-tokenizing the body bytes.
1984///
1985/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1986/// blockquote, multi-line open, or no content at all — fall through to
1987/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1988///
1989/// CST bytes remain byte-identical to source: the recursive parser is
1990/// lossless on the same byte slice the legacy path would have captured
1991/// as TEXT.
1992fn emit_html_block_body(
1993 builder: &mut GreenNodeBuilder<'static>,
1994 pre_content: &str,
1995 content_lines: &[&str],
1996 bq_depth: usize,
1997 wrapper_kind: SyntaxKind,
1998 lift_mode: bool,
1999 config: &ParserOptions,
2000) {
2001 if pre_content.is_empty() && content_lines.is_empty() {
2002 return;
2003 }
2004 if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
2005 // Reached when the parser walked to end-of-input without finding
2006 // `</div>` (unbalanced div) — no close tag, no Plain demotion.
2007 emit_html_block_body_lifted(
2008 builder,
2009 pre_content,
2010 content_lines,
2011 "",
2012 LastParaDemote::Never,
2013 config,
2014 );
2015 return;
2016 }
2017 // Legacy path: opaque TEXT capture. `pre_content` is always empty
2018 // here (lift_mode is the only path that populates it), but be
2019 // defensive — if a trailing prefix snuck in, emit it as TEXT so
2020 // bytes are preserved.
2021 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
2022 if !pre_content.is_empty() {
2023 builder.token(SyntaxKind::TEXT.into(), pre_content);
2024 }
2025 for content_line in content_lines {
2026 emit_html_block_line(builder, content_line, bq_depth);
2027 }
2028 builder.finish_node();
2029}
2030
2031/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
2032/// to `PLAIN` when grafting children into the structural CST.
2033#[derive(Copy, Clone, Debug)]
2034enum LastParaDemote {
2035 /// Never demote — pandoc preserves the trailing `Para`.
2036 Never,
2037 /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2038 /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2039 /// tag is butted against the paragraph text on its source line —
2040 /// pandoc's `markdown_in_html_blocks` Plain demotion.
2041 SkipTrailingBlanks,
2042 /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2043 /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2044 /// for non-div strict-block tags whose body emits at top-level
2045 /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2046 /// demotes the trailing `Para` to `Plain` unless a blank line
2047 /// separates them.
2048 OnlyIfLast,
2049}
2050
2051/// Lift the HTML-block body into structural CST children: build the
2052/// inner text from `pre_content` + `content_lines` + `post_content`
2053/// (in order), recursively parse it as Pandoc-flavored markdown, and
2054/// graft the resulting top-level blocks into `builder`. `demote_policy`
2055/// controls whether the trailing paragraph is retagged as `PLAIN` to
2056/// encode pandoc's Plain/Para adjacency rules structurally.
2057fn emit_html_block_body_lifted(
2058 builder: &mut GreenNodeBuilder<'static>,
2059 pre_content: &str,
2060 content_lines: &[&str],
2061 post_content: &str,
2062 demote_policy: LastParaDemote,
2063 config: &ParserOptions,
2064) {
2065 emit_html_block_body_lifted_inner(
2066 builder,
2067 pre_content,
2068 content_lines,
2069 post_content,
2070 demote_policy,
2071 config,
2072 &mut None,
2073 )
2074}
2075
2076/// Body-lift variant for `<div>` inside a blockquote. Strips
2077/// `bq_depth` levels of blockquote markers from each `content_line`,
2078/// captures the per-line prefix bytes, and grafts the recursive parse
2079/// with prefix injection so the output CST stays byte-equal to the
2080/// source. `pre_content` and `post_content` must be empty (the bq
2081/// clean lift only handles the shape where the open and close tags
2082/// stand alone on their source lines).
2083fn emit_html_block_body_lifted_bq(
2084 builder: &mut GreenNodeBuilder<'static>,
2085 content_lines: &[&str],
2086 prefix: &ContainerPrefix,
2087 demote_policy: LastParaDemote,
2088 config: &ParserOptions,
2089) {
2090 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2091 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2092 for cl in content_lines {
2093 let (li, bq, inner) = prefix.split(cl);
2094 prefix_lines.push(ContainerPrefixLine {
2095 list_indent: li.to_string(),
2096 bq_prefix: bq.to_string(),
2097 });
2098 stripped_lines.push(inner);
2099 }
2100 let mut state = ContainerPrefixState::new(prefix_lines);
2101 emit_html_block_body_lifted_inner(
2102 builder,
2103 "",
2104 &stripped_lines,
2105 "",
2106 demote_policy,
2107 config,
2108 &mut state,
2109 )
2110}
2111
2112/// Body-lift variant for the bq messy-shape lift — open-trailing,
2113/// butted-close, or both. The open-trailing bytes (if any) sit in
2114/// `pre_content` (line 0 of the body — no bq prefix in source because
2115/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2116/// each carry their own bq prefix. The close line's `leading` (body
2117/// bytes before `</tag>`) sits on the close line, prefixed in source
2118/// by `close_line_prefix` (the bq prefix captured from `line`).
2119///
2120/// Builds `prefixes` so each emitted line in the recursive parse
2121/// output gets the right per-line bq prefix re-injected at line start:
2122/// `pre_content` → empty prefix (no source `> ` precedes it); each
2123/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2124/// Result CST stays byte-equal to source.
2125#[allow(clippy::too_many_arguments)]
2126fn emit_html_block_body_lifted_bq_messy(
2127 builder: &mut GreenNodeBuilder<'static>,
2128 pre_content: &str,
2129 content_lines: &[&str],
2130 leading: &str,
2131 close_line_prefix: &str,
2132 prefix: &ContainerPrefix,
2133 demote_policy: LastParaDemote,
2134 config: &ParserOptions,
2135) {
2136 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2137 if !pre_content.is_empty() {
2138 prefix_lines.push(ContainerPrefixLine::default());
2139 }
2140 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2141 for cl in content_lines {
2142 let (li, bq, inner) = prefix.split(cl);
2143 prefix_lines.push(ContainerPrefixLine {
2144 list_indent: li.to_string(),
2145 bq_prefix: bq.to_string(),
2146 });
2147 stripped_lines.push(inner);
2148 }
2149 if !leading.is_empty() {
2150 // The close line carries its own captured prefix bytes; treat
2151 // them as bq-prefix only (no list-indent split applied) to keep
2152 // the legacy bq-only re-injection behavior for messy-shape
2153 // close-line lifts.
2154 prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2155 }
2156 let mut state = ContainerPrefixState::new(prefix_lines);
2157 emit_html_block_body_lifted_inner(
2158 builder,
2159 pre_content,
2160 &stripped_lines,
2161 leading,
2162 demote_policy,
2163 config,
2164 &mut state,
2165 )
2166}
2167
2168fn emit_html_block_body_lifted_inner(
2169 builder: &mut GreenNodeBuilder<'static>,
2170 pre_content: &str,
2171 content_lines: &[&str],
2172 post_content: &str,
2173 demote_policy: LastParaDemote,
2174 config: &ParserOptions,
2175 bq: &mut Option<ContainerPrefixState>,
2176) {
2177 if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2178 return;
2179 }
2180 let mut inner_text = String::with_capacity(
2181 pre_content.len()
2182 + content_lines.iter().map(|s| s.len()).sum::<usize>()
2183 + post_content.len(),
2184 );
2185 inner_text.push_str(pre_content);
2186 for line in content_lines {
2187 inner_text.push_str(line);
2188 }
2189 inner_text.push_str(post_content);
2190
2191 let mut inner_options = config.clone();
2192 let refdefs = config.refdef_labels.clone().unwrap_or_default();
2193 inner_options.refdef_labels = Some(refdefs.clone());
2194 let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2195 graft_document_children(builder, &inner_root, demote_policy, bq);
2196}
2197
2198/// Walk a parsed inner document's top-level children and re-emit them
2199/// into `builder`. The document's wrapper node is skipped — only its
2200/// children are grafted.
2201///
2202/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2203/// as `PLAIN` — see [`LastParaDemote`].
2204///
2205/// `bq` is `Some` when grafting a body that lived inside an outer
2206/// container (blockquote, list-item, or both) — token emission then
2207/// injects the captured per-line prefix tokens at line starts so the
2208/// CST stays byte-equal to source. See
2209/// [`super::container_prefix::ContainerPrefixState`].
2210fn graft_document_children(
2211 builder: &mut GreenNodeBuilder<'static>,
2212 doc: &SyntaxNode,
2213 demote_policy: LastParaDemote,
2214 bq: &mut Option<ContainerPrefixState>,
2215) {
2216 let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2217
2218 let mut demote_idx: Option<usize> = None;
2219 match demote_policy {
2220 LastParaDemote::Never => {}
2221 LastParaDemote::SkipTrailingBlanks => {
2222 for (i, c) in children.iter().enumerate().rev() {
2223 if let rowan::NodeOrToken::Node(n) = c {
2224 if n.kind() == SyntaxKind::BLANK_LINE {
2225 continue;
2226 }
2227 if n.kind() == SyntaxKind::PARAGRAPH {
2228 demote_idx = Some(i);
2229 }
2230 break;
2231 }
2232 }
2233 }
2234 LastParaDemote::OnlyIfLast => {
2235 for (i, c) in children.iter().enumerate().rev() {
2236 if let rowan::NodeOrToken::Node(n) = c {
2237 if n.kind() == SyntaxKind::PARAGRAPH {
2238 demote_idx = Some(i);
2239 }
2240 break;
2241 }
2242 }
2243 }
2244 }
2245
2246 for (i, child) in children.into_iter().enumerate() {
2247 match child {
2248 rowan::NodeOrToken::Node(n) => {
2249 if Some(i) == demote_idx {
2250 graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2251 } else {
2252 graft_subtree(builder, &n, bq);
2253 }
2254 }
2255 rowan::NodeOrToken::Token(t) => {
2256 emit_grafted_token(builder, t.kind(), t.text(), bq);
2257 }
2258 }
2259 }
2260}
2261
2262/// Recursively re-emit `node` and its descendants into `builder`.
2263/// Token text is copied verbatim so the result is byte-identical to
2264/// the input span (modulo bq prefix tokens injected at line starts
2265/// when `bq` is `Some`).
2266fn graft_subtree(
2267 builder: &mut GreenNodeBuilder<'static>,
2268 node: &SyntaxNode,
2269 bq: &mut Option<ContainerPrefixState>,
2270) {
2271 graft_subtree_as(builder, node, node.kind(), bq);
2272}
2273
2274/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2275/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2276/// the close-butted demotion rule.
2277fn graft_subtree_as(
2278 builder: &mut GreenNodeBuilder<'static>,
2279 node: &SyntaxNode,
2280 kind: SyntaxKind,
2281 bq: &mut Option<ContainerPrefixState>,
2282) {
2283 builder.start_node(kind.into());
2284 for child in node.children_with_tokens() {
2285 match child {
2286 rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2287 rowan::NodeOrToken::Token(t) => {
2288 emit_grafted_token(builder, t.kind(), t.text(), bq);
2289 }
2290 }
2291 }
2292 builder.finish_node();
2293}
2294
2295/// Emit a single token while optionally injecting blockquote prefix
2296/// tokens at line starts. When `bq` is `None`, this is a plain
2297/// `builder.token()` passthrough.
2298fn emit_grafted_token(
2299 builder: &mut GreenNodeBuilder<'static>,
2300 kind: SyntaxKind,
2301 text: &str,
2302 bq: &mut Option<ContainerPrefixState>,
2303) {
2304 if let Some(state) = bq.as_mut() {
2305 if state.at_line_start {
2306 if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2307 emit_container_prefix_tokens(builder, line_prefix);
2308 }
2309 state.at_line_start = false;
2310 }
2311 builder.token(kind.into(), text);
2312 // `BLANK_LINE` token represents an entirely blank source line —
2313 // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2314 // token as line-ending so the per-line prefix index advances
2315 // correctly.
2316 if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2317 state.line_idx += 1;
2318 state.at_line_start = true;
2319 }
2320 } else {
2321 builder.token(kind.into(), text);
2322 }
2323}
2324
2325/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2326/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2327fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2328 for ch in prefix.chars() {
2329 if ch == '>' {
2330 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2331 } else {
2332 let mut buf = [0u8; 4];
2333 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2334 }
2335 }
2336}
2337
2338/// Locate the byte index (within `line`) of the open-tag's closing `>`
2339/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2340/// the line doesn't fit the expected shape. Mirrors the inner scan of
2341/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2342/// caller can slice off the trailing bytes.
2343fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2344 let bytes = line.as_bytes();
2345 let indent_end = bytes
2346 .iter()
2347 .position(|&b| b != b' ' && b != b'\t')
2348 .unwrap_or(bytes.len());
2349 let rest = &line[indent_end..];
2350 let rest_bytes = rest.as_bytes();
2351 let prefix_len = 1 + tag_name.len();
2352 if rest_bytes.len() < prefix_len + 1
2353 || rest_bytes[0] != b'<'
2354 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2355 {
2356 return None;
2357 }
2358 let after_name = &rest[prefix_len..];
2359 let after_name_bytes = after_name.as_bytes();
2360 let mut i = 0usize;
2361 let mut quote: Option<u8> = None;
2362 while i < after_name_bytes.len() {
2363 match (quote, after_name_bytes[i]) {
2364 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2365 (Some(q), b2) if b2 == q => quote = None,
2366 (None, b'>') => return Some(indent_end + prefix_len + i),
2367 _ => {}
2368 }
2369 i += 1;
2370 }
2371 None
2372}
2373
2374/// Whether `slice` begins (after leading ASCII whitespace) with an
2375/// open tag whose name is a Pandoc void block tag (`<source>`,
2376/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2377/// open tags return false.
2378///
2379/// Used by the inline-block matched-pair lift gate: pandoc-native
2380/// abandons the lift when the body's first non-blank content is a
2381/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2382/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2383/// matched-pair lift).
2384fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2385 let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2386 if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2387 return false;
2388 }
2389 let Some(tag_end) = parse_open_tag(trimmed) else {
2390 return false;
2391 };
2392 let bytes = trimmed.as_bytes();
2393 let mut name_end = 1usize;
2394 while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2395 {
2396 name_end += 1;
2397 }
2398 if name_end == 1 {
2399 return false;
2400 }
2401 is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2402}
2403
2404/// Whether the body of an inline-block matched-pair (`<video>...`,
2405/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2406/// a void block tag — the condition under which pandoc-native abandons
2407/// the matched-pair lift. Probes three shapes:
2408///
2409/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2410/// after the open `>` on `first_inner` start with `<source`.
2411/// - **Single-line open + multi-line body**: open-trailing on the open
2412/// line is empty/whitespace AND the first non-blank body line
2413/// (`lines[start_pos+1..]`) starts with a void tag.
2414/// - **Multi-line open**: same body-line scan starting at
2415/// `lines[multiline_open_end+1..]`.
2416///
2417/// Returns `false` when the body begins with text, with a close tag,
2418/// or with a non-void block tag — those cases all proceed with the
2419/// matched-pair lift.
2420fn inline_block_void_interior_abandons(
2421 first_inner: &str,
2422 lines: &[&str],
2423 start_pos: usize,
2424 multiline_open_end: Option<usize>,
2425 bq_depth: usize,
2426 tag_name: &str,
2427) -> bool {
2428 let (line_no_nl, _) = strip_newline(first_inner);
2429 let (body_start_line_idx, open_trailing) = match multiline_open_end {
2430 Some(end) => (end + 1, ""),
2431 None => {
2432 let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2433 let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2434 (start_pos + 1, trailing)
2435 }
2436 };
2437 let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2438 if !trimmed.is_empty() {
2439 return slice_starts_with_void_block_tag(trimmed);
2440 }
2441 for line in &lines[body_start_line_idx..] {
2442 let inner = if bq_depth > 0 {
2443 strip_n_blockquote_markers(line, bq_depth)
2444 } else {
2445 line
2446 };
2447 let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2448 if trimmed.is_empty() {
2449 continue;
2450 }
2451 return slice_starts_with_void_block_tag(trimmed);
2452 }
2453 false
2454}
2455
2456/// Probe whether the open-tag line has a valid (quote-aware) closing
2457/// `>` after the tag name. Admits trailing content after `>` (the
2458/// open-trailing shape `<form>foo`) — the caller is expected to capture
2459/// that trailing into the structural lift's `pre_content`.
2460pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2461 let bytes = line.as_bytes();
2462 let indent_end = bytes
2463 .iter()
2464 .position(|&b| b != b' ' && b != b'\t')
2465 .unwrap_or(bytes.len());
2466 let rest = &line[indent_end..];
2467 let rest_bytes = rest.as_bytes();
2468 let prefix_len = 1 + tag_name.len();
2469 if rest_bytes.len() < prefix_len + 1
2470 || rest_bytes[0] != b'<'
2471 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2472 {
2473 return false;
2474 }
2475 let after_name = &rest[prefix_len..];
2476 let after_name_bytes = after_name.as_bytes();
2477 let mut i = 0usize;
2478 let mut quote: Option<u8> = None;
2479 while i < after_name_bytes.len() {
2480 match (quote, after_name_bytes[i]) {
2481 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2482 (Some(q), b2) if b2 == q => quote = None,
2483 (None, b'>') => return true,
2484 _ => {}
2485 }
2486 i += 1;
2487 }
2488 false
2489}
2490
2491/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2492/// be lifted structurally. Returns `true` only when:
2493/// - The line starts with `<tag_name` (modulo leading whitespace).
2494/// - The open tag's `>` exists with proper quote handling.
2495/// - The bytes after the open `>` contain a depth-zero matched
2496/// `</tag_name>` close (depth-aware: nested `<tag>` opens
2497/// increment depth; matching is case-insensitive, quote-aware).
2498///
2499/// Trailing bytes after the matched close are accepted and grafted
2500/// as a sibling block by the caller. Examples:
2501/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2502/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2503/// recursively parses to a `RawBlock`).
2504/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2505/// parsed recursively), trailing=`bar`.
2506fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2507 let bytes = line.as_bytes();
2508 let indent_end = bytes
2509 .iter()
2510 .position(|&b| b != b' ' && b != b'\t')
2511 .unwrap_or(bytes.len());
2512 let rest = &line[indent_end..];
2513 let rest_bytes = rest.as_bytes();
2514 let prefix_len = 1 + tag_name.len();
2515 if rest_bytes.len() < prefix_len
2516 || rest_bytes[0] != b'<'
2517 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2518 {
2519 return false;
2520 }
2521 let after_name = &rest[prefix_len..];
2522 let after_name_bytes = after_name.as_bytes();
2523 let mut i = 0usize;
2524 let mut quote: Option<u8> = None;
2525 let mut gt_idx: Option<usize> = None;
2526 while i < after_name_bytes.len() {
2527 match (quote, after_name_bytes[i]) {
2528 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2529 (Some(q), b2) if b2 == q => quote = None,
2530 (None, b'>') => {
2531 gt_idx = Some(i);
2532 break;
2533 }
2534 _ => {}
2535 }
2536 i += 1;
2537 }
2538 let Some(gt_idx) = gt_idx else {
2539 return false;
2540 };
2541 let trailing = &after_name[gt_idx + 1..];
2542 // Depth-aware: walk `trailing` (we begin inside the open tag at
2543 // depth 1). Return true iff a matched `</tag>` exists where depth
2544 // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2545 matched_close_offset(trailing, tag_name).is_some()
2546}
2547
2548/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2549/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2550/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2551/// starts at 1 (we begin inside the open tag). Self-closing opens
2552/// (`<tag/>`) do not increment depth.
2553///
2554/// Returns `Some((close_start, close_end))` where:
2555/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2556/// - `close_end` is one past the matched `>`.
2557///
2558/// Returns `None` when no matched close is present (unclosed tag,
2559/// depth never returns to 0).
2560fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2561 let bytes = trailing.as_bytes();
2562 let lower_line = trailing.to_ascii_lowercase();
2563 let lower_bytes = lower_line.as_bytes();
2564 let tag_lower = tag_name.to_ascii_lowercase();
2565 let tag_bytes = tag_lower.as_bytes();
2566
2567 let mut depth: i32 = 1;
2568 let mut i = 0usize;
2569
2570 while i < bytes.len() {
2571 if bytes[i] != b'<' {
2572 i += 1;
2573 continue;
2574 }
2575 let after = i + 1;
2576 let is_close = after < bytes.len() && bytes[after] == b'/';
2577 let name_start = if is_close { after + 1 } else { after };
2578 let matched = name_start + tag_bytes.len() <= bytes.len()
2579 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2580 let after_name = name_start + tag_bytes.len();
2581 let is_boundary = matched
2582 && matches!(
2583 bytes.get(after_name).copied(),
2584 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2585 );
2586
2587 // Scan forward to this tag bracket's `>`, respecting quoted
2588 // attribute values; track self-closing form (`/>`).
2589 let mut j = if matched { after_name } else { after };
2590 let mut quote: Option<u8> = None;
2591 let mut self_close = false;
2592 let mut found_gt = false;
2593 while j < bytes.len() {
2594 let b = bytes[j];
2595 match (quote, b) {
2596 (Some(q), x) if x == q => quote = None,
2597 (None, b'"') | (None, b'\'') => quote = Some(b),
2598 (None, b'>') => {
2599 found_gt = true;
2600 if j > i + 1 && bytes[j - 1] == b'/' {
2601 self_close = true;
2602 }
2603 break;
2604 }
2605 _ => {}
2606 }
2607 j += 1;
2608 }
2609
2610 if matched && is_boundary {
2611 if is_close {
2612 depth -= 1;
2613 if depth == 0 && found_gt {
2614 return Some((i, j + 1));
2615 }
2616 } else if !self_close {
2617 depth += 1;
2618 }
2619 }
2620
2621 if found_gt {
2622 i = j + 1;
2623 } else {
2624 // Unterminated `<...` — give up.
2625 break;
2626 }
2627 }
2628 None
2629}
2630
2631/// Locate the byte offset of the first `>` after a `</tag` prefix at
2632/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2633/// the caller can split `close_part` into the close-marker bytes
2634/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2635/// expected prefix shape is missing — caller treats the whole slice
2636/// as the close marker (no trailing).
2637fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2638 let prefix_len = 2 + tag_name.len();
2639 let bytes = close_part.as_bytes();
2640 if bytes.len() < prefix_len
2641 || bytes[0] != b'<'
2642 || bytes[1] != b'/'
2643 || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2644 {
2645 return None;
2646 }
2647 // Scan from after `</tag` to the first unquoted `>`.
2648 let mut i = prefix_len;
2649 let mut quote: Option<u8> = None;
2650 while i < bytes.len() {
2651 match (quote, bytes[i]) {
2652 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2653 (Some(q), b2) if b2 == q => quote = None,
2654 (None, b'>') => return Some(i + 1),
2655 _ => {}
2656 }
2657 i += 1;
2658 }
2659 None
2660}
2661
2662/// Try to split the close line of an HTML_BLOCK_DIV body into a
2663/// leading content prefix and a clean `</tag>...` remainder. Returns
2664/// `Some((leading, close_part))` only when the line contains exactly
2665/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2666/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2667/// for missing close tags, or for compound shapes the parser
2668/// shouldn't attempt to lift in this pass.
2669///
2670/// `leading` may be empty (close starts at column 0) or pure
2671/// whitespace (close on an indented line). Both count as "butted" per
2672/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2673/// the trailing paragraph inside the div demotes Para→Plain.
2674fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2675 let (opens, closes) = count_tag_balance(line, tag_name);
2676 if opens != 0 || closes != 1 {
2677 return None;
2678 }
2679 // Locate the close tag's opening `<` by lowercased substring search.
2680 // Safe because we've already established (above) that the line has
2681 // exactly one `</tag>` and no `<tag>` opens, so the first match is
2682 // THE close.
2683 let needle = format!("</{}", tag_name);
2684 let lower = line.to_ascii_lowercase();
2685 let close_lt = lower.find(&needle)?;
2686 Some((&line[..close_lt], &line[close_lt..]))
2687}
2688
2689/// Depth-aware variant of `try_split_close_line` used by the same-line
2690/// lift path. Walks `line` starting at depth 1 (we begin inside the
2691/// open `<tag>`) and splits at the byte position where the matched
2692/// `</tag>` close brings depth to 0. Returns `Some((body,
2693/// close_part))` where `body` is the bytes before the matched-close
2694/// start and `close_part` is the bytes from the matched close onward.
2695///
2696/// Unlike `try_split_close_line` this accepts nested same-tag opens
2697/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2698/// body=`<div>x</div>` (a nested div the body lift parses
2699/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2700/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2701/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2702fn try_split_close_line_depth_aware<'a>(
2703 line: &'a str,
2704 tag_name: &str,
2705) -> Option<(&'a str, &'a str)> {
2706 let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2707 Some((&line[..close_start], &line[close_start..]))
2708}
2709
2710/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2711/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2712/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2713/// + TEXT(">") + TEXT(trailing)?`.
2714///
2715/// Bytes are byte-identical to the source — this only tokenizes at finer
2716/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2717/// region structurally. Falls back to a single TEXT token if the line
2718/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2719/// only retags as the lift kind when this shape was matched).
2720///
2721/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2722/// returned as `&str` instead so the caller can splice them into the
2723/// recursive-parse input for the structural body lift. When false
2724/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2725/// empty slice is returned.
2726fn emit_open_tag_tokens<'a>(
2727 builder: &mut GreenNodeBuilder<'static>,
2728 line: &'a str,
2729 tag_name: &str,
2730 lift_trailing: bool,
2731) -> &'a str {
2732 let bytes = line.as_bytes();
2733 // Leading indent (CommonMark allows up to 3 spaces).
2734 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2735 if indent_end > 0 {
2736 builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2737 }
2738 let rest = &line[indent_end..];
2739 // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2740 let prefix_len = 1 + tag_name.len();
2741 if !rest.starts_with('<')
2742 || rest.len() < prefix_len
2743 || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2744 {
2745 builder.token(SyntaxKind::TEXT.into(), rest);
2746 return "";
2747 }
2748 let after_name = &rest[prefix_len..];
2749 let after_name_bytes = after_name.as_bytes();
2750 // Find the closing `>` of the open tag, respecting quoted attribute values.
2751 let mut i = 0usize;
2752 let mut quote: Option<u8> = None;
2753 let mut tag_close: Option<usize> = None;
2754 while i < after_name_bytes.len() {
2755 let b = after_name_bytes[i];
2756 match (quote, b) {
2757 (None, b'"') | (None, b'\'') => quote = Some(b),
2758 (Some(q), b2) if b2 == q => quote = None,
2759 (None, b'>') => {
2760 tag_close = Some(i);
2761 break;
2762 }
2763 _ => {}
2764 }
2765 i += 1;
2766 }
2767 let Some(tag_close) = tag_close else {
2768 // Open tag has no closing `>` on this line — defensive fallback.
2769 builder.token(SyntaxKind::TEXT.into(), rest);
2770 return "";
2771 };
2772 // Whitespace between the tag name and the attribute region.
2773 let attrs_inner = &after_name[..tag_close];
2774 let ws_end = attrs_inner
2775 .as_bytes()
2776 .iter()
2777 .position(|&b| !matches!(b, b' ' | b'\t'))
2778 .unwrap_or(attrs_inner.len());
2779 let leading_ws = &attrs_inner[..ws_end];
2780 // Strip a trailing self-closing slash and the whitespace before it
2781 // from the attribute region; emit them as TEXT outside the
2782 // HTML_ATTRS node so the structural region only holds attribute
2783 // bytes (not formatting punctuation).
2784 let attrs_after_ws = &attrs_inner[ws_end..];
2785 let mut attr_end = attrs_after_ws.len();
2786 let attr_bytes = attrs_after_ws.as_bytes();
2787 let mut self_close_start = attr_end;
2788 if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2789 self_close_start = attr_end - 1;
2790 attr_end = self_close_start;
2791 while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2792 attr_end -= 1;
2793 }
2794 }
2795 let attrs_text = &attrs_after_ws[..attr_end];
2796 let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2797 let after_self_close = &attrs_after_ws[self_close_start..];
2798
2799 // Use the original source bytes for the `<tag` prefix (preserves
2800 // source casing — losslessness).
2801 builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2802 if !leading_ws.is_empty() {
2803 builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2804 }
2805 if !attrs_text.is_empty() {
2806 emit_html_attrs_node(builder, attrs_text);
2807 }
2808 if !trailing_text.is_empty() {
2809 builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2810 }
2811 if !after_self_close.is_empty() {
2812 builder.token(SyntaxKind::TEXT.into(), after_self_close);
2813 }
2814 builder.token(SyntaxKind::TEXT.into(), ">");
2815 let after_gt = &after_name[tag_close + 1..];
2816 if lift_trailing {
2817 // Return trailing bytes to the caller (will be spliced into the
2818 // recursive-parse input for the body lift).
2819 return after_gt;
2820 }
2821 if !after_gt.is_empty() {
2822 builder.token(SyntaxKind::TEXT.into(), after_gt);
2823 }
2824 ""
2825}
2826
2827/// Detect a multi-line HTML open tag for `tag_name`. Returns
2828/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2829/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2830/// the existing path) or when the `>` is missing entirely.
2831///
2832/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2833/// attribute value doesn't terminate the open tag. Quote state carries
2834/// across line boundaries.
2835fn find_multiline_open_end(
2836 lines: &[&str],
2837 start_pos: usize,
2838 first_inner: &str,
2839 tag_name: &str,
2840 prefix: &ContainerPrefix,
2841) -> Option<usize> {
2842 // Locate the `<tag_name` literal in `first_inner` to start scanning past
2843 // it. Match is ASCII case-insensitive; the parser preserves source casing.
2844 // `first_inner` is already bq-stripped by the caller; subsequent lines are
2845 // stripped inline below via `strip_n_blockquote_markers`.
2846 let trimmed = strip_leading_spaces(first_inner);
2847 let prefix_len = 1 + tag_name.len();
2848 if !trimmed.starts_with('<')
2849 || trimmed.len() < prefix_len
2850 || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2851 {
2852 return None;
2853 }
2854 let leading_indent = first_inner.len() - trimmed.len();
2855 let mut i = leading_indent + prefix_len; // past `<tag_name`
2856 let mut quote: Option<u8> = None;
2857
2858 // Scan first line for an unquoted `>`.
2859 let line0_bytes = first_inner.as_bytes();
2860 while i < line0_bytes.len() {
2861 match (quote, line0_bytes[i]) {
2862 (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2863 (Some(q), x) if x == q => quote = None,
2864 (None, b'>') => return None, // single-line case
2865 _ => {}
2866 }
2867 i += 1;
2868 }
2869
2870 // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2871 // blockquote markers per line so `> ` prefixes don't count toward the
2872 // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2873 let mut line_idx = start_pos + 1;
2874 while line_idx < lines.len() {
2875 let raw = lines[line_idx];
2876 let inner = prefix.strip(raw);
2877 for &b in inner.as_bytes() {
2878 match (quote, b) {
2879 (None, b'"') | (None, b'\'') => quote = Some(b),
2880 (Some(q), x) if x == q => quote = None,
2881 (None, b'>') => return Some(line_idx),
2882 _ => {}
2883 }
2884 }
2885 line_idx += 1;
2886 }
2887
2888 None
2889}
2890
2891/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2892/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2893/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2894/// open tag (no `>` in the remaining input) as paragraph text rather than
2895/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2896/// reparse the same content recursively, causing a stack overflow.
2897///
2898/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2899/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2900/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2901/// the tag eventually closes far away.
2902pub(crate) fn pandoc_html_open_tag_closes(
2903 lines: &[&str],
2904 start_pos: usize,
2905 prefix: &ContainerPrefix,
2906) -> bool {
2907 if start_pos >= lines.len() {
2908 return false;
2909 }
2910 let mut quote: Option<u8> = None;
2911 for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2912 let inner = prefix.strip(line);
2913 let bytes = inner.as_bytes();
2914 let mut i = 0usize;
2915 if offset == start_pos {
2916 while i < bytes.len() && bytes[i] == b' ' {
2917 i += 1;
2918 }
2919 if bytes.get(i) != Some(&b'<') {
2920 return false;
2921 }
2922 i += 1;
2923 }
2924 while i < bytes.len() {
2925 match (quote, bytes[i]) {
2926 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2927 (Some(q), x) if x == q => quote = None,
2928 (None, b'>') => return true,
2929 _ => {}
2930 }
2931 i += 1;
2932 }
2933 }
2934 false
2935}
2936
2937/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2938/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2939/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2940/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2941/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2942/// Phase 6 structural lift.
2943///
2944/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2945/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2946/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2947/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2948/// + TEXT(">") + (TEXT(trailing))? + NEWLINE
2949///
2950/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2951/// newline; `parse_html_attribute_list` tolerates whitespace.
2952#[allow(clippy::too_many_arguments)]
2953fn emit_multiline_open_tag_with_attrs(
2954 builder: &mut GreenNodeBuilder<'static>,
2955 lines: &[&str],
2956 start_pos: usize,
2957 end_line_idx: usize,
2958 tag_name: &str,
2959 bq_depth: usize,
2960 lift_trailing: bool,
2961 pre_content: &mut String,
2962) {
2963 let prefix_len = 1 + tag_name.len();
2964 for (line_idx, raw) in lines
2965 .iter()
2966 .enumerate()
2967 .take(end_line_idx + 1)
2968 .skip(start_pos)
2969 {
2970 // Strip `bq_depth` blockquote markers from the source line so
2971 // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2972 // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2973 // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2974 // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2975 // before this parser runs; re-emitting it here would double
2976 // the bytes and break losslessness.
2977 let stripped = if bq_depth > 0 {
2978 strip_n_blockquote_markers(raw, bq_depth)
2979 } else {
2980 raw
2981 };
2982 let bq_prefix_len = raw.len() - stripped.len();
2983 if bq_prefix_len > 0 && line_idx != start_pos {
2984 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2985 }
2986 let line = stripped;
2987 let (line_no_nl, newline_str) = strip_newline(line);
2988
2989 if line_idx == start_pos {
2990 // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2991 // + attrs)?. The closing `>` is on a later line, so any
2992 // remaining bytes after "<{tag_name}" on this line are the
2993 // start of the attribute region.
2994 let bytes = line_no_nl.as_bytes();
2995 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2996 if indent_end > 0 {
2997 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2998 }
2999 // Defensive: caller verified the line starts with `<{tag_name}`.
3000 let after_indent = &line_no_nl[indent_end..];
3001 if after_indent.len() >= prefix_len {
3002 builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
3003 let rest = &after_indent[prefix_len..];
3004 emit_attr_region(builder, rest);
3005 } else {
3006 builder.token(SyntaxKind::TEXT.into(), after_indent);
3007 }
3008 } else if line_idx < end_line_idx {
3009 // Pure attribute line.
3010 let bytes = line_no_nl.as_bytes();
3011 let indent_end = bytes
3012 .iter()
3013 .position(|&b| !matches!(b, b' ' | b'\t'))
3014 .unwrap_or(bytes.len());
3015 if indent_end > 0 {
3016 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3017 }
3018 let attrs_text = &line_no_nl[indent_end..];
3019 if !attrs_text.is_empty() {
3020 emit_html_attrs_node(builder, attrs_text);
3021 }
3022 } else {
3023 // Last line: indent + attrs + ">" + trailing.
3024 let bytes = line_no_nl.as_bytes();
3025 let indent_end = bytes
3026 .iter()
3027 .position(|&b| !matches!(b, b' ' | b'\t'))
3028 .unwrap_or(bytes.len());
3029 if indent_end > 0 {
3030 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3031 }
3032 // Find the unquoted `>` byte position in this line.
3033 let mut quote: Option<u8> = None;
3034 let mut gt_pos: Option<usize> = None;
3035 for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3036 let actual_j = indent_end + j;
3037 match (quote, b) {
3038 (None, b'"') | (None, b'\'') => quote = Some(b),
3039 (Some(q), x) if x == q => quote = None,
3040 (None, b'>') => {
3041 gt_pos = Some(actual_j);
3042 break;
3043 }
3044 _ => {}
3045 }
3046 }
3047 let Some(gt) = gt_pos else {
3048 // Defensive — caller said `>` is on this line.
3049 builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3050 if !newline_str.is_empty() {
3051 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3052 }
3053 continue;
3054 };
3055 // Attribute region: between indent_end and gt, with possibly
3056 // trailing whitespace before `>`.
3057 let attrs_region = &line_no_nl[indent_end..gt];
3058 let region_bytes = attrs_region.as_bytes();
3059 // Strip trailing whitespace from attrs region; emit as
3060 // separate WHITESPACE so HTML_ATTRS only contains attribute
3061 // bytes.
3062 let mut attr_end = region_bytes.len();
3063 while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3064 attr_end -= 1;
3065 }
3066 let attrs_text = &attrs_region[..attr_end];
3067 let trailing_ws = &attrs_region[attr_end..];
3068 if !attrs_text.is_empty() {
3069 emit_html_attrs_node(builder, attrs_text);
3070 }
3071 if !trailing_ws.is_empty() {
3072 builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3073 }
3074 builder.token(SyntaxKind::TEXT.into(), ">");
3075 let after_gt = &line_no_nl[gt + 1..];
3076 if lift_trailing && !after_gt.is_empty() {
3077 // Lift trailing bytes (and the trailing newline) into
3078 // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3079 // with `TEXT(">")`. The recursive parse at the close-marker
3080 // site treats `pre_content` as the leading bytes of the
3081 // structural body — same shape produced by `emit_open_tag_tokens`
3082 // for single-line opens.
3083 pre_content.push_str(after_gt);
3084 pre_content.push_str(newline_str);
3085 continue;
3086 }
3087 if !after_gt.is_empty() {
3088 builder.token(SyntaxKind::TEXT.into(), after_gt);
3089 }
3090 }
3091
3092 if !newline_str.is_empty() {
3093 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3094 }
3095 }
3096}
3097
3098/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3099/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3100/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3101/// node is added. Pandoc's projector reads attributes only for `<div>` /
3102/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3103fn emit_multiline_open_tag_simple(
3104 builder: &mut GreenNodeBuilder<'static>,
3105 lines: &[&str],
3106 start_pos: usize,
3107 end_line_idx: usize,
3108 bq_depth: usize,
3109) {
3110 for (line_idx, raw) in lines
3111 .iter()
3112 .enumerate()
3113 .take(end_line_idx + 1)
3114 .skip(start_pos)
3115 {
3116 let stripped = if bq_depth > 0 {
3117 strip_n_blockquote_markers(raw, bq_depth)
3118 } else {
3119 raw
3120 };
3121 let bq_prefix_len = raw.len() - stripped.len();
3122 // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3123 // re-emit prefixes only for subsequent lines.
3124 if bq_prefix_len > 0 && line_idx != start_pos {
3125 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3126 }
3127 let (line_no_nl, newline_str) = strip_newline(stripped);
3128 if !line_no_nl.is_empty() {
3129 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3130 }
3131 if !newline_str.is_empty() {
3132 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3133 }
3134 }
3135}
3136
3137/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3138/// `<div` literal up to end-of-line. Called only from
3139/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3140/// pure attribute (and possibly inter-attribute whitespace).
3141fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3142 if region.is_empty() {
3143 return;
3144 }
3145 let bytes = region.as_bytes();
3146 // Split a leading run of whitespace into a WHITESPACE token so the
3147 // HTML_ATTRS node holds only attribute bytes.
3148 let ws_end = bytes
3149 .iter()
3150 .position(|&b| !matches!(b, b' ' | b'\t'))
3151 .unwrap_or(bytes.len());
3152 if ws_end > 0 {
3153 builder.token(SyntaxKind::WHITESPACE.into(), ®ion[..ws_end]);
3154 }
3155 let attrs_text = ®ion[ws_end..];
3156 if !attrs_text.is_empty() {
3157 emit_html_attrs_node(builder, attrs_text);
3158 }
3159}
3160
3161/// Emit one continuation line of an HTML block, preserving any blockquote
3162/// markers as structural tokens (so the CST stays byte-equal to the source
3163/// and downstream consumers can strip them per-context).
3164fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3165 let inner = if bq_depth > 0 {
3166 let stripped = strip_n_blockquote_markers(line, bq_depth);
3167 let prefix_len = line.len() - stripped.len();
3168 if prefix_len > 0 {
3169 for ch in line[..prefix_len].chars() {
3170 if ch == '>' {
3171 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3172 } else {
3173 let mut buf = [0u8; 4];
3174 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3175 }
3176 }
3177 }
3178 stripped
3179 } else {
3180 line
3181 };
3182
3183 let (line_without_newline, newline_str) = strip_newline(inner);
3184 if !line_without_newline.is_empty() {
3185 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3186 }
3187 if !newline_str.is_empty() {
3188 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3189 }
3190}
3191
3192#[cfg(test)]
3193mod tests {
3194 use super::*;
3195
3196 #[test]
3197 fn test_try_parse_html_comment() {
3198 assert_eq!(
3199 try_parse_html_block_start("<!-- comment -->", false),
3200 Some(HtmlBlockType::Comment)
3201 );
3202 assert_eq!(
3203 try_parse_html_block_start(" <!-- comment -->", false),
3204 Some(HtmlBlockType::Comment)
3205 );
3206 }
3207
3208 #[test]
3209 fn test_try_parse_div_tag() {
3210 assert_eq!(
3211 try_parse_html_block_start("<div>", false),
3212 Some(HtmlBlockType::BlockTag {
3213 tag_name: "div".to_string(),
3214 is_verbatim: false,
3215 closed_by_blank_line: false,
3216 depth_aware: true,
3217 closes_at_open_tag: false,
3218 is_closing: false,
3219 })
3220 );
3221 assert_eq!(
3222 try_parse_html_block_start("<div class=\"test\">", false),
3223 Some(HtmlBlockType::BlockTag {
3224 tag_name: "div".to_string(),
3225 is_verbatim: false,
3226 closed_by_blank_line: false,
3227 depth_aware: true,
3228 closes_at_open_tag: false,
3229 is_closing: false,
3230 })
3231 );
3232 }
3233
3234 #[test]
3235 fn test_try_parse_script_tag() {
3236 assert_eq!(
3237 try_parse_html_block_start("<script>", false),
3238 Some(HtmlBlockType::BlockTag {
3239 tag_name: "script".to_string(),
3240 is_verbatim: true,
3241 closed_by_blank_line: false,
3242 depth_aware: true,
3243 closes_at_open_tag: false,
3244 is_closing: false,
3245 })
3246 );
3247 }
3248
3249 #[test]
3250 fn test_try_parse_processing_instruction() {
3251 assert_eq!(
3252 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3253 Some(HtmlBlockType::ProcessingInstruction)
3254 );
3255 }
3256
3257 #[test]
3258 fn test_try_parse_declaration() {
3259 // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3260 assert_eq!(
3261 try_parse_html_block_start("<!DOCTYPE html>", true),
3262 Some(HtmlBlockType::Declaration)
3263 );
3264 // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3265 // just uppercase. Lowercase doctype must match too.
3266 assert_eq!(
3267 try_parse_html_block_start("<!doctype html>", true),
3268 Some(HtmlBlockType::Declaration)
3269 );
3270 // Pandoc dialect does not — bare declarations fall through to
3271 // paragraph parsing.
3272 assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3273 assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3274 }
3275
3276 #[test]
3277 fn test_dialect_specific_block_tag_membership() {
3278 // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3279 // CommonMark §4.6 type-6 plus a few additions. These tags
3280 // diverge between dialects:
3281 // CM-only block tags (Pandoc treats as inline raw HTML):
3282 // dialog, legend, menuitem, optgroup, option, frame,
3283 // base, basefont, link, param
3284 // Pandoc-only block tags (CM doesn't recognize):
3285 // canvas, hgroup, isindex, meta, output
3286 for cm_only in [
3287 "<dialog>",
3288 "<legend>",
3289 "<menuitem>",
3290 "<optgroup>",
3291 "<option>",
3292 "<frame>",
3293 "<base>",
3294 "<basefont>",
3295 "<link>",
3296 "<param>",
3297 ] {
3298 assert!(
3299 matches!(
3300 try_parse_html_block_start(cm_only, true),
3301 Some(HtmlBlockType::BlockTag { .. })
3302 ),
3303 "{cm_only} should be a block-tag start under CommonMark",
3304 );
3305 assert_eq!(
3306 try_parse_html_block_start(cm_only, false),
3307 None,
3308 "{cm_only} should NOT be a block-tag start under Pandoc",
3309 );
3310 }
3311 for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3312 // Under CM these are not type-6 BlockTags; they may still match
3313 // type-7 (complete tag on a line) which has different semantics.
3314 assert!(
3315 !matches!(
3316 try_parse_html_block_start(pandoc_only, true),
3317 Some(HtmlBlockType::BlockTag { .. })
3318 ),
3319 "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3320 );
3321 assert!(
3322 matches!(
3323 try_parse_html_block_start(pandoc_only, false),
3324 Some(HtmlBlockType::BlockTag { .. })
3325 ),
3326 "{pandoc_only} should be a block-tag start under Pandoc",
3327 );
3328 }
3329 }
3330
3331 #[test]
3332 fn test_pandoc_inline_block_tag_membership() {
3333 // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3334 // fresh-block positions under Pandoc dialect. We list the
3335 // non-void, non-script subset (verbatim `script` is handled
3336 // via the verbatim path; void elements are deferred — see
3337 // PANDOC_INLINE_BLOCK_TAGS docs).
3338 for tag in [
3339 "<button>",
3340 "<iframe>",
3341 "<video>",
3342 "<audio>",
3343 "<noscript>",
3344 "<object>",
3345 "<map>",
3346 "<progress>",
3347 "<del>",
3348 "<ins>",
3349 "<svg>",
3350 "<applet>",
3351 ] {
3352 assert!(
3353 matches!(
3354 try_parse_html_block_start(tag, false),
3355 Some(HtmlBlockType::BlockTag {
3356 depth_aware: true,
3357 ..
3358 })
3359 ),
3360 "{tag} should be a depth-aware block-tag start under Pandoc",
3361 );
3362 }
3363 // Closing forms of inline-block tags also start a block under
3364 // Pandoc — pandoc-native pins `</button>` standalone as a
3365 // single-line `RawBlock`. These use `closes_at_open_tag: true`
3366 // (no balanced match — the close emits as a one-line block on
3367 // its own).
3368 for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3369 assert!(
3370 matches!(
3371 try_parse_html_block_start(closing, false),
3372 Some(HtmlBlockType::BlockTag {
3373 depth_aware: false,
3374 closes_at_open_tag: true,
3375 ..
3376 })
3377 ),
3378 "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3379 );
3380 }
3381 }
3382
3383 #[test]
3384 fn test_pandoc_void_block_tag_membership() {
3385 // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3386 // at fresh-block positions under Pandoc dialect, with
3387 // `closes_at_open_tag: true` — the block always ends on the
3388 // open-tag line (no closing tag to match).
3389 for tag in [
3390 "<area>",
3391 "<embed>",
3392 "<source>",
3393 "<track>",
3394 "<embed src=\"foo.swf\">",
3395 "<source src=\"foo.mp4\" type=\"video/mp4\">",
3396 ] {
3397 assert!(
3398 matches!(
3399 try_parse_html_block_start(tag, false),
3400 Some(HtmlBlockType::BlockTag {
3401 depth_aware: false,
3402 closes_at_open_tag: true,
3403 ..
3404 })
3405 ),
3406 "{tag} should be a void block-tag start under Pandoc",
3407 );
3408 }
3409 // Closing forms of void tags also start a single-line block
3410 // under Pandoc. Void elements have no closing tag in HTML, but
3411 // `</embed>` etc. can appear in the wild — pandoc-native still
3412 // emits them as `RawBlock`s at fresh-block positions; mirror
3413 // that with the same `closes_at_open_tag: true` shape.
3414 for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3415 assert!(
3416 matches!(
3417 try_parse_html_block_start(closing, false),
3418 Some(HtmlBlockType::BlockTag {
3419 depth_aware: false,
3420 closes_at_open_tag: true,
3421 ..
3422 })
3423 ),
3424 "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3425 );
3426 }
3427 // Under CommonMark dialect, the void-tag block-start path is
3428 // skipped. `<source>` and `<track>` are in the CM type-6
3429 // BLOCK_TAGS set so they DO start a block, but with CM type-6
3430 // semantics (`closed_by_blank_line: true`,
3431 // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3432 // `<embed>` and `<area>` aren't in the CM type-6 list — they
3433 // fall through to type 7 (complete tag on a line by itself).
3434 assert_eq!(
3435 try_parse_html_block_start("<embed>", true),
3436 Some(HtmlBlockType::Type7)
3437 );
3438 assert_eq!(
3439 try_parse_html_block_start("<area>", true),
3440 Some(HtmlBlockType::Type7)
3441 );
3442 assert!(matches!(
3443 try_parse_html_block_start("<source src=\"x\">", true),
3444 Some(HtmlBlockType::BlockTag {
3445 closed_by_blank_line: true,
3446 closes_at_open_tag: false,
3447 ..
3448 })
3449 ));
3450 assert!(matches!(
3451 try_parse_html_block_start("<track src=\"x\">", true),
3452 Some(HtmlBlockType::BlockTag {
3453 closed_by_blank_line: true,
3454 closes_at_open_tag: false,
3455 ..
3456 })
3457 ));
3458 }
3459
3460 #[test]
3461 fn test_find_multiline_open_end() {
3462 // Single-line opens return None (caller takes the regular path).
3463 assert_eq!(
3464 find_multiline_open_end(
3465 &["<div id=\"x\">"],
3466 0,
3467 "<div id=\"x\">",
3468 "div",
3469 &ContainerPrefix::default()
3470 ),
3471 None
3472 );
3473 assert_eq!(
3474 find_multiline_open_end(
3475 &["<embed src=\"x\">"],
3476 0,
3477 "<embed src=\"x\">",
3478 "embed",
3479 &ContainerPrefix::default()
3480 ),
3481 None
3482 );
3483 // Multi-line opens return the line index of the closing `>`.
3484 assert_eq!(
3485 find_multiline_open_end(
3486 &["<embed", " src=\"x\">"],
3487 0,
3488 "<embed",
3489 "embed",
3490 &ContainerPrefix::default()
3491 ),
3492 Some(1)
3493 );
3494 assert_eq!(
3495 find_multiline_open_end(
3496 &["<embed", " src=\"x\"", " type=\"video\">"],
3497 0,
3498 "<embed",
3499 "embed",
3500 &ContainerPrefix::default()
3501 ),
3502 Some(2)
3503 );
3504 // Tag-name mismatch returns None (case-insensitive on the tag name).
3505 assert_eq!(
3506 find_multiline_open_end(
3507 &["<embed", " src=\"x\">"],
3508 0,
3509 "<embed",
3510 "div",
3511 &ContainerPrefix::default()
3512 ),
3513 None
3514 );
3515 assert_eq!(
3516 find_multiline_open_end(
3517 &["<EMBED", " src=\"x\">"],
3518 0,
3519 "<EMBED",
3520 "embed",
3521 &ContainerPrefix::default()
3522 ),
3523 Some(1)
3524 );
3525 // Quoted `>` does not terminate the open tag; quote state threads
3526 // across line boundaries.
3527 assert_eq!(
3528 find_multiline_open_end(
3529 &["<embed title=\"a>b", " c\">"],
3530 0,
3531 "<embed title=\"a>b",
3532 "embed",
3533 &ContainerPrefix::default()
3534 ),
3535 Some(1)
3536 );
3537 // No `>` anywhere returns None.
3538 assert_eq!(
3539 find_multiline_open_end(
3540 &["<embed", " src=\"x\""],
3541 0,
3542 "<embed",
3543 "embed",
3544 &ContainerPrefix::default()
3545 ),
3546 None
3547 );
3548 // Subsequent lines inside a blockquote: bq markers stripped before
3549 // scanning so `> ` prefixes don't count.
3550 assert_eq!(
3551 find_multiline_open_end(
3552 &["<div", "> id=\"x\">"],
3553 0,
3554 "<div",
3555 "div",
3556 &ContainerPrefix::bq_only(1)
3557 ),
3558 Some(1)
3559 );
3560 // Nested bq: strips two `> ` per line.
3561 assert_eq!(
3562 find_multiline_open_end(
3563 &["<section", "> > id=\"x\">"],
3564 0,
3565 "<section",
3566 "section",
3567 &ContainerPrefix::bq_only(2)
3568 ),
3569 Some(1)
3570 );
3571 }
3572
3573 #[test]
3574 fn test_pandoc_html_open_tag_closes() {
3575 // Single-line complete: scanner finds `>` on the first line.
3576 assert!(pandoc_html_open_tag_closes(
3577 &["<div>"],
3578 0,
3579 &ContainerPrefix::default()
3580 ));
3581 assert!(pandoc_html_open_tag_closes(
3582 &["<embed src=\"x\">"],
3583 0,
3584 &ContainerPrefix::default()
3585 ));
3586 // Multi-line complete: scanner finds `>` on a later line.
3587 assert!(pandoc_html_open_tag_closes(
3588 &["<div", " id=\"x\">", "body", "</div>"],
3589 0,
3590 &ContainerPrefix::default()
3591 ));
3592 assert!(pandoc_html_open_tag_closes(
3593 &["<embed", " src=\"x.png\" alt=\"y\">"],
3594 0,
3595 &ContainerPrefix::default()
3596 ));
3597 // Quoted `>` does not close: scanner threads quote state.
3598 assert!(!pandoc_html_open_tag_closes(
3599 &["<div title=\"a>b", " c\""],
3600 0,
3601 &ContainerPrefix::default()
3602 ));
3603 assert!(pandoc_html_open_tag_closes(
3604 &["<div title=\"a>b", " c\">"],
3605 0,
3606 &ContainerPrefix::default()
3607 ));
3608 // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3609 assert!(!pandoc_html_open_tag_closes(
3610 &["<embed"],
3611 0,
3612 &ContainerPrefix::default()
3613 ));
3614 assert!(!pandoc_html_open_tag_closes(
3615 &["<div", "foo", "bar"],
3616 0,
3617 &ContainerPrefix::default()
3618 ));
3619 // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3620 // across them); the scan continues until EOF or `>`.
3621 assert!(pandoc_html_open_tag_closes(
3622 &["<div", "", "id=\"x\">"],
3623 0,
3624 &ContainerPrefix::default()
3625 ));
3626 }
3627
3628 #[test]
3629 fn test_try_parse_cdata() {
3630 // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3631 assert_eq!(
3632 try_parse_html_block_start("<![CDATA[content]]>", true),
3633 Some(HtmlBlockType::CData)
3634 );
3635 // Pandoc dialect does not.
3636 assert_eq!(
3637 try_parse_html_block_start("<![CDATA[content]]>", false),
3638 None
3639 );
3640 }
3641
3642 #[test]
3643 fn test_extract_block_tag_name_open_only() {
3644 assert_eq!(
3645 extract_block_tag_name("<div>", false),
3646 Some("div".to_string())
3647 );
3648 assert_eq!(
3649 extract_block_tag_name("<div class=\"test\">", false),
3650 Some("div".to_string())
3651 );
3652 assert_eq!(
3653 extract_block_tag_name("<div/>", false),
3654 Some("div".to_string())
3655 );
3656 assert_eq!(extract_block_tag_name("</div>", false), None);
3657 assert_eq!(extract_block_tag_name("<>", false), None);
3658 assert_eq!(extract_block_tag_name("< div>", false), None);
3659 }
3660
3661 #[test]
3662 fn test_extract_block_tag_name_with_closing() {
3663 // CommonMark §4.6 type-6 starts also accept closing tags.
3664 assert_eq!(
3665 extract_block_tag_name("</div>", true),
3666 Some("div".to_string())
3667 );
3668 assert_eq!(
3669 extract_block_tag_name("</div >", true),
3670 Some("div".to_string())
3671 );
3672 }
3673
3674 #[test]
3675 fn test_commonmark_type6_closing_tag_start() {
3676 assert_eq!(
3677 try_parse_html_block_start("</div>", true),
3678 Some(HtmlBlockType::BlockTag {
3679 tag_name: "div".to_string(),
3680 is_verbatim: false,
3681 closed_by_blank_line: true,
3682 depth_aware: false,
3683 closes_at_open_tag: false,
3684 is_closing: true,
3685 })
3686 );
3687 }
3688
3689 #[test]
3690 fn test_commonmark_type7_open_tag() {
3691 // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3692 // CommonMark; rejected under non-CommonMark.
3693 assert_eq!(
3694 try_parse_html_block_start("<a href=\"foo\">", true),
3695 Some(HtmlBlockType::Type7)
3696 );
3697 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3698 }
3699
3700 #[test]
3701 fn test_commonmark_type7_close_tag() {
3702 assert_eq!(
3703 try_parse_html_block_start("</ins>", true),
3704 Some(HtmlBlockType::Type7)
3705 );
3706 }
3707
3708 #[test]
3709 fn test_commonmark_type7_rejects_with_trailing_text() {
3710 // A complete tag must be followed only by whitespace.
3711 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3712 }
3713
3714 #[test]
3715 fn test_is_closing_marker_comment() {
3716 let block_type = HtmlBlockType::Comment;
3717 assert!(is_closing_marker("-->", &block_type));
3718 assert!(is_closing_marker("end -->", &block_type));
3719 assert!(!is_closing_marker("<!--", &block_type));
3720 }
3721
3722 #[test]
3723 fn test_is_closing_marker_tag() {
3724 let block_type = HtmlBlockType::BlockTag {
3725 tag_name: "div".to_string(),
3726 is_verbatim: false,
3727 closed_by_blank_line: false,
3728 depth_aware: false,
3729 closes_at_open_tag: false,
3730 is_closing: false,
3731 };
3732 assert!(is_closing_marker("</div>", &block_type));
3733 assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3734 assert!(is_closing_marker("content</div>", &block_type));
3735 assert!(!is_closing_marker("<div>", &block_type));
3736 }
3737
3738 #[test]
3739 fn test_parse_html_comment_block() {
3740 let input = "<!-- comment -->\n";
3741 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3742 let mut builder = GreenNodeBuilder::new();
3743
3744 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3745 let opts = ParserOptions::default();
3746 let new_pos = parse_html_block_with_wrapper(
3747 &mut builder,
3748 &lines,
3749 0,
3750 block_type,
3751 &ContainerPrefix::default(),
3752 SyntaxKind::HTML_BLOCK,
3753 &opts,
3754 );
3755
3756 assert_eq!(new_pos, 1);
3757 }
3758
3759 #[test]
3760 fn test_parse_div_block() {
3761 let input = "<div>\ncontent\n</div>\n";
3762 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3763 let mut builder = GreenNodeBuilder::new();
3764
3765 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3766 let opts = ParserOptions::default();
3767 let new_pos = parse_html_block_with_wrapper(
3768 &mut builder,
3769 &lines,
3770 0,
3771 block_type,
3772 &ContainerPrefix::default(),
3773 SyntaxKind::HTML_BLOCK,
3774 &opts,
3775 );
3776
3777 assert_eq!(new_pos, 3);
3778 }
3779
3780 #[test]
3781 fn test_parse_html_block_no_closing() {
3782 let input = "<div>\ncontent\n";
3783 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3784 let mut builder = GreenNodeBuilder::new();
3785
3786 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3787 let opts = ParserOptions::default();
3788 let new_pos = parse_html_block_with_wrapper(
3789 &mut builder,
3790 &lines,
3791 0,
3792 block_type,
3793 &ContainerPrefix::default(),
3794 SyntaxKind::HTML_BLOCK,
3795 &opts,
3796 );
3797
3798 // Should consume all lines even without closing tag
3799 assert_eq!(new_pos, 2);
3800 }
3801
3802 #[test]
3803 fn test_parse_div_block_nested_pandoc() {
3804 // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3805 // close on the OUTER `</div>`, not the first `</div>` seen. The
3806 // CommonMark-style "first close" scanner is wrong here; Pandoc's
3807 // div parser is depth-aware (mirrors `htmlInBalanced`).
3808 let input =
3809 "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3810 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3811 let mut builder = GreenNodeBuilder::new();
3812
3813 // is_commonmark = false → Pandoc dialect.
3814 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3815 let opts = ParserOptions::default();
3816 let new_pos = parse_html_block_with_wrapper(
3817 &mut builder,
3818 &lines,
3819 0,
3820 block_type,
3821 &ContainerPrefix::default(),
3822 SyntaxKind::HTML_BLOCK_DIV,
3823 &opts,
3824 );
3825
3826 // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3827 // inner-close, blank, outer-close. All consumed.
3828 assert_eq!(new_pos, 9);
3829 }
3830
3831 #[test]
3832 fn test_parse_div_block_same_line_pandoc() {
3833 // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3834 // close on first line. Depth-aware tracking must not regress this.
3835 let input = "<div>foo</div>\n";
3836 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3837 let mut builder = GreenNodeBuilder::new();
3838
3839 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3840 let opts = ParserOptions::default();
3841 let new_pos = parse_html_block_with_wrapper(
3842 &mut builder,
3843 &lines,
3844 0,
3845 block_type,
3846 &ContainerPrefix::default(),
3847 SyntaxKind::HTML_BLOCK_DIV,
3848 &opts,
3849 );
3850 assert_eq!(new_pos, 1);
3851 }
3852
3853 #[test]
3854 fn test_commonmark_verbatim_first_close() {
3855 // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3856 // ends at the first matching close — not depth-aware. Stash a
3857 // bogus inner `<script>` inside a JS string; the outer block
3858 // still closes at the first `</script>`.
3859 let input = "<script>\nlet x = '<script>';\n</script>\n";
3860 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3861 let mut builder = GreenNodeBuilder::new();
3862
3863 // is_commonmark = true.
3864 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3865 let opts = ParserOptions::default();
3866 let new_pos = parse_html_block_with_wrapper(
3867 &mut builder,
3868 &lines,
3869 0,
3870 block_type,
3871 &ContainerPrefix::default(),
3872 SyntaxKind::HTML_BLOCK,
3873 &opts,
3874 );
3875 // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3876 assert_eq!(new_pos, 3);
3877 }
3878
3879 #[test]
3880 fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3881 // Multi-line open tag with the closing `>` on its own line:
3882 //
3883 // <div
3884 // id="x"
3885 // class="y"
3886 // >
3887 //
3888 // foo
3889 //
3890 // </div>
3891 //
3892 // Open tag spans lines 0..=3. Content starts at line 4.
3893 let input = "<div\n id=\"x\"\n class=\"y\"\n>\n\nfoo\n\n</div>\n";
3894 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3895 let mut builder = GreenNodeBuilder::new();
3896
3897 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3898 let opts = ParserOptions::default();
3899 let new_pos = parse_html_block_with_wrapper(
3900 &mut builder,
3901 &lines,
3902 0,
3903 block_type,
3904 &ContainerPrefix::default(),
3905 SyntaxKind::HTML_BLOCK_DIV,
3906 &opts,
3907 );
3908
3909 // 8 lines: open-line 0, open-line 1 (` id="x"`), open-line 2
3910 // (` class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3911 assert_eq!(new_pos, 8);
3912
3913 // CST must contain a structural HTML_ATTRS region holding the
3914 // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3915 let green = builder.finish();
3916 let root = crate::syntax::SyntaxNode::new_root(green);
3917 let attrs_count = root
3918 .descendants()
3919 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3920 .count();
3921 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3922
3923 // Byte-identical losslessness check.
3924 let collected: String = root
3925 .descendants_with_tokens()
3926 .filter_map(|n| n.into_token())
3927 .map(|t| t.text().to_string())
3928 .collect();
3929 assert_eq!(collected, input);
3930 }
3931
3932 #[test]
3933 fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3934 // Multi-line open tag with the closing `>` on the last attribute
3935 // line (case 0262 already covers this pattern; pin behavior to
3936 // also ensure HTML_ATTRS structural exposure).
3937 let input = "<div\n id=\"x\"\n class=\"y\">\nfoo\n</div>\n";
3938 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3939 let mut builder = GreenNodeBuilder::new();
3940
3941 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3942 let opts = ParserOptions::default();
3943 let new_pos = parse_html_block_with_wrapper(
3944 &mut builder,
3945 &lines,
3946 0,
3947 block_type,
3948 &ContainerPrefix::default(),
3949 SyntaxKind::HTML_BLOCK_DIV,
3950 &opts,
3951 );
3952
3953 // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3954 // </div>.
3955 assert_eq!(new_pos, 5);
3956
3957 let green = builder.finish();
3958 let root = crate::syntax::SyntaxNode::new_root(green);
3959 let attrs_count = root
3960 .descendants()
3961 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3962 .count();
3963 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3964
3965 let collected: String = root
3966 .descendants_with_tokens()
3967 .filter_map(|n| n.into_token())
3968 .map(|t| t.text().to_string())
3969 .collect();
3970 assert_eq!(collected, input);
3971 }
3972
3973 #[test]
3974 fn test_commonmark_type6_blank_line_terminates() {
3975 let input = "<div>\nfoo\n\nbar\n";
3976 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3977 let mut builder = GreenNodeBuilder::new();
3978
3979 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3980 let opts = ParserOptions::default();
3981 let new_pos = parse_html_block_with_wrapper(
3982 &mut builder,
3983 &lines,
3984 0,
3985 block_type,
3986 &ContainerPrefix::default(),
3987 SyntaxKind::HTML_BLOCK,
3988 &opts,
3989 );
3990
3991 // Block contains <div>\nfoo\n; stops at blank line (line 2).
3992 assert_eq!(new_pos, 2);
3993 }
3994}