panache_parser/parser/blocks/html_blocks.rs
1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10 ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
13
14/// HTML block-level tags as defined by CommonMark spec.
15/// These tags start an HTML block when found at the start of a line.
16const BLOCK_TAGS: &[&str] = &[
17 "address",
18 "article",
19 "aside",
20 "base",
21 "basefont",
22 "blockquote",
23 "body",
24 "caption",
25 "center",
26 "col",
27 "colgroup",
28 "dd",
29 "details",
30 "dialog",
31 "dir",
32 "div",
33 "dl",
34 "dt",
35 "fieldset",
36 "figcaption",
37 "figure",
38 "footer",
39 "form",
40 "frame",
41 "frameset",
42 "h1",
43 "h2",
44 "h3",
45 "h4",
46 "h5",
47 "h6",
48 "head",
49 "header",
50 "hr",
51 "html",
52 "iframe",
53 "legend",
54 "li",
55 "link",
56 "main",
57 "menu",
58 "menuitem",
59 "nav",
60 "noframes",
61 "ol",
62 "optgroup",
63 "option",
64 "p",
65 "param",
66 "section",
67 "source",
68 "summary",
69 "table",
70 "tbody",
71 "td",
72 "tfoot",
73 "th",
74 "thead",
75 "title",
76 "tr",
77 "track",
78 "ul",
79];
80
81/// Tags that contain raw/verbatim content (no Markdown processing inside).
82const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
83
84/// Pandoc's `blockHtmlTags` (mirrors
85/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
86/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
87/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
88/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
89/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
90/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
91///
92/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
93/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
94/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
95/// and the verbatim `script`) is tracked separately as
96/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
97/// fresh-block positions but stay inline inside an existing HTML block
98/// (e.g. `<form><input><button>X</button></form>`); the projector's
99/// `split_html_block_by_tags` keys on `inline_pending` to keep them
100/// inline once an inline-only tag or text byte has been seen since the
101/// last splitter.
102const PANDOC_BLOCK_TAGS: &[&str] = &[
103 "address",
104 "article",
105 "aside",
106 "blockquote",
107 "body",
108 "canvas",
109 "caption",
110 "center",
111 "col",
112 "colgroup",
113 "dd",
114 "details",
115 "dir",
116 "div",
117 "dl",
118 "dt",
119 "fieldset",
120 "figcaption",
121 "figure",
122 "footer",
123 "form",
124 "frameset",
125 "h1",
126 "h2",
127 "h3",
128 "h4",
129 "h5",
130 "h6",
131 "head",
132 "header",
133 "hgroup",
134 "hr",
135 "html",
136 "isindex",
137 "li",
138 "main",
139 "menu",
140 "meta",
141 "nav",
142 "noframes",
143 "ol",
144 "output",
145 "p",
146 "pre",
147 "script",
148 "section",
149 "style",
150 "summary",
151 "table",
152 "tbody",
153 "td",
154 "textarea",
155 "tfoot",
156 "th",
157 "thead",
158 "tr",
159 "ul",
160];
161
162/// Whether `name` (case-insensitive) is one of the HTML block-level tags
163/// recognized by CommonMark §4.6 type-6.
164pub fn is_html_block_tag_name(name: &str) -> bool {
165 let lower = name.to_ascii_lowercase();
166 BLOCK_TAGS.contains(&lower.as_str())
167}
168
169/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
170/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
171/// Used by the pandoc-native projector's `split_html_block_by_tags` to
172/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
173/// the block — block-level tags emit as separate `RawBlock` entries;
174/// inline tags stay inline in the surrounding `Plain` content.
175pub fn is_pandoc_block_tag_name(name: &str) -> bool {
176 let lower = name.to_ascii_lowercase();
177 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
178}
179
180/// Pandoc's `eitherBlockOrInline` set (mirrors
181/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
182/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
183/// (because `name ∉ blockTags`). At top level (or after a blank line)
184/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
185/// inside an existing HTML block once a paragraph has started parsing,
186/// the same tag stays inline as `RawInline`.
187///
188/// The projector's `split_html_block_by_tags` mirrors this with an
189/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
190/// always split; inline-block tags split only when no inline content
191/// has been buffered since the last splitter.
192///
193/// Void elements (`area`, `embed`, `source`, `track`) live in
194/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
195/// rule as non-void inline-block tags but emit a single RawBlock per
196/// instance instead of a matched-pair lift.
197/// `script` is omitted because it is already verbatim (handled by the
198/// `<script>...</script>` raw-text path) and the strict-block check
199/// fires first regardless.
200const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
201 "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
202 "svg", "video",
203];
204
205/// Whether `name` (case-insensitive) is one of pandoc's
206/// `eitherBlockOrInline` tags (excluding void elements and `script`;
207/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
208pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
209 let lower = name.to_ascii_lowercase();
210 PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
211}
212
213/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
214/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
215/// minus those handled elsewhere: `br` and `wbr` are inline-only;
216/// `img` and `input` are inline-only; HTML void elements that pandoc
217/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
218/// `track`).
219///
220/// At fresh-block positions (or after a blank line) pandoc emits these
221/// as a single `RawBlock`; inside a running paragraph they stay inline
222/// as `RawInline`. The parser opens a depth-zero HTML block (closes
223/// immediately on the open-tag line — there is no closing tag to
224/// match) so subsequent lines start fresh blocks; the projector's
225/// `split_html_block_by_tags` handles the same-line splitting via
226/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
227const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
228
229/// Whether `name` (case-insensitive) is one of pandoc's void
230/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
231pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
232 let lower = name.to_ascii_lowercase();
233 PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
234}
235
236/// Whether the given tag name is eligible for the Phase 6 / Fix #4
237/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
238/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
239/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
240/// and NOT void. These are the tags where pandoc parses the body as
241/// fresh markdown between RawBlock emissions of the open/close tags —
242/// exactly the shape we can lift into structural CST children.
243///
244/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
245/// additional gate at the lift-gate site: the lift is abandoned when
246/// the body's first non-blank content is a void block tag at a
247/// fresh-block position (`<video>\n<source ...>\n</video>` projects
248/// per-tag rather than matched-pair, mirroring pandoc).
249///
250/// `<div>` is intentionally excluded — it has its own lift path
251/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
252/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
253pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
254 let lower = name.to_ascii_lowercase();
255 if VERBATIM_TAGS.contains(&lower.as_str()) {
256 return false;
257 }
258 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
259 return false;
260 }
261 if lower == "div" {
262 return false;
263 }
264 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
265 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
266}
267
268/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
269/// — anything that has an opening and a matching closing form whose
270/// `</tag>` would be recognized by the dispatcher as a separate block
271/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
272/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
273/// Void tags are excluded — they have no close form.
274///
275/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
276/// an open inside the buffer whose close would otherwise interrupt the
277/// list item mid-construct.
278pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
279 let lower = name.to_ascii_lowercase();
280 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
281 return false;
282 }
283 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
284 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
285 || VERBATIM_TAGS.contains(&lower.as_str())
286}
287
288/// Open-tag-attribute tokenization gate for non-div strict-block tags
289/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
290/// open tag is eligible for finer-grained tokenization
291/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
292/// driving the full body lift — that's the `bq_clean_lift` path. The
293/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
294/// the salsa anchor index.
295///
296/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
297/// wrapper) regardless of bq depth, so this gate skips it.
298fn bq_strict_attr_emit_tag_name(
299 wrapper_kind: SyntaxKind,
300 block_type: &HtmlBlockType,
301 bq_depth: usize,
302) -> Option<&str> {
303 if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
304 return None;
305 }
306 match block_type {
307 HtmlBlockType::BlockTag {
308 tag_name,
309 is_verbatim: false,
310 closed_by_blank_line: false,
311 depth_aware: true,
312 closes_at_open_tag: false,
313 is_closing: false,
314 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
315 _ => None,
316 }
317}
318
319/// Information about a detected HTML block opening.
320#[derive(Debug, Clone, PartialEq, Eq)]
321pub(crate) enum HtmlBlockType {
322 /// HTML comment: <!-- ... -->
323 Comment,
324 /// Processing instruction: <? ... ?>
325 ProcessingInstruction,
326 /// Declaration: <!...>
327 Declaration,
328 /// CDATA section: <![CDATA[ ... ]]>
329 CData,
330 /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
331 /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
332 /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
333 /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
334 /// `depth_aware` extends the matching-tag close path with balanced
335 /// open/close tracking of the same tag name (mirrors pandoc's
336 /// `htmlInBalanced`); used under Pandoc dialect to handle nested
337 /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
338 /// `closed_by_blank_line` is true.
339 /// `closes_at_open_tag` short-circuits the close search: the block
340 /// always ends after the open-tag line. Used for void
341 /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
342 /// `<track>`) which have no closing tag — depth-aware matching
343 /// would walk to end-of-input.
344 /// `is_closing` records whether the tag at the start position is a
345 /// closing form (`</tag>`) rather than an opening form (`<tag>`).
346 /// The dispatcher's `cannot_interrupt` consults this to mirror
347 /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
348 /// even when `<script>` is not — pandoc treats the close-form as
349 /// always-inline regardless of attributes).
350 BlockTag {
351 tag_name: String,
352 is_verbatim: bool,
353 closed_by_blank_line: bool,
354 depth_aware: bool,
355 closes_at_open_tag: bool,
356 is_closing: bool,
357 },
358 /// CommonMark §4.6 type 7: complete open or close tag on a line by
359 /// itself, tag name not in the type-1 verbatim list. Block ends at
360 /// blank line. Cannot interrupt a paragraph.
361 Type7,
362}
363
364/// Try to detect an HTML block opening from content.
365/// Returns block type if this is a valid HTML block start.
366///
367/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
368/// accept closing tags (`</div>`), type-6 blocks end at the next blank
369/// line (rather than a matching close tag), and type 7 is recognized.
370pub(crate) fn try_parse_html_block_start(
371 content: &str,
372 is_commonmark: bool,
373) -> Option<HtmlBlockType> {
374 let trimmed = strip_leading_spaces(content);
375
376 // Must start with <
377 if !trimmed.starts_with('<') {
378 return None;
379 }
380
381 // HTML comment
382 if trimmed.starts_with("<!--") {
383 return Some(HtmlBlockType::Comment);
384 }
385
386 // Processing instruction
387 if trimmed.starts_with("<?") {
388 return Some(HtmlBlockType::ProcessingInstruction);
389 }
390
391 // CDATA section — CommonMark dialect only. Pandoc-markdown does not
392 // recognize bare CDATA as a raw HTML block; the literal bytes fall
393 // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
394 // text is parsed as inline markdown, etc).
395 if is_commonmark && trimmed.starts_with("<![CDATA[") {
396 return Some(HtmlBlockType::CData);
397 }
398
399 // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
400 // does not recognize bare declarations as raw HTML blocks (its
401 // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
402 // tag-shaped blocks); the bytes fall through to paragraph parsing.
403 if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
404 let after_bang = &trimmed[2..];
405 if after_bang.chars().next()?.is_ascii_alphabetic() {
406 return Some(HtmlBlockType::Declaration);
407 }
408 }
409
410 // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
411 // Pandoc-native recognizes standalone closing forms of strict-block tags
412 // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
413 // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
414 // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
415 // the open-tag line via `closes_at_open_tag: true`.
416 if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
417 let tag_lower = tag_name.to_lowercase();
418 let is_closing = trimmed.starts_with("</");
419
420 // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
421 // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
422 // Unlike inline-block / void closes, these CAN interrupt a running
423 // paragraph (the dispatcher's `cannot_interrupt` only covers the
424 // inline-block / void categories). Inline-block / void closes are
425 // handled by their own branches further below.
426 if !is_commonmark
427 && is_closing
428 && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
429 || VERBATIM_TAGS.contains(&tag_lower.as_str()))
430 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
431 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
432 {
433 return Some(HtmlBlockType::BlockTag {
434 tag_name: tag_lower,
435 is_verbatim: false,
436 closed_by_blank_line: false,
437 depth_aware: false,
438 closes_at_open_tag: true,
439 is_closing: true,
440 });
441 }
442
443 // Under Pandoc, remaining closing forms (truly inline-only tags like
444 // `</em>`, `</span>`) are not block starts — fall through to the
445 // existing inline-html path. Inline-block + void closes are caught
446 // by the dedicated branches further below.
447 if !is_commonmark
448 && is_closing
449 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
450 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
451 {
452 return None;
453 }
454
455 // Check if it's a block-level tag. Pandoc and CommonMark disagree on
456 // membership: pandoc's `blockHtmlTags` (see
457 // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
458 // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
459 // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
460 let is_block_tag = if is_commonmark {
461 BLOCK_TAGS.contains(&tag_lower.as_str())
462 } else {
463 PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
464 };
465 if is_block_tag {
466 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
467 return Some(HtmlBlockType::BlockTag {
468 tag_name: tag_lower,
469 is_verbatim,
470 closed_by_blank_line: is_commonmark && !is_verbatim,
471 depth_aware: !is_commonmark,
472 closes_at_open_tag: false,
473 is_closing,
474 });
475 }
476
477 // Pandoc dialect also treats `eitherBlockOrInline` tags as block
478 // starters at fresh-block positions. The block dispatcher caller
479 // gates these as `cannot_interrupt` (mirrors pandoc — they never
480 // interrupt a running paragraph; only start a fresh block when
481 // following a blank line or at document start). Closing forms
482 // (`</video>`) emit as a single-line `RawBlock` with no balanced
483 // match — pandoc-native pins this for standalone closes.
484 if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
485 return Some(HtmlBlockType::BlockTag {
486 tag_name: tag_lower,
487 is_verbatim: false,
488 closed_by_blank_line: false,
489 depth_aware: !is_closing,
490 closes_at_open_tag: is_closing,
491 is_closing,
492 });
493 }
494
495 // Pandoc dialect also recognizes the void subset of
496 // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
497 // These have no closing tag, so the parser closes the block
498 // immediately on the open-tag line; the projector's
499 // `split_html_block_by_tags` handles the same-line splitting
500 // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
501 // non-void inline-block tags, void tags never interrupt a
502 // running paragraph (gated as `cannot_interrupt` in the
503 // dispatcher). Closing forms (`</embed>`) — semantically
504 // nonsensical for void elements — pandoc still emits as a
505 // single-line `RawBlock`; mirror that.
506 if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
507 return Some(HtmlBlockType::BlockTag {
508 tag_name: tag_lower,
509 is_verbatim: false,
510 closed_by_blank_line: false,
511 depth_aware: false,
512 closes_at_open_tag: true,
513 is_closing,
514 });
515 }
516
517 // Also accept verbatim tags even if not in BLOCK_TAGS list — but
518 // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
519 // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
520 // do not start a type-1 block. Letting `</pre>` through here would
521 // wrongly interrupt a paragraph.
522 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
523 return Some(HtmlBlockType::BlockTag {
524 tag_name: tag_lower,
525 is_verbatim: true,
526 closed_by_blank_line: false,
527 depth_aware: !is_commonmark,
528 closes_at_open_tag: false,
529 is_closing: false,
530 });
531 }
532 }
533
534 // Type 7 (CommonMark only): complete open or close tag on a line by
535 // itself, tag name not in the type-1 verbatim list.
536 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
537 {
538 let rest = &trimmed[end..];
539 let only_ws = rest
540 .bytes()
541 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
542 if only_ws {
543 // Reject if the tag name belongs to the type-1 verbatim set
544 // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
545 // type-1 starts above, so seeing one here means the opener
546 // had a different shape (e.g. `<pre/>` self-closing) that
547 // shouldn't trigger type 7 either. Conservatively skip.
548 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
549 let name_end = leading
550 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
551 .unwrap_or(leading.len());
552 let name = leading[..name_end].to_ascii_lowercase();
553 if !VERBATIM_TAGS.contains(&name.as_str()) {
554 return Some(HtmlBlockType::Type7);
555 }
556 }
557 }
558
559 None
560}
561
562/// Extract the tag name for HTML-block-start detection.
563///
564/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
565/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
566/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
567/// the spec — we approximate that with the space/`>`/`/` boundary check.
568fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
569 if !text.starts_with('<') {
570 return None;
571 }
572
573 let after_bracket = &text[1..];
574
575 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
576 if !accept_closing {
577 return None;
578 }
579 stripped
580 } else {
581 after_bracket
582 };
583
584 // Extract tag name (alphanumeric, ends at space, >, or /)
585 let tag_end = after_slash
586 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
587 .unwrap_or(after_slash.len());
588
589 if tag_end == 0 {
590 return None;
591 }
592
593 let tag_name = &after_slash[..tag_end];
594
595 // Tag name must be valid (ASCII alphabetic start, alphanumeric)
596 if !tag_name.chars().next()?.is_ascii_alphabetic() {
597 return None;
598 }
599
600 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
601 return None;
602 }
603
604 Some(tag_name.to_string())
605}
606
607/// Whether this block type ends at a blank line (CommonMark types 6 & 7
608/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
609/// marker — only at end of input or the next blank line.
610fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
611 matches!(
612 block_type,
613 HtmlBlockType::Type7
614 | HtmlBlockType::BlockTag {
615 closed_by_blank_line: true,
616 ..
617 }
618 )
619}
620
621/// Check if a line contains the closing marker for the given HTML block type.
622/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
623/// blank-line-terminated types (6 in CommonMark, 7) never match here.
624fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
625 match block_type {
626 HtmlBlockType::Comment => line.contains("-->"),
627 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
628 HtmlBlockType::Declaration => line.contains('>'),
629 HtmlBlockType::CData => line.contains("]]>"),
630 HtmlBlockType::BlockTag {
631 tag_name,
632 closed_by_blank_line: false,
633 ..
634 } => {
635 // Look for closing tag </tagname>
636 let closing_tag = format!("</{}>", tag_name);
637 line.to_lowercase().contains(&closing_tag)
638 }
639 HtmlBlockType::BlockTag {
640 closed_by_blank_line: true,
641 ..
642 }
643 | HtmlBlockType::Type7 => false,
644 }
645}
646
647/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
648/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
649/// inside a quoted attribute value are NOT counted — the scanner walks
650/// `<...>` brackets and respects `"`/`'` quoting.
651///
652/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
653/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
654/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
655/// close-form dispatch that would otherwise break the list-item buffer
656/// mid-`<div>...</div>`.
657pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
658 let bytes = line.as_bytes();
659 let lower_line = line.to_ascii_lowercase();
660 let lower_bytes = lower_line.as_bytes();
661 let tag_lower = tag_name.to_ascii_lowercase();
662 let tag_bytes = tag_lower.as_bytes();
663
664 let mut opens = 0usize;
665 let mut closes = 0usize;
666 let mut i = 0usize;
667
668 while i < bytes.len() {
669 if bytes[i] != b'<' {
670 i += 1;
671 continue;
672 }
673 let after = i + 1;
674 let is_close = after < bytes.len() && bytes[after] == b'/';
675 let name_start = if is_close { after + 1 } else { after };
676 let matched = name_start + tag_bytes.len() <= bytes.len()
677 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
678 let after_name = name_start + tag_bytes.len();
679 let is_boundary = matched
680 && matches!(
681 bytes.get(after_name).copied(),
682 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
683 );
684
685 // Walk forward to the closing `>` of this tag bracket, skipping
686 // inside quoted attribute values. Self-closing form ends with `/>`.
687 let mut j = if matched { after_name } else { after };
688 let mut quote: Option<u8> = None;
689 let mut self_close = false;
690 let mut found_gt = false;
691 while j < bytes.len() {
692 let b = bytes[j];
693 match (quote, b) {
694 (Some(q), x) if x == q => quote = None,
695 (None, b'"') | (None, b'\'') => quote = Some(b),
696 (None, b'>') => {
697 found_gt = true;
698 if j > i + 1 && bytes[j - 1] == b'/' {
699 self_close = true;
700 }
701 break;
702 }
703 _ => {}
704 }
705 j += 1;
706 }
707
708 if matched && is_boundary {
709 if is_close {
710 closes += 1;
711 } else if !self_close {
712 opens += 1;
713 }
714 }
715
716 if found_gt {
717 i = j + 1;
718 } else {
719 // Unterminated `<...` — bail out to avoid an infinite loop.
720 // The remaining bytes don't form a complete tag.
721 break;
722 }
723 }
724
725 (opens, closes)
726}
727
728/// Pandoc-dialect lift for HTML comments / processing instructions
729/// whose close marker is followed by additional bytes (same-line
730/// trailing or following lines). Pandoc-native emits a `RawBlock` for
731/// the marker bytes only, then parses the remainder as fresh blocks.
732///
733/// Returns `Some(consumed_lines)` when the split fires (caller must
734/// NOT enter the legacy emission); `None` to fall back to the legacy
735/// path (no close marker found, or no trailing content to split).
736///
737/// CST shape on success:
738/// ```text
739/// HTML_BLOCK
740/// HTML_BLOCK_TAG (open) // line[0] up to and incl close marker
741/// TEXT "<!-- hi -->" // or with HTML_BLOCK_CONTENT in between
742/// ... // for multi-line `<!--\n…\n-->` shape
743/// <sibling blocks> // recursive parse of trailing + lines[M+1..]
744/// ```
745fn try_parse_comment_pi_with_trailing_split(
746 builder: &mut GreenNodeBuilder<'static>,
747 lines: &[&str],
748 start_pos: usize,
749 block_type: &HtmlBlockType,
750 wrapper_kind: SyntaxKind,
751 bq_depth: usize,
752 config: &ParserOptions,
753) -> Option<usize> {
754 let marker: &str = match block_type {
755 HtmlBlockType::Comment => "-->",
756 HtmlBlockType::ProcessingInstruction => "?>",
757 _ => return None,
758 };
759
760 // Find the close marker in the bq-stripped line content. For
761 // bq_depth == 0 the inner content equals the raw line; for
762 // bq_depth > 0 we look past the `>` markers stripped by the
763 // outer dispatcher (line 0) and emitted as bq prefix below
764 // (lines > 0). `marker_end_in_inner` is the byte offset of the
765 // first byte AFTER the close marker, measured from the start
766 // of the inner (post-strip) content.
767 let mut close_line_idx: Option<usize> = None;
768 let mut marker_end_in_inner: usize = 0;
769 for (offset, line) in lines[start_pos..].iter().enumerate() {
770 let inner = if bq_depth > 0 {
771 strip_n_blockquote_markers(line, bq_depth)
772 } else {
773 line
774 };
775 if let Some(pos) = inner.find(marker) {
776 close_line_idx = Some(start_pos + offset);
777 marker_end_in_inner = pos + marker.len();
778 break;
779 }
780 }
781 let close_line_idx = close_line_idx?;
782 let close_line = lines[close_line_idx];
783 let close_inner = if bq_depth > 0 {
784 strip_n_blockquote_markers(close_line, bq_depth)
785 } else {
786 close_line
787 };
788 let close_prefix_len = close_line.len() - close_inner.len();
789 let trailing = &close_inner[marker_end_in_inner..];
790
791 // Only fire when there is non-whitespace content AFTER the close
792 // marker on the close line. The legacy path correctly handles
793 // the close-line-ends-at-close-marker shapes (`-->\n` followed
794 // by separate blocks); only the same-line-trailing case needs
795 // structural splitting. Trailing-whitespace-only handling
796 // (`--> \n`) is a projector-side trim — separate concern.
797 let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
798 if !has_non_ws_trailing {
799 return None;
800 }
801
802 builder.start_node(wrapper_kind.into());
803
804 // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
805 // middle `HTML_BLOCK_CONTENT` lines between open and close. The
806 // close `HTML_BLOCK_TAG` carries only the bytes up to and
807 // including the close marker — trailing bytes go to the sibling.
808 if close_line_idx == start_pos {
809 // Same-line shape: one HTML_BLOCK_TAG containing the close
810 // marker's bytes. The newline lives on the trailing sibling.
811 // Line 0's bq prefix (if any) was already emitted by the
812 // outer dispatcher; emit only the inner marker bytes.
813 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
814 let close_part = &close_inner[..marker_end_in_inner];
815 if !close_part.is_empty() {
816 builder.token(SyntaxKind::TEXT.into(), close_part);
817 }
818 builder.finish_node();
819 } else {
820 // Multi-line shape: open tag covers lines[start_pos..close],
821 // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
822 // only the marker bytes. Line 0's bq prefix was emitted by
823 // the outer dispatcher; subsequent lines (middle + close)
824 // need bq prefix re-emission inside the wrapper.
825 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
826 let first_line = lines[start_pos];
827 let first_inner = if bq_depth > 0 {
828 strip_n_blockquote_markers(first_line, bq_depth)
829 } else {
830 first_line
831 };
832 let (line_no_nl, nl) = strip_newline(first_inner);
833 if !line_no_nl.is_empty() {
834 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
835 }
836 if !nl.is_empty() {
837 builder.token(SyntaxKind::NEWLINE.into(), nl);
838 }
839 builder.finish_node();
840
841 if close_line_idx > start_pos + 1 {
842 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
843 for content_line in &lines[start_pos + 1..close_line_idx] {
844 emit_html_block_line(builder, content_line, bq_depth);
845 }
846 builder.finish_node();
847 }
848
849 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
850 if bq_depth > 0 && close_prefix_len > 0 {
851 emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
852 }
853 let close_part = &close_inner[..marker_end_in_inner];
854 if !close_part.is_empty() {
855 builder.token(SyntaxKind::TEXT.into(), close_part);
856 }
857 builder.finish_node();
858 }
859
860 builder.finish_node(); // HTML_BLOCK
861
862 // Recursively parse JUST the trailing bytes on the close line
863 // and graft top-level children as siblings of the HTML_BLOCK we
864 // just closed. We do NOT consume subsequent lines here — the
865 // outer dispatcher continues from `close_line_idx + 1` and
866 // handles container-boundary lines (`:::` div closes, blockquote
867 // markers, list-marker continuations) correctly. Multi-line
868 // softbreak continuation (`<!-- --> trailing\nmore\n` →
869 // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
870 // outer dispatcher sees `more` after the close line and starts
871 // a fresh paragraph. Refdefs flow through from the outer config
872 // (same pattern as `emit_html_block_body_lifted_inner`).
873 if !trailing.is_empty() {
874 let mut inner_options = config.clone();
875 let refdefs = config.refdef_labels.clone().unwrap_or_default();
876 inner_options.refdef_labels = Some(refdefs.clone());
877 let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
878 let mut bq = None;
879 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
880 }
881
882 Some(close_line_idx + 1)
883}
884
885/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
886/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
887/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
888/// identical to the source either way; only the wrapper retag changes.
889pub(crate) fn parse_html_block_with_wrapper(
890 builder: &mut GreenNodeBuilder<'static>,
891 lines: &[&str],
892 start_pos: usize,
893 block_type: HtmlBlockType,
894 prefix: &ContainerPrefix,
895 wrapper_kind: SyntaxKind,
896 config: &ParserOptions,
897) -> usize {
898 let bq_depth = prefix.bq_depth();
899 // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
900 // closes the RawBlock at the close marker (`-->` / `?>`) and parses
901 // any subsequent bytes (same-line trailing or following lines) as
902 // fresh blocks. The legacy path absorbs them into the HTML block
903 // wrapper, producing one oversized RawBlock. Handle the split here
904 // before entering the legacy emission so the CST encodes the
905 // sibling structure.
906 if config.dialect == crate::options::Dialect::Pandoc
907 && matches!(
908 block_type,
909 HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
910 )
911 && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
912 builder,
913 lines,
914 start_pos,
915 &block_type,
916 wrapper_kind,
917 bq_depth,
918 config,
919 )
920 {
921 return consumed;
922 }
923
924 // Start HTML block
925 builder.start_node(wrapper_kind.into());
926
927 let first_line = lines[start_pos];
928 let blank_terminated = ends_at_blank_line(&block_type);
929
930 // The block dispatcher has already emitted the bq prefix tokens for
931 // the first line; emit only the inner content as TEXT to keep the
932 // CST byte-equal to the source. List-marker bytes are stripped only
933 // when this dispatch fires on a list-marker line — for
934 // continuation-line dispatches (the much more common case) the
935 // leading indent is inner content, not upstream-emitted prefix.
936 let first_inner = prefix.strip_line_0_for_emission(first_line);
937
938 // Detect a multi-line open tag.
939 // - `<div>` (Pandoc lift): we tokenize each line structurally so the
940 // salsa anchor walk picks up `id` from the HTML_ATTRS region.
941 // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
942 // `<section>`, `<header>`, …): same structural emission, exposing
943 // `id` to the salsa anchor walk and enabling the body lift below.
944 // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
945 // without this, the parser closes the block after line 0 and the
946 // remainder of the open tag falls into following paragraphs;
947 // pandoc-native treats the whole multi-line open tag as a single
948 // `RawBlock`. Emission for void tags uses simple per-line
949 // TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
950 // from void tags).
951 let multiline_open_end = match (wrapper_kind, &block_type) {
952 (SyntaxKind::HTML_BLOCK_DIV, _) => {
953 find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
954 }
955 (
956 _,
957 HtmlBlockType::BlockTag {
958 tag_name,
959 closes_at_open_tag: true,
960 ..
961 },
962 ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
963 (
964 _,
965 HtmlBlockType::BlockTag {
966 tag_name,
967 is_verbatim: false,
968 closed_by_blank_line: false,
969 depth_aware: true,
970 closes_at_open_tag: false,
971 is_closing: false,
972 },
973 ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
974 find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
975 }
976 _ => None,
977 };
978
979 // Set up depth-aware close tracking when the block type asks for it
980 // (Pandoc dialect, balanced same-name tag matching). A `None` means
981 // we fall back to the legacy "first matching close" path via
982 // `is_closing_marker`. Computed up front so the lift-mode gate
983 // below can decide whether the open line already balances the
984 // block (same-line `<div>...</div>`).
985 let depth_aware_tag: Option<String> = match &block_type {
986 HtmlBlockType::BlockTag {
987 tag_name,
988 closed_by_blank_line: false,
989 depth_aware: true,
990 ..
991 } => Some(tag_name.clone()),
992 _ => None,
993 };
994 let mut depth: i64 = 1;
995 if let Some(tag_name) = &depth_aware_tag {
996 // Sum opens/closes across all open-tag lines (single-line: just
997 // line 0; multi-line: lines 0..=end_line_idx).
998 let last_open_line = multiline_open_end.unwrap_or(start_pos);
999 let mut opens = 0usize;
1000 let mut closes = 0usize;
1001 for line in &lines[start_pos..=last_open_line] {
1002 let inner = prefix.strip(line);
1003 let (o, c) = count_tag_balance(inner, tag_name);
1004 opens += o;
1005 closes += c;
1006 }
1007 depth = opens as i64 - closes as i64;
1008 }
1009
1010 // Same-line `<div>foo</div>` shape: the open line balances the
1011 // block under depth-aware tracking. We can lift this structurally
1012 // only when the open-tag trailing has exactly one `</div>` close,
1013 // zero `<div>` opens, and no non-whitespace content after the
1014 // close. Other same-line shapes (nested, trailing text, malformed)
1015 // fall through to the byte-reparse path.
1016 let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1017 && multiline_open_end.is_none()
1018 && depth_aware_tag.is_some()
1019 && depth <= 0;
1020 let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1021 let (line_without_newline, _) = strip_newline(first_inner);
1022 probe_same_line_lift(line_without_newline, "div")
1023 };
1024
1025 // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1026 // `<nav>`, …): the body parses as fresh markdown between RawBlock
1027 // emissions of the open/close tags. Covers the clean multi-line
1028 // shape (open tag stands alone on its line), open-trailing
1029 // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1030 // and same-line (`<form>foo</form>`). Multi-line open and
1031 // blockquote-wrapped non-div shapes still fall through to the
1032 // byte-walker path.
1033 let strict_block_tag_name: Option<&str> =
1034 if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1035 match &block_type {
1036 HtmlBlockType::BlockTag {
1037 tag_name,
1038 is_verbatim: false,
1039 closed_by_blank_line: false,
1040 depth_aware: true,
1041 closes_at_open_tag: false,
1042 is_closing: false,
1043 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1044 _ => None,
1045 }
1046 } else {
1047 None
1048 };
1049 // Same-line `<form>foo</form>` shape: the open line already
1050 // balances the block (`depth <= 0`). Lift only when the trailing
1051 // bytes after the open `>` end with `</tag>` and contain exactly
1052 // one close + zero nested opens.
1053 let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1054 multiline_open_end.is_none() && depth <= 0 && {
1055 let (line_no_nl, _) = strip_newline(first_inner);
1056 probe_same_line_lift(line_no_nl, name)
1057 }
1058 });
1059 // Strict-block lift gate: accept (a) a multi-line open tag spanning
1060 // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1061 // trailing single-line open (depth > 0, open `>` is present with
1062 // quote-aware matching), or (c) a safe same-line shape. For
1063 // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1064 // …) the lift additionally abandons when the body starts at a
1065 // fresh-block position with a void block tag — pandoc-native pins
1066 // per-tag emission rather than a matched-pair lift in that case.
1067 let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1068 let (line_no_nl, _) = strip_newline(first_inner);
1069 let shape_ok = if multiline_open_end.is_some() {
1070 // `find_multiline_open_end` already verified the open tag
1071 // closes with a quote-aware `>` somewhere in lines
1072 // `start_pos+1..=end`. No same-line trailing content to
1073 // probe; defer trailing-on-close-`>`-line handling to a
1074 // future session (rare in practice).
1075 true
1076 } else if depth > 0 {
1077 probe_open_tag_line_has_close_gt(line_no_nl, name)
1078 } else {
1079 same_line_strict_lift_safe
1080 };
1081 if !shape_ok {
1082 return false;
1083 }
1084 if !is_pandoc_inline_block_tag_name(name) {
1085 return true;
1086 }
1087 !inline_block_void_interior_abandons(
1088 first_inner,
1089 lines,
1090 start_pos,
1091 multiline_open_end,
1092 bq_depth,
1093 name,
1094 )
1095 });
1096
1097 // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1098 // are byte-equal to the non-bq same-line shape minus the leading
1099 // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1100 // The body has no inner newlines, so no bq prefix re-injection is
1101 // needed when grafting — `emit_html_block_body_lifted` (passing
1102 // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1103 // open-trailing) still fall through to the projector's byte
1104 // walker — they need per-line prefix injection.
1105 let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1106 && multiline_open_end.is_none()
1107 && depth_aware_tag.is_some()
1108 && depth <= 0
1109 {
1110 let (line_no_nl, _) = strip_newline(first_inner);
1111 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1112 if probe_same_line_lift(line_no_nl, "div") {
1113 Some("div")
1114 } else {
1115 None
1116 }
1117 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1118 match &block_type {
1119 HtmlBlockType::BlockTag {
1120 tag_name,
1121 is_verbatim: false,
1122 closed_by_blank_line: false,
1123 depth_aware: true,
1124 closes_at_open_tag: false,
1125 is_closing: false,
1126 } if is_pandoc_lift_eligible_block_tag(tag_name)
1127 && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1128 {
1129 // Inline-block tags (`<video>`, `<iframe>`, …) skip
1130 // the void-interior check at same-line — the shape
1131 // has no inner block content to interfere with.
1132 Some(tag_name.as_str())
1133 }
1134 _ => None,
1135 }
1136 } else {
1137 None
1138 }
1139 } else {
1140 None
1141 };
1142
1143 // Messy-shape lift inside a blockquote — covers open-trailing
1144 // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1145 // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1146 // including the multi-line-open variants (`> <div\n> id="x">foo\n>
1147 // body\n> </div>`) where the trailing is captured into `pre_content`
1148 // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1149 // The open line does NOT balance the block (depth > 0 after the
1150 // open line, distinguishing this from `same_line_bq_lift_tag` which
1151 // requires depth <= 0). The close line — possibly with leading body
1152 // text — closes the block when depth returns to 0. Body lines (incl.
1153 // open trailing and close leading) graft via prefix re-injection.
1154 let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1155 {
1156 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1157 Some("div")
1158 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1159 match &block_type {
1160 HtmlBlockType::BlockTag {
1161 tag_name,
1162 is_verbatim: false,
1163 closed_by_blank_line: false,
1164 depth_aware: true,
1165 closes_at_open_tag: false,
1166 is_closing: false,
1167 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1168 // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1169 // …) abandon the lift when the body starts at a
1170 // fresh-block position with a void block tag. Same gate
1171 // as the non-bq matched-pair lift (`strict_block_lift`).
1172 if is_pandoc_inline_block_tag_name(tag_name)
1173 && inline_block_void_interior_abandons(
1174 first_inner,
1175 lines,
1176 start_pos,
1177 multiline_open_end,
1178 bq_depth,
1179 tag_name,
1180 )
1181 {
1182 None
1183 } else {
1184 Some(tag_name.as_str())
1185 }
1186 }
1187 _ => None,
1188 }
1189 } else {
1190 None
1191 }
1192 } else {
1193 None
1194 };
1195
1196 // Multi-line open + matched close-on-the-open's-last-line shape inside
1197 // a blockquote (`> <div\n> id="x">foo</div>` and depth-aware variants:
1198 // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1199 // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1200 // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1201 // close is on the open's last line (`depth <= 0` after the open lines)
1202 // AND `multiline_open_end.is_some()`. The trailing bytes after the
1203 // last `>` get lifted into `pre_content` via
1204 // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1205 // new branch below splits `pre_content` at the matched close marker
1206 // and grafts body + close + any trailing siblings.
1207 let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1208 && multiline_open_end.is_some()
1209 && depth_aware_tag.is_some()
1210 && depth <= 0
1211 {
1212 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1213 Some("div")
1214 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1215 match &block_type {
1216 HtmlBlockType::BlockTag {
1217 tag_name,
1218 is_verbatim: false,
1219 closed_by_blank_line: false,
1220 depth_aware: true,
1221 closes_at_open_tag: false,
1222 is_closing: false,
1223 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1224 if is_pandoc_inline_block_tag_name(tag_name)
1225 && inline_block_void_interior_abandons(
1226 first_inner,
1227 lines,
1228 start_pos,
1229 multiline_open_end,
1230 bq_depth,
1231 tag_name,
1232 )
1233 {
1234 None
1235 } else {
1236 Some(tag_name.as_str())
1237 }
1238 }
1239 _ => None,
1240 }
1241 } else {
1242 None
1243 }
1244 } else {
1245 None
1246 };
1247
1248 // Whether this block participates in the Phase 6 structural lift
1249 // (recursively parse body as Pandoc markdown and graft children).
1250 // Covers `<div>` outside blockquote context. For same-line shapes
1251 // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1252 // keep the legacy single-HTML_BLOCK_TAG shape and let the
1253 // byte-reparse path handle projection.
1254 let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1255 && bq_depth == 0
1256 && (!is_same_line_div || same_line_div_lift_safe))
1257 || strict_block_lift
1258 || same_line_bq_lift_tag.is_some()
1259 || bq_messy_lift_tag.is_some()
1260 || bq_multiline_close_lift_tag.is_some();
1261
1262 // Trailing content from the open tag (after `>`). When the lift is
1263 // active and the open line is `<div ATTRS>foo\n`, this captures
1264 // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1265 // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1266 // shapes (same-line / blockquote-wrapped).
1267 let mut pre_content = String::new();
1268
1269 // Emit opening line(s)
1270 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1271
1272 if let Some(end_line_idx) = multiline_open_end {
1273 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1274 emit_multiline_open_tag_with_attrs(
1275 builder,
1276 lines,
1277 start_pos,
1278 end_line_idx,
1279 "div",
1280 bq_depth,
1281 lift_mode,
1282 &mut pre_content,
1283 );
1284 } else if let Some(name) = strict_block_tag_name
1285 && strict_block_lift
1286 {
1287 emit_multiline_open_tag_with_attrs(
1288 builder,
1289 lines,
1290 start_pos,
1291 end_line_idx,
1292 name,
1293 bq_depth,
1294 lift_mode,
1295 &mut pre_content,
1296 );
1297 } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1298 {
1299 // Multi-line open of a lift-eligible strict-block tag inside a
1300 // blockquote (`> <section\n> id=...>`). The non-bq
1301 // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1302 // covers the bq side so the open tag emits HTML_ATTRS regions
1303 // for `AttributeNode::cast` and the projector's canonicalizer.
1304 //
1305 // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1306 // call below: only push trailing bytes into `pre_content` when
1307 // the structural lift will consume them (bq messy lift). The
1308 // bq clean-lift requires `pre_content.is_empty()`, so for clean
1309 // multi-line opens the trailing is empty anyway and this is
1310 // a no-op.
1311 let lift_trailing =
1312 bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1313 emit_multiline_open_tag_with_attrs(
1314 builder,
1315 lines,
1316 start_pos,
1317 end_line_idx,
1318 name,
1319 bq_depth,
1320 lift_trailing,
1321 &mut pre_content,
1322 );
1323 } else {
1324 emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1325 }
1326 } else {
1327 let (line_without_newline, newline_str) = strip_newline(first_inner);
1328 if !line_without_newline.is_empty() {
1329 // For HTML_BLOCK_DIV, expose the open tag's attributes
1330 // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1331 // via the same descendants walk that handles fenced-div /
1332 // heading attrs. CST bytes stay byte-equal to source — we only
1333 // tokenize at finer granularity for matched div opens.
1334 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1335 let trailing =
1336 emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1337 if !trailing.is_empty() {
1338 pre_content.push_str(trailing);
1339 pre_content.push_str(newline_str);
1340 }
1341 } else if let Some(name) = strict_block_tag_name
1342 && strict_block_lift
1343 {
1344 let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1345 if !trailing.is_empty() {
1346 pre_content.push_str(trailing);
1347 pre_content.push_str(newline_str);
1348 }
1349 } else if let Some(name) =
1350 bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1351 {
1352 // Inside a blockquote, lift trailing bytes into
1353 // `pre_content` when either the same-line bq gate fires
1354 // (`> <tag>body</tag>` — handled by `same_line_closed`)
1355 // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1356 // </tag>` and butted-close — handled at the close-marker
1357 // site below). For the clean-shape bq lift the open has
1358 // no trailing bytes regardless, so `lift_trailing=true`
1359 // is a no-op there.
1360 let lift_trailing =
1361 same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1362 let trailing =
1363 emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1364 if lift_trailing && !trailing.is_empty() {
1365 pre_content.push_str(trailing);
1366 pre_content.push_str(newline_str);
1367 }
1368 } else {
1369 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1370 }
1371 }
1372 // When the open tag has trailing content under lift mode, the
1373 // newline belongs to that trailing line (it terminates the
1374 // synthetic body line, not the open tag). Don't double-emit.
1375 if pre_content.is_empty() && !newline_str.is_empty() {
1376 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1377 }
1378 }
1379
1380 builder.finish_node(); // HtmlBlockTag
1381
1382 // Check if opening line also contains closing marker. Blank-line-terminated
1383 // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1384 // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1385 // (`closes_at_open_tag: true`) close immediately — the block always
1386 // ends on the open-tag line since there is no closing tag to find.
1387 let void_block = matches!(
1388 &block_type,
1389 HtmlBlockType::BlockTag {
1390 closes_at_open_tag: true,
1391 ..
1392 }
1393 );
1394 // Void tags with a multi-line open close immediately after the open
1395 // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1396 // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1397 // a single RawBlock for the whole multi-line tag, with no following
1398 // content.
1399 if void_block && let Some(end_line_idx) = multiline_open_end {
1400 log::trace!(
1401 "HTML void block at line {} closes after multi-line open ending at line {}",
1402 start_pos + 1,
1403 end_line_idx + 1
1404 );
1405 builder.finish_node(); // HtmlBlock
1406 return end_line_idx + 1;
1407 }
1408 // Multi-line open with all matched closes on the open's last line:
1409 // `pre_content` holds the bytes after the last open `>` (lifted there
1410 // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1411 // When `depth <= 0` after the multi-line open and the trailing bytes
1412 // contain the depth-zero matched close, do the same-line lift on
1413 // `pre_content` directly. Mirrors the single-line `same_line_closed`
1414 // lift below — same body / close-marker / trailing-graft shape, just
1415 // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1416 //
1417 // The body bytes of `pre_content` come from the open's last line,
1418 // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1419 // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1420 // tag thus inherit the bq context without per-line prefix injection,
1421 // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1422 // both the non-bq and bq variants of this shape.
1423 if let Some(end_line_idx) = multiline_open_end
1424 && !blank_terminated
1425 && depth_aware_tag.is_some()
1426 && depth <= 0
1427 && lift_mode
1428 && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1429 && !pre_content.is_empty()
1430 {
1431 let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1432 Some("div")
1433 } else if strict_block_lift {
1434 strict_block_tag_name
1435 } else if let Some(name) = bq_multiline_close_lift_tag {
1436 Some(name)
1437 } else {
1438 None
1439 };
1440 if let Some(tag_name) = tag_name_opt {
1441 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1442 if let Some((leading, close_part)) =
1443 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1444 {
1445 let close_marker_end =
1446 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1447 let close_marker = &close_part[..close_marker_end];
1448 let same_line_trailing = &close_part[close_marker_end..];
1449 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1450 LastParaDemote::SkipTrailingBlanks
1451 } else {
1452 LastParaDemote::OnlyIfLast
1453 };
1454 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1455 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1456 if same_line_trailing.is_empty() {
1457 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1458 close_line.push_str(close_marker);
1459 close_line.push_str(post_nl);
1460 emit_html_block_line(builder, &close_line, 0);
1461 builder.finish_node();
1462 builder.finish_node(); // HtmlBlock
1463 } else {
1464 builder.token(SyntaxKind::TEXT.into(), close_marker);
1465 builder.finish_node(); // HTML_BLOCK_TAG
1466 builder.finish_node(); // HtmlBlock
1467
1468 let mut trailing_text =
1469 String::with_capacity(same_line_trailing.len() + post_nl.len());
1470 trailing_text.push_str(same_line_trailing);
1471 trailing_text.push_str(post_nl);
1472 let mut inner_options = config.clone();
1473 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1474 inner_options.refdef_labels = Some(refdefs.clone());
1475 let inner_root = crate::parser::parse_with_refdefs(
1476 &trailing_text,
1477 Some(inner_options),
1478 refdefs,
1479 );
1480 let mut bq = None;
1481 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1482 }
1483 return end_line_idx + 1;
1484 }
1485 }
1486 }
1487
1488 let same_line_closed = !blank_terminated
1489 && multiline_open_end.is_none()
1490 && (void_block
1491 || match &depth_aware_tag {
1492 Some(_) => depth <= 0,
1493 None => is_closing_marker(first_inner, &block_type),
1494 });
1495 if same_line_closed {
1496 log::trace!(
1497 "HTML block at line {} opens and closes on same line",
1498 start_pos + 1
1499 );
1500 // Same-line structural lift (div or non-div strict-block):
1501 // pre_content holds the bytes after the open `>` (including
1502 // the close `</tag>` and the trailing newline). Split into
1503 // body + close tag, emit body via recursive parse, emit close
1504 // tag as a sibling `HTML_BLOCK_TAG`.
1505 let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1506 None
1507 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1508 Some("div")
1509 } else if same_line_strict_lift_safe {
1510 strict_block_tag_name
1511 } else if let Some(name) = same_line_bq_lift_tag {
1512 // Bq same-line: body has no inner newlines so the standard
1513 // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1514 // sufficient. The bq prefix `> ` lives on the outer
1515 // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1516 Some(name)
1517 } else {
1518 None
1519 };
1520 if let Some(tag_name) = same_line_lift_tag {
1521 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1522 // Depth-aware split: handles `<tag>foo</tag>bar` (single
1523 // close, trailing text), `<tag>foo</tag></tag>` (matched
1524 // close + unmatched trailing close → sibling RawBlock),
1525 // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1526 // recursive body parse).
1527 if let Some((leading, close_part)) =
1528 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1529 {
1530 // `close_part` starts with `</tag` and contains the close
1531 // marker followed by any same-line trailing text. Split
1532 // off the close marker bytes (`</tag>`) so the close
1533 // `HTML_BLOCK_TAG` carries only those bytes; trailing
1534 // text is parsed and grafted as a sibling block at the
1535 // parent level (matches pandoc-native shape:
1536 // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1537 let close_marker_end =
1538 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1539 let close_marker = &close_part[..close_marker_end];
1540 let same_line_trailing = &close_part[close_marker_end..];
1541
1542 // Same-line is always close-butted; div demotes the
1543 // trailing Para→Plain via `SkipTrailingBlanks`.
1544 // Non-div strict-block uses `OnlyIfLast` (consistent
1545 // with butted-close — no trailing BLANK_LINE before
1546 // the close means the trailing Para demotes).
1547 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1548 LastParaDemote::SkipTrailingBlanks
1549 } else {
1550 LastParaDemote::OnlyIfLast
1551 };
1552 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1553 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1554 if same_line_trailing.is_empty() {
1555 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1556 close_line.push_str(close_marker);
1557 close_line.push_str(post_nl);
1558 emit_html_block_line(builder, &close_line, 0);
1559 builder.finish_node();
1560 builder.finish_node(); // HtmlBlock
1561 } else {
1562 // Close tag holds only the close-marker bytes;
1563 // trailing + newline graft as siblings of the
1564 // wrapper (matches pandoc's per-tag block split).
1565 builder.token(SyntaxKind::TEXT.into(), close_marker);
1566 builder.finish_node(); // HTML_BLOCK_TAG
1567 builder.finish_node(); // HtmlBlock
1568
1569 let mut trailing_text =
1570 String::with_capacity(same_line_trailing.len() + post_nl.len());
1571 trailing_text.push_str(same_line_trailing);
1572 trailing_text.push_str(post_nl);
1573 let mut inner_options = config.clone();
1574 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1575 inner_options.refdef_labels = Some(refdefs.clone());
1576 let inner_root = crate::parser::parse_with_refdefs(
1577 &trailing_text,
1578 Some(inner_options),
1579 refdefs,
1580 );
1581 let mut bq = None;
1582 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1583 }
1584 return start_pos + 1;
1585 }
1586 }
1587 builder.finish_node(); // HtmlBlock
1588 return start_pos + 1;
1589 }
1590
1591 let mut current_pos = multiline_open_end
1592 .map(|end| end + 1)
1593 .unwrap_or(start_pos + 1);
1594 let mut content_lines: Vec<&str> = Vec::new();
1595 let mut found_closing = false;
1596
1597 // Parse content until we find the closing marker
1598 while current_pos < lines.len() {
1599 let line = lines[current_pos];
1600 let (line_bq_depth, inner) = count_blockquote_markers(line);
1601
1602 // Only process lines at the same or deeper blockquote depth
1603 if line_bq_depth < bq_depth {
1604 break;
1605 }
1606
1607 // Blank-line-terminated blocks (types 6/7) end before the blank line.
1608 // The blank line itself is not part of the block.
1609 if blank_terminated && inner.trim().is_empty() {
1610 break;
1611 }
1612
1613 // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1614 // count opens/closes of the same tag name and only close when depth
1615 // returns to 0; otherwise fall back to substring-match on the line.
1616 let line_closes = match &depth_aware_tag {
1617 Some(tag_name) => {
1618 let (opens, closes) = count_tag_balance(inner, tag_name);
1619 depth += opens as i64;
1620 depth -= closes as i64;
1621 depth <= 0
1622 }
1623 None => is_closing_marker(inner, &block_type),
1624 };
1625
1626 if line_closes {
1627 log::trace!("Found HTML block closing at line {}", current_pos + 1);
1628 found_closing = true;
1629
1630 // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1631 // the open and close tags stand alone on their source lines
1632 // (no trailing on open, no body content on close after
1633 // stripping bq markers), lift the body lines structurally
1634 // so the projector walks CST children instead of
1635 // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1636 //
1637 // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1638 // grafted, Para preserved), non-div strict-block tags
1639 // (`<form>`, `<section>`, …) and inline-block matched-pair
1640 // tags (`<video>`, `<iframe>`, …) — the latter two under
1641 // HTML_BLOCK with the structural lift hitting pandoc's
1642 // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1643 // demotion. Inline-block additionally bails if the body
1644 // starts at a fresh-block position with a void block tag
1645 // (mirrors the non-bq matched-pair gate).
1646 //
1647 // Other bq-wrapped shapes (butted-close / open-trailing /
1648 // same-line) still fall through to the opaque path.
1649 // Multi-line opens are allowed here as of 2026-05-12: the
1650 // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1651 // per attr line and per-line bq prefix tokens) by the bq-aware
1652 // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1653 // empty for multi-line opens (the emitter writes any trailing
1654 // bytes on the last open line directly as TEXT inside
1655 // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1656 // trailing falls through to the opaque path, matching the non-
1657 // bq deferral.
1658 let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1659 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1660 Some("div")
1661 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1662 match &block_type {
1663 HtmlBlockType::BlockTag {
1664 tag_name,
1665 is_verbatim: false,
1666 closed_by_blank_line: false,
1667 depth_aware: true,
1668 closes_at_open_tag: false,
1669 is_closing: false,
1670 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1671 _ => None,
1672 }
1673 } else {
1674 None
1675 }
1676 } else {
1677 None
1678 };
1679
1680 let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1681 // Open-shape: last open line must end with `>` (clean
1682 // close-of-open). For single-line, that's `first_inner`
1683 // (already bq-stripped); for multi-line, strip bq markers
1684 // from `lines[end_line_idx]` and check the same.
1685 let last_open_line: &str = match multiline_open_end {
1686 None => first_inner,
1687 Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1688 prefix.strip(lines[end])
1689 }
1690 Some(end) => lines[end],
1691 };
1692 let (open_no_nl, _) = strip_newline(last_open_line);
1693 if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1694 return false;
1695 }
1696 let close_stripped = prefix.strip(line);
1697 let (close_no_nl, _) = strip_newline(close_stripped);
1698 if !close_no_nl
1699 .trim_start_matches([' ', '\t'])
1700 .starts_with("</")
1701 {
1702 return false;
1703 }
1704 if is_pandoc_inline_block_tag_name(tag_name)
1705 && inline_block_void_interior_abandons(
1706 first_inner,
1707 lines,
1708 start_pos,
1709 multiline_open_end,
1710 bq_depth,
1711 tag_name,
1712 )
1713 {
1714 return false;
1715 }
1716 true
1717 });
1718
1719 if bq_clean_lift {
1720 let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1721 LastParaDemote::Never
1722 } else {
1723 LastParaDemote::OnlyIfLast
1724 };
1725 emit_html_block_body_lifted_bq(
1726 builder,
1727 &content_lines,
1728 prefix,
1729 demote_policy,
1730 config,
1731 );
1732 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1733 emit_html_block_line(builder, line, bq_depth);
1734 builder.finish_node();
1735 current_pos += 1;
1736 break;
1737 }
1738
1739 // Bq messy-shape lift — single-line open with trailing or
1740 // butted-close (or both). `pre_content` already captures any
1741 // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1742 // strip the close line's bq markers before splitting so
1743 // `leading` and `close_part` are bq-prefix-free. Body parses
1744 // recursively from `pre_content + stripped(content_lines) +
1745 // leading`, with per-line bq prefixes re-injected so the CST
1746 // stays byte-equal to the source. Demote: div is keyed on
1747 // close-butted-ness (Plain when leading non-empty, Para
1748 // otherwise); non-div uses OnlyIfLast either way.
1749 if let Some(tag_name) = bq_messy_lift_tag {
1750 let close_stripped = prefix.strip(line);
1751 let close_prefix_len = line.len() - close_stripped.len();
1752 let close_prefix = &line[..close_prefix_len];
1753 if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1754 {
1755 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1756 if leading.is_empty() {
1757 LastParaDemote::Never
1758 } else {
1759 LastParaDemote::SkipTrailingBlanks
1760 }
1761 } else {
1762 LastParaDemote::OnlyIfLast
1763 };
1764 emit_html_block_body_lifted_bq_messy(
1765 builder,
1766 &pre_content,
1767 &content_lines,
1768 leading,
1769 close_prefix,
1770 prefix,
1771 policy,
1772 config,
1773 );
1774 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1775 // When `leading` is empty, no recursive-parse output carries
1776 // the close line's bq prefix, so emit it here before the
1777 // close tag. When `leading` is non-empty,
1778 // `emit_html_block_body_lifted_bq_messy` already injected
1779 // the prefix at the start of the leading bytes (via the
1780 // BqPrefixState entry); emitting again would double the
1781 // prefix bytes and break losslessness.
1782 if leading.is_empty() {
1783 emit_bq_prefix_tokens(builder, close_prefix);
1784 }
1785 emit_html_block_line(builder, close_part, 0);
1786 builder.finish_node();
1787 current_pos += 1;
1788 break;
1789 }
1790 }
1791
1792 // Under lift mode, try to split the close line into a
1793 // leading "body content" prefix and the close-marker
1794 // remainder using depth-aware matching. Walks at depth 1
1795 // (we're inside the open tag) so nested same-tag opens
1796 // (e.g. `<inner></inner></tag>` style with a nested div)
1797 // are absorbed into the body and parsed recursively, and
1798 // multi-close shapes (`foo</div></div>` on the close line)
1799 // peel off the matched-pair close — the unmatched
1800 // trailing close projects as a sibling `RawBlock` per
1801 // pandoc-native. For `<div>`, non-empty `leading`
1802 // propagates pandoc's `markdown_in_html_blocks` Plain
1803 // demotion rule. For non-div strict-block tags, demotion
1804 // follows pandoc's `OnlyIfLast` rule (demote the trailing
1805 // Para only when no blank line precedes the close).
1806 let close_split_tag = if lift_mode {
1807 if strict_block_lift {
1808 strict_block_tag_name
1809 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1810 Some("div")
1811 } else {
1812 None
1813 }
1814 } else {
1815 None
1816 };
1817 let (close_no_nl, close_post_nl) = strip_newline(line);
1818 let close_split = close_split_tag
1819 .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1820
1821 if let Some((leading, close_part)) = close_split {
1822 // Close-line leading that is whitespace-only is close-tag
1823 // indentation, not body content (pandoc-native strips it
1824 // from the close RawBlock and treats the close as butted —
1825 // see ` </tag>` shapes). Route those bytes into the
1826 // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1827 // projector strips them; keep the demote policy keyed on
1828 // the original leading so butted-close detection (Plain
1829 // demotion for div, OnlyIfLast for non-div) still fires.
1830 let leading_is_ws_only =
1831 !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1832 let body_leading = if leading_is_ws_only { "" } else { leading };
1833 let policy = if strict_block_lift {
1834 LastParaDemote::OnlyIfLast
1835 } else if !leading.is_empty() {
1836 LastParaDemote::SkipTrailingBlanks
1837 } else {
1838 LastParaDemote::Never
1839 };
1840 // Split close_part into close-marker bytes (`</tag>`)
1841 // and trailing bytes (e.g. an extra `</div>` for the
1842 // double-close case, or `bar` for trailing text after
1843 // a normal close). Trailing bytes are recursively
1844 // parsed and grafted as siblings of the HTML_BLOCK_DIV
1845 // wrapper.
1846 let close_tag_name = close_split_tag.expect("close_split_tag present");
1847 let close_marker_end =
1848 split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1849 let close_marker = &close_part[..close_marker_end];
1850 let close_trailing = &close_part[close_marker_end..];
1851
1852 emit_html_block_body_lifted(
1853 builder,
1854 &pre_content,
1855 &content_lines,
1856 body_leading,
1857 policy,
1858 config,
1859 );
1860 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1861 if leading_is_ws_only {
1862 builder.token(SyntaxKind::WHITESPACE.into(), leading);
1863 }
1864 if close_trailing.is_empty() {
1865 let mut close_line =
1866 String::with_capacity(close_marker.len() + close_post_nl.len());
1867 close_line.push_str(close_marker);
1868 close_line.push_str(close_post_nl);
1869 emit_html_block_line(builder, &close_line, 0);
1870 builder.finish_node();
1871 } else {
1872 // Close tag holds only the close-marker bytes;
1873 // trailing + newline graft as siblings.
1874 builder.token(SyntaxKind::TEXT.into(), close_marker);
1875 builder.finish_node(); // HTML_BLOCK_TAG
1876 builder.finish_node(); // HtmlBlock
1877
1878 let mut trailing_text =
1879 String::with_capacity(close_trailing.len() + close_post_nl.len());
1880 trailing_text.push_str(close_trailing);
1881 trailing_text.push_str(close_post_nl);
1882 let mut inner_options = config.clone();
1883 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1884 inner_options.refdef_labels = Some(refdefs.clone());
1885 let inner_root = crate::parser::parse_with_refdefs(
1886 &trailing_text,
1887 Some(inner_options),
1888 refdefs,
1889 );
1890 let mut bq = None;
1891 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1892 current_pos += 1;
1893 return current_pos;
1894 }
1895 } else {
1896 emit_html_block_body(
1897 builder,
1898 &pre_content,
1899 &content_lines,
1900 bq_depth,
1901 wrapper_kind,
1902 lift_mode,
1903 config,
1904 );
1905 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1906 emit_html_block_line(builder, line, bq_depth);
1907 builder.finish_node();
1908 }
1909
1910 current_pos += 1;
1911 break;
1912 }
1913
1914 // Regular content line
1915 content_lines.push(line);
1916 current_pos += 1;
1917 }
1918
1919 // If we didn't find a closing marker, emit what we collected
1920 if !found_closing {
1921 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1922 emit_html_block_body(
1923 builder,
1924 &pre_content,
1925 &content_lines,
1926 bq_depth,
1927 wrapper_kind,
1928 lift_mode,
1929 config,
1930 );
1931 }
1932
1933 builder.finish_node(); // HtmlBlock
1934 current_pos
1935}
1936
1937/// Emit the collected inner content lines for an HTML block.
1938///
1939/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1940/// line `<div>` open outside blockquote), recursively parse the inner
1941/// content (including any open-tag trailing) as Pandoc-flavored
1942/// markdown and graft the resulting top-level blocks as direct children
1943/// of the wrapper. This is the Phase 6 structural lift — the projector
1944/// and downstream consumers (linter, salsa, LSP) can walk the
1945/// structural children instead of re-tokenizing the body bytes.
1946///
1947/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1948/// blockquote, multi-line open, or no content at all — fall through to
1949/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1950///
1951/// CST bytes remain byte-identical to source: the recursive parser is
1952/// lossless on the same byte slice the legacy path would have captured
1953/// as TEXT.
1954fn emit_html_block_body(
1955 builder: &mut GreenNodeBuilder<'static>,
1956 pre_content: &str,
1957 content_lines: &[&str],
1958 bq_depth: usize,
1959 wrapper_kind: SyntaxKind,
1960 lift_mode: bool,
1961 config: &ParserOptions,
1962) {
1963 if pre_content.is_empty() && content_lines.is_empty() {
1964 return;
1965 }
1966 if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1967 // Reached when the parser walked to end-of-input without finding
1968 // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1969 emit_html_block_body_lifted(
1970 builder,
1971 pre_content,
1972 content_lines,
1973 "",
1974 LastParaDemote::Never,
1975 config,
1976 );
1977 return;
1978 }
1979 // Legacy path: opaque TEXT capture. `pre_content` is always empty
1980 // here (lift_mode is the only path that populates it), but be
1981 // defensive — if a trailing prefix snuck in, emit it as TEXT so
1982 // bytes are preserved.
1983 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1984 if !pre_content.is_empty() {
1985 builder.token(SyntaxKind::TEXT.into(), pre_content);
1986 }
1987 for content_line in content_lines {
1988 emit_html_block_line(builder, content_line, bq_depth);
1989 }
1990 builder.finish_node();
1991}
1992
1993/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1994/// to `PLAIN` when grafting children into the structural CST.
1995#[derive(Copy, Clone, Debug)]
1996enum LastParaDemote {
1997 /// Never demote — pandoc preserves the trailing `Para`.
1998 Never,
1999 /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2000 /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2001 /// tag is butted against the paragraph text on its source line —
2002 /// pandoc's `markdown_in_html_blocks` Plain demotion.
2003 SkipTrailingBlanks,
2004 /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2005 /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2006 /// for non-div strict-block tags whose body emits at top-level
2007 /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2008 /// demotes the trailing `Para` to `Plain` unless a blank line
2009 /// separates them.
2010 OnlyIfLast,
2011}
2012
2013/// Lift the HTML-block body into structural CST children: build the
2014/// inner text from `pre_content` + `content_lines` + `post_content`
2015/// (in order), recursively parse it as Pandoc-flavored markdown, and
2016/// graft the resulting top-level blocks into `builder`. `demote_policy`
2017/// controls whether the trailing paragraph is retagged as `PLAIN` to
2018/// encode pandoc's Plain/Para adjacency rules structurally.
2019fn emit_html_block_body_lifted(
2020 builder: &mut GreenNodeBuilder<'static>,
2021 pre_content: &str,
2022 content_lines: &[&str],
2023 post_content: &str,
2024 demote_policy: LastParaDemote,
2025 config: &ParserOptions,
2026) {
2027 emit_html_block_body_lifted_inner(
2028 builder,
2029 pre_content,
2030 content_lines,
2031 post_content,
2032 demote_policy,
2033 config,
2034 &mut None,
2035 )
2036}
2037
2038/// Body-lift variant for `<div>` inside a blockquote. Strips
2039/// `bq_depth` levels of blockquote markers from each `content_line`,
2040/// captures the per-line prefix bytes, and grafts the recursive parse
2041/// with prefix injection so the output CST stays byte-equal to the
2042/// source. `pre_content` and `post_content` must be empty (the bq
2043/// clean lift only handles the shape where the open and close tags
2044/// stand alone on their source lines).
2045fn emit_html_block_body_lifted_bq(
2046 builder: &mut GreenNodeBuilder<'static>,
2047 content_lines: &[&str],
2048 prefix: &ContainerPrefix,
2049 demote_policy: LastParaDemote,
2050 config: &ParserOptions,
2051) {
2052 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2053 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2054 for cl in content_lines {
2055 let (li, bq, inner) = prefix.split(cl);
2056 prefix_lines.push(ContainerPrefixLine {
2057 list_indent: li.to_string(),
2058 bq_prefix: bq.to_string(),
2059 });
2060 stripped_lines.push(inner);
2061 }
2062 let mut state = ContainerPrefixState::new(prefix_lines);
2063 emit_html_block_body_lifted_inner(
2064 builder,
2065 "",
2066 &stripped_lines,
2067 "",
2068 demote_policy,
2069 config,
2070 &mut state,
2071 )
2072}
2073
2074/// Body-lift variant for the bq messy-shape lift — open-trailing,
2075/// butted-close, or both. The open-trailing bytes (if any) sit in
2076/// `pre_content` (line 0 of the body — no bq prefix in source because
2077/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2078/// each carry their own bq prefix. The close line's `leading` (body
2079/// bytes before `</tag>`) sits on the close line, prefixed in source
2080/// by `close_line_prefix` (the bq prefix captured from `line`).
2081///
2082/// Builds `prefixes` so each emitted line in the recursive parse
2083/// output gets the right per-line bq prefix re-injected at line start:
2084/// `pre_content` → empty prefix (no source `> ` precedes it); each
2085/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2086/// Result CST stays byte-equal to source.
2087#[allow(clippy::too_many_arguments)]
2088fn emit_html_block_body_lifted_bq_messy(
2089 builder: &mut GreenNodeBuilder<'static>,
2090 pre_content: &str,
2091 content_lines: &[&str],
2092 leading: &str,
2093 close_line_prefix: &str,
2094 prefix: &ContainerPrefix,
2095 demote_policy: LastParaDemote,
2096 config: &ParserOptions,
2097) {
2098 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2099 if !pre_content.is_empty() {
2100 prefix_lines.push(ContainerPrefixLine::default());
2101 }
2102 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2103 for cl in content_lines {
2104 let (li, bq, inner) = prefix.split(cl);
2105 prefix_lines.push(ContainerPrefixLine {
2106 list_indent: li.to_string(),
2107 bq_prefix: bq.to_string(),
2108 });
2109 stripped_lines.push(inner);
2110 }
2111 if !leading.is_empty() {
2112 // The close line carries its own captured prefix bytes; treat
2113 // them as bq-prefix only (no list-indent split applied) to keep
2114 // the legacy bq-only re-injection behavior for messy-shape
2115 // close-line lifts.
2116 prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2117 }
2118 let mut state = ContainerPrefixState::new(prefix_lines);
2119 emit_html_block_body_lifted_inner(
2120 builder,
2121 pre_content,
2122 &stripped_lines,
2123 leading,
2124 demote_policy,
2125 config,
2126 &mut state,
2127 )
2128}
2129
2130fn emit_html_block_body_lifted_inner(
2131 builder: &mut GreenNodeBuilder<'static>,
2132 pre_content: &str,
2133 content_lines: &[&str],
2134 post_content: &str,
2135 demote_policy: LastParaDemote,
2136 config: &ParserOptions,
2137 bq: &mut Option<ContainerPrefixState>,
2138) {
2139 if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2140 return;
2141 }
2142 let mut inner_text = String::with_capacity(
2143 pre_content.len()
2144 + content_lines.iter().map(|s| s.len()).sum::<usize>()
2145 + post_content.len(),
2146 );
2147 inner_text.push_str(pre_content);
2148 for line in content_lines {
2149 inner_text.push_str(line);
2150 }
2151 inner_text.push_str(post_content);
2152
2153 let mut inner_options = config.clone();
2154 let refdefs = config.refdef_labels.clone().unwrap_or_default();
2155 inner_options.refdef_labels = Some(refdefs.clone());
2156 let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2157 graft_document_children(builder, &inner_root, demote_policy, bq);
2158}
2159
2160/// Walk a parsed inner document's top-level children and re-emit them
2161/// into `builder`. The document's wrapper node is skipped — only its
2162/// children are grafted.
2163///
2164/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2165/// as `PLAIN` — see [`LastParaDemote`].
2166///
2167/// `bq` is `Some` when grafting a body that lived inside an outer
2168/// container (blockquote, list-item, or both) — token emission then
2169/// injects the captured per-line prefix tokens at line starts so the
2170/// CST stays byte-equal to source. See
2171/// [`super::container_prefix::ContainerPrefixState`].
2172fn graft_document_children(
2173 builder: &mut GreenNodeBuilder<'static>,
2174 doc: &SyntaxNode,
2175 demote_policy: LastParaDemote,
2176 bq: &mut Option<ContainerPrefixState>,
2177) {
2178 let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2179
2180 let mut demote_idx: Option<usize> = None;
2181 match demote_policy {
2182 LastParaDemote::Never => {}
2183 LastParaDemote::SkipTrailingBlanks => {
2184 for (i, c) in children.iter().enumerate().rev() {
2185 if let rowan::NodeOrToken::Node(n) = c {
2186 if n.kind() == SyntaxKind::BLANK_LINE {
2187 continue;
2188 }
2189 if n.kind() == SyntaxKind::PARAGRAPH {
2190 demote_idx = Some(i);
2191 }
2192 break;
2193 }
2194 }
2195 }
2196 LastParaDemote::OnlyIfLast => {
2197 for (i, c) in children.iter().enumerate().rev() {
2198 if let rowan::NodeOrToken::Node(n) = c {
2199 if n.kind() == SyntaxKind::PARAGRAPH {
2200 demote_idx = Some(i);
2201 }
2202 break;
2203 }
2204 }
2205 }
2206 }
2207
2208 for (i, child) in children.into_iter().enumerate() {
2209 match child {
2210 rowan::NodeOrToken::Node(n) => {
2211 if Some(i) == demote_idx {
2212 graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2213 } else {
2214 graft_subtree(builder, &n, bq);
2215 }
2216 }
2217 rowan::NodeOrToken::Token(t) => {
2218 emit_grafted_token(builder, t.kind(), t.text(), bq);
2219 }
2220 }
2221 }
2222}
2223
2224/// Recursively re-emit `node` and its descendants into `builder`.
2225/// Token text is copied verbatim so the result is byte-identical to
2226/// the input span (modulo bq prefix tokens injected at line starts
2227/// when `bq` is `Some`).
2228fn graft_subtree(
2229 builder: &mut GreenNodeBuilder<'static>,
2230 node: &SyntaxNode,
2231 bq: &mut Option<ContainerPrefixState>,
2232) {
2233 graft_subtree_as(builder, node, node.kind(), bq);
2234}
2235
2236/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2237/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2238/// the close-butted demotion rule.
2239fn graft_subtree_as(
2240 builder: &mut GreenNodeBuilder<'static>,
2241 node: &SyntaxNode,
2242 kind: SyntaxKind,
2243 bq: &mut Option<ContainerPrefixState>,
2244) {
2245 builder.start_node(kind.into());
2246 for child in node.children_with_tokens() {
2247 match child {
2248 rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2249 rowan::NodeOrToken::Token(t) => {
2250 emit_grafted_token(builder, t.kind(), t.text(), bq);
2251 }
2252 }
2253 }
2254 builder.finish_node();
2255}
2256
2257/// Emit a single token while optionally injecting blockquote prefix
2258/// tokens at line starts. When `bq` is `None`, this is a plain
2259/// `builder.token()` passthrough.
2260fn emit_grafted_token(
2261 builder: &mut GreenNodeBuilder<'static>,
2262 kind: SyntaxKind,
2263 text: &str,
2264 bq: &mut Option<ContainerPrefixState>,
2265) {
2266 if let Some(state) = bq.as_mut() {
2267 if state.at_line_start {
2268 if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2269 emit_container_prefix_tokens(builder, line_prefix);
2270 }
2271 state.at_line_start = false;
2272 }
2273 builder.token(kind.into(), text);
2274 // `BLANK_LINE` token represents an entirely blank source line —
2275 // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2276 // token as line-ending so the per-line prefix index advances
2277 // correctly.
2278 if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2279 state.line_idx += 1;
2280 state.at_line_start = true;
2281 }
2282 } else {
2283 builder.token(kind.into(), text);
2284 }
2285}
2286
2287/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2288/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2289fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2290 for ch in prefix.chars() {
2291 if ch == '>' {
2292 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2293 } else {
2294 let mut buf = [0u8; 4];
2295 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2296 }
2297 }
2298}
2299
2300/// Locate the byte index (within `line`) of the open-tag's closing `>`
2301/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2302/// the line doesn't fit the expected shape. Mirrors the inner scan of
2303/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2304/// caller can slice off the trailing bytes.
2305fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2306 let bytes = line.as_bytes();
2307 let indent_end = bytes
2308 .iter()
2309 .position(|&b| b != b' ' && b != b'\t')
2310 .unwrap_or(bytes.len());
2311 let rest = &line[indent_end..];
2312 let rest_bytes = rest.as_bytes();
2313 let prefix_len = 1 + tag_name.len();
2314 if rest_bytes.len() < prefix_len + 1
2315 || rest_bytes[0] != b'<'
2316 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2317 {
2318 return None;
2319 }
2320 let after_name = &rest[prefix_len..];
2321 let after_name_bytes = after_name.as_bytes();
2322 let mut i = 0usize;
2323 let mut quote: Option<u8> = None;
2324 while i < after_name_bytes.len() {
2325 match (quote, after_name_bytes[i]) {
2326 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2327 (Some(q), b2) if b2 == q => quote = None,
2328 (None, b'>') => return Some(indent_end + prefix_len + i),
2329 _ => {}
2330 }
2331 i += 1;
2332 }
2333 None
2334}
2335
2336/// Whether `slice` begins (after leading ASCII whitespace) with an
2337/// open tag whose name is a Pandoc void block tag (`<source>`,
2338/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2339/// open tags return false.
2340///
2341/// Used by the inline-block matched-pair lift gate: pandoc-native
2342/// abandons the lift when the body's first non-blank content is a
2343/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2344/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2345/// matched-pair lift).
2346fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2347 let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2348 if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2349 return false;
2350 }
2351 let Some(tag_end) = parse_open_tag(trimmed) else {
2352 return false;
2353 };
2354 let bytes = trimmed.as_bytes();
2355 let mut name_end = 1usize;
2356 while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2357 {
2358 name_end += 1;
2359 }
2360 if name_end == 1 {
2361 return false;
2362 }
2363 is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2364}
2365
2366/// Whether the body of an inline-block matched-pair (`<video>...`,
2367/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2368/// a void block tag — the condition under which pandoc-native abandons
2369/// the matched-pair lift. Probes three shapes:
2370///
2371/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2372/// after the open `>` on `first_inner` start with `<source`.
2373/// - **Single-line open + multi-line body**: open-trailing on the open
2374/// line is empty/whitespace AND the first non-blank body line
2375/// (`lines[start_pos+1..]`) starts with a void tag.
2376/// - **Multi-line open**: same body-line scan starting at
2377/// `lines[multiline_open_end+1..]`.
2378///
2379/// Returns `false` when the body begins with text, with a close tag,
2380/// or with a non-void block tag — those cases all proceed with the
2381/// matched-pair lift.
2382fn inline_block_void_interior_abandons(
2383 first_inner: &str,
2384 lines: &[&str],
2385 start_pos: usize,
2386 multiline_open_end: Option<usize>,
2387 bq_depth: usize,
2388 tag_name: &str,
2389) -> bool {
2390 let (line_no_nl, _) = strip_newline(first_inner);
2391 let (body_start_line_idx, open_trailing) = match multiline_open_end {
2392 Some(end) => (end + 1, ""),
2393 None => {
2394 let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2395 let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2396 (start_pos + 1, trailing)
2397 }
2398 };
2399 let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2400 if !trimmed.is_empty() {
2401 return slice_starts_with_void_block_tag(trimmed);
2402 }
2403 for line in &lines[body_start_line_idx..] {
2404 let inner = if bq_depth > 0 {
2405 strip_n_blockquote_markers(line, bq_depth)
2406 } else {
2407 line
2408 };
2409 let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2410 if trimmed.is_empty() {
2411 continue;
2412 }
2413 return slice_starts_with_void_block_tag(trimmed);
2414 }
2415 false
2416}
2417
2418/// Probe whether the open-tag line has a valid (quote-aware) closing
2419/// `>` after the tag name. Admits trailing content after `>` (the
2420/// open-trailing shape `<form>foo`) — the caller is expected to capture
2421/// that trailing into the structural lift's `pre_content`.
2422pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2423 let bytes = line.as_bytes();
2424 let indent_end = bytes
2425 .iter()
2426 .position(|&b| b != b' ' && b != b'\t')
2427 .unwrap_or(bytes.len());
2428 let rest = &line[indent_end..];
2429 let rest_bytes = rest.as_bytes();
2430 let prefix_len = 1 + tag_name.len();
2431 if rest_bytes.len() < prefix_len + 1
2432 || rest_bytes[0] != b'<'
2433 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2434 {
2435 return false;
2436 }
2437 let after_name = &rest[prefix_len..];
2438 let after_name_bytes = after_name.as_bytes();
2439 let mut i = 0usize;
2440 let mut quote: Option<u8> = None;
2441 while i < after_name_bytes.len() {
2442 match (quote, after_name_bytes[i]) {
2443 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2444 (Some(q), b2) if b2 == q => quote = None,
2445 (None, b'>') => return true,
2446 _ => {}
2447 }
2448 i += 1;
2449 }
2450 false
2451}
2452
2453/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2454/// be lifted structurally. Returns `true` only when:
2455/// - The line starts with `<tag_name` (modulo leading whitespace).
2456/// - The open tag's `>` exists with proper quote handling.
2457/// - The bytes after the open `>` contain a depth-zero matched
2458/// `</tag_name>` close (depth-aware: nested `<tag>` opens
2459/// increment depth; matching is case-insensitive, quote-aware).
2460///
2461/// Trailing bytes after the matched close are accepted and grafted
2462/// as a sibling block by the caller. Examples:
2463/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2464/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2465/// recursively parses to a `RawBlock`).
2466/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2467/// parsed recursively), trailing=`bar`.
2468fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2469 let bytes = line.as_bytes();
2470 let indent_end = bytes
2471 .iter()
2472 .position(|&b| b != b' ' && b != b'\t')
2473 .unwrap_or(bytes.len());
2474 let rest = &line[indent_end..];
2475 let rest_bytes = rest.as_bytes();
2476 let prefix_len = 1 + tag_name.len();
2477 if rest_bytes.len() < prefix_len
2478 || rest_bytes[0] != b'<'
2479 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2480 {
2481 return false;
2482 }
2483 let after_name = &rest[prefix_len..];
2484 let after_name_bytes = after_name.as_bytes();
2485 let mut i = 0usize;
2486 let mut quote: Option<u8> = None;
2487 let mut gt_idx: Option<usize> = None;
2488 while i < after_name_bytes.len() {
2489 match (quote, after_name_bytes[i]) {
2490 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2491 (Some(q), b2) if b2 == q => quote = None,
2492 (None, b'>') => {
2493 gt_idx = Some(i);
2494 break;
2495 }
2496 _ => {}
2497 }
2498 i += 1;
2499 }
2500 let Some(gt_idx) = gt_idx else {
2501 return false;
2502 };
2503 let trailing = &after_name[gt_idx + 1..];
2504 // Depth-aware: walk `trailing` (we begin inside the open tag at
2505 // depth 1). Return true iff a matched `</tag>` exists where depth
2506 // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2507 matched_close_offset(trailing, tag_name).is_some()
2508}
2509
2510/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2511/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2512/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2513/// starts at 1 (we begin inside the open tag). Self-closing opens
2514/// (`<tag/>`) do not increment depth.
2515///
2516/// Returns `Some((close_start, close_end))` where:
2517/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2518/// - `close_end` is one past the matched `>`.
2519///
2520/// Returns `None` when no matched close is present (unclosed tag,
2521/// depth never returns to 0).
2522fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2523 let bytes = trailing.as_bytes();
2524 let lower_line = trailing.to_ascii_lowercase();
2525 let lower_bytes = lower_line.as_bytes();
2526 let tag_lower = tag_name.to_ascii_lowercase();
2527 let tag_bytes = tag_lower.as_bytes();
2528
2529 let mut depth: i32 = 1;
2530 let mut i = 0usize;
2531
2532 while i < bytes.len() {
2533 if bytes[i] != b'<' {
2534 i += 1;
2535 continue;
2536 }
2537 let after = i + 1;
2538 let is_close = after < bytes.len() && bytes[after] == b'/';
2539 let name_start = if is_close { after + 1 } else { after };
2540 let matched = name_start + tag_bytes.len() <= bytes.len()
2541 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2542 let after_name = name_start + tag_bytes.len();
2543 let is_boundary = matched
2544 && matches!(
2545 bytes.get(after_name).copied(),
2546 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2547 );
2548
2549 // Scan forward to this tag bracket's `>`, respecting quoted
2550 // attribute values; track self-closing form (`/>`).
2551 let mut j = if matched { after_name } else { after };
2552 let mut quote: Option<u8> = None;
2553 let mut self_close = false;
2554 let mut found_gt = false;
2555 while j < bytes.len() {
2556 let b = bytes[j];
2557 match (quote, b) {
2558 (Some(q), x) if x == q => quote = None,
2559 (None, b'"') | (None, b'\'') => quote = Some(b),
2560 (None, b'>') => {
2561 found_gt = true;
2562 if j > i + 1 && bytes[j - 1] == b'/' {
2563 self_close = true;
2564 }
2565 break;
2566 }
2567 _ => {}
2568 }
2569 j += 1;
2570 }
2571
2572 if matched && is_boundary {
2573 if is_close {
2574 depth -= 1;
2575 if depth == 0 && found_gt {
2576 return Some((i, j + 1));
2577 }
2578 } else if !self_close {
2579 depth += 1;
2580 }
2581 }
2582
2583 if found_gt {
2584 i = j + 1;
2585 } else {
2586 // Unterminated `<...` — give up.
2587 break;
2588 }
2589 }
2590 None
2591}
2592
2593/// Locate the byte offset of the first `>` after a `</tag` prefix at
2594/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2595/// the caller can split `close_part` into the close-marker bytes
2596/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2597/// expected prefix shape is missing — caller treats the whole slice
2598/// as the close marker (no trailing).
2599fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2600 let prefix_len = 2 + tag_name.len();
2601 let bytes = close_part.as_bytes();
2602 if bytes.len() < prefix_len
2603 || bytes[0] != b'<'
2604 || bytes[1] != b'/'
2605 || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2606 {
2607 return None;
2608 }
2609 // Scan from after `</tag` to the first unquoted `>`.
2610 let mut i = prefix_len;
2611 let mut quote: Option<u8> = None;
2612 while i < bytes.len() {
2613 match (quote, bytes[i]) {
2614 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2615 (Some(q), b2) if b2 == q => quote = None,
2616 (None, b'>') => return Some(i + 1),
2617 _ => {}
2618 }
2619 i += 1;
2620 }
2621 None
2622}
2623
2624/// Try to split the close line of an HTML_BLOCK_DIV body into a
2625/// leading content prefix and a clean `</tag>...` remainder. Returns
2626/// `Some((leading, close_part))` only when the line contains exactly
2627/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2628/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2629/// for missing close tags, or for compound shapes the parser
2630/// shouldn't attempt to lift in this pass.
2631///
2632/// `leading` may be empty (close starts at column 0) or pure
2633/// whitespace (close on an indented line). Both count as "butted" per
2634/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2635/// the trailing paragraph inside the div demotes Para→Plain.
2636fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2637 let (opens, closes) = count_tag_balance(line, tag_name);
2638 if opens != 0 || closes != 1 {
2639 return None;
2640 }
2641 // Locate the close tag's opening `<` by lowercased substring search.
2642 // Safe because we've already established (above) that the line has
2643 // exactly one `</tag>` and no `<tag>` opens, so the first match is
2644 // THE close.
2645 let needle = format!("</{}", tag_name);
2646 let lower = line.to_ascii_lowercase();
2647 let close_lt = lower.find(&needle)?;
2648 Some((&line[..close_lt], &line[close_lt..]))
2649}
2650
2651/// Depth-aware variant of `try_split_close_line` used by the same-line
2652/// lift path. Walks `line` starting at depth 1 (we begin inside the
2653/// open `<tag>`) and splits at the byte position where the matched
2654/// `</tag>` close brings depth to 0. Returns `Some((body,
2655/// close_part))` where `body` is the bytes before the matched-close
2656/// start and `close_part` is the bytes from the matched close onward.
2657///
2658/// Unlike `try_split_close_line` this accepts nested same-tag opens
2659/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2660/// body=`<div>x</div>` (a nested div the body lift parses
2661/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2662/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2663/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2664fn try_split_close_line_depth_aware<'a>(
2665 line: &'a str,
2666 tag_name: &str,
2667) -> Option<(&'a str, &'a str)> {
2668 let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2669 Some((&line[..close_start], &line[close_start..]))
2670}
2671
2672/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2673/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2674/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2675/// + TEXT(">") + TEXT(trailing)?`.
2676///
2677/// Bytes are byte-identical to the source — this only tokenizes at finer
2678/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2679/// region structurally. Falls back to a single TEXT token if the line
2680/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2681/// only retags as the lift kind when this shape was matched).
2682///
2683/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2684/// returned as `&str` instead so the caller can splice them into the
2685/// recursive-parse input for the structural body lift. When false
2686/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2687/// empty slice is returned.
2688fn emit_open_tag_tokens<'a>(
2689 builder: &mut GreenNodeBuilder<'static>,
2690 line: &'a str,
2691 tag_name: &str,
2692 lift_trailing: bool,
2693) -> &'a str {
2694 let bytes = line.as_bytes();
2695 // Leading indent (CommonMark allows up to 3 spaces).
2696 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2697 if indent_end > 0 {
2698 builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2699 }
2700 let rest = &line[indent_end..];
2701 // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2702 let prefix_len = 1 + tag_name.len();
2703 if !rest.starts_with('<')
2704 || rest.len() < prefix_len
2705 || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2706 {
2707 builder.token(SyntaxKind::TEXT.into(), rest);
2708 return "";
2709 }
2710 let after_name = &rest[prefix_len..];
2711 let after_name_bytes = after_name.as_bytes();
2712 // Find the closing `>` of the open tag, respecting quoted attribute values.
2713 let mut i = 0usize;
2714 let mut quote: Option<u8> = None;
2715 let mut tag_close: Option<usize> = None;
2716 while i < after_name_bytes.len() {
2717 let b = after_name_bytes[i];
2718 match (quote, b) {
2719 (None, b'"') | (None, b'\'') => quote = Some(b),
2720 (Some(q), b2) if b2 == q => quote = None,
2721 (None, b'>') => {
2722 tag_close = Some(i);
2723 break;
2724 }
2725 _ => {}
2726 }
2727 i += 1;
2728 }
2729 let Some(tag_close) = tag_close else {
2730 // Open tag has no closing `>` on this line — defensive fallback.
2731 builder.token(SyntaxKind::TEXT.into(), rest);
2732 return "";
2733 };
2734 // Whitespace between the tag name and the attribute region.
2735 let attrs_inner = &after_name[..tag_close];
2736 let ws_end = attrs_inner
2737 .as_bytes()
2738 .iter()
2739 .position(|&b| !matches!(b, b' ' | b'\t'))
2740 .unwrap_or(attrs_inner.len());
2741 let leading_ws = &attrs_inner[..ws_end];
2742 // Strip a trailing self-closing slash and the whitespace before it
2743 // from the attribute region; emit them as TEXT outside the
2744 // HTML_ATTRS node so the structural region only holds attribute
2745 // bytes (not formatting punctuation).
2746 let attrs_after_ws = &attrs_inner[ws_end..];
2747 let mut attr_end = attrs_after_ws.len();
2748 let attr_bytes = attrs_after_ws.as_bytes();
2749 let mut self_close_start = attr_end;
2750 if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2751 self_close_start = attr_end - 1;
2752 attr_end = self_close_start;
2753 while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2754 attr_end -= 1;
2755 }
2756 }
2757 let attrs_text = &attrs_after_ws[..attr_end];
2758 let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2759 let after_self_close = &attrs_after_ws[self_close_start..];
2760
2761 // Use the original source bytes for the `<tag` prefix (preserves
2762 // source casing — losslessness).
2763 builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2764 if !leading_ws.is_empty() {
2765 builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2766 }
2767 if !attrs_text.is_empty() {
2768 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2769 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2770 builder.finish_node();
2771 }
2772 if !trailing_text.is_empty() {
2773 builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2774 }
2775 if !after_self_close.is_empty() {
2776 builder.token(SyntaxKind::TEXT.into(), after_self_close);
2777 }
2778 builder.token(SyntaxKind::TEXT.into(), ">");
2779 let after_gt = &after_name[tag_close + 1..];
2780 if lift_trailing {
2781 // Return trailing bytes to the caller (will be spliced into the
2782 // recursive-parse input for the body lift).
2783 return after_gt;
2784 }
2785 if !after_gt.is_empty() {
2786 builder.token(SyntaxKind::TEXT.into(), after_gt);
2787 }
2788 ""
2789}
2790
2791/// Detect a multi-line HTML open tag for `tag_name`. Returns
2792/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2793/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2794/// the existing path) or when the `>` is missing entirely.
2795///
2796/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2797/// attribute value doesn't terminate the open tag. Quote state carries
2798/// across line boundaries.
2799fn find_multiline_open_end(
2800 lines: &[&str],
2801 start_pos: usize,
2802 first_inner: &str,
2803 tag_name: &str,
2804 prefix: &ContainerPrefix,
2805) -> Option<usize> {
2806 // Locate the `<tag_name` literal in `first_inner` to start scanning past
2807 // it. Match is ASCII case-insensitive; the parser preserves source casing.
2808 // `first_inner` is already bq-stripped by the caller; subsequent lines are
2809 // stripped inline below via `strip_n_blockquote_markers`.
2810 let trimmed = strip_leading_spaces(first_inner);
2811 let prefix_len = 1 + tag_name.len();
2812 if !trimmed.starts_with('<')
2813 || trimmed.len() < prefix_len
2814 || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2815 {
2816 return None;
2817 }
2818 let leading_indent = first_inner.len() - trimmed.len();
2819 let mut i = leading_indent + prefix_len; // past `<tag_name`
2820 let mut quote: Option<u8> = None;
2821
2822 // Scan first line for an unquoted `>`.
2823 let line0_bytes = first_inner.as_bytes();
2824 while i < line0_bytes.len() {
2825 match (quote, line0_bytes[i]) {
2826 (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2827 (Some(q), x) if x == q => quote = None,
2828 (None, b'>') => return None, // single-line case
2829 _ => {}
2830 }
2831 i += 1;
2832 }
2833
2834 // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2835 // blockquote markers per line so `> ` prefixes don't count toward the
2836 // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2837 let mut line_idx = start_pos + 1;
2838 while line_idx < lines.len() {
2839 let raw = lines[line_idx];
2840 let inner = prefix.strip(raw);
2841 for &b in inner.as_bytes() {
2842 match (quote, b) {
2843 (None, b'"') | (None, b'\'') => quote = Some(b),
2844 (Some(q), x) if x == q => quote = None,
2845 (None, b'>') => return Some(line_idx),
2846 _ => {}
2847 }
2848 }
2849 line_idx += 1;
2850 }
2851
2852 None
2853}
2854
2855/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2856/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2857/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2858/// open tag (no `>` in the remaining input) as paragraph text rather than
2859/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2860/// reparse the same content recursively, causing a stack overflow.
2861///
2862/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2863/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2864/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2865/// the tag eventually closes far away.
2866pub(crate) fn pandoc_html_open_tag_closes(
2867 lines: &[&str],
2868 start_pos: usize,
2869 prefix: &ContainerPrefix,
2870) -> bool {
2871 if start_pos >= lines.len() {
2872 return false;
2873 }
2874 let mut quote: Option<u8> = None;
2875 for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2876 let inner = prefix.strip(line);
2877 let bytes = inner.as_bytes();
2878 let mut i = 0usize;
2879 if offset == start_pos {
2880 while i < bytes.len() && bytes[i] == b' ' {
2881 i += 1;
2882 }
2883 if bytes.get(i) != Some(&b'<') {
2884 return false;
2885 }
2886 i += 1;
2887 }
2888 while i < bytes.len() {
2889 match (quote, bytes[i]) {
2890 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2891 (Some(q), x) if x == q => quote = None,
2892 (None, b'>') => return true,
2893 _ => {}
2894 }
2895 i += 1;
2896 }
2897 }
2898 false
2899}
2900
2901/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2902/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2903/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2904/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2905/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2906/// Phase 6 structural lift.
2907///
2908/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2909/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2910/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2911/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2912/// + TEXT(">") + (TEXT(trailing))? + NEWLINE
2913///
2914/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2915/// newline; `parse_html_attribute_list` tolerates whitespace.
2916#[allow(clippy::too_many_arguments)]
2917fn emit_multiline_open_tag_with_attrs(
2918 builder: &mut GreenNodeBuilder<'static>,
2919 lines: &[&str],
2920 start_pos: usize,
2921 end_line_idx: usize,
2922 tag_name: &str,
2923 bq_depth: usize,
2924 lift_trailing: bool,
2925 pre_content: &mut String,
2926) {
2927 let prefix_len = 1 + tag_name.len();
2928 for (line_idx, raw) in lines
2929 .iter()
2930 .enumerate()
2931 .take(end_line_idx + 1)
2932 .skip(start_pos)
2933 {
2934 // Strip `bq_depth` blockquote markers from the source line so
2935 // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2936 // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2937 // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2938 // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2939 // before this parser runs; re-emitting it here would double
2940 // the bytes and break losslessness.
2941 let stripped = if bq_depth > 0 {
2942 strip_n_blockquote_markers(raw, bq_depth)
2943 } else {
2944 raw
2945 };
2946 let bq_prefix_len = raw.len() - stripped.len();
2947 if bq_prefix_len > 0 && line_idx != start_pos {
2948 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2949 }
2950 let line = stripped;
2951 let (line_no_nl, newline_str) = strip_newline(line);
2952
2953 if line_idx == start_pos {
2954 // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2955 // + attrs)?. The closing `>` is on a later line, so any
2956 // remaining bytes after "<{tag_name}" on this line are the
2957 // start of the attribute region.
2958 let bytes = line_no_nl.as_bytes();
2959 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2960 if indent_end > 0 {
2961 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2962 }
2963 // Defensive: caller verified the line starts with `<{tag_name}`.
2964 let after_indent = &line_no_nl[indent_end..];
2965 if after_indent.len() >= prefix_len {
2966 builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2967 let rest = &after_indent[prefix_len..];
2968 emit_attr_region(builder, rest);
2969 } else {
2970 builder.token(SyntaxKind::TEXT.into(), after_indent);
2971 }
2972 } else if line_idx < end_line_idx {
2973 // Pure attribute line.
2974 let bytes = line_no_nl.as_bytes();
2975 let indent_end = bytes
2976 .iter()
2977 .position(|&b| !matches!(b, b' ' | b'\t'))
2978 .unwrap_or(bytes.len());
2979 if indent_end > 0 {
2980 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2981 }
2982 let attrs_text = &line_no_nl[indent_end..];
2983 if !attrs_text.is_empty() {
2984 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2985 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2986 builder.finish_node();
2987 }
2988 } else {
2989 // Last line: indent + attrs + ">" + trailing.
2990 let bytes = line_no_nl.as_bytes();
2991 let indent_end = bytes
2992 .iter()
2993 .position(|&b| !matches!(b, b' ' | b'\t'))
2994 .unwrap_or(bytes.len());
2995 if indent_end > 0 {
2996 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2997 }
2998 // Find the unquoted `>` byte position in this line.
2999 let mut quote: Option<u8> = None;
3000 let mut gt_pos: Option<usize> = None;
3001 for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3002 let actual_j = indent_end + j;
3003 match (quote, b) {
3004 (None, b'"') | (None, b'\'') => quote = Some(b),
3005 (Some(q), x) if x == q => quote = None,
3006 (None, b'>') => {
3007 gt_pos = Some(actual_j);
3008 break;
3009 }
3010 _ => {}
3011 }
3012 }
3013 let Some(gt) = gt_pos else {
3014 // Defensive — caller said `>` is on this line.
3015 builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3016 if !newline_str.is_empty() {
3017 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3018 }
3019 continue;
3020 };
3021 // Attribute region: between indent_end and gt, with possibly
3022 // trailing whitespace before `>`.
3023 let attrs_region = &line_no_nl[indent_end..gt];
3024 let region_bytes = attrs_region.as_bytes();
3025 // Strip trailing whitespace from attrs region; emit as
3026 // separate WHITESPACE so HTML_ATTRS only contains attribute
3027 // bytes.
3028 let mut attr_end = region_bytes.len();
3029 while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3030 attr_end -= 1;
3031 }
3032 let attrs_text = &attrs_region[..attr_end];
3033 let trailing_ws = &attrs_region[attr_end..];
3034 if !attrs_text.is_empty() {
3035 builder.start_node(SyntaxKind::HTML_ATTRS.into());
3036 builder.token(SyntaxKind::TEXT.into(), attrs_text);
3037 builder.finish_node();
3038 }
3039 if !trailing_ws.is_empty() {
3040 builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3041 }
3042 builder.token(SyntaxKind::TEXT.into(), ">");
3043 let after_gt = &line_no_nl[gt + 1..];
3044 if lift_trailing && !after_gt.is_empty() {
3045 // Lift trailing bytes (and the trailing newline) into
3046 // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3047 // with `TEXT(">")`. The recursive parse at the close-marker
3048 // site treats `pre_content` as the leading bytes of the
3049 // structural body — same shape produced by `emit_open_tag_tokens`
3050 // for single-line opens.
3051 pre_content.push_str(after_gt);
3052 pre_content.push_str(newline_str);
3053 continue;
3054 }
3055 if !after_gt.is_empty() {
3056 builder.token(SyntaxKind::TEXT.into(), after_gt);
3057 }
3058 }
3059
3060 if !newline_str.is_empty() {
3061 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3062 }
3063 }
3064}
3065
3066/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3067/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3068/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3069/// node is added. Pandoc's projector reads attributes only for `<div>` /
3070/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3071fn emit_multiline_open_tag_simple(
3072 builder: &mut GreenNodeBuilder<'static>,
3073 lines: &[&str],
3074 start_pos: usize,
3075 end_line_idx: usize,
3076 bq_depth: usize,
3077) {
3078 for (line_idx, raw) in lines
3079 .iter()
3080 .enumerate()
3081 .take(end_line_idx + 1)
3082 .skip(start_pos)
3083 {
3084 let stripped = if bq_depth > 0 {
3085 strip_n_blockquote_markers(raw, bq_depth)
3086 } else {
3087 raw
3088 };
3089 let bq_prefix_len = raw.len() - stripped.len();
3090 // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3091 // re-emit prefixes only for subsequent lines.
3092 if bq_prefix_len > 0 && line_idx != start_pos {
3093 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3094 }
3095 let (line_no_nl, newline_str) = strip_newline(stripped);
3096 if !line_no_nl.is_empty() {
3097 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3098 }
3099 if !newline_str.is_empty() {
3100 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3101 }
3102 }
3103}
3104
3105/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3106/// `<div` literal up to end-of-line. Called only from
3107/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3108/// pure attribute (and possibly inter-attribute whitespace).
3109fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3110 if region.is_empty() {
3111 return;
3112 }
3113 let bytes = region.as_bytes();
3114 // Split a leading run of whitespace into a WHITESPACE token so the
3115 // HTML_ATTRS node holds only attribute bytes.
3116 let ws_end = bytes
3117 .iter()
3118 .position(|&b| !matches!(b, b' ' | b'\t'))
3119 .unwrap_or(bytes.len());
3120 if ws_end > 0 {
3121 builder.token(SyntaxKind::WHITESPACE.into(), ®ion[..ws_end]);
3122 }
3123 let attrs_text = ®ion[ws_end..];
3124 if !attrs_text.is_empty() {
3125 builder.start_node(SyntaxKind::HTML_ATTRS.into());
3126 builder.token(SyntaxKind::TEXT.into(), attrs_text);
3127 builder.finish_node();
3128 }
3129}
3130
3131/// Emit one continuation line of an HTML block, preserving any blockquote
3132/// markers as structural tokens (so the CST stays byte-equal to the source
3133/// and downstream consumers can strip them per-context).
3134fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3135 let inner = if bq_depth > 0 {
3136 let stripped = strip_n_blockquote_markers(line, bq_depth);
3137 let prefix_len = line.len() - stripped.len();
3138 if prefix_len > 0 {
3139 for ch in line[..prefix_len].chars() {
3140 if ch == '>' {
3141 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3142 } else {
3143 let mut buf = [0u8; 4];
3144 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3145 }
3146 }
3147 }
3148 stripped
3149 } else {
3150 line
3151 };
3152
3153 let (line_without_newline, newline_str) = strip_newline(inner);
3154 if !line_without_newline.is_empty() {
3155 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3156 }
3157 if !newline_str.is_empty() {
3158 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3159 }
3160}
3161
3162#[cfg(test)]
3163mod tests {
3164 use super::*;
3165
3166 #[test]
3167 fn test_try_parse_html_comment() {
3168 assert_eq!(
3169 try_parse_html_block_start("<!-- comment -->", false),
3170 Some(HtmlBlockType::Comment)
3171 );
3172 assert_eq!(
3173 try_parse_html_block_start(" <!-- comment -->", false),
3174 Some(HtmlBlockType::Comment)
3175 );
3176 }
3177
3178 #[test]
3179 fn test_try_parse_div_tag() {
3180 assert_eq!(
3181 try_parse_html_block_start("<div>", false),
3182 Some(HtmlBlockType::BlockTag {
3183 tag_name: "div".to_string(),
3184 is_verbatim: false,
3185 closed_by_blank_line: false,
3186 depth_aware: true,
3187 closes_at_open_tag: false,
3188 is_closing: false,
3189 })
3190 );
3191 assert_eq!(
3192 try_parse_html_block_start("<div class=\"test\">", false),
3193 Some(HtmlBlockType::BlockTag {
3194 tag_name: "div".to_string(),
3195 is_verbatim: false,
3196 closed_by_blank_line: false,
3197 depth_aware: true,
3198 closes_at_open_tag: false,
3199 is_closing: false,
3200 })
3201 );
3202 }
3203
3204 #[test]
3205 fn test_try_parse_script_tag() {
3206 assert_eq!(
3207 try_parse_html_block_start("<script>", false),
3208 Some(HtmlBlockType::BlockTag {
3209 tag_name: "script".to_string(),
3210 is_verbatim: true,
3211 closed_by_blank_line: false,
3212 depth_aware: true,
3213 closes_at_open_tag: false,
3214 is_closing: false,
3215 })
3216 );
3217 }
3218
3219 #[test]
3220 fn test_try_parse_processing_instruction() {
3221 assert_eq!(
3222 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3223 Some(HtmlBlockType::ProcessingInstruction)
3224 );
3225 }
3226
3227 #[test]
3228 fn test_try_parse_declaration() {
3229 // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3230 assert_eq!(
3231 try_parse_html_block_start("<!DOCTYPE html>", true),
3232 Some(HtmlBlockType::Declaration)
3233 );
3234 // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3235 // just uppercase. Lowercase doctype must match too.
3236 assert_eq!(
3237 try_parse_html_block_start("<!doctype html>", true),
3238 Some(HtmlBlockType::Declaration)
3239 );
3240 // Pandoc dialect does not — bare declarations fall through to
3241 // paragraph parsing.
3242 assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3243 assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3244 }
3245
3246 #[test]
3247 fn test_dialect_specific_block_tag_membership() {
3248 // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3249 // CommonMark §4.6 type-6 plus a few additions. These tags
3250 // diverge between dialects:
3251 // CM-only block tags (Pandoc treats as inline raw HTML):
3252 // dialog, legend, menuitem, optgroup, option, frame,
3253 // base, basefont, link, param
3254 // Pandoc-only block tags (CM doesn't recognize):
3255 // canvas, hgroup, isindex, meta, output
3256 for cm_only in [
3257 "<dialog>",
3258 "<legend>",
3259 "<menuitem>",
3260 "<optgroup>",
3261 "<option>",
3262 "<frame>",
3263 "<base>",
3264 "<basefont>",
3265 "<link>",
3266 "<param>",
3267 ] {
3268 assert!(
3269 matches!(
3270 try_parse_html_block_start(cm_only, true),
3271 Some(HtmlBlockType::BlockTag { .. })
3272 ),
3273 "{cm_only} should be a block-tag start under CommonMark",
3274 );
3275 assert_eq!(
3276 try_parse_html_block_start(cm_only, false),
3277 None,
3278 "{cm_only} should NOT be a block-tag start under Pandoc",
3279 );
3280 }
3281 for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3282 // Under CM these are not type-6 BlockTags; they may still match
3283 // type-7 (complete tag on a line) which has different semantics.
3284 assert!(
3285 !matches!(
3286 try_parse_html_block_start(pandoc_only, true),
3287 Some(HtmlBlockType::BlockTag { .. })
3288 ),
3289 "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3290 );
3291 assert!(
3292 matches!(
3293 try_parse_html_block_start(pandoc_only, false),
3294 Some(HtmlBlockType::BlockTag { .. })
3295 ),
3296 "{pandoc_only} should be a block-tag start under Pandoc",
3297 );
3298 }
3299 }
3300
3301 #[test]
3302 fn test_pandoc_inline_block_tag_membership() {
3303 // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3304 // fresh-block positions under Pandoc dialect. We list the
3305 // non-void, non-script subset (verbatim `script` is handled
3306 // via the verbatim path; void elements are deferred — see
3307 // PANDOC_INLINE_BLOCK_TAGS docs).
3308 for tag in [
3309 "<button>",
3310 "<iframe>",
3311 "<video>",
3312 "<audio>",
3313 "<noscript>",
3314 "<object>",
3315 "<map>",
3316 "<progress>",
3317 "<del>",
3318 "<ins>",
3319 "<svg>",
3320 "<applet>",
3321 ] {
3322 assert!(
3323 matches!(
3324 try_parse_html_block_start(tag, false),
3325 Some(HtmlBlockType::BlockTag {
3326 depth_aware: true,
3327 ..
3328 })
3329 ),
3330 "{tag} should be a depth-aware block-tag start under Pandoc",
3331 );
3332 }
3333 // Closing forms of inline-block tags also start a block under
3334 // Pandoc — pandoc-native pins `</button>` standalone as a
3335 // single-line `RawBlock`. These use `closes_at_open_tag: true`
3336 // (no balanced match — the close emits as a one-line block on
3337 // its own).
3338 for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3339 assert!(
3340 matches!(
3341 try_parse_html_block_start(closing, false),
3342 Some(HtmlBlockType::BlockTag {
3343 depth_aware: false,
3344 closes_at_open_tag: true,
3345 ..
3346 })
3347 ),
3348 "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3349 );
3350 }
3351 }
3352
3353 #[test]
3354 fn test_pandoc_void_block_tag_membership() {
3355 // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3356 // at fresh-block positions under Pandoc dialect, with
3357 // `closes_at_open_tag: true` — the block always ends on the
3358 // open-tag line (no closing tag to match).
3359 for tag in [
3360 "<area>",
3361 "<embed>",
3362 "<source>",
3363 "<track>",
3364 "<embed src=\"foo.swf\">",
3365 "<source src=\"foo.mp4\" type=\"video/mp4\">",
3366 ] {
3367 assert!(
3368 matches!(
3369 try_parse_html_block_start(tag, false),
3370 Some(HtmlBlockType::BlockTag {
3371 depth_aware: false,
3372 closes_at_open_tag: true,
3373 ..
3374 })
3375 ),
3376 "{tag} should be a void block-tag start under Pandoc",
3377 );
3378 }
3379 // Closing forms of void tags also start a single-line block
3380 // under Pandoc. Void elements have no closing tag in HTML, but
3381 // `</embed>` etc. can appear in the wild — pandoc-native still
3382 // emits them as `RawBlock`s at fresh-block positions; mirror
3383 // that with the same `closes_at_open_tag: true` shape.
3384 for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3385 assert!(
3386 matches!(
3387 try_parse_html_block_start(closing, false),
3388 Some(HtmlBlockType::BlockTag {
3389 depth_aware: false,
3390 closes_at_open_tag: true,
3391 ..
3392 })
3393 ),
3394 "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3395 );
3396 }
3397 // Under CommonMark dialect, the void-tag block-start path is
3398 // skipped. `<source>` and `<track>` are in the CM type-6
3399 // BLOCK_TAGS set so they DO start a block, but with CM type-6
3400 // semantics (`closed_by_blank_line: true`,
3401 // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3402 // `<embed>` and `<area>` aren't in the CM type-6 list — they
3403 // fall through to type 7 (complete tag on a line by itself).
3404 assert_eq!(
3405 try_parse_html_block_start("<embed>", true),
3406 Some(HtmlBlockType::Type7)
3407 );
3408 assert_eq!(
3409 try_parse_html_block_start("<area>", true),
3410 Some(HtmlBlockType::Type7)
3411 );
3412 assert!(matches!(
3413 try_parse_html_block_start("<source src=\"x\">", true),
3414 Some(HtmlBlockType::BlockTag {
3415 closed_by_blank_line: true,
3416 closes_at_open_tag: false,
3417 ..
3418 })
3419 ));
3420 assert!(matches!(
3421 try_parse_html_block_start("<track src=\"x\">", true),
3422 Some(HtmlBlockType::BlockTag {
3423 closed_by_blank_line: true,
3424 closes_at_open_tag: false,
3425 ..
3426 })
3427 ));
3428 }
3429
3430 #[test]
3431 fn test_find_multiline_open_end() {
3432 // Single-line opens return None (caller takes the regular path).
3433 assert_eq!(
3434 find_multiline_open_end(
3435 &["<div id=\"x\">"],
3436 0,
3437 "<div id=\"x\">",
3438 "div",
3439 &ContainerPrefix::default()
3440 ),
3441 None
3442 );
3443 assert_eq!(
3444 find_multiline_open_end(
3445 &["<embed src=\"x\">"],
3446 0,
3447 "<embed src=\"x\">",
3448 "embed",
3449 &ContainerPrefix::default()
3450 ),
3451 None
3452 );
3453 // Multi-line opens return the line index of the closing `>`.
3454 assert_eq!(
3455 find_multiline_open_end(
3456 &["<embed", " src=\"x\">"],
3457 0,
3458 "<embed",
3459 "embed",
3460 &ContainerPrefix::default()
3461 ),
3462 Some(1)
3463 );
3464 assert_eq!(
3465 find_multiline_open_end(
3466 &["<embed", " src=\"x\"", " type=\"video\">"],
3467 0,
3468 "<embed",
3469 "embed",
3470 &ContainerPrefix::default()
3471 ),
3472 Some(2)
3473 );
3474 // Tag-name mismatch returns None (case-insensitive on the tag name).
3475 assert_eq!(
3476 find_multiline_open_end(
3477 &["<embed", " src=\"x\">"],
3478 0,
3479 "<embed",
3480 "div",
3481 &ContainerPrefix::default()
3482 ),
3483 None
3484 );
3485 assert_eq!(
3486 find_multiline_open_end(
3487 &["<EMBED", " src=\"x\">"],
3488 0,
3489 "<EMBED",
3490 "embed",
3491 &ContainerPrefix::default()
3492 ),
3493 Some(1)
3494 );
3495 // Quoted `>` does not terminate the open tag; quote state threads
3496 // across line boundaries.
3497 assert_eq!(
3498 find_multiline_open_end(
3499 &["<embed title=\"a>b", " c\">"],
3500 0,
3501 "<embed title=\"a>b",
3502 "embed",
3503 &ContainerPrefix::default()
3504 ),
3505 Some(1)
3506 );
3507 // No `>` anywhere returns None.
3508 assert_eq!(
3509 find_multiline_open_end(
3510 &["<embed", " src=\"x\""],
3511 0,
3512 "<embed",
3513 "embed",
3514 &ContainerPrefix::default()
3515 ),
3516 None
3517 );
3518 // Subsequent lines inside a blockquote: bq markers stripped before
3519 // scanning so `> ` prefixes don't count.
3520 assert_eq!(
3521 find_multiline_open_end(
3522 &["<div", "> id=\"x\">"],
3523 0,
3524 "<div",
3525 "div",
3526 &ContainerPrefix::bq_only(1)
3527 ),
3528 Some(1)
3529 );
3530 // Nested bq: strips two `> ` per line.
3531 assert_eq!(
3532 find_multiline_open_end(
3533 &["<section", "> > id=\"x\">"],
3534 0,
3535 "<section",
3536 "section",
3537 &ContainerPrefix::bq_only(2)
3538 ),
3539 Some(1)
3540 );
3541 }
3542
3543 #[test]
3544 fn test_pandoc_html_open_tag_closes() {
3545 // Single-line complete: scanner finds `>` on the first line.
3546 assert!(pandoc_html_open_tag_closes(
3547 &["<div>"],
3548 0,
3549 &ContainerPrefix::default()
3550 ));
3551 assert!(pandoc_html_open_tag_closes(
3552 &["<embed src=\"x\">"],
3553 0,
3554 &ContainerPrefix::default()
3555 ));
3556 // Multi-line complete: scanner finds `>` on a later line.
3557 assert!(pandoc_html_open_tag_closes(
3558 &["<div", " id=\"x\">", "body", "</div>"],
3559 0,
3560 &ContainerPrefix::default()
3561 ));
3562 assert!(pandoc_html_open_tag_closes(
3563 &["<embed", " src=\"x.png\" alt=\"y\">"],
3564 0,
3565 &ContainerPrefix::default()
3566 ));
3567 // Quoted `>` does not close: scanner threads quote state.
3568 assert!(!pandoc_html_open_tag_closes(
3569 &["<div title=\"a>b", " c\""],
3570 0,
3571 &ContainerPrefix::default()
3572 ));
3573 assert!(pandoc_html_open_tag_closes(
3574 &["<div title=\"a>b", " c\">"],
3575 0,
3576 &ContainerPrefix::default()
3577 ));
3578 // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3579 assert!(!pandoc_html_open_tag_closes(
3580 &["<embed"],
3581 0,
3582 &ContainerPrefix::default()
3583 ));
3584 assert!(!pandoc_html_open_tag_closes(
3585 &["<div", "foo", "bar"],
3586 0,
3587 &ContainerPrefix::default()
3588 ));
3589 // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3590 // across them); the scan continues until EOF or `>`.
3591 assert!(pandoc_html_open_tag_closes(
3592 &["<div", "", "id=\"x\">"],
3593 0,
3594 &ContainerPrefix::default()
3595 ));
3596 }
3597
3598 #[test]
3599 fn test_try_parse_cdata() {
3600 // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3601 assert_eq!(
3602 try_parse_html_block_start("<![CDATA[content]]>", true),
3603 Some(HtmlBlockType::CData)
3604 );
3605 // Pandoc dialect does not.
3606 assert_eq!(
3607 try_parse_html_block_start("<![CDATA[content]]>", false),
3608 None
3609 );
3610 }
3611
3612 #[test]
3613 fn test_extract_block_tag_name_open_only() {
3614 assert_eq!(
3615 extract_block_tag_name("<div>", false),
3616 Some("div".to_string())
3617 );
3618 assert_eq!(
3619 extract_block_tag_name("<div class=\"test\">", false),
3620 Some("div".to_string())
3621 );
3622 assert_eq!(
3623 extract_block_tag_name("<div/>", false),
3624 Some("div".to_string())
3625 );
3626 assert_eq!(extract_block_tag_name("</div>", false), None);
3627 assert_eq!(extract_block_tag_name("<>", false), None);
3628 assert_eq!(extract_block_tag_name("< div>", false), None);
3629 }
3630
3631 #[test]
3632 fn test_extract_block_tag_name_with_closing() {
3633 // CommonMark §4.6 type-6 starts also accept closing tags.
3634 assert_eq!(
3635 extract_block_tag_name("</div>", true),
3636 Some("div".to_string())
3637 );
3638 assert_eq!(
3639 extract_block_tag_name("</div >", true),
3640 Some("div".to_string())
3641 );
3642 }
3643
3644 #[test]
3645 fn test_commonmark_type6_closing_tag_start() {
3646 assert_eq!(
3647 try_parse_html_block_start("</div>", true),
3648 Some(HtmlBlockType::BlockTag {
3649 tag_name: "div".to_string(),
3650 is_verbatim: false,
3651 closed_by_blank_line: true,
3652 depth_aware: false,
3653 closes_at_open_tag: false,
3654 is_closing: true,
3655 })
3656 );
3657 }
3658
3659 #[test]
3660 fn test_commonmark_type7_open_tag() {
3661 // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3662 // CommonMark; rejected under non-CommonMark.
3663 assert_eq!(
3664 try_parse_html_block_start("<a href=\"foo\">", true),
3665 Some(HtmlBlockType::Type7)
3666 );
3667 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3668 }
3669
3670 #[test]
3671 fn test_commonmark_type7_close_tag() {
3672 assert_eq!(
3673 try_parse_html_block_start("</ins>", true),
3674 Some(HtmlBlockType::Type7)
3675 );
3676 }
3677
3678 #[test]
3679 fn test_commonmark_type7_rejects_with_trailing_text() {
3680 // A complete tag must be followed only by whitespace.
3681 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3682 }
3683
3684 #[test]
3685 fn test_is_closing_marker_comment() {
3686 let block_type = HtmlBlockType::Comment;
3687 assert!(is_closing_marker("-->", &block_type));
3688 assert!(is_closing_marker("end -->", &block_type));
3689 assert!(!is_closing_marker("<!--", &block_type));
3690 }
3691
3692 #[test]
3693 fn test_is_closing_marker_tag() {
3694 let block_type = HtmlBlockType::BlockTag {
3695 tag_name: "div".to_string(),
3696 is_verbatim: false,
3697 closed_by_blank_line: false,
3698 depth_aware: false,
3699 closes_at_open_tag: false,
3700 is_closing: false,
3701 };
3702 assert!(is_closing_marker("</div>", &block_type));
3703 assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3704 assert!(is_closing_marker("content</div>", &block_type));
3705 assert!(!is_closing_marker("<div>", &block_type));
3706 }
3707
3708 #[test]
3709 fn test_parse_html_comment_block() {
3710 let input = "<!-- comment -->\n";
3711 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3712 let mut builder = GreenNodeBuilder::new();
3713
3714 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3715 let opts = ParserOptions::default();
3716 let new_pos = parse_html_block_with_wrapper(
3717 &mut builder,
3718 &lines,
3719 0,
3720 block_type,
3721 &ContainerPrefix::default(),
3722 SyntaxKind::HTML_BLOCK,
3723 &opts,
3724 );
3725
3726 assert_eq!(new_pos, 1);
3727 }
3728
3729 #[test]
3730 fn test_parse_div_block() {
3731 let input = "<div>\ncontent\n</div>\n";
3732 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3733 let mut builder = GreenNodeBuilder::new();
3734
3735 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3736 let opts = ParserOptions::default();
3737 let new_pos = parse_html_block_with_wrapper(
3738 &mut builder,
3739 &lines,
3740 0,
3741 block_type,
3742 &ContainerPrefix::default(),
3743 SyntaxKind::HTML_BLOCK,
3744 &opts,
3745 );
3746
3747 assert_eq!(new_pos, 3);
3748 }
3749
3750 #[test]
3751 fn test_parse_html_block_no_closing() {
3752 let input = "<div>\ncontent\n";
3753 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3754 let mut builder = GreenNodeBuilder::new();
3755
3756 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3757 let opts = ParserOptions::default();
3758 let new_pos = parse_html_block_with_wrapper(
3759 &mut builder,
3760 &lines,
3761 0,
3762 block_type,
3763 &ContainerPrefix::default(),
3764 SyntaxKind::HTML_BLOCK,
3765 &opts,
3766 );
3767
3768 // Should consume all lines even without closing tag
3769 assert_eq!(new_pos, 2);
3770 }
3771
3772 #[test]
3773 fn test_parse_div_block_nested_pandoc() {
3774 // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3775 // close on the OUTER `</div>`, not the first `</div>` seen. The
3776 // CommonMark-style "first close" scanner is wrong here; Pandoc's
3777 // div parser is depth-aware (mirrors `htmlInBalanced`).
3778 let input =
3779 "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3780 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3781 let mut builder = GreenNodeBuilder::new();
3782
3783 // is_commonmark = false → Pandoc dialect.
3784 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3785 let opts = ParserOptions::default();
3786 let new_pos = parse_html_block_with_wrapper(
3787 &mut builder,
3788 &lines,
3789 0,
3790 block_type,
3791 &ContainerPrefix::default(),
3792 SyntaxKind::HTML_BLOCK_DIV,
3793 &opts,
3794 );
3795
3796 // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3797 // inner-close, blank, outer-close. All consumed.
3798 assert_eq!(new_pos, 9);
3799 }
3800
3801 #[test]
3802 fn test_parse_div_block_same_line_pandoc() {
3803 // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3804 // close on first line. Depth-aware tracking must not regress this.
3805 let input = "<div>foo</div>\n";
3806 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3807 let mut builder = GreenNodeBuilder::new();
3808
3809 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3810 let opts = ParserOptions::default();
3811 let new_pos = parse_html_block_with_wrapper(
3812 &mut builder,
3813 &lines,
3814 0,
3815 block_type,
3816 &ContainerPrefix::default(),
3817 SyntaxKind::HTML_BLOCK_DIV,
3818 &opts,
3819 );
3820 assert_eq!(new_pos, 1);
3821 }
3822
3823 #[test]
3824 fn test_commonmark_verbatim_first_close() {
3825 // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3826 // ends at the first matching close — not depth-aware. Stash a
3827 // bogus inner `<script>` inside a JS string; the outer block
3828 // still closes at the first `</script>`.
3829 let input = "<script>\nlet x = '<script>';\n</script>\n";
3830 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3831 let mut builder = GreenNodeBuilder::new();
3832
3833 // is_commonmark = true.
3834 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3835 let opts = ParserOptions::default();
3836 let new_pos = parse_html_block_with_wrapper(
3837 &mut builder,
3838 &lines,
3839 0,
3840 block_type,
3841 &ContainerPrefix::default(),
3842 SyntaxKind::HTML_BLOCK,
3843 &opts,
3844 );
3845 // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3846 assert_eq!(new_pos, 3);
3847 }
3848
3849 #[test]
3850 fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3851 // Multi-line open tag with the closing `>` on its own line:
3852 //
3853 // <div
3854 // id="x"
3855 // class="y"
3856 // >
3857 //
3858 // foo
3859 //
3860 // </div>
3861 //
3862 // Open tag spans lines 0..=3. Content starts at line 4.
3863 let input = "<div\n id=\"x\"\n class=\"y\"\n>\n\nfoo\n\n</div>\n";
3864 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3865 let mut builder = GreenNodeBuilder::new();
3866
3867 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3868 let opts = ParserOptions::default();
3869 let new_pos = parse_html_block_with_wrapper(
3870 &mut builder,
3871 &lines,
3872 0,
3873 block_type,
3874 &ContainerPrefix::default(),
3875 SyntaxKind::HTML_BLOCK_DIV,
3876 &opts,
3877 );
3878
3879 // 8 lines: open-line 0, open-line 1 (` id="x"`), open-line 2
3880 // (` class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3881 assert_eq!(new_pos, 8);
3882
3883 // CST must contain a structural HTML_ATTRS region holding the
3884 // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3885 let green = builder.finish();
3886 let root = crate::syntax::SyntaxNode::new_root(green);
3887 let attrs_count = root
3888 .descendants()
3889 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3890 .count();
3891 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3892
3893 // Byte-identical losslessness check.
3894 let collected: String = root
3895 .descendants_with_tokens()
3896 .filter_map(|n| n.into_token())
3897 .map(|t| t.text().to_string())
3898 .collect();
3899 assert_eq!(collected, input);
3900 }
3901
3902 #[test]
3903 fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3904 // Multi-line open tag with the closing `>` on the last attribute
3905 // line (case 0262 already covers this pattern; pin behavior to
3906 // also ensure HTML_ATTRS structural exposure).
3907 let input = "<div\n id=\"x\"\n class=\"y\">\nfoo\n</div>\n";
3908 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3909 let mut builder = GreenNodeBuilder::new();
3910
3911 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3912 let opts = ParserOptions::default();
3913 let new_pos = parse_html_block_with_wrapper(
3914 &mut builder,
3915 &lines,
3916 0,
3917 block_type,
3918 &ContainerPrefix::default(),
3919 SyntaxKind::HTML_BLOCK_DIV,
3920 &opts,
3921 );
3922
3923 // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3924 // </div>.
3925 assert_eq!(new_pos, 5);
3926
3927 let green = builder.finish();
3928 let root = crate::syntax::SyntaxNode::new_root(green);
3929 let attrs_count = root
3930 .descendants()
3931 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3932 .count();
3933 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3934
3935 let collected: String = root
3936 .descendants_with_tokens()
3937 .filter_map(|n| n.into_token())
3938 .map(|t| t.text().to_string())
3939 .collect();
3940 assert_eq!(collected, input);
3941 }
3942
3943 #[test]
3944 fn test_commonmark_type6_blank_line_terminates() {
3945 let input = "<div>\nfoo\n\nbar\n";
3946 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3947 let mut builder = GreenNodeBuilder::new();
3948
3949 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3950 let opts = ParserOptions::default();
3951 let new_pos = parse_html_block_with_wrapper(
3952 &mut builder,
3953 &lines,
3954 0,
3955 block_type,
3956 &ContainerPrefix::default(),
3957 SyntaxKind::HTML_BLOCK,
3958 &opts,
3959 );
3960
3961 // Block contains <div>\nfoo\n; stops at blank line (line 2).
3962 assert_eq!(new_pos, 2);
3963 }
3964}