panache_parser/parser/blocks/html_blocks.rs
1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use super::container_prefix::{
10 ContainerPrefix, ContainerPrefixLine, ContainerPrefixState, emit_container_prefix_tokens,
11};
12use crate::parser::utils::attributes::emit_html_attrs_node;
13use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
14
15/// HTML block-level tags as defined by CommonMark spec.
16/// These tags start an HTML block when found at the start of a line.
17const BLOCK_TAGS: &[&str] = &[
18 "address",
19 "article",
20 "aside",
21 "base",
22 "basefont",
23 "blockquote",
24 "body",
25 "caption",
26 "center",
27 "col",
28 "colgroup",
29 "dd",
30 "details",
31 "dialog",
32 "dir",
33 "div",
34 "dl",
35 "dt",
36 "fieldset",
37 "figcaption",
38 "figure",
39 "footer",
40 "form",
41 "frame",
42 "frameset",
43 "h1",
44 "h2",
45 "h3",
46 "h4",
47 "h5",
48 "h6",
49 "head",
50 "header",
51 "hr",
52 "html",
53 "iframe",
54 "legend",
55 "li",
56 "link",
57 "main",
58 "menu",
59 "menuitem",
60 "nav",
61 "noframes",
62 "ol",
63 "optgroup",
64 "option",
65 "p",
66 "param",
67 "section",
68 "source",
69 "summary",
70 "table",
71 "tbody",
72 "td",
73 "tfoot",
74 "th",
75 "thead",
76 "title",
77 "tr",
78 "track",
79 "ul",
80];
81
82/// Tags that contain raw/verbatim content (no Markdown processing inside).
83const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
84
85/// Pandoc's `blockHtmlTags` (mirrors
86/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
87/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
88/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
89/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
90/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
91/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
92///
93/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
94/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
95/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
96/// and the verbatim `script`) is tracked separately as
97/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
98/// fresh-block positions but stay inline inside an existing HTML block
99/// (e.g. `<form><input><button>X</button></form>`); the projector's
100/// `split_html_block_by_tags` keys on `inline_pending` to keep them
101/// inline once an inline-only tag or text byte has been seen since the
102/// last splitter.
103const PANDOC_BLOCK_TAGS: &[&str] = &[
104 "address",
105 "article",
106 "aside",
107 "blockquote",
108 "body",
109 "canvas",
110 "caption",
111 "center",
112 "col",
113 "colgroup",
114 "dd",
115 "details",
116 "dir",
117 "div",
118 "dl",
119 "dt",
120 "fieldset",
121 "figcaption",
122 "figure",
123 "footer",
124 "form",
125 "frameset",
126 "h1",
127 "h2",
128 "h3",
129 "h4",
130 "h5",
131 "h6",
132 "head",
133 "header",
134 "hgroup",
135 "hr",
136 "html",
137 "isindex",
138 "li",
139 "main",
140 "menu",
141 "meta",
142 "nav",
143 "noframes",
144 "ol",
145 "output",
146 "p",
147 "pre",
148 "script",
149 "section",
150 "style",
151 "summary",
152 "table",
153 "tbody",
154 "td",
155 "textarea",
156 "tfoot",
157 "th",
158 "thead",
159 "tr",
160 "ul",
161];
162
163/// Whether `name` (case-insensitive) is one of the HTML block-level tags
164/// recognized by CommonMark §4.6 type-6.
165pub fn is_html_block_tag_name(name: &str) -> bool {
166 let lower = name.to_ascii_lowercase();
167 BLOCK_TAGS.contains(&lower.as_str())
168}
169
170/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
171/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
172/// Used by the pandoc-native projector's `split_html_block_by_tags` to
173/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
174/// the block — block-level tags emit as separate `RawBlock` entries;
175/// inline tags stay inline in the surrounding `Plain` content.
176pub fn is_pandoc_block_tag_name(name: &str) -> bool {
177 let lower = name.to_ascii_lowercase();
178 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
179}
180
181/// Pandoc's `eitherBlockOrInline` set (mirrors
182/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
183/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
184/// (because `name ∉ blockTags`). At top level (or after a blank line)
185/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
186/// inside an existing HTML block once a paragraph has started parsing,
187/// the same tag stays inline as `RawInline`.
188///
189/// The projector's `split_html_block_by_tags` mirrors this with an
190/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
191/// always split; inline-block tags split only when no inline content
192/// has been buffered since the last splitter.
193///
194/// Void elements (`area`, `embed`, `source`, `track`) live in
195/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
196/// rule as non-void inline-block tags but emit a single RawBlock per
197/// instance instead of a matched-pair lift.
198/// `script` is omitted because it is already verbatim (handled by the
199/// `<script>...</script>` raw-text path) and the strict-block check
200/// fires first regardless.
201const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
202 "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
203 "svg", "video",
204];
205
206/// Whether `name` (case-insensitive) is one of pandoc's
207/// `eitherBlockOrInline` tags (excluding void elements and `script`;
208/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
209pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
210 let lower = name.to_ascii_lowercase();
211 PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
212}
213
214/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
215/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
216/// minus those handled elsewhere: `br` and `wbr` are inline-only;
217/// `img` and `input` are inline-only; HTML void elements that pandoc
218/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
219/// `track`).
220///
221/// At fresh-block positions (or after a blank line) pandoc emits these
222/// as a single `RawBlock`; inside a running paragraph they stay inline
223/// as `RawInline`. The parser opens a depth-zero HTML block (closes
224/// immediately on the open-tag line — there is no closing tag to
225/// match) so subsequent lines start fresh blocks; the projector's
226/// `split_html_block_by_tags` handles the same-line splitting via
227/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
228const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
229
230/// Whether `name` (case-insensitive) is one of pandoc's void
231/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
232pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
233 let lower = name.to_ascii_lowercase();
234 PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
235}
236
237/// Whether the given tag name is eligible for the Phase 6 / Fix #4
238/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
239/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
240/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
241/// and NOT void. These are the tags where pandoc parses the body as
242/// fresh markdown between RawBlock emissions of the open/close tags —
243/// exactly the shape we can lift into structural CST children.
244///
245/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
246/// additional gate at the lift-gate site: the lift is abandoned when
247/// the body's first non-blank content is a void block tag at a
248/// fresh-block position (`<video>\n<source ...>\n</video>` projects
249/// per-tag rather than matched-pair, mirroring pandoc).
250///
251/// `<div>` is intentionally excluded — it has its own lift path
252/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
253/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
254pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
255 let lower = name.to_ascii_lowercase();
256 if VERBATIM_TAGS.contains(&lower.as_str()) {
257 return false;
258 }
259 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
260 return false;
261 }
262 if lower == "div" {
263 return false;
264 }
265 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
266 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
267}
268
269/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
270/// — anything that has an opening and a matching closing form whose
271/// `</tag>` would be recognized by the dispatcher as a separate block
272/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
273/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
274/// Void tags are excluded — they have no close form.
275///
276/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
277/// an open inside the buffer whose close would otherwise interrupt the
278/// list item mid-construct.
279pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
280 let lower = name.to_ascii_lowercase();
281 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
282 return false;
283 }
284 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
285 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
286 || VERBATIM_TAGS.contains(&lower.as_str())
287}
288
289/// Open-tag-attribute tokenization gate for non-div strict-block tags
290/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
291/// open tag is eligible for finer-grained tokenization
292/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
293/// driving the full body lift — that's the `bq_clean_lift` path. The
294/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
295/// the salsa anchor index.
296///
297/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
298/// wrapper) regardless of bq depth, so this gate skips it.
299fn bq_strict_attr_emit_tag_name(
300 wrapper_kind: SyntaxKind,
301 block_type: &HtmlBlockType,
302 bq_depth: usize,
303) -> Option<&str> {
304 if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
305 return None;
306 }
307 match block_type {
308 HtmlBlockType::BlockTag {
309 tag_name,
310 is_verbatim: false,
311 closed_by_blank_line: false,
312 depth_aware: true,
313 closes_at_open_tag: false,
314 is_closing: false,
315 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
316 _ => None,
317 }
318}
319
320/// Information about a detected HTML block opening.
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub(crate) enum HtmlBlockType {
323 /// HTML comment: <!-- ... -->
324 Comment,
325 /// Processing instruction: <? ... ?>
326 ProcessingInstruction,
327 /// Declaration: <!...>
328 Declaration,
329 /// CDATA section: <![CDATA[ ... ]]>
330 CData,
331 /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
332 /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
333 /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
334 /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
335 /// `depth_aware` extends the matching-tag close path with balanced
336 /// open/close tracking of the same tag name (mirrors pandoc's
337 /// `htmlInBalanced`); used under Pandoc dialect to handle nested
338 /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
339 /// `closed_by_blank_line` is true.
340 /// `closes_at_open_tag` short-circuits the close search: the block
341 /// always ends after the open-tag line. Used for void
342 /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
343 /// `<track>`) which have no closing tag — depth-aware matching
344 /// would walk to end-of-input.
345 /// `is_closing` records whether the tag at the start position is a
346 /// closing form (`</tag>`) rather than an opening form (`<tag>`).
347 /// The dispatcher's `cannot_interrupt` consults this to mirror
348 /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
349 /// even when `<script>` is not — pandoc treats the close-form as
350 /// always-inline regardless of attributes).
351 BlockTag {
352 tag_name: String,
353 is_verbatim: bool,
354 closed_by_blank_line: bool,
355 depth_aware: bool,
356 closes_at_open_tag: bool,
357 is_closing: bool,
358 },
359 /// CommonMark §4.6 type 7: complete open or close tag on a line by
360 /// itself, tag name not in the type-1 verbatim list. Block ends at
361 /// blank line. Cannot interrupt a paragraph.
362 Type7,
363}
364
365/// Try to detect an HTML block opening from content.
366/// Returns block type if this is a valid HTML block start.
367///
368/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
369/// accept closing tags (`</div>`), type-6 blocks end at the next blank
370/// line (rather than a matching close tag), and type 7 is recognized.
371pub(crate) fn try_parse_html_block_start(
372 content: &str,
373 is_commonmark: bool,
374) -> Option<HtmlBlockType> {
375 let trimmed = strip_leading_spaces(content);
376
377 // Must start with <
378 if !trimmed.starts_with('<') {
379 return None;
380 }
381
382 // HTML comment
383 if trimmed.starts_with("<!--") {
384 return Some(HtmlBlockType::Comment);
385 }
386
387 // Processing instruction
388 if trimmed.starts_with("<?") {
389 return Some(HtmlBlockType::ProcessingInstruction);
390 }
391
392 // CDATA section — CommonMark dialect only. Pandoc-markdown does not
393 // recognize bare CDATA as a raw HTML block; the literal bytes fall
394 // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
395 // text is parsed as inline markdown, etc).
396 if is_commonmark && trimmed.starts_with("<![CDATA[") {
397 return Some(HtmlBlockType::CData);
398 }
399
400 // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
401 // does not recognize bare declarations as raw HTML blocks (its
402 // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
403 // tag-shaped blocks); the bytes fall through to paragraph parsing.
404 if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
405 let after_bang = &trimmed[2..];
406 if after_bang.chars().next()?.is_ascii_alphabetic() {
407 return Some(HtmlBlockType::Declaration);
408 }
409 }
410
411 // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
412 // Pandoc-native recognizes standalone closing forms of strict-block tags
413 // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
414 // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
415 // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
416 // the open-tag line via `closes_at_open_tag: true`.
417 if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
418 let tag_lower = tag_name.to_lowercase();
419 let is_closing = trimmed.starts_with("</");
420
421 // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
422 // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
423 // Unlike inline-block / void closes, these CAN interrupt a running
424 // paragraph (the dispatcher's `cannot_interrupt` only covers the
425 // inline-block / void categories). Inline-block / void closes are
426 // handled by their own branches further below.
427 if !is_commonmark
428 && is_closing
429 && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
430 || VERBATIM_TAGS.contains(&tag_lower.as_str()))
431 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
432 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
433 {
434 return Some(HtmlBlockType::BlockTag {
435 tag_name: tag_lower,
436 is_verbatim: false,
437 closed_by_blank_line: false,
438 depth_aware: false,
439 closes_at_open_tag: true,
440 is_closing: true,
441 });
442 }
443
444 // Under Pandoc, remaining closing forms (truly inline-only tags like
445 // `</em>`, `</span>`) are not block starts — fall through to the
446 // existing inline-html path. Inline-block + void closes are caught
447 // by the dedicated branches further below.
448 if !is_commonmark
449 && is_closing
450 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
451 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
452 {
453 return None;
454 }
455
456 // Check if it's a block-level tag. Pandoc and CommonMark disagree on
457 // membership: pandoc's `blockHtmlTags` (see
458 // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
459 // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
460 // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
461 let is_block_tag = if is_commonmark {
462 BLOCK_TAGS.contains(&tag_lower.as_str())
463 } else {
464 PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
465 };
466 if is_block_tag {
467 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
468 return Some(HtmlBlockType::BlockTag {
469 tag_name: tag_lower,
470 is_verbatim,
471 closed_by_blank_line: is_commonmark && !is_verbatim,
472 depth_aware: !is_commonmark,
473 closes_at_open_tag: false,
474 is_closing,
475 });
476 }
477
478 // Pandoc dialect also treats `eitherBlockOrInline` tags as block
479 // starters at fresh-block positions. The block dispatcher caller
480 // gates these as `cannot_interrupt` (mirrors pandoc — they never
481 // interrupt a running paragraph; only start a fresh block when
482 // following a blank line or at document start). Closing forms
483 // (`</video>`) emit as a single-line `RawBlock` with no balanced
484 // match — pandoc-native pins this for standalone closes.
485 if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
486 return Some(HtmlBlockType::BlockTag {
487 tag_name: tag_lower,
488 is_verbatim: false,
489 closed_by_blank_line: false,
490 depth_aware: !is_closing,
491 closes_at_open_tag: is_closing,
492 is_closing,
493 });
494 }
495
496 // Pandoc dialect also recognizes the void subset of
497 // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
498 // These have no closing tag, so the parser closes the block
499 // immediately on the open-tag line; the projector's
500 // `split_html_block_by_tags` handles the same-line splitting
501 // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
502 // non-void inline-block tags, void tags never interrupt a
503 // running paragraph (gated as `cannot_interrupt` in the
504 // dispatcher). Closing forms (`</embed>`) — semantically
505 // nonsensical for void elements — pandoc still emits as a
506 // single-line `RawBlock`; mirror that.
507 if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
508 return Some(HtmlBlockType::BlockTag {
509 tag_name: tag_lower,
510 is_verbatim: false,
511 closed_by_blank_line: false,
512 depth_aware: false,
513 closes_at_open_tag: true,
514 is_closing,
515 });
516 }
517
518 // Also accept verbatim tags even if not in BLOCK_TAGS list — but
519 // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
520 // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
521 // do not start a type-1 block. Letting `</pre>` through here would
522 // wrongly interrupt a paragraph.
523 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
524 return Some(HtmlBlockType::BlockTag {
525 tag_name: tag_lower,
526 is_verbatim: true,
527 closed_by_blank_line: false,
528 depth_aware: !is_commonmark,
529 closes_at_open_tag: false,
530 is_closing: false,
531 });
532 }
533 }
534
535 // Type 7 (CommonMark only): complete open or close tag on a line by
536 // itself, tag name not in the type-1 verbatim list.
537 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
538 {
539 let rest = &trimmed[end..];
540 let only_ws = rest
541 .bytes()
542 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
543 if only_ws {
544 // Reject if the tag name belongs to the type-1 verbatim set
545 // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
546 // type-1 starts above, so seeing one here means the opener
547 // had a different shape (e.g. `<pre/>` self-closing) that
548 // shouldn't trigger type 7 either. Conservatively skip.
549 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
550 let name_end = leading
551 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
552 .unwrap_or(leading.len());
553 let name = leading[..name_end].to_ascii_lowercase();
554 if !VERBATIM_TAGS.contains(&name.as_str()) {
555 return Some(HtmlBlockType::Type7);
556 }
557 }
558 }
559
560 None
561}
562
563/// Extract the tag name for HTML-block-start detection.
564///
565/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
566/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
567/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
568/// the spec — we approximate that with the space/`>`/`/` boundary check.
569fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
570 if !text.starts_with('<') {
571 return None;
572 }
573
574 let after_bracket = &text[1..];
575
576 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
577 if !accept_closing {
578 return None;
579 }
580 stripped
581 } else {
582 after_bracket
583 };
584
585 // Extract tag name (alphanumeric, ends at space, >, or /)
586 let tag_end = after_slash
587 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
588 .unwrap_or(after_slash.len());
589
590 if tag_end == 0 {
591 return None;
592 }
593
594 let tag_name = &after_slash[..tag_end];
595
596 // Tag name must be valid (ASCII alphabetic start, alphanumeric)
597 if !tag_name.chars().next()?.is_ascii_alphabetic() {
598 return None;
599 }
600
601 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
602 return None;
603 }
604
605 Some(tag_name.to_string())
606}
607
608/// Whether this block type ends at a blank line (CommonMark types 6 & 7
609/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
610/// marker — only at end of input or the next blank line.
611fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
612 matches!(
613 block_type,
614 HtmlBlockType::Type7
615 | HtmlBlockType::BlockTag {
616 closed_by_blank_line: true,
617 ..
618 }
619 )
620}
621
622/// Check if a line contains the closing marker for the given HTML block type.
623/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
624/// blank-line-terminated types (6 in CommonMark, 7) never match here.
625fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
626 match block_type {
627 HtmlBlockType::Comment => line.contains("-->"),
628 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
629 HtmlBlockType::Declaration => line.contains('>'),
630 HtmlBlockType::CData => line.contains("]]>"),
631 HtmlBlockType::BlockTag {
632 tag_name,
633 closed_by_blank_line: false,
634 ..
635 } => {
636 // Look for closing tag </tagname>
637 let closing_tag = format!("</{}>", tag_name);
638 line.to_lowercase().contains(&closing_tag)
639 }
640 HtmlBlockType::BlockTag {
641 closed_by_blank_line: true,
642 ..
643 }
644 | HtmlBlockType::Type7 => false,
645 }
646}
647
648/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
649/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
650/// inside a quoted attribute value are NOT counted — the scanner walks
651/// `<...>` brackets and respects `"`/`'` quoting.
652///
653/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
654/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
655/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
656/// close-form dispatch that would otherwise break the list-item buffer
657/// mid-`<div>...</div>`.
658pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
659 let bytes = line.as_bytes();
660 let lower_line = line.to_ascii_lowercase();
661 let lower_bytes = lower_line.as_bytes();
662 let tag_lower = tag_name.to_ascii_lowercase();
663 let tag_bytes = tag_lower.as_bytes();
664
665 let mut opens = 0usize;
666 let mut closes = 0usize;
667 let mut i = 0usize;
668
669 while i < bytes.len() {
670 if bytes[i] != b'<' {
671 i += 1;
672 continue;
673 }
674 let after = i + 1;
675 let is_close = after < bytes.len() && bytes[after] == b'/';
676 let name_start = if is_close { after + 1 } else { after };
677 let matched = name_start + tag_bytes.len() <= bytes.len()
678 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
679 let after_name = name_start + tag_bytes.len();
680 let is_boundary = matched
681 && matches!(
682 bytes.get(after_name).copied(),
683 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
684 );
685
686 // Walk forward to the closing `>` of this tag bracket, skipping
687 // inside quoted attribute values. Self-closing form ends with `/>`.
688 let mut j = if matched { after_name } else { after };
689 let mut quote: Option<u8> = None;
690 let mut self_close = false;
691 let mut found_gt = false;
692 while j < bytes.len() {
693 let b = bytes[j];
694 match (quote, b) {
695 (Some(q), x) if x == q => quote = None,
696 (None, b'"') | (None, b'\'') => quote = Some(b),
697 (None, b'>') => {
698 found_gt = true;
699 if j > i + 1 && bytes[j - 1] == b'/' {
700 self_close = true;
701 }
702 break;
703 }
704 _ => {}
705 }
706 j += 1;
707 }
708
709 if matched && is_boundary {
710 if is_close {
711 closes += 1;
712 } else if !self_close {
713 opens += 1;
714 }
715 }
716
717 if found_gt {
718 i = j + 1;
719 } else {
720 // Unterminated `<...` — bail out to avoid an infinite loop.
721 // The remaining bytes don't form a complete tag.
722 break;
723 }
724 }
725
726 (opens, closes)
727}
728
729/// Pandoc-dialect lift for HTML comments / processing instructions
730/// whose close marker is followed by additional bytes (same-line
731/// trailing or following lines). Pandoc-native emits a `RawBlock` for
732/// the marker bytes only, then parses the remainder as fresh blocks.
733///
734/// Returns `Some(consumed_lines)` when the split fires (caller must
735/// NOT enter the legacy emission); `None` to fall back to the legacy
736/// path (no close marker found, or no trailing content to split).
737///
738/// CST shape on success:
739/// ```text
740/// HTML_BLOCK
741/// HTML_BLOCK_TAG (open) // line[0] up to and incl close marker
742/// TEXT "<!-- hi -->" // or with HTML_BLOCK_CONTENT in between
743/// ... // for multi-line `<!--\n…\n-->` shape
744/// <sibling blocks> // recursive parse of trailing + lines[M+1..]
745/// ```
746fn try_parse_comment_pi_with_trailing_split(
747 builder: &mut GreenNodeBuilder<'static>,
748 lines: &[&str],
749 start_pos: usize,
750 block_type: &HtmlBlockType,
751 wrapper_kind: SyntaxKind,
752 bq_depth: usize,
753 config: &ParserOptions,
754) -> Option<usize> {
755 let marker: &str = match block_type {
756 HtmlBlockType::Comment => "-->",
757 HtmlBlockType::ProcessingInstruction => "?>",
758 _ => return None,
759 };
760
761 // Find the close marker in the bq-stripped line content. For
762 // bq_depth == 0 the inner content equals the raw line; for
763 // bq_depth > 0 we look past the `>` markers stripped by the
764 // outer dispatcher (line 0) and emitted as bq prefix below
765 // (lines > 0). `marker_end_in_inner` is the byte offset of the
766 // first byte AFTER the close marker, measured from the start
767 // of the inner (post-strip) content.
768 let mut close_line_idx: Option<usize> = None;
769 let mut marker_end_in_inner: usize = 0;
770 for (offset, line) in lines[start_pos..].iter().enumerate() {
771 let inner = if bq_depth > 0 {
772 strip_n_blockquote_markers(line, bq_depth)
773 } else {
774 line
775 };
776 if let Some(pos) = inner.find(marker) {
777 close_line_idx = Some(start_pos + offset);
778 marker_end_in_inner = pos + marker.len();
779 break;
780 }
781 }
782 let close_line_idx = close_line_idx?;
783 let close_line = lines[close_line_idx];
784 let close_inner = if bq_depth > 0 {
785 strip_n_blockquote_markers(close_line, bq_depth)
786 } else {
787 close_line
788 };
789 let close_prefix_len = close_line.len() - close_inner.len();
790 let trailing = &close_inner[marker_end_in_inner..];
791
792 // Only fire when there is non-whitespace content AFTER the close
793 // marker on the close line. The legacy path correctly handles
794 // the close-line-ends-at-close-marker shapes (`-->\n` followed
795 // by separate blocks); only the same-line-trailing case needs
796 // structural splitting. Trailing-whitespace-only handling
797 // (`--> \n`) is a projector-side trim — separate concern.
798 let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
799 if !has_non_ws_trailing {
800 return None;
801 }
802
803 builder.start_node(wrapper_kind.into());
804
805 // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
806 // middle `HTML_BLOCK_CONTENT` lines between open and close. The
807 // close `HTML_BLOCK_TAG` carries only the bytes up to and
808 // including the close marker — trailing bytes go to the sibling.
809 if close_line_idx == start_pos {
810 // Same-line shape: one HTML_BLOCK_TAG containing the close
811 // marker's bytes. The newline lives on the trailing sibling.
812 // Line 0's bq prefix (if any) was already emitted by the
813 // outer dispatcher; emit only the inner marker bytes.
814 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
815 let close_part = &close_inner[..marker_end_in_inner];
816 if !close_part.is_empty() {
817 builder.token(SyntaxKind::TEXT.into(), close_part);
818 }
819 builder.finish_node();
820 } else {
821 // Multi-line shape: open tag covers lines[start_pos..close],
822 // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
823 // only the marker bytes. Line 0's bq prefix was emitted by
824 // the outer dispatcher; subsequent lines (middle + close)
825 // need bq prefix re-emission inside the wrapper.
826 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
827 let first_line = lines[start_pos];
828 let first_inner = if bq_depth > 0 {
829 strip_n_blockquote_markers(first_line, bq_depth)
830 } else {
831 first_line
832 };
833 let (line_no_nl, nl) = strip_newline(first_inner);
834 if !line_no_nl.is_empty() {
835 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
836 }
837 if !nl.is_empty() {
838 builder.token(SyntaxKind::NEWLINE.into(), nl);
839 }
840 builder.finish_node();
841
842 if close_line_idx > start_pos + 1 {
843 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
844 for content_line in &lines[start_pos + 1..close_line_idx] {
845 emit_html_block_line(builder, content_line, bq_depth);
846 }
847 builder.finish_node();
848 }
849
850 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
851 if bq_depth > 0 && close_prefix_len > 0 {
852 emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
853 }
854 let close_part = &close_inner[..marker_end_in_inner];
855 if !close_part.is_empty() {
856 builder.token(SyntaxKind::TEXT.into(), close_part);
857 }
858 builder.finish_node();
859 }
860
861 builder.finish_node(); // HTML_BLOCK
862
863 // Recursively parse JUST the trailing bytes on the close line
864 // and graft top-level children as siblings of the HTML_BLOCK we
865 // just closed. We do NOT consume subsequent lines here — the
866 // outer dispatcher continues from `close_line_idx + 1` and
867 // handles container-boundary lines (`:::` div closes, blockquote
868 // markers, list-marker continuations) correctly. Multi-line
869 // softbreak continuation (`<!-- --> trailing\nmore\n` →
870 // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
871 // outer dispatcher sees `more` after the close line and starts
872 // a fresh paragraph. Refdefs flow through from the outer config
873 // (same pattern as `emit_html_block_body_lifted_inner`).
874 if !trailing.is_empty() {
875 let mut inner_options = config.clone();
876 let refdefs = config.refdef_labels.clone().unwrap_or_default();
877 inner_options.refdef_labels = Some(refdefs.clone());
878 let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
879 let mut bq = None;
880 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
881 }
882
883 Some(close_line_idx + 1)
884}
885
886/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
887/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
888/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
889/// identical to the source either way; only the wrapper retag changes.
890pub(crate) fn parse_html_block_with_wrapper(
891 builder: &mut GreenNodeBuilder<'static>,
892 lines: &[&str],
893 start_pos: usize,
894 block_type: HtmlBlockType,
895 prefix: &ContainerPrefix,
896 wrapper_kind: SyntaxKind,
897 config: &ParserOptions,
898) -> usize {
899 let bq_depth = prefix.bq_depth();
900 // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
901 // closes the RawBlock at the close marker (`-->` / `?>`) and parses
902 // any subsequent bytes (same-line trailing or following lines) as
903 // fresh blocks. The legacy path absorbs them into the HTML block
904 // wrapper, producing one oversized RawBlock. Handle the split here
905 // before entering the legacy emission so the CST encodes the
906 // sibling structure.
907 if config.dialect == crate::options::Dialect::Pandoc
908 && matches!(
909 block_type,
910 HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
911 )
912 && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
913 builder,
914 lines,
915 start_pos,
916 &block_type,
917 wrapper_kind,
918 bq_depth,
919 config,
920 )
921 {
922 return consumed;
923 }
924
925 // Start HTML block
926 builder.start_node(wrapper_kind.into());
927
928 let first_line = lines[start_pos];
929 let blank_terminated = ends_at_blank_line(&block_type);
930
931 // The block dispatcher has already emitted the bq prefix tokens for
932 // the first line; emit only the inner content as TEXT to keep the
933 // CST byte-equal to the source. List-marker bytes are stripped only
934 // when this dispatch fires on a list-marker line — for
935 // continuation-line dispatches (the much more common case) the
936 // leading indent is inner content, not upstream-emitted prefix.
937 let first_inner = prefix.strip_line_0_for_emission(first_line);
938
939 // Detect a multi-line open tag.
940 // - `<div>` (Pandoc lift): we tokenize each line structurally so the
941 // salsa anchor walk picks up `id` from the HTML_ATTRS region.
942 // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
943 // `<section>`, `<header>`, …): same structural emission, exposing
944 // `id` to the salsa anchor walk and enabling the body lift below.
945 // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
946 // without this, the parser closes the block after line 0 and the
947 // remainder of the open tag falls into following paragraphs;
948 // pandoc-native treats the whole multi-line open tag as a single
949 // `RawBlock`. Emission for void tags uses simple per-line
950 // TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
951 // from void tags).
952 let multiline_open_end = match (wrapper_kind, &block_type) {
953 (SyntaxKind::HTML_BLOCK_DIV, _) => {
954 find_multiline_open_end(lines, start_pos, first_inner, "div", prefix)
955 }
956 (
957 _,
958 HtmlBlockType::BlockTag {
959 tag_name,
960 closes_at_open_tag: true,
961 ..
962 },
963 ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix),
964 (
965 _,
966 HtmlBlockType::BlockTag {
967 tag_name,
968 is_verbatim: false,
969 closed_by_blank_line: false,
970 depth_aware: true,
971 closes_at_open_tag: false,
972 is_closing: false,
973 },
974 ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
975 find_multiline_open_end(lines, start_pos, first_inner, tag_name, prefix)
976 }
977 _ => None,
978 };
979
980 // Set up depth-aware close tracking when the block type asks for it
981 // (Pandoc dialect, balanced same-name tag matching). A `None` means
982 // we fall back to the legacy "first matching close" path via
983 // `is_closing_marker`. Computed up front so the lift-mode gate
984 // below can decide whether the open line already balances the
985 // block (same-line `<div>...</div>`).
986 let depth_aware_tag: Option<String> = match &block_type {
987 HtmlBlockType::BlockTag {
988 tag_name,
989 closed_by_blank_line: false,
990 depth_aware: true,
991 ..
992 } => Some(tag_name.clone()),
993 _ => None,
994 };
995 let mut depth: i64 = 1;
996 if let Some(tag_name) = &depth_aware_tag {
997 // Sum opens/closes across all open-tag lines (single-line: just
998 // line 0; multi-line: lines 0..=end_line_idx).
999 let last_open_line = multiline_open_end.unwrap_or(start_pos);
1000 let mut opens = 0usize;
1001 let mut closes = 0usize;
1002 for line in &lines[start_pos..=last_open_line] {
1003 let inner = prefix.strip(line);
1004 let (o, c) = count_tag_balance(inner, tag_name);
1005 opens += o;
1006 closes += c;
1007 }
1008 depth = opens as i64 - closes as i64;
1009 }
1010
1011 // Same-line `<div>foo</div>` shape: the open line balances the
1012 // block under depth-aware tracking. We can lift this structurally
1013 // only when the open-tag trailing has exactly one `</div>` close,
1014 // zero `<div>` opens, and no non-whitespace content after the
1015 // close. Other same-line shapes (nested, trailing text, malformed)
1016 // fall through to the byte-reparse path.
1017 let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1018 && multiline_open_end.is_none()
1019 && depth_aware_tag.is_some()
1020 && depth <= 0;
1021 let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1022 let (line_without_newline, _) = strip_newline(first_inner);
1023 probe_same_line_lift(line_without_newline, "div")
1024 };
1025
1026 // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1027 // `<nav>`, …): the body parses as fresh markdown between RawBlock
1028 // emissions of the open/close tags. Covers the clean multi-line
1029 // shape (open tag stands alone on its line), open-trailing
1030 // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1031 // and same-line (`<form>foo</form>`). Multi-line open and
1032 // blockquote-wrapped non-div shapes still fall through to the
1033 // byte-walker path.
1034 let strict_block_tag_name: Option<&str> =
1035 if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1036 match &block_type {
1037 HtmlBlockType::BlockTag {
1038 tag_name,
1039 is_verbatim: false,
1040 closed_by_blank_line: false,
1041 depth_aware: true,
1042 closes_at_open_tag: false,
1043 is_closing: false,
1044 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1045 _ => None,
1046 }
1047 } else {
1048 None
1049 };
1050 // Same-line `<form>foo</form>` shape: the open line already
1051 // balances the block (`depth <= 0`). Lift only when the trailing
1052 // bytes after the open `>` end with `</tag>` and contain exactly
1053 // one close + zero nested opens.
1054 let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1055 multiline_open_end.is_none() && depth <= 0 && {
1056 let (line_no_nl, _) = strip_newline(first_inner);
1057 probe_same_line_lift(line_no_nl, name)
1058 }
1059 });
1060 // Strict-block lift gate: accept (a) a multi-line open tag spanning
1061 // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1062 // trailing single-line open (depth > 0, open `>` is present with
1063 // quote-aware matching), or (c) a safe same-line shape. For
1064 // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1065 // …) the lift additionally abandons when the body starts at a
1066 // fresh-block position with a void block tag — pandoc-native pins
1067 // per-tag emission rather than a matched-pair lift in that case.
1068 let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1069 let (line_no_nl, _) = strip_newline(first_inner);
1070 let shape_ok = if multiline_open_end.is_some() {
1071 // `find_multiline_open_end` already verified the open tag
1072 // closes with a quote-aware `>` somewhere in lines
1073 // `start_pos+1..=end`. No same-line trailing content to
1074 // probe; defer trailing-on-close-`>`-line handling to a
1075 // future session (rare in practice).
1076 true
1077 } else if depth > 0 {
1078 probe_open_tag_line_has_close_gt(line_no_nl, name)
1079 } else {
1080 same_line_strict_lift_safe
1081 };
1082 if !shape_ok {
1083 return false;
1084 }
1085 if !is_pandoc_inline_block_tag_name(name) {
1086 return true;
1087 }
1088 !inline_block_void_interior_abandons(
1089 first_inner,
1090 lines,
1091 start_pos,
1092 multiline_open_end,
1093 bq_depth,
1094 name,
1095 )
1096 });
1097
1098 // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1099 // are byte-equal to the non-bq same-line shape minus the leading
1100 // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1101 // The body has no inner newlines, so no bq prefix re-injection is
1102 // needed when grafting — `emit_html_block_body_lifted` (passing
1103 // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1104 // open-trailing) still fall through to the projector's byte
1105 // walker — they need per-line prefix injection.
1106 let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1107 && multiline_open_end.is_none()
1108 && depth_aware_tag.is_some()
1109 && depth <= 0
1110 {
1111 let (line_no_nl, _) = strip_newline(first_inner);
1112 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1113 if probe_same_line_lift(line_no_nl, "div") {
1114 Some("div")
1115 } else {
1116 None
1117 }
1118 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1119 match &block_type {
1120 HtmlBlockType::BlockTag {
1121 tag_name,
1122 is_verbatim: false,
1123 closed_by_blank_line: false,
1124 depth_aware: true,
1125 closes_at_open_tag: false,
1126 is_closing: false,
1127 } if is_pandoc_lift_eligible_block_tag(tag_name)
1128 && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1129 {
1130 // Inline-block tags (`<video>`, `<iframe>`, …) skip
1131 // the void-interior check at same-line — the shape
1132 // has no inner block content to interfere with.
1133 Some(tag_name.as_str())
1134 }
1135 _ => None,
1136 }
1137 } else {
1138 None
1139 }
1140 } else {
1141 None
1142 };
1143
1144 // Messy-shape lift inside a blockquote — covers open-trailing
1145 // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1146 // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1147 // including the multi-line-open variants (`> <div\n> id="x">foo\n>
1148 // body\n> </div>`) where the trailing is captured into `pre_content`
1149 // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1150 // The open line does NOT balance the block (depth > 0 after the
1151 // open line, distinguishing this from `same_line_bq_lift_tag` which
1152 // requires depth <= 0). The close line — possibly with leading body
1153 // text — closes the block when depth returns to 0. Body lines (incl.
1154 // open trailing and close leading) graft via prefix re-injection.
1155 let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1156 {
1157 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1158 Some("div")
1159 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1160 match &block_type {
1161 HtmlBlockType::BlockTag {
1162 tag_name,
1163 is_verbatim: false,
1164 closed_by_blank_line: false,
1165 depth_aware: true,
1166 closes_at_open_tag: false,
1167 is_closing: false,
1168 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1169 // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1170 // …) abandon the lift when the body starts at a
1171 // fresh-block position with a void block tag. Same gate
1172 // as the non-bq matched-pair lift (`strict_block_lift`).
1173 if is_pandoc_inline_block_tag_name(tag_name)
1174 && inline_block_void_interior_abandons(
1175 first_inner,
1176 lines,
1177 start_pos,
1178 multiline_open_end,
1179 bq_depth,
1180 tag_name,
1181 )
1182 {
1183 None
1184 } else {
1185 Some(tag_name.as_str())
1186 }
1187 }
1188 _ => None,
1189 }
1190 } else {
1191 None
1192 }
1193 } else {
1194 None
1195 };
1196
1197 // Multi-line open + matched close-on-the-open's-last-line shape inside
1198 // a blockquote (`> <div\n> id="x">foo</div>` and depth-aware variants:
1199 // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1200 // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1201 // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1202 // close is on the open's last line (`depth <= 0` after the open lines)
1203 // AND `multiline_open_end.is_some()`. The trailing bytes after the
1204 // last `>` get lifted into `pre_content` via
1205 // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1206 // new branch below splits `pre_content` at the matched close marker
1207 // and grafts body + close + any trailing siblings.
1208 let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1209 && multiline_open_end.is_some()
1210 && depth_aware_tag.is_some()
1211 && depth <= 0
1212 {
1213 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1214 Some("div")
1215 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1216 match &block_type {
1217 HtmlBlockType::BlockTag {
1218 tag_name,
1219 is_verbatim: false,
1220 closed_by_blank_line: false,
1221 depth_aware: true,
1222 closes_at_open_tag: false,
1223 is_closing: false,
1224 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1225 if is_pandoc_inline_block_tag_name(tag_name)
1226 && inline_block_void_interior_abandons(
1227 first_inner,
1228 lines,
1229 start_pos,
1230 multiline_open_end,
1231 bq_depth,
1232 tag_name,
1233 )
1234 {
1235 None
1236 } else {
1237 Some(tag_name.as_str())
1238 }
1239 }
1240 _ => None,
1241 }
1242 } else {
1243 None
1244 }
1245 } else {
1246 None
1247 };
1248
1249 // Whether this block participates in the Phase 6 structural lift
1250 // (recursively parse body as Pandoc markdown and graft children).
1251 // Covers `<div>` outside blockquote context. For same-line shapes
1252 // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1253 // keep the legacy single-HTML_BLOCK_TAG shape and let the
1254 // byte-reparse path handle projection.
1255 let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1256 && bq_depth == 0
1257 && (!is_same_line_div || same_line_div_lift_safe))
1258 || strict_block_lift
1259 || same_line_bq_lift_tag.is_some()
1260 || bq_messy_lift_tag.is_some()
1261 || bq_multiline_close_lift_tag.is_some();
1262
1263 // Trailing content from the open tag (after `>`). When the lift is
1264 // active and the open line is `<div ATTRS>foo\n`, this captures
1265 // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1266 // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1267 // shapes (same-line / blockquote-wrapped).
1268 let mut pre_content = String::new();
1269
1270 // Emit opening line(s)
1271 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1272
1273 if let Some(end_line_idx) = multiline_open_end {
1274 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1275 emit_multiline_open_tag_with_attrs(
1276 builder,
1277 lines,
1278 start_pos,
1279 end_line_idx,
1280 "div",
1281 bq_depth,
1282 lift_mode,
1283 &mut pre_content,
1284 );
1285 } else if let Some(name) = strict_block_tag_name
1286 && strict_block_lift
1287 {
1288 emit_multiline_open_tag_with_attrs(
1289 builder,
1290 lines,
1291 start_pos,
1292 end_line_idx,
1293 name,
1294 bq_depth,
1295 lift_mode,
1296 &mut pre_content,
1297 );
1298 } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1299 {
1300 // Multi-line open of a lift-eligible strict-block tag inside a
1301 // blockquote (`> <section\n> id=...>`). The non-bq
1302 // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1303 // covers the bq side so the open tag emits HTML_ATTRS regions
1304 // for `AttributeNode::cast` and the projector's canonicalizer.
1305 //
1306 // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1307 // call below: only push trailing bytes into `pre_content` when
1308 // the structural lift will consume them (bq messy lift). The
1309 // bq clean-lift requires `pre_content.is_empty()`, so for clean
1310 // multi-line opens the trailing is empty anyway and this is
1311 // a no-op.
1312 let lift_trailing =
1313 bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1314 emit_multiline_open_tag_with_attrs(
1315 builder,
1316 lines,
1317 start_pos,
1318 end_line_idx,
1319 name,
1320 bq_depth,
1321 lift_trailing,
1322 &mut pre_content,
1323 );
1324 } else {
1325 emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1326 }
1327 } else {
1328 let (line_without_newline, newline_str) = strip_newline(first_inner);
1329 if !line_without_newline.is_empty() {
1330 // For HTML_BLOCK_DIV, expose the open tag's attributes
1331 // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1332 // via the same descendants walk that handles fenced-div /
1333 // heading attrs. CST bytes stay byte-equal to source — we only
1334 // tokenize at finer granularity for matched div opens.
1335 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1336 let trailing =
1337 emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1338 if !trailing.is_empty() {
1339 pre_content.push_str(trailing);
1340 pre_content.push_str(newline_str);
1341 }
1342 } else if let Some(name) = strict_block_tag_name
1343 && strict_block_lift
1344 {
1345 let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1346 if !trailing.is_empty() {
1347 pre_content.push_str(trailing);
1348 pre_content.push_str(newline_str);
1349 }
1350 } else if let Some(name) =
1351 bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1352 {
1353 // Inside a blockquote, lift trailing bytes into
1354 // `pre_content` when either the same-line bq gate fires
1355 // (`> <tag>body</tag>` — handled by `same_line_closed`)
1356 // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1357 // </tag>` and butted-close — handled at the close-marker
1358 // site below). For the clean-shape bq lift the open has
1359 // no trailing bytes regardless, so `lift_trailing=true`
1360 // is a no-op there.
1361 let lift_trailing =
1362 same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1363 let trailing =
1364 emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1365 if lift_trailing && !trailing.is_empty() {
1366 pre_content.push_str(trailing);
1367 pre_content.push_str(newline_str);
1368 }
1369 } else {
1370 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1371 }
1372 }
1373 // When the open tag has trailing content under lift mode, the
1374 // newline belongs to that trailing line (it terminates the
1375 // synthetic body line, not the open tag). Don't double-emit.
1376 if pre_content.is_empty() && !newline_str.is_empty() {
1377 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1378 }
1379 }
1380
1381 builder.finish_node(); // HtmlBlockTag
1382
1383 // Check if opening line also contains closing marker. Blank-line-terminated
1384 // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1385 // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1386 // (`closes_at_open_tag: true`) close immediately — the block always
1387 // ends on the open-tag line since there is no closing tag to find.
1388 let void_block = matches!(
1389 &block_type,
1390 HtmlBlockType::BlockTag {
1391 closes_at_open_tag: true,
1392 ..
1393 }
1394 );
1395 // Void tags with a multi-line open close immediately after the open
1396 // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1397 // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1398 // a single RawBlock for the whole multi-line tag, with no following
1399 // content.
1400 if void_block && let Some(end_line_idx) = multiline_open_end {
1401 log::trace!(
1402 "HTML void block at line {} closes after multi-line open ending at line {}",
1403 start_pos + 1,
1404 end_line_idx + 1
1405 );
1406 builder.finish_node(); // HtmlBlock
1407 return end_line_idx + 1;
1408 }
1409 // Multi-line open with all matched closes on the open's last line:
1410 // `pre_content` holds the bytes after the last open `>` (lifted there
1411 // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1412 // When `depth <= 0` after the multi-line open and the trailing bytes
1413 // contain the depth-zero matched close, do the same-line lift on
1414 // `pre_content` directly. Mirrors the single-line `same_line_closed`
1415 // lift below — same body / close-marker / trailing-graft shape, just
1416 // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1417 //
1418 // The body bytes of `pre_content` come from the open's last line,
1419 // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1420 // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1421 // tag thus inherit the bq context without per-line prefix injection,
1422 // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1423 // both the non-bq and bq variants of this shape.
1424 if let Some(end_line_idx) = multiline_open_end
1425 && !blank_terminated
1426 && depth_aware_tag.is_some()
1427 && depth <= 0
1428 && lift_mode
1429 && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1430 && !pre_content.is_empty()
1431 {
1432 let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1433 Some("div")
1434 } else if strict_block_lift {
1435 strict_block_tag_name
1436 } else if let Some(name) = bq_multiline_close_lift_tag {
1437 Some(name)
1438 } else {
1439 None
1440 };
1441 if let Some(tag_name) = tag_name_opt {
1442 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1443 if let Some((leading, close_part)) =
1444 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1445 {
1446 let close_marker_end =
1447 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1448 let close_marker = &close_part[..close_marker_end];
1449 let same_line_trailing = &close_part[close_marker_end..];
1450 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1451 LastParaDemote::SkipTrailingBlanks
1452 } else {
1453 LastParaDemote::OnlyIfLast
1454 };
1455 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1456 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1457 if same_line_trailing.is_empty() {
1458 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1459 close_line.push_str(close_marker);
1460 close_line.push_str(post_nl);
1461 emit_html_block_line(builder, &close_line, 0);
1462 builder.finish_node();
1463 builder.finish_node(); // HtmlBlock
1464 } else {
1465 builder.token(SyntaxKind::TEXT.into(), close_marker);
1466 builder.finish_node(); // HTML_BLOCK_TAG
1467 builder.finish_node(); // HtmlBlock
1468
1469 let mut trailing_text =
1470 String::with_capacity(same_line_trailing.len() + post_nl.len());
1471 trailing_text.push_str(same_line_trailing);
1472 trailing_text.push_str(post_nl);
1473 let mut inner_options = config.clone();
1474 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1475 inner_options.refdef_labels = Some(refdefs.clone());
1476 let inner_root = crate::parser::parse_with_refdefs(
1477 &trailing_text,
1478 Some(inner_options),
1479 refdefs,
1480 );
1481 let mut bq = None;
1482 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1483 }
1484 return end_line_idx + 1;
1485 }
1486 }
1487 }
1488
1489 let same_line_closed = !blank_terminated
1490 && multiline_open_end.is_none()
1491 && (void_block
1492 || match &depth_aware_tag {
1493 Some(_) => depth <= 0,
1494 None => is_closing_marker(first_inner, &block_type),
1495 });
1496 if same_line_closed {
1497 log::trace!(
1498 "HTML block at line {} opens and closes on same line",
1499 start_pos + 1
1500 );
1501 // Same-line structural lift (div or non-div strict-block):
1502 // pre_content holds the bytes after the open `>` (including
1503 // the close `</tag>` and the trailing newline). Split into
1504 // body + close tag, emit body via recursive parse, emit close
1505 // tag as a sibling `HTML_BLOCK_TAG`.
1506 let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1507 None
1508 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1509 Some("div")
1510 } else if same_line_strict_lift_safe {
1511 strict_block_tag_name
1512 } else if let Some(name) = same_line_bq_lift_tag {
1513 // Bq same-line: body has no inner newlines so the standard
1514 // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1515 // sufficient. The bq prefix `> ` lives on the outer
1516 // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1517 Some(name)
1518 } else {
1519 None
1520 };
1521 if let Some(tag_name) = same_line_lift_tag {
1522 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1523 // Depth-aware split: handles `<tag>foo</tag>bar` (single
1524 // close, trailing text), `<tag>foo</tag></tag>` (matched
1525 // close + unmatched trailing close → sibling RawBlock),
1526 // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1527 // recursive body parse).
1528 if let Some((leading, close_part)) =
1529 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1530 {
1531 // `close_part` starts with `</tag` and contains the close
1532 // marker followed by any same-line trailing text. Split
1533 // off the close marker bytes (`</tag>`) so the close
1534 // `HTML_BLOCK_TAG` carries only those bytes; trailing
1535 // text is parsed and grafted as a sibling block at the
1536 // parent level (matches pandoc-native shape:
1537 // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1538 let close_marker_end =
1539 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1540 let close_marker = &close_part[..close_marker_end];
1541 let same_line_trailing = &close_part[close_marker_end..];
1542
1543 // Same-line is always close-butted; div demotes the
1544 // trailing Para→Plain via `SkipTrailingBlanks`.
1545 // Non-div strict-block uses `OnlyIfLast` (consistent
1546 // with butted-close — no trailing BLANK_LINE before
1547 // the close means the trailing Para demotes).
1548 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1549 LastParaDemote::SkipTrailingBlanks
1550 } else {
1551 LastParaDemote::OnlyIfLast
1552 };
1553 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1554 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1555 if same_line_trailing.is_empty() {
1556 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1557 close_line.push_str(close_marker);
1558 close_line.push_str(post_nl);
1559 emit_html_block_line(builder, &close_line, 0);
1560 builder.finish_node();
1561 builder.finish_node(); // HtmlBlock
1562 } else {
1563 // Close tag holds only the close-marker bytes;
1564 // trailing + newline graft as siblings of the
1565 // wrapper (matches pandoc's per-tag block split).
1566 builder.token(SyntaxKind::TEXT.into(), close_marker);
1567 builder.finish_node(); // HTML_BLOCK_TAG
1568 builder.finish_node(); // HtmlBlock
1569
1570 let mut trailing_text =
1571 String::with_capacity(same_line_trailing.len() + post_nl.len());
1572 trailing_text.push_str(same_line_trailing);
1573 trailing_text.push_str(post_nl);
1574 let mut inner_options = config.clone();
1575 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1576 inner_options.refdef_labels = Some(refdefs.clone());
1577 let inner_root = crate::parser::parse_with_refdefs(
1578 &trailing_text,
1579 Some(inner_options),
1580 refdefs,
1581 );
1582 let mut bq = None;
1583 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1584 }
1585 return start_pos + 1;
1586 }
1587 }
1588 builder.finish_node(); // HtmlBlock
1589 return start_pos + 1;
1590 }
1591
1592 let mut current_pos = multiline_open_end
1593 .map(|end| end + 1)
1594 .unwrap_or(start_pos + 1);
1595 let mut content_lines: Vec<&str> = Vec::new();
1596 let mut found_closing = false;
1597
1598 // Parse content until we find the closing marker
1599 while current_pos < lines.len() {
1600 let line = lines[current_pos];
1601 let (line_bq_depth, inner) = count_blockquote_markers(line);
1602
1603 // Only process lines at the same or deeper blockquote depth
1604 if line_bq_depth < bq_depth {
1605 break;
1606 }
1607
1608 // Blank-line-terminated blocks (types 6/7) end before the blank line.
1609 // The blank line itself is not part of the block.
1610 if blank_terminated && inner.trim().is_empty() {
1611 break;
1612 }
1613
1614 // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1615 // count opens/closes of the same tag name and only close when depth
1616 // returns to 0; otherwise fall back to substring-match on the line.
1617 let line_closes = match &depth_aware_tag {
1618 Some(tag_name) => {
1619 let (opens, closes) = count_tag_balance(inner, tag_name);
1620 depth += opens as i64;
1621 depth -= closes as i64;
1622 depth <= 0
1623 }
1624 None => is_closing_marker(inner, &block_type),
1625 };
1626
1627 if line_closes {
1628 log::trace!("Found HTML block closing at line {}", current_pos + 1);
1629 found_closing = true;
1630
1631 // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1632 // the open and close tags stand alone on their source lines
1633 // (no trailing on open, no body content on close after
1634 // stripping bq markers), lift the body lines structurally
1635 // so the projector walks CST children instead of
1636 // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1637 //
1638 // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1639 // grafted, Para preserved), non-div strict-block tags
1640 // (`<form>`, `<section>`, …) and inline-block matched-pair
1641 // tags (`<video>`, `<iframe>`, …) — the latter two under
1642 // HTML_BLOCK with the structural lift hitting pandoc's
1643 // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1644 // demotion. Inline-block additionally bails if the body
1645 // starts at a fresh-block position with a void block tag
1646 // (mirrors the non-bq matched-pair gate).
1647 //
1648 // Other bq-wrapped shapes (butted-close / open-trailing /
1649 // same-line) still fall through to the opaque path.
1650 // Multi-line opens are allowed here as of 2026-05-12: the
1651 // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1652 // per attr line and per-line bq prefix tokens) by the bq-aware
1653 // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1654 // empty for multi-line opens (the emitter writes any trailing
1655 // bytes on the last open line directly as TEXT inside
1656 // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1657 // trailing falls through to the opaque path, matching the non-
1658 // bq deferral.
1659 let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1660 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1661 Some("div")
1662 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1663 match &block_type {
1664 HtmlBlockType::BlockTag {
1665 tag_name,
1666 is_verbatim: false,
1667 closed_by_blank_line: false,
1668 depth_aware: true,
1669 closes_at_open_tag: false,
1670 is_closing: false,
1671 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1672 _ => None,
1673 }
1674 } else {
1675 None
1676 }
1677 } else {
1678 None
1679 };
1680
1681 let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1682 // Open-shape: last open line must end with `>` (clean
1683 // close-of-open). For single-line, that's `first_inner`
1684 // (already bq-stripped); for multi-line, strip bq markers
1685 // from `lines[end_line_idx]` and check the same.
1686 let last_open_line: &str = match multiline_open_end {
1687 None => first_inner,
1688 Some(end) if prefix.bq_depth() > 0 || prefix.list_content_col() > 0 => {
1689 prefix.strip(lines[end])
1690 }
1691 Some(end) => lines[end],
1692 };
1693 let (open_no_nl, _) = strip_newline(last_open_line);
1694 if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1695 return false;
1696 }
1697 let close_stripped = prefix.strip(line);
1698 let (close_no_nl, _) = strip_newline(close_stripped);
1699 if !close_no_nl
1700 .trim_start_matches([' ', '\t'])
1701 .starts_with("</")
1702 {
1703 return false;
1704 }
1705 if is_pandoc_inline_block_tag_name(tag_name)
1706 && inline_block_void_interior_abandons(
1707 first_inner,
1708 lines,
1709 start_pos,
1710 multiline_open_end,
1711 bq_depth,
1712 tag_name,
1713 )
1714 {
1715 return false;
1716 }
1717 true
1718 });
1719
1720 if bq_clean_lift {
1721 let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1722 LastParaDemote::Never
1723 } else {
1724 LastParaDemote::OnlyIfLast
1725 };
1726 emit_html_block_body_lifted_bq(
1727 builder,
1728 &content_lines,
1729 prefix,
1730 demote_policy,
1731 config,
1732 );
1733 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1734 emit_html_block_line(builder, line, bq_depth);
1735 builder.finish_node();
1736 current_pos += 1;
1737 break;
1738 }
1739
1740 // Bq messy-shape lift — single-line open with trailing or
1741 // butted-close (or both). `pre_content` already captures any
1742 // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1743 // strip the close line's bq markers before splitting so
1744 // `leading` and `close_part` are bq-prefix-free. Body parses
1745 // recursively from `pre_content + stripped(content_lines) +
1746 // leading`, with per-line bq prefixes re-injected so the CST
1747 // stays byte-equal to the source. Demote: div is keyed on
1748 // close-butted-ness (Plain when leading non-empty, Para
1749 // otherwise); non-div uses OnlyIfLast either way.
1750 if let Some(tag_name) = bq_messy_lift_tag {
1751 let close_stripped = prefix.strip(line);
1752 let close_prefix_len = line.len() - close_stripped.len();
1753 let close_prefix = &line[..close_prefix_len];
1754 if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1755 {
1756 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1757 if leading.is_empty() {
1758 LastParaDemote::Never
1759 } else {
1760 LastParaDemote::SkipTrailingBlanks
1761 }
1762 } else {
1763 LastParaDemote::OnlyIfLast
1764 };
1765 emit_html_block_body_lifted_bq_messy(
1766 builder,
1767 &pre_content,
1768 &content_lines,
1769 leading,
1770 close_prefix,
1771 prefix,
1772 policy,
1773 config,
1774 );
1775 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1776 // When `leading` is empty, no recursive-parse output carries
1777 // the close line's bq prefix, so emit it here before the
1778 // close tag. When `leading` is non-empty,
1779 // `emit_html_block_body_lifted_bq_messy` already injected
1780 // the prefix at the start of the leading bytes (via the
1781 // BqPrefixState entry); emitting again would double the
1782 // prefix bytes and break losslessness.
1783 if leading.is_empty() {
1784 emit_bq_prefix_tokens(builder, close_prefix);
1785 }
1786 emit_html_block_line(builder, close_part, 0);
1787 builder.finish_node();
1788 current_pos += 1;
1789 break;
1790 }
1791 }
1792
1793 // Under lift mode, try to split the close line into a
1794 // leading "body content" prefix and the close-marker
1795 // remainder using depth-aware matching. Walks at depth 1
1796 // (we're inside the open tag) so nested same-tag opens
1797 // (e.g. `<inner></inner></tag>` style with a nested div)
1798 // are absorbed into the body and parsed recursively, and
1799 // multi-close shapes (`foo</div></div>` on the close line)
1800 // peel off the matched-pair close — the unmatched
1801 // trailing close projects as a sibling `RawBlock` per
1802 // pandoc-native. For `<div>`, non-empty `leading`
1803 // propagates pandoc's `markdown_in_html_blocks` Plain
1804 // demotion rule. For non-div strict-block tags, demotion
1805 // follows pandoc's `OnlyIfLast` rule (demote the trailing
1806 // Para only when no blank line precedes the close).
1807 let close_split_tag = if lift_mode {
1808 if strict_block_lift {
1809 strict_block_tag_name
1810 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1811 Some("div")
1812 } else {
1813 None
1814 }
1815 } else {
1816 None
1817 };
1818 let (close_no_nl, close_post_nl) = strip_newline(line);
1819 let close_split = close_split_tag
1820 .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1821
1822 if let Some((leading, close_part)) = close_split {
1823 // Close-line leading that is whitespace-only is close-tag
1824 // indentation, not body content (pandoc-native strips it
1825 // from the close RawBlock and treats the close as butted —
1826 // see ` </tag>` shapes). Route those bytes into the
1827 // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1828 // projector strips them; keep the demote policy keyed on
1829 // the original leading so butted-close detection (Plain
1830 // demotion for div, OnlyIfLast for non-div) still fires.
1831 let leading_is_ws_only =
1832 !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1833 let body_leading = if leading_is_ws_only { "" } else { leading };
1834 let policy = if strict_block_lift {
1835 LastParaDemote::OnlyIfLast
1836 } else if !leading.is_empty() {
1837 LastParaDemote::SkipTrailingBlanks
1838 } else {
1839 LastParaDemote::Never
1840 };
1841 // Split close_part into close-marker bytes (`</tag>`)
1842 // and trailing bytes (e.g. an extra `</div>` for the
1843 // double-close case, or `bar` for trailing text after
1844 // a normal close). Trailing bytes are recursively
1845 // parsed and grafted as siblings of the HTML_BLOCK_DIV
1846 // wrapper.
1847 let close_tag_name = close_split_tag.expect("close_split_tag present");
1848 let close_marker_end =
1849 split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1850 let close_marker = &close_part[..close_marker_end];
1851 let close_trailing = &close_part[close_marker_end..];
1852
1853 emit_html_block_body_lifted(
1854 builder,
1855 &pre_content,
1856 &content_lines,
1857 body_leading,
1858 policy,
1859 config,
1860 );
1861 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1862 if leading_is_ws_only {
1863 builder.token(SyntaxKind::WHITESPACE.into(), leading);
1864 }
1865 if close_trailing.is_empty() {
1866 let mut close_line =
1867 String::with_capacity(close_marker.len() + close_post_nl.len());
1868 close_line.push_str(close_marker);
1869 close_line.push_str(close_post_nl);
1870 emit_html_block_line(builder, &close_line, 0);
1871 builder.finish_node();
1872 } else {
1873 // Close tag holds only the close-marker bytes;
1874 // trailing + newline graft as siblings.
1875 builder.token(SyntaxKind::TEXT.into(), close_marker);
1876 builder.finish_node(); // HTML_BLOCK_TAG
1877 builder.finish_node(); // HtmlBlock
1878
1879 let mut trailing_text =
1880 String::with_capacity(close_trailing.len() + close_post_nl.len());
1881 trailing_text.push_str(close_trailing);
1882 trailing_text.push_str(close_post_nl);
1883 let mut inner_options = config.clone();
1884 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1885 inner_options.refdef_labels = Some(refdefs.clone());
1886 let inner_root = crate::parser::parse_with_refdefs(
1887 &trailing_text,
1888 Some(inner_options),
1889 refdefs,
1890 );
1891 let mut bq = None;
1892 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1893 current_pos += 1;
1894 return current_pos;
1895 }
1896 } else {
1897 emit_html_block_body(
1898 builder,
1899 &pre_content,
1900 &content_lines,
1901 bq_depth,
1902 wrapper_kind,
1903 lift_mode,
1904 config,
1905 );
1906 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1907 emit_html_block_line(builder, line, bq_depth);
1908 builder.finish_node();
1909 }
1910
1911 current_pos += 1;
1912 break;
1913 }
1914
1915 // Regular content line
1916 content_lines.push(line);
1917 current_pos += 1;
1918 }
1919
1920 // If we didn't find a closing marker, emit what we collected
1921 if !found_closing {
1922 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1923 emit_html_block_body(
1924 builder,
1925 &pre_content,
1926 &content_lines,
1927 bq_depth,
1928 wrapper_kind,
1929 lift_mode,
1930 config,
1931 );
1932 }
1933
1934 builder.finish_node(); // HtmlBlock
1935 current_pos
1936}
1937
1938/// Emit the collected inner content lines for an HTML block.
1939///
1940/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1941/// line `<div>` open outside blockquote), recursively parse the inner
1942/// content (including any open-tag trailing) as Pandoc-flavored
1943/// markdown and graft the resulting top-level blocks as direct children
1944/// of the wrapper. This is the Phase 6 structural lift — the projector
1945/// and downstream consumers (linter, salsa, LSP) can walk the
1946/// structural children instead of re-tokenizing the body bytes.
1947///
1948/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1949/// blockquote, multi-line open, or no content at all — fall through to
1950/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1951///
1952/// CST bytes remain byte-identical to source: the recursive parser is
1953/// lossless on the same byte slice the legacy path would have captured
1954/// as TEXT.
1955fn emit_html_block_body(
1956 builder: &mut GreenNodeBuilder<'static>,
1957 pre_content: &str,
1958 content_lines: &[&str],
1959 bq_depth: usize,
1960 wrapper_kind: SyntaxKind,
1961 lift_mode: bool,
1962 config: &ParserOptions,
1963) {
1964 if pre_content.is_empty() && content_lines.is_empty() {
1965 return;
1966 }
1967 if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1968 // Reached when the parser walked to end-of-input without finding
1969 // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1970 emit_html_block_body_lifted(
1971 builder,
1972 pre_content,
1973 content_lines,
1974 "",
1975 LastParaDemote::Never,
1976 config,
1977 );
1978 return;
1979 }
1980 // Legacy path: opaque TEXT capture. `pre_content` is always empty
1981 // here (lift_mode is the only path that populates it), but be
1982 // defensive — if a trailing prefix snuck in, emit it as TEXT so
1983 // bytes are preserved.
1984 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1985 if !pre_content.is_empty() {
1986 builder.token(SyntaxKind::TEXT.into(), pre_content);
1987 }
1988 for content_line in content_lines {
1989 emit_html_block_line(builder, content_line, bq_depth);
1990 }
1991 builder.finish_node();
1992}
1993
1994/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1995/// to `PLAIN` when grafting children into the structural CST.
1996#[derive(Copy, Clone, Debug)]
1997enum LastParaDemote {
1998 /// Never demote — pandoc preserves the trailing `Para`.
1999 Never,
2000 /// Demote the LAST `PARAGRAPH` child, skipping any trailing
2001 /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2002 /// tag is butted against the paragraph text on its source line —
2003 /// pandoc's `markdown_in_html_blocks` Plain demotion.
2004 SkipTrailingBlanks,
2005 /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2006 /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2007 /// for non-div strict-block tags whose body emits at top-level
2008 /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2009 /// demotes the trailing `Para` to `Plain` unless a blank line
2010 /// separates them.
2011 OnlyIfLast,
2012}
2013
2014/// Lift the HTML-block body into structural CST children: build the
2015/// inner text from `pre_content` + `content_lines` + `post_content`
2016/// (in order), recursively parse it as Pandoc-flavored markdown, and
2017/// graft the resulting top-level blocks into `builder`. `demote_policy`
2018/// controls whether the trailing paragraph is retagged as `PLAIN` to
2019/// encode pandoc's Plain/Para adjacency rules structurally.
2020fn emit_html_block_body_lifted(
2021 builder: &mut GreenNodeBuilder<'static>,
2022 pre_content: &str,
2023 content_lines: &[&str],
2024 post_content: &str,
2025 demote_policy: LastParaDemote,
2026 config: &ParserOptions,
2027) {
2028 emit_html_block_body_lifted_inner(
2029 builder,
2030 pre_content,
2031 content_lines,
2032 post_content,
2033 demote_policy,
2034 config,
2035 &mut None,
2036 )
2037}
2038
2039/// Body-lift variant for `<div>` inside a blockquote. Strips
2040/// `bq_depth` levels of blockquote markers from each `content_line`,
2041/// captures the per-line prefix bytes, and grafts the recursive parse
2042/// with prefix injection so the output CST stays byte-equal to the
2043/// source. `pre_content` and `post_content` must be empty (the bq
2044/// clean lift only handles the shape where the open and close tags
2045/// stand alone on their source lines).
2046fn emit_html_block_body_lifted_bq(
2047 builder: &mut GreenNodeBuilder<'static>,
2048 content_lines: &[&str],
2049 prefix: &ContainerPrefix,
2050 demote_policy: LastParaDemote,
2051 config: &ParserOptions,
2052) {
2053 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::with_capacity(content_lines.len());
2054 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2055 for cl in content_lines {
2056 let (li, bq, inner) = prefix.split(cl);
2057 prefix_lines.push(ContainerPrefixLine {
2058 list_indent: li.to_string(),
2059 bq_prefix: bq.to_string(),
2060 });
2061 stripped_lines.push(inner);
2062 }
2063 let mut state = ContainerPrefixState::new(prefix_lines);
2064 emit_html_block_body_lifted_inner(
2065 builder,
2066 "",
2067 &stripped_lines,
2068 "",
2069 demote_policy,
2070 config,
2071 &mut state,
2072 )
2073}
2074
2075/// Body-lift variant for the bq messy-shape lift — open-trailing,
2076/// butted-close, or both. The open-trailing bytes (if any) sit in
2077/// `pre_content` (line 0 of the body — no bq prefix in source because
2078/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2079/// each carry their own bq prefix. The close line's `leading` (body
2080/// bytes before `</tag>`) sits on the close line, prefixed in source
2081/// by `close_line_prefix` (the bq prefix captured from `line`).
2082///
2083/// Builds `prefixes` so each emitted line in the recursive parse
2084/// output gets the right per-line bq prefix re-injected at line start:
2085/// `pre_content` → empty prefix (no source `> ` precedes it); each
2086/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2087/// Result CST stays byte-equal to source.
2088#[allow(clippy::too_many_arguments)]
2089fn emit_html_block_body_lifted_bq_messy(
2090 builder: &mut GreenNodeBuilder<'static>,
2091 pre_content: &str,
2092 content_lines: &[&str],
2093 leading: &str,
2094 close_line_prefix: &str,
2095 prefix: &ContainerPrefix,
2096 demote_policy: LastParaDemote,
2097 config: &ParserOptions,
2098) {
2099 let mut prefix_lines: Vec<ContainerPrefixLine> = Vec::new();
2100 if !pre_content.is_empty() {
2101 prefix_lines.push(ContainerPrefixLine::default());
2102 }
2103 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2104 for cl in content_lines {
2105 let (li, bq, inner) = prefix.split(cl);
2106 prefix_lines.push(ContainerPrefixLine {
2107 list_indent: li.to_string(),
2108 bq_prefix: bq.to_string(),
2109 });
2110 stripped_lines.push(inner);
2111 }
2112 if !leading.is_empty() {
2113 // The close line carries its own captured prefix bytes; treat
2114 // them as bq-prefix only (no list-indent split applied) to keep
2115 // the legacy bq-only re-injection behavior for messy-shape
2116 // close-line lifts.
2117 prefix_lines.push(ContainerPrefixLine::bq_only(close_line_prefix.to_string()));
2118 }
2119 let mut state = ContainerPrefixState::new(prefix_lines);
2120 emit_html_block_body_lifted_inner(
2121 builder,
2122 pre_content,
2123 &stripped_lines,
2124 leading,
2125 demote_policy,
2126 config,
2127 &mut state,
2128 )
2129}
2130
2131fn emit_html_block_body_lifted_inner(
2132 builder: &mut GreenNodeBuilder<'static>,
2133 pre_content: &str,
2134 content_lines: &[&str],
2135 post_content: &str,
2136 demote_policy: LastParaDemote,
2137 config: &ParserOptions,
2138 bq: &mut Option<ContainerPrefixState>,
2139) {
2140 if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2141 return;
2142 }
2143 let mut inner_text = String::with_capacity(
2144 pre_content.len()
2145 + content_lines.iter().map(|s| s.len()).sum::<usize>()
2146 + post_content.len(),
2147 );
2148 inner_text.push_str(pre_content);
2149 for line in content_lines {
2150 inner_text.push_str(line);
2151 }
2152 inner_text.push_str(post_content);
2153
2154 let mut inner_options = config.clone();
2155 let refdefs = config.refdef_labels.clone().unwrap_or_default();
2156 inner_options.refdef_labels = Some(refdefs.clone());
2157 let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2158 graft_document_children(builder, &inner_root, demote_policy, bq);
2159}
2160
2161/// Walk a parsed inner document's top-level children and re-emit them
2162/// into `builder`. The document's wrapper node is skipped — only its
2163/// children are grafted.
2164///
2165/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2166/// as `PLAIN` — see [`LastParaDemote`].
2167///
2168/// `bq` is `Some` when grafting a body that lived inside an outer
2169/// container (blockquote, list-item, or both) — token emission then
2170/// injects the captured per-line prefix tokens at line starts so the
2171/// CST stays byte-equal to source. See
2172/// [`super::container_prefix::ContainerPrefixState`].
2173fn graft_document_children(
2174 builder: &mut GreenNodeBuilder<'static>,
2175 doc: &SyntaxNode,
2176 demote_policy: LastParaDemote,
2177 bq: &mut Option<ContainerPrefixState>,
2178) {
2179 let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2180
2181 let mut demote_idx: Option<usize> = None;
2182 match demote_policy {
2183 LastParaDemote::Never => {}
2184 LastParaDemote::SkipTrailingBlanks => {
2185 for (i, c) in children.iter().enumerate().rev() {
2186 if let rowan::NodeOrToken::Node(n) = c {
2187 if n.kind() == SyntaxKind::BLANK_LINE {
2188 continue;
2189 }
2190 if n.kind() == SyntaxKind::PARAGRAPH {
2191 demote_idx = Some(i);
2192 }
2193 break;
2194 }
2195 }
2196 }
2197 LastParaDemote::OnlyIfLast => {
2198 for (i, c) in children.iter().enumerate().rev() {
2199 if let rowan::NodeOrToken::Node(n) = c {
2200 if n.kind() == SyntaxKind::PARAGRAPH {
2201 demote_idx = Some(i);
2202 }
2203 break;
2204 }
2205 }
2206 }
2207 }
2208
2209 for (i, child) in children.into_iter().enumerate() {
2210 match child {
2211 rowan::NodeOrToken::Node(n) => {
2212 if Some(i) == demote_idx {
2213 graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2214 } else {
2215 graft_subtree(builder, &n, bq);
2216 }
2217 }
2218 rowan::NodeOrToken::Token(t) => {
2219 emit_grafted_token(builder, t.kind(), t.text(), bq);
2220 }
2221 }
2222 }
2223}
2224
2225/// Recursively re-emit `node` and its descendants into `builder`.
2226/// Token text is copied verbatim so the result is byte-identical to
2227/// the input span (modulo bq prefix tokens injected at line starts
2228/// when `bq` is `Some`).
2229fn graft_subtree(
2230 builder: &mut GreenNodeBuilder<'static>,
2231 node: &SyntaxNode,
2232 bq: &mut Option<ContainerPrefixState>,
2233) {
2234 graft_subtree_as(builder, node, node.kind(), bq);
2235}
2236
2237/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2238/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2239/// the close-butted demotion rule.
2240fn graft_subtree_as(
2241 builder: &mut GreenNodeBuilder<'static>,
2242 node: &SyntaxNode,
2243 kind: SyntaxKind,
2244 bq: &mut Option<ContainerPrefixState>,
2245) {
2246 builder.start_node(kind.into());
2247 for child in node.children_with_tokens() {
2248 match child {
2249 rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2250 rowan::NodeOrToken::Token(t) => {
2251 emit_grafted_token(builder, t.kind(), t.text(), bq);
2252 }
2253 }
2254 }
2255 builder.finish_node();
2256}
2257
2258/// Emit a single token while optionally injecting blockquote prefix
2259/// tokens at line starts. When `bq` is `None`, this is a plain
2260/// `builder.token()` passthrough.
2261fn emit_grafted_token(
2262 builder: &mut GreenNodeBuilder<'static>,
2263 kind: SyntaxKind,
2264 text: &str,
2265 bq: &mut Option<ContainerPrefixState>,
2266) {
2267 if let Some(state) = bq.as_mut() {
2268 if state.at_line_start {
2269 if let Some(line_prefix) = state.prefixes.get(state.line_idx) {
2270 emit_container_prefix_tokens(builder, line_prefix);
2271 }
2272 state.at_line_start = false;
2273 }
2274 builder.token(kind.into(), text);
2275 // `BLANK_LINE` token represents an entirely blank source line —
2276 // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2277 // token as line-ending so the per-line prefix index advances
2278 // correctly.
2279 if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2280 state.line_idx += 1;
2281 state.at_line_start = true;
2282 }
2283 } else {
2284 builder.token(kind.into(), text);
2285 }
2286}
2287
2288/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2289/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2290fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2291 for ch in prefix.chars() {
2292 if ch == '>' {
2293 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2294 } else {
2295 let mut buf = [0u8; 4];
2296 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2297 }
2298 }
2299}
2300
2301/// Locate the byte index (within `line`) of the open-tag's closing `>`
2302/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2303/// the line doesn't fit the expected shape. Mirrors the inner scan of
2304/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2305/// caller can slice off the trailing bytes.
2306fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2307 let bytes = line.as_bytes();
2308 let indent_end = bytes
2309 .iter()
2310 .position(|&b| b != b' ' && b != b'\t')
2311 .unwrap_or(bytes.len());
2312 let rest = &line[indent_end..];
2313 let rest_bytes = rest.as_bytes();
2314 let prefix_len = 1 + tag_name.len();
2315 if rest_bytes.len() < prefix_len + 1
2316 || rest_bytes[0] != b'<'
2317 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2318 {
2319 return None;
2320 }
2321 let after_name = &rest[prefix_len..];
2322 let after_name_bytes = after_name.as_bytes();
2323 let mut i = 0usize;
2324 let mut quote: Option<u8> = None;
2325 while i < after_name_bytes.len() {
2326 match (quote, after_name_bytes[i]) {
2327 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2328 (Some(q), b2) if b2 == q => quote = None,
2329 (None, b'>') => return Some(indent_end + prefix_len + i),
2330 _ => {}
2331 }
2332 i += 1;
2333 }
2334 None
2335}
2336
2337/// Whether `slice` begins (after leading ASCII whitespace) with an
2338/// open tag whose name is a Pandoc void block tag (`<source>`,
2339/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2340/// open tags return false.
2341///
2342/// Used by the inline-block matched-pair lift gate: pandoc-native
2343/// abandons the lift when the body's first non-blank content is a
2344/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2345/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2346/// matched-pair lift).
2347fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2348 let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2349 if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2350 return false;
2351 }
2352 let Some(tag_end) = parse_open_tag(trimmed) else {
2353 return false;
2354 };
2355 let bytes = trimmed.as_bytes();
2356 let mut name_end = 1usize;
2357 while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2358 {
2359 name_end += 1;
2360 }
2361 if name_end == 1 {
2362 return false;
2363 }
2364 is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2365}
2366
2367/// Whether the body of an inline-block matched-pair (`<video>...`,
2368/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2369/// a void block tag — the condition under which pandoc-native abandons
2370/// the matched-pair lift. Probes three shapes:
2371///
2372/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2373/// after the open `>` on `first_inner` start with `<source`.
2374/// - **Single-line open + multi-line body**: open-trailing on the open
2375/// line is empty/whitespace AND the first non-blank body line
2376/// (`lines[start_pos+1..]`) starts with a void tag.
2377/// - **Multi-line open**: same body-line scan starting at
2378/// `lines[multiline_open_end+1..]`.
2379///
2380/// Returns `false` when the body begins with text, with a close tag,
2381/// or with a non-void block tag — those cases all proceed with the
2382/// matched-pair lift.
2383fn inline_block_void_interior_abandons(
2384 first_inner: &str,
2385 lines: &[&str],
2386 start_pos: usize,
2387 multiline_open_end: Option<usize>,
2388 bq_depth: usize,
2389 tag_name: &str,
2390) -> bool {
2391 let (line_no_nl, _) = strip_newline(first_inner);
2392 let (body_start_line_idx, open_trailing) = match multiline_open_end {
2393 Some(end) => (end + 1, ""),
2394 None => {
2395 let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2396 let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2397 (start_pos + 1, trailing)
2398 }
2399 };
2400 let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2401 if !trimmed.is_empty() {
2402 return slice_starts_with_void_block_tag(trimmed);
2403 }
2404 for line in &lines[body_start_line_idx..] {
2405 let inner = if bq_depth > 0 {
2406 strip_n_blockquote_markers(line, bq_depth)
2407 } else {
2408 line
2409 };
2410 let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2411 if trimmed.is_empty() {
2412 continue;
2413 }
2414 return slice_starts_with_void_block_tag(trimmed);
2415 }
2416 false
2417}
2418
2419/// Probe whether the open-tag line has a valid (quote-aware) closing
2420/// `>` after the tag name. Admits trailing content after `>` (the
2421/// open-trailing shape `<form>foo`) — the caller is expected to capture
2422/// that trailing into the structural lift's `pre_content`.
2423pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2424 let bytes = line.as_bytes();
2425 let indent_end = bytes
2426 .iter()
2427 .position(|&b| b != b' ' && b != b'\t')
2428 .unwrap_or(bytes.len());
2429 let rest = &line[indent_end..];
2430 let rest_bytes = rest.as_bytes();
2431 let prefix_len = 1 + tag_name.len();
2432 if rest_bytes.len() < prefix_len + 1
2433 || rest_bytes[0] != b'<'
2434 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2435 {
2436 return false;
2437 }
2438 let after_name = &rest[prefix_len..];
2439 let after_name_bytes = after_name.as_bytes();
2440 let mut i = 0usize;
2441 let mut quote: Option<u8> = None;
2442 while i < after_name_bytes.len() {
2443 match (quote, after_name_bytes[i]) {
2444 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2445 (Some(q), b2) if b2 == q => quote = None,
2446 (None, b'>') => return true,
2447 _ => {}
2448 }
2449 i += 1;
2450 }
2451 false
2452}
2453
2454/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2455/// be lifted structurally. Returns `true` only when:
2456/// - The line starts with `<tag_name` (modulo leading whitespace).
2457/// - The open tag's `>` exists with proper quote handling.
2458/// - The bytes after the open `>` contain a depth-zero matched
2459/// `</tag_name>` close (depth-aware: nested `<tag>` opens
2460/// increment depth; matching is case-insensitive, quote-aware).
2461///
2462/// Trailing bytes after the matched close are accepted and grafted
2463/// as a sibling block by the caller. Examples:
2464/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2465/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2466/// recursively parses to a `RawBlock`).
2467/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2468/// parsed recursively), trailing=`bar`.
2469fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2470 let bytes = line.as_bytes();
2471 let indent_end = bytes
2472 .iter()
2473 .position(|&b| b != b' ' && b != b'\t')
2474 .unwrap_or(bytes.len());
2475 let rest = &line[indent_end..];
2476 let rest_bytes = rest.as_bytes();
2477 let prefix_len = 1 + tag_name.len();
2478 if rest_bytes.len() < prefix_len
2479 || rest_bytes[0] != b'<'
2480 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2481 {
2482 return false;
2483 }
2484 let after_name = &rest[prefix_len..];
2485 let after_name_bytes = after_name.as_bytes();
2486 let mut i = 0usize;
2487 let mut quote: Option<u8> = None;
2488 let mut gt_idx: Option<usize> = None;
2489 while i < after_name_bytes.len() {
2490 match (quote, after_name_bytes[i]) {
2491 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2492 (Some(q), b2) if b2 == q => quote = None,
2493 (None, b'>') => {
2494 gt_idx = Some(i);
2495 break;
2496 }
2497 _ => {}
2498 }
2499 i += 1;
2500 }
2501 let Some(gt_idx) = gt_idx else {
2502 return false;
2503 };
2504 let trailing = &after_name[gt_idx + 1..];
2505 // Depth-aware: walk `trailing` (we begin inside the open tag at
2506 // depth 1). Return true iff a matched `</tag>` exists where depth
2507 // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2508 matched_close_offset(trailing, tag_name).is_some()
2509}
2510
2511/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2512/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2513/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2514/// starts at 1 (we begin inside the open tag). Self-closing opens
2515/// (`<tag/>`) do not increment depth.
2516///
2517/// Returns `Some((close_start, close_end))` where:
2518/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2519/// - `close_end` is one past the matched `>`.
2520///
2521/// Returns `None` when no matched close is present (unclosed tag,
2522/// depth never returns to 0).
2523fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2524 let bytes = trailing.as_bytes();
2525 let lower_line = trailing.to_ascii_lowercase();
2526 let lower_bytes = lower_line.as_bytes();
2527 let tag_lower = tag_name.to_ascii_lowercase();
2528 let tag_bytes = tag_lower.as_bytes();
2529
2530 let mut depth: i32 = 1;
2531 let mut i = 0usize;
2532
2533 while i < bytes.len() {
2534 if bytes[i] != b'<' {
2535 i += 1;
2536 continue;
2537 }
2538 let after = i + 1;
2539 let is_close = after < bytes.len() && bytes[after] == b'/';
2540 let name_start = if is_close { after + 1 } else { after };
2541 let matched = name_start + tag_bytes.len() <= bytes.len()
2542 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2543 let after_name = name_start + tag_bytes.len();
2544 let is_boundary = matched
2545 && matches!(
2546 bytes.get(after_name).copied(),
2547 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2548 );
2549
2550 // Scan forward to this tag bracket's `>`, respecting quoted
2551 // attribute values; track self-closing form (`/>`).
2552 let mut j = if matched { after_name } else { after };
2553 let mut quote: Option<u8> = None;
2554 let mut self_close = false;
2555 let mut found_gt = false;
2556 while j < bytes.len() {
2557 let b = bytes[j];
2558 match (quote, b) {
2559 (Some(q), x) if x == q => quote = None,
2560 (None, b'"') | (None, b'\'') => quote = Some(b),
2561 (None, b'>') => {
2562 found_gt = true;
2563 if j > i + 1 && bytes[j - 1] == b'/' {
2564 self_close = true;
2565 }
2566 break;
2567 }
2568 _ => {}
2569 }
2570 j += 1;
2571 }
2572
2573 if matched && is_boundary {
2574 if is_close {
2575 depth -= 1;
2576 if depth == 0 && found_gt {
2577 return Some((i, j + 1));
2578 }
2579 } else if !self_close {
2580 depth += 1;
2581 }
2582 }
2583
2584 if found_gt {
2585 i = j + 1;
2586 } else {
2587 // Unterminated `<...` — give up.
2588 break;
2589 }
2590 }
2591 None
2592}
2593
2594/// Locate the byte offset of the first `>` after a `</tag` prefix at
2595/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2596/// the caller can split `close_part` into the close-marker bytes
2597/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2598/// expected prefix shape is missing — caller treats the whole slice
2599/// as the close marker (no trailing).
2600fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2601 let prefix_len = 2 + tag_name.len();
2602 let bytes = close_part.as_bytes();
2603 if bytes.len() < prefix_len
2604 || bytes[0] != b'<'
2605 || bytes[1] != b'/'
2606 || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2607 {
2608 return None;
2609 }
2610 // Scan from after `</tag` to the first unquoted `>`.
2611 let mut i = prefix_len;
2612 let mut quote: Option<u8> = None;
2613 while i < bytes.len() {
2614 match (quote, bytes[i]) {
2615 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2616 (Some(q), b2) if b2 == q => quote = None,
2617 (None, b'>') => return Some(i + 1),
2618 _ => {}
2619 }
2620 i += 1;
2621 }
2622 None
2623}
2624
2625/// Try to split the close line of an HTML_BLOCK_DIV body into a
2626/// leading content prefix and a clean `</tag>...` remainder. Returns
2627/// `Some((leading, close_part))` only when the line contains exactly
2628/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2629/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2630/// for missing close tags, or for compound shapes the parser
2631/// shouldn't attempt to lift in this pass.
2632///
2633/// `leading` may be empty (close starts at column 0) or pure
2634/// whitespace (close on an indented line). Both count as "butted" per
2635/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2636/// the trailing paragraph inside the div demotes Para→Plain.
2637fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2638 let (opens, closes) = count_tag_balance(line, tag_name);
2639 if opens != 0 || closes != 1 {
2640 return None;
2641 }
2642 // Locate the close tag's opening `<` by lowercased substring search.
2643 // Safe because we've already established (above) that the line has
2644 // exactly one `</tag>` and no `<tag>` opens, so the first match is
2645 // THE close.
2646 let needle = format!("</{}", tag_name);
2647 let lower = line.to_ascii_lowercase();
2648 let close_lt = lower.find(&needle)?;
2649 Some((&line[..close_lt], &line[close_lt..]))
2650}
2651
2652/// Depth-aware variant of `try_split_close_line` used by the same-line
2653/// lift path. Walks `line` starting at depth 1 (we begin inside the
2654/// open `<tag>`) and splits at the byte position where the matched
2655/// `</tag>` close brings depth to 0. Returns `Some((body,
2656/// close_part))` where `body` is the bytes before the matched-close
2657/// start and `close_part` is the bytes from the matched close onward.
2658///
2659/// Unlike `try_split_close_line` this accepts nested same-tag opens
2660/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2661/// body=`<div>x</div>` (a nested div the body lift parses
2662/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2663/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2664/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2665fn try_split_close_line_depth_aware<'a>(
2666 line: &'a str,
2667 tag_name: &str,
2668) -> Option<(&'a str, &'a str)> {
2669 let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2670 Some((&line[..close_start], &line[close_start..]))
2671}
2672
2673/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2674/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2675/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2676/// + TEXT(">") + TEXT(trailing)?`.
2677///
2678/// Bytes are byte-identical to the source — this only tokenizes at finer
2679/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2680/// region structurally. Falls back to a single TEXT token if the line
2681/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2682/// only retags as the lift kind when this shape was matched).
2683///
2684/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2685/// returned as `&str` instead so the caller can splice them into the
2686/// recursive-parse input for the structural body lift. When false
2687/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2688/// empty slice is returned.
2689fn emit_open_tag_tokens<'a>(
2690 builder: &mut GreenNodeBuilder<'static>,
2691 line: &'a str,
2692 tag_name: &str,
2693 lift_trailing: bool,
2694) -> &'a str {
2695 let bytes = line.as_bytes();
2696 // Leading indent (CommonMark allows up to 3 spaces).
2697 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2698 if indent_end > 0 {
2699 builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2700 }
2701 let rest = &line[indent_end..];
2702 // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2703 let prefix_len = 1 + tag_name.len();
2704 if !rest.starts_with('<')
2705 || rest.len() < prefix_len
2706 || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2707 {
2708 builder.token(SyntaxKind::TEXT.into(), rest);
2709 return "";
2710 }
2711 let after_name = &rest[prefix_len..];
2712 let after_name_bytes = after_name.as_bytes();
2713 // Find the closing `>` of the open tag, respecting quoted attribute values.
2714 let mut i = 0usize;
2715 let mut quote: Option<u8> = None;
2716 let mut tag_close: Option<usize> = None;
2717 while i < after_name_bytes.len() {
2718 let b = after_name_bytes[i];
2719 match (quote, b) {
2720 (None, b'"') | (None, b'\'') => quote = Some(b),
2721 (Some(q), b2) if b2 == q => quote = None,
2722 (None, b'>') => {
2723 tag_close = Some(i);
2724 break;
2725 }
2726 _ => {}
2727 }
2728 i += 1;
2729 }
2730 let Some(tag_close) = tag_close else {
2731 // Open tag has no closing `>` on this line — defensive fallback.
2732 builder.token(SyntaxKind::TEXT.into(), rest);
2733 return "";
2734 };
2735 // Whitespace between the tag name and the attribute region.
2736 let attrs_inner = &after_name[..tag_close];
2737 let ws_end = attrs_inner
2738 .as_bytes()
2739 .iter()
2740 .position(|&b| !matches!(b, b' ' | b'\t'))
2741 .unwrap_or(attrs_inner.len());
2742 let leading_ws = &attrs_inner[..ws_end];
2743 // Strip a trailing self-closing slash and the whitespace before it
2744 // from the attribute region; emit them as TEXT outside the
2745 // HTML_ATTRS node so the structural region only holds attribute
2746 // bytes (not formatting punctuation).
2747 let attrs_after_ws = &attrs_inner[ws_end..];
2748 let mut attr_end = attrs_after_ws.len();
2749 let attr_bytes = attrs_after_ws.as_bytes();
2750 let mut self_close_start = attr_end;
2751 if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2752 self_close_start = attr_end - 1;
2753 attr_end = self_close_start;
2754 while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2755 attr_end -= 1;
2756 }
2757 }
2758 let attrs_text = &attrs_after_ws[..attr_end];
2759 let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2760 let after_self_close = &attrs_after_ws[self_close_start..];
2761
2762 // Use the original source bytes for the `<tag` prefix (preserves
2763 // source casing — losslessness).
2764 builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2765 if !leading_ws.is_empty() {
2766 builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2767 }
2768 if !attrs_text.is_empty() {
2769 emit_html_attrs_node(builder, attrs_text);
2770 }
2771 if !trailing_text.is_empty() {
2772 builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2773 }
2774 if !after_self_close.is_empty() {
2775 builder.token(SyntaxKind::TEXT.into(), after_self_close);
2776 }
2777 builder.token(SyntaxKind::TEXT.into(), ">");
2778 let after_gt = &after_name[tag_close + 1..];
2779 if lift_trailing {
2780 // Return trailing bytes to the caller (will be spliced into the
2781 // recursive-parse input for the body lift).
2782 return after_gt;
2783 }
2784 if !after_gt.is_empty() {
2785 builder.token(SyntaxKind::TEXT.into(), after_gt);
2786 }
2787 ""
2788}
2789
2790/// Detect a multi-line HTML open tag for `tag_name`. Returns
2791/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2792/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2793/// the existing path) or when the `>` is missing entirely.
2794///
2795/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2796/// attribute value doesn't terminate the open tag. Quote state carries
2797/// across line boundaries.
2798fn find_multiline_open_end(
2799 lines: &[&str],
2800 start_pos: usize,
2801 first_inner: &str,
2802 tag_name: &str,
2803 prefix: &ContainerPrefix,
2804) -> Option<usize> {
2805 // Locate the `<tag_name` literal in `first_inner` to start scanning past
2806 // it. Match is ASCII case-insensitive; the parser preserves source casing.
2807 // `first_inner` is already bq-stripped by the caller; subsequent lines are
2808 // stripped inline below via `strip_n_blockquote_markers`.
2809 let trimmed = strip_leading_spaces(first_inner);
2810 let prefix_len = 1 + tag_name.len();
2811 if !trimmed.starts_with('<')
2812 || trimmed.len() < prefix_len
2813 || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2814 {
2815 return None;
2816 }
2817 let leading_indent = first_inner.len() - trimmed.len();
2818 let mut i = leading_indent + prefix_len; // past `<tag_name`
2819 let mut quote: Option<u8> = None;
2820
2821 // Scan first line for an unquoted `>`.
2822 let line0_bytes = first_inner.as_bytes();
2823 while i < line0_bytes.len() {
2824 match (quote, line0_bytes[i]) {
2825 (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2826 (Some(q), x) if x == q => quote = None,
2827 (None, b'>') => return None, // single-line case
2828 _ => {}
2829 }
2830 i += 1;
2831 }
2832
2833 // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2834 // blockquote markers per line so `> ` prefixes don't count toward the
2835 // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2836 let mut line_idx = start_pos + 1;
2837 while line_idx < lines.len() {
2838 let raw = lines[line_idx];
2839 let inner = prefix.strip(raw);
2840 for &b in inner.as_bytes() {
2841 match (quote, b) {
2842 (None, b'"') | (None, b'\'') => quote = Some(b),
2843 (Some(q), x) if x == q => quote = None,
2844 (None, b'>') => return Some(line_idx),
2845 _ => {}
2846 }
2847 }
2848 line_idx += 1;
2849 }
2850
2851 None
2852}
2853
2854/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2855/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2856/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2857/// open tag (no `>` in the remaining input) as paragraph text rather than
2858/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2859/// reparse the same content recursively, causing a stack overflow.
2860///
2861/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2862/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2863/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2864/// the tag eventually closes far away.
2865pub(crate) fn pandoc_html_open_tag_closes(
2866 lines: &[&str],
2867 start_pos: usize,
2868 prefix: &ContainerPrefix,
2869) -> bool {
2870 if start_pos >= lines.len() {
2871 return false;
2872 }
2873 let mut quote: Option<u8> = None;
2874 for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2875 let inner = prefix.strip(line);
2876 let bytes = inner.as_bytes();
2877 let mut i = 0usize;
2878 if offset == start_pos {
2879 while i < bytes.len() && bytes[i] == b' ' {
2880 i += 1;
2881 }
2882 if bytes.get(i) != Some(&b'<') {
2883 return false;
2884 }
2885 i += 1;
2886 }
2887 while i < bytes.len() {
2888 match (quote, bytes[i]) {
2889 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2890 (Some(q), x) if x == q => quote = None,
2891 (None, b'>') => return true,
2892 _ => {}
2893 }
2894 i += 1;
2895 }
2896 }
2897 false
2898}
2899
2900/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2901/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2902/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2903/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2904/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2905/// Phase 6 structural lift.
2906///
2907/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2908/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2909/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2910/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2911/// + TEXT(">") + (TEXT(trailing))? + NEWLINE
2912///
2913/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2914/// newline; `parse_html_attribute_list` tolerates whitespace.
2915#[allow(clippy::too_many_arguments)]
2916fn emit_multiline_open_tag_with_attrs(
2917 builder: &mut GreenNodeBuilder<'static>,
2918 lines: &[&str],
2919 start_pos: usize,
2920 end_line_idx: usize,
2921 tag_name: &str,
2922 bq_depth: usize,
2923 lift_trailing: bool,
2924 pre_content: &mut String,
2925) {
2926 let prefix_len = 1 + tag_name.len();
2927 for (line_idx, raw) in lines
2928 .iter()
2929 .enumerate()
2930 .take(end_line_idx + 1)
2931 .skip(start_pos)
2932 {
2933 // Strip `bq_depth` blockquote markers from the source line so
2934 // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2935 // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2936 // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2937 // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2938 // before this parser runs; re-emitting it here would double
2939 // the bytes and break losslessness.
2940 let stripped = if bq_depth > 0 {
2941 strip_n_blockquote_markers(raw, bq_depth)
2942 } else {
2943 raw
2944 };
2945 let bq_prefix_len = raw.len() - stripped.len();
2946 if bq_prefix_len > 0 && line_idx != start_pos {
2947 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2948 }
2949 let line = stripped;
2950 let (line_no_nl, newline_str) = strip_newline(line);
2951
2952 if line_idx == start_pos {
2953 // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2954 // + attrs)?. The closing `>` is on a later line, so any
2955 // remaining bytes after "<{tag_name}" on this line are the
2956 // start of the attribute region.
2957 let bytes = line_no_nl.as_bytes();
2958 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2959 if indent_end > 0 {
2960 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2961 }
2962 // Defensive: caller verified the line starts with `<{tag_name}`.
2963 let after_indent = &line_no_nl[indent_end..];
2964 if after_indent.len() >= prefix_len {
2965 builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2966 let rest = &after_indent[prefix_len..];
2967 emit_attr_region(builder, rest);
2968 } else {
2969 builder.token(SyntaxKind::TEXT.into(), after_indent);
2970 }
2971 } else if line_idx < end_line_idx {
2972 // Pure attribute line.
2973 let bytes = line_no_nl.as_bytes();
2974 let indent_end = bytes
2975 .iter()
2976 .position(|&b| !matches!(b, b' ' | b'\t'))
2977 .unwrap_or(bytes.len());
2978 if indent_end > 0 {
2979 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2980 }
2981 let attrs_text = &line_no_nl[indent_end..];
2982 if !attrs_text.is_empty() {
2983 emit_html_attrs_node(builder, attrs_text);
2984 }
2985 } else {
2986 // Last line: indent + attrs + ">" + trailing.
2987 let bytes = line_no_nl.as_bytes();
2988 let indent_end = bytes
2989 .iter()
2990 .position(|&b| !matches!(b, b' ' | b'\t'))
2991 .unwrap_or(bytes.len());
2992 if indent_end > 0 {
2993 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2994 }
2995 // Find the unquoted `>` byte position in this line.
2996 let mut quote: Option<u8> = None;
2997 let mut gt_pos: Option<usize> = None;
2998 for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
2999 let actual_j = indent_end + j;
3000 match (quote, b) {
3001 (None, b'"') | (None, b'\'') => quote = Some(b),
3002 (Some(q), x) if x == q => quote = None,
3003 (None, b'>') => {
3004 gt_pos = Some(actual_j);
3005 break;
3006 }
3007 _ => {}
3008 }
3009 }
3010 let Some(gt) = gt_pos else {
3011 // Defensive — caller said `>` is on this line.
3012 builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3013 if !newline_str.is_empty() {
3014 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3015 }
3016 continue;
3017 };
3018 // Attribute region: between indent_end and gt, with possibly
3019 // trailing whitespace before `>`.
3020 let attrs_region = &line_no_nl[indent_end..gt];
3021 let region_bytes = attrs_region.as_bytes();
3022 // Strip trailing whitespace from attrs region; emit as
3023 // separate WHITESPACE so HTML_ATTRS only contains attribute
3024 // bytes.
3025 let mut attr_end = region_bytes.len();
3026 while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3027 attr_end -= 1;
3028 }
3029 let attrs_text = &attrs_region[..attr_end];
3030 let trailing_ws = &attrs_region[attr_end..];
3031 if !attrs_text.is_empty() {
3032 emit_html_attrs_node(builder, attrs_text);
3033 }
3034 if !trailing_ws.is_empty() {
3035 builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3036 }
3037 builder.token(SyntaxKind::TEXT.into(), ">");
3038 let after_gt = &line_no_nl[gt + 1..];
3039 if lift_trailing && !after_gt.is_empty() {
3040 // Lift trailing bytes (and the trailing newline) into
3041 // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3042 // with `TEXT(">")`. The recursive parse at the close-marker
3043 // site treats `pre_content` as the leading bytes of the
3044 // structural body — same shape produced by `emit_open_tag_tokens`
3045 // for single-line opens.
3046 pre_content.push_str(after_gt);
3047 pre_content.push_str(newline_str);
3048 continue;
3049 }
3050 if !after_gt.is_empty() {
3051 builder.token(SyntaxKind::TEXT.into(), after_gt);
3052 }
3053 }
3054
3055 if !newline_str.is_empty() {
3056 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3057 }
3058 }
3059}
3060
3061/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3062/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3063/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3064/// node is added. Pandoc's projector reads attributes only for `<div>` /
3065/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3066fn emit_multiline_open_tag_simple(
3067 builder: &mut GreenNodeBuilder<'static>,
3068 lines: &[&str],
3069 start_pos: usize,
3070 end_line_idx: usize,
3071 bq_depth: usize,
3072) {
3073 for (line_idx, raw) in lines
3074 .iter()
3075 .enumerate()
3076 .take(end_line_idx + 1)
3077 .skip(start_pos)
3078 {
3079 let stripped = if bq_depth > 0 {
3080 strip_n_blockquote_markers(raw, bq_depth)
3081 } else {
3082 raw
3083 };
3084 let bq_prefix_len = raw.len() - stripped.len();
3085 // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3086 // re-emit prefixes only for subsequent lines.
3087 if bq_prefix_len > 0 && line_idx != start_pos {
3088 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3089 }
3090 let (line_no_nl, newline_str) = strip_newline(stripped);
3091 if !line_no_nl.is_empty() {
3092 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3093 }
3094 if !newline_str.is_empty() {
3095 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3096 }
3097 }
3098}
3099
3100/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3101/// `<div` literal up to end-of-line. Called only from
3102/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3103/// pure attribute (and possibly inter-attribute whitespace).
3104fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3105 if region.is_empty() {
3106 return;
3107 }
3108 let bytes = region.as_bytes();
3109 // Split a leading run of whitespace into a WHITESPACE token so the
3110 // HTML_ATTRS node holds only attribute bytes.
3111 let ws_end = bytes
3112 .iter()
3113 .position(|&b| !matches!(b, b' ' | b'\t'))
3114 .unwrap_or(bytes.len());
3115 if ws_end > 0 {
3116 builder.token(SyntaxKind::WHITESPACE.into(), ®ion[..ws_end]);
3117 }
3118 let attrs_text = ®ion[ws_end..];
3119 if !attrs_text.is_empty() {
3120 emit_html_attrs_node(builder, attrs_text);
3121 }
3122}
3123
3124/// Emit one continuation line of an HTML block, preserving any blockquote
3125/// markers as structural tokens (so the CST stays byte-equal to the source
3126/// and downstream consumers can strip them per-context).
3127fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3128 let inner = if bq_depth > 0 {
3129 let stripped = strip_n_blockquote_markers(line, bq_depth);
3130 let prefix_len = line.len() - stripped.len();
3131 if prefix_len > 0 {
3132 for ch in line[..prefix_len].chars() {
3133 if ch == '>' {
3134 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3135 } else {
3136 let mut buf = [0u8; 4];
3137 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3138 }
3139 }
3140 }
3141 stripped
3142 } else {
3143 line
3144 };
3145
3146 let (line_without_newline, newline_str) = strip_newline(inner);
3147 if !line_without_newline.is_empty() {
3148 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3149 }
3150 if !newline_str.is_empty() {
3151 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3152 }
3153}
3154
3155#[cfg(test)]
3156mod tests {
3157 use super::*;
3158
3159 #[test]
3160 fn test_try_parse_html_comment() {
3161 assert_eq!(
3162 try_parse_html_block_start("<!-- comment -->", false),
3163 Some(HtmlBlockType::Comment)
3164 );
3165 assert_eq!(
3166 try_parse_html_block_start(" <!-- comment -->", false),
3167 Some(HtmlBlockType::Comment)
3168 );
3169 }
3170
3171 #[test]
3172 fn test_try_parse_div_tag() {
3173 assert_eq!(
3174 try_parse_html_block_start("<div>", false),
3175 Some(HtmlBlockType::BlockTag {
3176 tag_name: "div".to_string(),
3177 is_verbatim: false,
3178 closed_by_blank_line: false,
3179 depth_aware: true,
3180 closes_at_open_tag: false,
3181 is_closing: false,
3182 })
3183 );
3184 assert_eq!(
3185 try_parse_html_block_start("<div class=\"test\">", false),
3186 Some(HtmlBlockType::BlockTag {
3187 tag_name: "div".to_string(),
3188 is_verbatim: false,
3189 closed_by_blank_line: false,
3190 depth_aware: true,
3191 closes_at_open_tag: false,
3192 is_closing: false,
3193 })
3194 );
3195 }
3196
3197 #[test]
3198 fn test_try_parse_script_tag() {
3199 assert_eq!(
3200 try_parse_html_block_start("<script>", false),
3201 Some(HtmlBlockType::BlockTag {
3202 tag_name: "script".to_string(),
3203 is_verbatim: true,
3204 closed_by_blank_line: false,
3205 depth_aware: true,
3206 closes_at_open_tag: false,
3207 is_closing: false,
3208 })
3209 );
3210 }
3211
3212 #[test]
3213 fn test_try_parse_processing_instruction() {
3214 assert_eq!(
3215 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3216 Some(HtmlBlockType::ProcessingInstruction)
3217 );
3218 }
3219
3220 #[test]
3221 fn test_try_parse_declaration() {
3222 // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3223 assert_eq!(
3224 try_parse_html_block_start("<!DOCTYPE html>", true),
3225 Some(HtmlBlockType::Declaration)
3226 );
3227 // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3228 // just uppercase. Lowercase doctype must match too.
3229 assert_eq!(
3230 try_parse_html_block_start("<!doctype html>", true),
3231 Some(HtmlBlockType::Declaration)
3232 );
3233 // Pandoc dialect does not — bare declarations fall through to
3234 // paragraph parsing.
3235 assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3236 assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3237 }
3238
3239 #[test]
3240 fn test_dialect_specific_block_tag_membership() {
3241 // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3242 // CommonMark §4.6 type-6 plus a few additions. These tags
3243 // diverge between dialects:
3244 // CM-only block tags (Pandoc treats as inline raw HTML):
3245 // dialog, legend, menuitem, optgroup, option, frame,
3246 // base, basefont, link, param
3247 // Pandoc-only block tags (CM doesn't recognize):
3248 // canvas, hgroup, isindex, meta, output
3249 for cm_only in [
3250 "<dialog>",
3251 "<legend>",
3252 "<menuitem>",
3253 "<optgroup>",
3254 "<option>",
3255 "<frame>",
3256 "<base>",
3257 "<basefont>",
3258 "<link>",
3259 "<param>",
3260 ] {
3261 assert!(
3262 matches!(
3263 try_parse_html_block_start(cm_only, true),
3264 Some(HtmlBlockType::BlockTag { .. })
3265 ),
3266 "{cm_only} should be a block-tag start under CommonMark",
3267 );
3268 assert_eq!(
3269 try_parse_html_block_start(cm_only, false),
3270 None,
3271 "{cm_only} should NOT be a block-tag start under Pandoc",
3272 );
3273 }
3274 for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3275 // Under CM these are not type-6 BlockTags; they may still match
3276 // type-7 (complete tag on a line) which has different semantics.
3277 assert!(
3278 !matches!(
3279 try_parse_html_block_start(pandoc_only, true),
3280 Some(HtmlBlockType::BlockTag { .. })
3281 ),
3282 "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3283 );
3284 assert!(
3285 matches!(
3286 try_parse_html_block_start(pandoc_only, false),
3287 Some(HtmlBlockType::BlockTag { .. })
3288 ),
3289 "{pandoc_only} should be a block-tag start under Pandoc",
3290 );
3291 }
3292 }
3293
3294 #[test]
3295 fn test_pandoc_inline_block_tag_membership() {
3296 // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3297 // fresh-block positions under Pandoc dialect. We list the
3298 // non-void, non-script subset (verbatim `script` is handled
3299 // via the verbatim path; void elements are deferred — see
3300 // PANDOC_INLINE_BLOCK_TAGS docs).
3301 for tag in [
3302 "<button>",
3303 "<iframe>",
3304 "<video>",
3305 "<audio>",
3306 "<noscript>",
3307 "<object>",
3308 "<map>",
3309 "<progress>",
3310 "<del>",
3311 "<ins>",
3312 "<svg>",
3313 "<applet>",
3314 ] {
3315 assert!(
3316 matches!(
3317 try_parse_html_block_start(tag, false),
3318 Some(HtmlBlockType::BlockTag {
3319 depth_aware: true,
3320 ..
3321 })
3322 ),
3323 "{tag} should be a depth-aware block-tag start under Pandoc",
3324 );
3325 }
3326 // Closing forms of inline-block tags also start a block under
3327 // Pandoc — pandoc-native pins `</button>` standalone as a
3328 // single-line `RawBlock`. These use `closes_at_open_tag: true`
3329 // (no balanced match — the close emits as a one-line block on
3330 // its own).
3331 for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3332 assert!(
3333 matches!(
3334 try_parse_html_block_start(closing, false),
3335 Some(HtmlBlockType::BlockTag {
3336 depth_aware: false,
3337 closes_at_open_tag: true,
3338 ..
3339 })
3340 ),
3341 "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3342 );
3343 }
3344 }
3345
3346 #[test]
3347 fn test_pandoc_void_block_tag_membership() {
3348 // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3349 // at fresh-block positions under Pandoc dialect, with
3350 // `closes_at_open_tag: true` — the block always ends on the
3351 // open-tag line (no closing tag to match).
3352 for tag in [
3353 "<area>",
3354 "<embed>",
3355 "<source>",
3356 "<track>",
3357 "<embed src=\"foo.swf\">",
3358 "<source src=\"foo.mp4\" type=\"video/mp4\">",
3359 ] {
3360 assert!(
3361 matches!(
3362 try_parse_html_block_start(tag, false),
3363 Some(HtmlBlockType::BlockTag {
3364 depth_aware: false,
3365 closes_at_open_tag: true,
3366 ..
3367 })
3368 ),
3369 "{tag} should be a void block-tag start under Pandoc",
3370 );
3371 }
3372 // Closing forms of void tags also start a single-line block
3373 // under Pandoc. Void elements have no closing tag in HTML, but
3374 // `</embed>` etc. can appear in the wild — pandoc-native still
3375 // emits them as `RawBlock`s at fresh-block positions; mirror
3376 // that with the same `closes_at_open_tag: true` shape.
3377 for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3378 assert!(
3379 matches!(
3380 try_parse_html_block_start(closing, false),
3381 Some(HtmlBlockType::BlockTag {
3382 depth_aware: false,
3383 closes_at_open_tag: true,
3384 ..
3385 })
3386 ),
3387 "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3388 );
3389 }
3390 // Under CommonMark dialect, the void-tag block-start path is
3391 // skipped. `<source>` and `<track>` are in the CM type-6
3392 // BLOCK_TAGS set so they DO start a block, but with CM type-6
3393 // semantics (`closed_by_blank_line: true`,
3394 // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3395 // `<embed>` and `<area>` aren't in the CM type-6 list — they
3396 // fall through to type 7 (complete tag on a line by itself).
3397 assert_eq!(
3398 try_parse_html_block_start("<embed>", true),
3399 Some(HtmlBlockType::Type7)
3400 );
3401 assert_eq!(
3402 try_parse_html_block_start("<area>", true),
3403 Some(HtmlBlockType::Type7)
3404 );
3405 assert!(matches!(
3406 try_parse_html_block_start("<source src=\"x\">", true),
3407 Some(HtmlBlockType::BlockTag {
3408 closed_by_blank_line: true,
3409 closes_at_open_tag: false,
3410 ..
3411 })
3412 ));
3413 assert!(matches!(
3414 try_parse_html_block_start("<track src=\"x\">", true),
3415 Some(HtmlBlockType::BlockTag {
3416 closed_by_blank_line: true,
3417 closes_at_open_tag: false,
3418 ..
3419 })
3420 ));
3421 }
3422
3423 #[test]
3424 fn test_find_multiline_open_end() {
3425 // Single-line opens return None (caller takes the regular path).
3426 assert_eq!(
3427 find_multiline_open_end(
3428 &["<div id=\"x\">"],
3429 0,
3430 "<div id=\"x\">",
3431 "div",
3432 &ContainerPrefix::default()
3433 ),
3434 None
3435 );
3436 assert_eq!(
3437 find_multiline_open_end(
3438 &["<embed src=\"x\">"],
3439 0,
3440 "<embed src=\"x\">",
3441 "embed",
3442 &ContainerPrefix::default()
3443 ),
3444 None
3445 );
3446 // Multi-line opens return the line index of the closing `>`.
3447 assert_eq!(
3448 find_multiline_open_end(
3449 &["<embed", " src=\"x\">"],
3450 0,
3451 "<embed",
3452 "embed",
3453 &ContainerPrefix::default()
3454 ),
3455 Some(1)
3456 );
3457 assert_eq!(
3458 find_multiline_open_end(
3459 &["<embed", " src=\"x\"", " type=\"video\">"],
3460 0,
3461 "<embed",
3462 "embed",
3463 &ContainerPrefix::default()
3464 ),
3465 Some(2)
3466 );
3467 // Tag-name mismatch returns None (case-insensitive on the tag name).
3468 assert_eq!(
3469 find_multiline_open_end(
3470 &["<embed", " src=\"x\">"],
3471 0,
3472 "<embed",
3473 "div",
3474 &ContainerPrefix::default()
3475 ),
3476 None
3477 );
3478 assert_eq!(
3479 find_multiline_open_end(
3480 &["<EMBED", " src=\"x\">"],
3481 0,
3482 "<EMBED",
3483 "embed",
3484 &ContainerPrefix::default()
3485 ),
3486 Some(1)
3487 );
3488 // Quoted `>` does not terminate the open tag; quote state threads
3489 // across line boundaries.
3490 assert_eq!(
3491 find_multiline_open_end(
3492 &["<embed title=\"a>b", " c\">"],
3493 0,
3494 "<embed title=\"a>b",
3495 "embed",
3496 &ContainerPrefix::default()
3497 ),
3498 Some(1)
3499 );
3500 // No `>` anywhere returns None.
3501 assert_eq!(
3502 find_multiline_open_end(
3503 &["<embed", " src=\"x\""],
3504 0,
3505 "<embed",
3506 "embed",
3507 &ContainerPrefix::default()
3508 ),
3509 None
3510 );
3511 // Subsequent lines inside a blockquote: bq markers stripped before
3512 // scanning so `> ` prefixes don't count.
3513 assert_eq!(
3514 find_multiline_open_end(
3515 &["<div", "> id=\"x\">"],
3516 0,
3517 "<div",
3518 "div",
3519 &ContainerPrefix::bq_only(1)
3520 ),
3521 Some(1)
3522 );
3523 // Nested bq: strips two `> ` per line.
3524 assert_eq!(
3525 find_multiline_open_end(
3526 &["<section", "> > id=\"x\">"],
3527 0,
3528 "<section",
3529 "section",
3530 &ContainerPrefix::bq_only(2)
3531 ),
3532 Some(1)
3533 );
3534 }
3535
3536 #[test]
3537 fn test_pandoc_html_open_tag_closes() {
3538 // Single-line complete: scanner finds `>` on the first line.
3539 assert!(pandoc_html_open_tag_closes(
3540 &["<div>"],
3541 0,
3542 &ContainerPrefix::default()
3543 ));
3544 assert!(pandoc_html_open_tag_closes(
3545 &["<embed src=\"x\">"],
3546 0,
3547 &ContainerPrefix::default()
3548 ));
3549 // Multi-line complete: scanner finds `>` on a later line.
3550 assert!(pandoc_html_open_tag_closes(
3551 &["<div", " id=\"x\">", "body", "</div>"],
3552 0,
3553 &ContainerPrefix::default()
3554 ));
3555 assert!(pandoc_html_open_tag_closes(
3556 &["<embed", " src=\"x.png\" alt=\"y\">"],
3557 0,
3558 &ContainerPrefix::default()
3559 ));
3560 // Quoted `>` does not close: scanner threads quote state.
3561 assert!(!pandoc_html_open_tag_closes(
3562 &["<div title=\"a>b", " c\""],
3563 0,
3564 &ContainerPrefix::default()
3565 ));
3566 assert!(pandoc_html_open_tag_closes(
3567 &["<div title=\"a>b", " c\">"],
3568 0,
3569 &ContainerPrefix::default()
3570 ));
3571 // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3572 assert!(!pandoc_html_open_tag_closes(
3573 &["<embed"],
3574 0,
3575 &ContainerPrefix::default()
3576 ));
3577 assert!(!pandoc_html_open_tag_closes(
3578 &["<div", "foo", "bar"],
3579 0,
3580 &ContainerPrefix::default()
3581 ));
3582 // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3583 // across them); the scan continues until EOF or `>`.
3584 assert!(pandoc_html_open_tag_closes(
3585 &["<div", "", "id=\"x\">"],
3586 0,
3587 &ContainerPrefix::default()
3588 ));
3589 }
3590
3591 #[test]
3592 fn test_try_parse_cdata() {
3593 // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3594 assert_eq!(
3595 try_parse_html_block_start("<![CDATA[content]]>", true),
3596 Some(HtmlBlockType::CData)
3597 );
3598 // Pandoc dialect does not.
3599 assert_eq!(
3600 try_parse_html_block_start("<![CDATA[content]]>", false),
3601 None
3602 );
3603 }
3604
3605 #[test]
3606 fn test_extract_block_tag_name_open_only() {
3607 assert_eq!(
3608 extract_block_tag_name("<div>", false),
3609 Some("div".to_string())
3610 );
3611 assert_eq!(
3612 extract_block_tag_name("<div class=\"test\">", false),
3613 Some("div".to_string())
3614 );
3615 assert_eq!(
3616 extract_block_tag_name("<div/>", false),
3617 Some("div".to_string())
3618 );
3619 assert_eq!(extract_block_tag_name("</div>", false), None);
3620 assert_eq!(extract_block_tag_name("<>", false), None);
3621 assert_eq!(extract_block_tag_name("< div>", false), None);
3622 }
3623
3624 #[test]
3625 fn test_extract_block_tag_name_with_closing() {
3626 // CommonMark §4.6 type-6 starts also accept closing tags.
3627 assert_eq!(
3628 extract_block_tag_name("</div>", true),
3629 Some("div".to_string())
3630 );
3631 assert_eq!(
3632 extract_block_tag_name("</div >", true),
3633 Some("div".to_string())
3634 );
3635 }
3636
3637 #[test]
3638 fn test_commonmark_type6_closing_tag_start() {
3639 assert_eq!(
3640 try_parse_html_block_start("</div>", true),
3641 Some(HtmlBlockType::BlockTag {
3642 tag_name: "div".to_string(),
3643 is_verbatim: false,
3644 closed_by_blank_line: true,
3645 depth_aware: false,
3646 closes_at_open_tag: false,
3647 is_closing: true,
3648 })
3649 );
3650 }
3651
3652 #[test]
3653 fn test_commonmark_type7_open_tag() {
3654 // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3655 // CommonMark; rejected under non-CommonMark.
3656 assert_eq!(
3657 try_parse_html_block_start("<a href=\"foo\">", true),
3658 Some(HtmlBlockType::Type7)
3659 );
3660 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3661 }
3662
3663 #[test]
3664 fn test_commonmark_type7_close_tag() {
3665 assert_eq!(
3666 try_parse_html_block_start("</ins>", true),
3667 Some(HtmlBlockType::Type7)
3668 );
3669 }
3670
3671 #[test]
3672 fn test_commonmark_type7_rejects_with_trailing_text() {
3673 // A complete tag must be followed only by whitespace.
3674 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3675 }
3676
3677 #[test]
3678 fn test_is_closing_marker_comment() {
3679 let block_type = HtmlBlockType::Comment;
3680 assert!(is_closing_marker("-->", &block_type));
3681 assert!(is_closing_marker("end -->", &block_type));
3682 assert!(!is_closing_marker("<!--", &block_type));
3683 }
3684
3685 #[test]
3686 fn test_is_closing_marker_tag() {
3687 let block_type = HtmlBlockType::BlockTag {
3688 tag_name: "div".to_string(),
3689 is_verbatim: false,
3690 closed_by_blank_line: false,
3691 depth_aware: false,
3692 closes_at_open_tag: false,
3693 is_closing: false,
3694 };
3695 assert!(is_closing_marker("</div>", &block_type));
3696 assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3697 assert!(is_closing_marker("content</div>", &block_type));
3698 assert!(!is_closing_marker("<div>", &block_type));
3699 }
3700
3701 #[test]
3702 fn test_parse_html_comment_block() {
3703 let input = "<!-- comment -->\n";
3704 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3705 let mut builder = GreenNodeBuilder::new();
3706
3707 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3708 let opts = ParserOptions::default();
3709 let new_pos = parse_html_block_with_wrapper(
3710 &mut builder,
3711 &lines,
3712 0,
3713 block_type,
3714 &ContainerPrefix::default(),
3715 SyntaxKind::HTML_BLOCK,
3716 &opts,
3717 );
3718
3719 assert_eq!(new_pos, 1);
3720 }
3721
3722 #[test]
3723 fn test_parse_div_block() {
3724 let input = "<div>\ncontent\n</div>\n";
3725 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3726 let mut builder = GreenNodeBuilder::new();
3727
3728 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3729 let opts = ParserOptions::default();
3730 let new_pos = parse_html_block_with_wrapper(
3731 &mut builder,
3732 &lines,
3733 0,
3734 block_type,
3735 &ContainerPrefix::default(),
3736 SyntaxKind::HTML_BLOCK,
3737 &opts,
3738 );
3739
3740 assert_eq!(new_pos, 3);
3741 }
3742
3743 #[test]
3744 fn test_parse_html_block_no_closing() {
3745 let input = "<div>\ncontent\n";
3746 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3747 let mut builder = GreenNodeBuilder::new();
3748
3749 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3750 let opts = ParserOptions::default();
3751 let new_pos = parse_html_block_with_wrapper(
3752 &mut builder,
3753 &lines,
3754 0,
3755 block_type,
3756 &ContainerPrefix::default(),
3757 SyntaxKind::HTML_BLOCK,
3758 &opts,
3759 );
3760
3761 // Should consume all lines even without closing tag
3762 assert_eq!(new_pos, 2);
3763 }
3764
3765 #[test]
3766 fn test_parse_div_block_nested_pandoc() {
3767 // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3768 // close on the OUTER `</div>`, not the first `</div>` seen. The
3769 // CommonMark-style "first close" scanner is wrong here; Pandoc's
3770 // div parser is depth-aware (mirrors `htmlInBalanced`).
3771 let input =
3772 "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3773 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3774 let mut builder = GreenNodeBuilder::new();
3775
3776 // is_commonmark = false → Pandoc dialect.
3777 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3778 let opts = ParserOptions::default();
3779 let new_pos = parse_html_block_with_wrapper(
3780 &mut builder,
3781 &lines,
3782 0,
3783 block_type,
3784 &ContainerPrefix::default(),
3785 SyntaxKind::HTML_BLOCK_DIV,
3786 &opts,
3787 );
3788
3789 // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3790 // inner-close, blank, outer-close. All consumed.
3791 assert_eq!(new_pos, 9);
3792 }
3793
3794 #[test]
3795 fn test_parse_div_block_same_line_pandoc() {
3796 // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3797 // close on first line. Depth-aware tracking must not regress this.
3798 let input = "<div>foo</div>\n";
3799 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3800 let mut builder = GreenNodeBuilder::new();
3801
3802 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3803 let opts = ParserOptions::default();
3804 let new_pos = parse_html_block_with_wrapper(
3805 &mut builder,
3806 &lines,
3807 0,
3808 block_type,
3809 &ContainerPrefix::default(),
3810 SyntaxKind::HTML_BLOCK_DIV,
3811 &opts,
3812 );
3813 assert_eq!(new_pos, 1);
3814 }
3815
3816 #[test]
3817 fn test_commonmark_verbatim_first_close() {
3818 // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3819 // ends at the first matching close — not depth-aware. Stash a
3820 // bogus inner `<script>` inside a JS string; the outer block
3821 // still closes at the first `</script>`.
3822 let input = "<script>\nlet x = '<script>';\n</script>\n";
3823 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3824 let mut builder = GreenNodeBuilder::new();
3825
3826 // is_commonmark = true.
3827 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3828 let opts = ParserOptions::default();
3829 let new_pos = parse_html_block_with_wrapper(
3830 &mut builder,
3831 &lines,
3832 0,
3833 block_type,
3834 &ContainerPrefix::default(),
3835 SyntaxKind::HTML_BLOCK,
3836 &opts,
3837 );
3838 // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3839 assert_eq!(new_pos, 3);
3840 }
3841
3842 #[test]
3843 fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3844 // Multi-line open tag with the closing `>` on its own line:
3845 //
3846 // <div
3847 // id="x"
3848 // class="y"
3849 // >
3850 //
3851 // foo
3852 //
3853 // </div>
3854 //
3855 // Open tag spans lines 0..=3. Content starts at line 4.
3856 let input = "<div\n id=\"x\"\n class=\"y\"\n>\n\nfoo\n\n</div>\n";
3857 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3858 let mut builder = GreenNodeBuilder::new();
3859
3860 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3861 let opts = ParserOptions::default();
3862 let new_pos = parse_html_block_with_wrapper(
3863 &mut builder,
3864 &lines,
3865 0,
3866 block_type,
3867 &ContainerPrefix::default(),
3868 SyntaxKind::HTML_BLOCK_DIV,
3869 &opts,
3870 );
3871
3872 // 8 lines: open-line 0, open-line 1 (` id="x"`), open-line 2
3873 // (` class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3874 assert_eq!(new_pos, 8);
3875
3876 // CST must contain a structural HTML_ATTRS region holding the
3877 // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3878 let green = builder.finish();
3879 let root = crate::syntax::SyntaxNode::new_root(green);
3880 let attrs_count = root
3881 .descendants()
3882 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3883 .count();
3884 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3885
3886 // Byte-identical losslessness check.
3887 let collected: String = root
3888 .descendants_with_tokens()
3889 .filter_map(|n| n.into_token())
3890 .map(|t| t.text().to_string())
3891 .collect();
3892 assert_eq!(collected, input);
3893 }
3894
3895 #[test]
3896 fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3897 // Multi-line open tag with the closing `>` on the last attribute
3898 // line (case 0262 already covers this pattern; pin behavior to
3899 // also ensure HTML_ATTRS structural exposure).
3900 let input = "<div\n id=\"x\"\n class=\"y\">\nfoo\n</div>\n";
3901 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3902 let mut builder = GreenNodeBuilder::new();
3903
3904 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3905 let opts = ParserOptions::default();
3906 let new_pos = parse_html_block_with_wrapper(
3907 &mut builder,
3908 &lines,
3909 0,
3910 block_type,
3911 &ContainerPrefix::default(),
3912 SyntaxKind::HTML_BLOCK_DIV,
3913 &opts,
3914 );
3915
3916 // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3917 // </div>.
3918 assert_eq!(new_pos, 5);
3919
3920 let green = builder.finish();
3921 let root = crate::syntax::SyntaxNode::new_root(green);
3922 let attrs_count = root
3923 .descendants()
3924 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3925 .count();
3926 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3927
3928 let collected: String = root
3929 .descendants_with_tokens()
3930 .filter_map(|n| n.into_token())
3931 .map(|t| t.text().to_string())
3932 .collect();
3933 assert_eq!(collected, input);
3934 }
3935
3936 #[test]
3937 fn test_commonmark_type6_blank_line_terminates() {
3938 let input = "<div>\nfoo\n\nbar\n";
3939 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3940 let mut builder = GreenNodeBuilder::new();
3941
3942 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3943 let opts = ParserOptions::default();
3944 let new_pos = parse_html_block_with_wrapper(
3945 &mut builder,
3946 &lines,
3947 0,
3948 block_type,
3949 &ContainerPrefix::default(),
3950 SyntaxKind::HTML_BLOCK,
3951 &opts,
3952 );
3953
3954 // Block contains <div>\nfoo\n; stops at blank line (line 2).
3955 assert_eq!(new_pos, 2);
3956 }
3957}