panache_parser/parser/blocks/html_blocks.rs
1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
10
11/// HTML block-level tags as defined by CommonMark spec.
12/// These tags start an HTML block when found at the start of a line.
13const BLOCK_TAGS: &[&str] = &[
14 "address",
15 "article",
16 "aside",
17 "base",
18 "basefont",
19 "blockquote",
20 "body",
21 "caption",
22 "center",
23 "col",
24 "colgroup",
25 "dd",
26 "details",
27 "dialog",
28 "dir",
29 "div",
30 "dl",
31 "dt",
32 "fieldset",
33 "figcaption",
34 "figure",
35 "footer",
36 "form",
37 "frame",
38 "frameset",
39 "h1",
40 "h2",
41 "h3",
42 "h4",
43 "h5",
44 "h6",
45 "head",
46 "header",
47 "hr",
48 "html",
49 "iframe",
50 "legend",
51 "li",
52 "link",
53 "main",
54 "menu",
55 "menuitem",
56 "nav",
57 "noframes",
58 "ol",
59 "optgroup",
60 "option",
61 "p",
62 "param",
63 "section",
64 "source",
65 "summary",
66 "table",
67 "tbody",
68 "td",
69 "tfoot",
70 "th",
71 "thead",
72 "title",
73 "tr",
74 "track",
75 "ul",
76];
77
78/// Tags that contain raw/verbatim content (no Markdown processing inside).
79const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
80
81/// Pandoc's `blockHtmlTags` (mirrors
82/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
83/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
84/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
85/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
86/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
87/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
88///
89/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
90/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
91/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
92/// and the verbatim `script`) is tracked separately as
93/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
94/// fresh-block positions but stay inline inside an existing HTML block
95/// (e.g. `<form><input><button>X</button></form>`); the projector's
96/// `split_html_block_by_tags` keys on `inline_pending` to keep them
97/// inline once an inline-only tag or text byte has been seen since the
98/// last splitter.
99const PANDOC_BLOCK_TAGS: &[&str] = &[
100 "address",
101 "article",
102 "aside",
103 "blockquote",
104 "body",
105 "canvas",
106 "caption",
107 "center",
108 "col",
109 "colgroup",
110 "dd",
111 "details",
112 "dir",
113 "div",
114 "dl",
115 "dt",
116 "fieldset",
117 "figcaption",
118 "figure",
119 "footer",
120 "form",
121 "frameset",
122 "h1",
123 "h2",
124 "h3",
125 "h4",
126 "h5",
127 "h6",
128 "head",
129 "header",
130 "hgroup",
131 "hr",
132 "html",
133 "isindex",
134 "li",
135 "main",
136 "menu",
137 "meta",
138 "nav",
139 "noframes",
140 "ol",
141 "output",
142 "p",
143 "pre",
144 "script",
145 "section",
146 "style",
147 "summary",
148 "table",
149 "tbody",
150 "td",
151 "textarea",
152 "tfoot",
153 "th",
154 "thead",
155 "tr",
156 "ul",
157];
158
159/// Whether `name` (case-insensitive) is one of the HTML block-level tags
160/// recognized by CommonMark §4.6 type-6.
161pub fn is_html_block_tag_name(name: &str) -> bool {
162 let lower = name.to_ascii_lowercase();
163 BLOCK_TAGS.contains(&lower.as_str())
164}
165
166/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
167/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
168/// Used by the pandoc-native projector's `split_html_block_by_tags` to
169/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
170/// the block — block-level tags emit as separate `RawBlock` entries;
171/// inline tags stay inline in the surrounding `Plain` content.
172pub fn is_pandoc_block_tag_name(name: &str) -> bool {
173 let lower = name.to_ascii_lowercase();
174 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
175}
176
177/// Pandoc's `eitherBlockOrInline` set (mirrors
178/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
179/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
180/// (because `name ∉ blockTags`). At top level (or after a blank line)
181/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
182/// inside an existing HTML block once a paragraph has started parsing,
183/// the same tag stays inline as `RawInline`.
184///
185/// The projector's `split_html_block_by_tags` mirrors this with an
186/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
187/// always split; inline-block tags split only when no inline content
188/// has been buffered since the last splitter.
189///
190/// Void elements (`area`, `embed`, `source`, `track`) live in
191/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
192/// rule as non-void inline-block tags but emit a single RawBlock per
193/// instance instead of a matched-pair lift.
194/// `script` is omitted because it is already verbatim (handled by the
195/// `<script>...</script>` raw-text path) and the strict-block check
196/// fires first regardless.
197const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
198 "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
199 "svg", "video",
200];
201
202/// Whether `name` (case-insensitive) is one of pandoc's
203/// `eitherBlockOrInline` tags (excluding void elements and `script`;
204/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
205pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
206 let lower = name.to_ascii_lowercase();
207 PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
208}
209
210/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
211/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
212/// minus those handled elsewhere: `br` and `wbr` are inline-only;
213/// `img` and `input` are inline-only; HTML void elements that pandoc
214/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
215/// `track`).
216///
217/// At fresh-block positions (or after a blank line) pandoc emits these
218/// as a single `RawBlock`; inside a running paragraph they stay inline
219/// as `RawInline`. The parser opens a depth-zero HTML block (closes
220/// immediately on the open-tag line — there is no closing tag to
221/// match) so subsequent lines start fresh blocks; the projector's
222/// `split_html_block_by_tags` handles the same-line splitting via
223/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
224const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
225
226/// Whether `name` (case-insensitive) is one of pandoc's void
227/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
228pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
229 let lower = name.to_ascii_lowercase();
230 PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
231}
232
233/// Whether the given tag name is eligible for the Phase 6 / Fix #4
234/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
235/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
236/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
237/// and NOT void. These are the tags where pandoc parses the body as
238/// fresh markdown between RawBlock emissions of the open/close tags —
239/// exactly the shape we can lift into structural CST children.
240///
241/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
242/// additional gate at the lift-gate site: the lift is abandoned when
243/// the body's first non-blank content is a void block tag at a
244/// fresh-block position (`<video>\n<source ...>\n</video>` projects
245/// per-tag rather than matched-pair, mirroring pandoc).
246///
247/// `<div>` is intentionally excluded — it has its own lift path
248/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
249/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
250pub(crate) fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
251 let lower = name.to_ascii_lowercase();
252 if VERBATIM_TAGS.contains(&lower.as_str()) {
253 return false;
254 }
255 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
256 return false;
257 }
258 if lower == "div" {
259 return false;
260 }
261 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
262 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
263}
264
265/// Whether `name` (case-insensitive) is a Pandoc matched-pair block tag
266/// — anything that has an opening and a matching closing form whose
267/// `</tag>` would be recognized by the dispatcher as a separate block
268/// start. Covers strict-block tags (incl. `<div>`), inline-block tags,
269/// and verbatim tags (`<pre>`, `<style>`, `<script>`, `<textarea>`).
270/// Void tags are excluded — they have no close form.
271///
272/// Used by `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to detect
273/// an open inside the buffer whose close would otherwise interrupt the
274/// list item mid-construct.
275pub(crate) fn is_pandoc_matched_pair_tag(name: &str) -> bool {
276 let lower = name.to_ascii_lowercase();
277 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
278 return false;
279 }
280 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
281 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
282 || VERBATIM_TAGS.contains(&lower.as_str())
283}
284
285/// Open-tag-attribute tokenization gate for non-div strict-block tags
286/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
287/// open tag is eligible for finer-grained tokenization
288/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
289/// driving the full body lift — that's the `bq_clean_lift` path. The
290/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
291/// the salsa anchor index.
292///
293/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
294/// wrapper) regardless of bq depth, so this gate skips it.
295fn bq_strict_attr_emit_tag_name(
296 wrapper_kind: SyntaxKind,
297 block_type: &HtmlBlockType,
298 bq_depth: usize,
299) -> Option<&str> {
300 if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
301 return None;
302 }
303 match block_type {
304 HtmlBlockType::BlockTag {
305 tag_name,
306 is_verbatim: false,
307 closed_by_blank_line: false,
308 depth_aware: true,
309 closes_at_open_tag: false,
310 is_closing: false,
311 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
312 _ => None,
313 }
314}
315
316/// Information about a detected HTML block opening.
317#[derive(Debug, Clone, PartialEq, Eq)]
318pub(crate) enum HtmlBlockType {
319 /// HTML comment: <!-- ... -->
320 Comment,
321 /// Processing instruction: <? ... ?>
322 ProcessingInstruction,
323 /// Declaration: <!...>
324 Declaration,
325 /// CDATA section: <![CDATA[ ... ]]>
326 CData,
327 /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
328 /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
329 /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
330 /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
331 /// `depth_aware` extends the matching-tag close path with balanced
332 /// open/close tracking of the same tag name (mirrors pandoc's
333 /// `htmlInBalanced`); used under Pandoc dialect to handle nested
334 /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
335 /// `closed_by_blank_line` is true.
336 /// `closes_at_open_tag` short-circuits the close search: the block
337 /// always ends after the open-tag line. Used for void
338 /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
339 /// `<track>`) which have no closing tag — depth-aware matching
340 /// would walk to end-of-input.
341 /// `is_closing` records whether the tag at the start position is a
342 /// closing form (`</tag>`) rather than an opening form (`<tag>`).
343 /// The dispatcher's `cannot_interrupt` consults this to mirror
344 /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
345 /// even when `<script>` is not — pandoc treats the close-form as
346 /// always-inline regardless of attributes).
347 BlockTag {
348 tag_name: String,
349 is_verbatim: bool,
350 closed_by_blank_line: bool,
351 depth_aware: bool,
352 closes_at_open_tag: bool,
353 is_closing: bool,
354 },
355 /// CommonMark §4.6 type 7: complete open or close tag on a line by
356 /// itself, tag name not in the type-1 verbatim list. Block ends at
357 /// blank line. Cannot interrupt a paragraph.
358 Type7,
359}
360
361/// Try to detect an HTML block opening from content.
362/// Returns block type if this is a valid HTML block start.
363///
364/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
365/// accept closing tags (`</div>`), type-6 blocks end at the next blank
366/// line (rather than a matching close tag), and type 7 is recognized.
367pub(crate) fn try_parse_html_block_start(
368 content: &str,
369 is_commonmark: bool,
370) -> Option<HtmlBlockType> {
371 let trimmed = strip_leading_spaces(content);
372
373 // Must start with <
374 if !trimmed.starts_with('<') {
375 return None;
376 }
377
378 // HTML comment
379 if trimmed.starts_with("<!--") {
380 return Some(HtmlBlockType::Comment);
381 }
382
383 // Processing instruction
384 if trimmed.starts_with("<?") {
385 return Some(HtmlBlockType::ProcessingInstruction);
386 }
387
388 // CDATA section — CommonMark dialect only. Pandoc-markdown does not
389 // recognize bare CDATA as a raw HTML block; the literal bytes fall
390 // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
391 // text is parsed as inline markdown, etc).
392 if is_commonmark && trimmed.starts_with("<![CDATA[") {
393 return Some(HtmlBlockType::CData);
394 }
395
396 // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
397 // does not recognize bare declarations as raw HTML blocks (its
398 // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
399 // tag-shaped blocks); the bytes fall through to paragraph parsing.
400 if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
401 let after_bang = &trimmed[2..];
402 if after_bang.chars().next()?.is_ascii_alphabetic() {
403 return Some(HtmlBlockType::Declaration);
404 }
405 }
406
407 // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
408 // Pandoc-native recognizes standalone closing forms of strict-block tags
409 // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
410 // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
411 // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
412 // the open-tag line via `closes_at_open_tag: true`.
413 if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
414 let tag_lower = tag_name.to_lowercase();
415 let is_closing = trimmed.starts_with("</");
416
417 // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
418 // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
419 // Unlike inline-block / void closes, these CAN interrupt a running
420 // paragraph (the dispatcher's `cannot_interrupt` only covers the
421 // inline-block / void categories). Inline-block / void closes are
422 // handled by their own branches further below.
423 if !is_commonmark
424 && is_closing
425 && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
426 || VERBATIM_TAGS.contains(&tag_lower.as_str()))
427 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
428 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
429 {
430 return Some(HtmlBlockType::BlockTag {
431 tag_name: tag_lower,
432 is_verbatim: false,
433 closed_by_blank_line: false,
434 depth_aware: false,
435 closes_at_open_tag: true,
436 is_closing: true,
437 });
438 }
439
440 // Under Pandoc, remaining closing forms (truly inline-only tags like
441 // `</em>`, `</span>`) are not block starts — fall through to the
442 // existing inline-html path. Inline-block + void closes are caught
443 // by the dedicated branches further below.
444 if !is_commonmark
445 && is_closing
446 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
447 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
448 {
449 return None;
450 }
451
452 // Check if it's a block-level tag. Pandoc and CommonMark disagree on
453 // membership: pandoc's `blockHtmlTags` (see
454 // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
455 // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
456 // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
457 let is_block_tag = if is_commonmark {
458 BLOCK_TAGS.contains(&tag_lower.as_str())
459 } else {
460 PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
461 };
462 if is_block_tag {
463 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
464 return Some(HtmlBlockType::BlockTag {
465 tag_name: tag_lower,
466 is_verbatim,
467 closed_by_blank_line: is_commonmark && !is_verbatim,
468 depth_aware: !is_commonmark,
469 closes_at_open_tag: false,
470 is_closing,
471 });
472 }
473
474 // Pandoc dialect also treats `eitherBlockOrInline` tags as block
475 // starters at fresh-block positions. The block dispatcher caller
476 // gates these as `cannot_interrupt` (mirrors pandoc — they never
477 // interrupt a running paragraph; only start a fresh block when
478 // following a blank line or at document start). Closing forms
479 // (`</video>`) emit as a single-line `RawBlock` with no balanced
480 // match — pandoc-native pins this for standalone closes.
481 if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
482 return Some(HtmlBlockType::BlockTag {
483 tag_name: tag_lower,
484 is_verbatim: false,
485 closed_by_blank_line: false,
486 depth_aware: !is_closing,
487 closes_at_open_tag: is_closing,
488 is_closing,
489 });
490 }
491
492 // Pandoc dialect also recognizes the void subset of
493 // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
494 // These have no closing tag, so the parser closes the block
495 // immediately on the open-tag line; the projector's
496 // `split_html_block_by_tags` handles the same-line splitting
497 // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
498 // non-void inline-block tags, void tags never interrupt a
499 // running paragraph (gated as `cannot_interrupt` in the
500 // dispatcher). Closing forms (`</embed>`) — semantically
501 // nonsensical for void elements — pandoc still emits as a
502 // single-line `RawBlock`; mirror that.
503 if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
504 return Some(HtmlBlockType::BlockTag {
505 tag_name: tag_lower,
506 is_verbatim: false,
507 closed_by_blank_line: false,
508 depth_aware: false,
509 closes_at_open_tag: true,
510 is_closing,
511 });
512 }
513
514 // Also accept verbatim tags even if not in BLOCK_TAGS list — but
515 // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
516 // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
517 // do not start a type-1 block. Letting `</pre>` through here would
518 // wrongly interrupt a paragraph.
519 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
520 return Some(HtmlBlockType::BlockTag {
521 tag_name: tag_lower,
522 is_verbatim: true,
523 closed_by_blank_line: false,
524 depth_aware: !is_commonmark,
525 closes_at_open_tag: false,
526 is_closing: false,
527 });
528 }
529 }
530
531 // Type 7 (CommonMark only): complete open or close tag on a line by
532 // itself, tag name not in the type-1 verbatim list.
533 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
534 {
535 let rest = &trimmed[end..];
536 let only_ws = rest
537 .bytes()
538 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
539 if only_ws {
540 // Reject if the tag name belongs to the type-1 verbatim set
541 // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
542 // type-1 starts above, so seeing one here means the opener
543 // had a different shape (e.g. `<pre/>` self-closing) that
544 // shouldn't trigger type 7 either. Conservatively skip.
545 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
546 let name_end = leading
547 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
548 .unwrap_or(leading.len());
549 let name = leading[..name_end].to_ascii_lowercase();
550 if !VERBATIM_TAGS.contains(&name.as_str()) {
551 return Some(HtmlBlockType::Type7);
552 }
553 }
554 }
555
556 None
557}
558
559/// Extract the tag name for HTML-block-start detection.
560///
561/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
562/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
563/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
564/// the spec — we approximate that with the space/`>`/`/` boundary check.
565fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
566 if !text.starts_with('<') {
567 return None;
568 }
569
570 let after_bracket = &text[1..];
571
572 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
573 if !accept_closing {
574 return None;
575 }
576 stripped
577 } else {
578 after_bracket
579 };
580
581 // Extract tag name (alphanumeric, ends at space, >, or /)
582 let tag_end = after_slash
583 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
584 .unwrap_or(after_slash.len());
585
586 if tag_end == 0 {
587 return None;
588 }
589
590 let tag_name = &after_slash[..tag_end];
591
592 // Tag name must be valid (ASCII alphabetic start, alphanumeric)
593 if !tag_name.chars().next()?.is_ascii_alphabetic() {
594 return None;
595 }
596
597 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
598 return None;
599 }
600
601 Some(tag_name.to_string())
602}
603
604/// Whether this block type ends at a blank line (CommonMark types 6 & 7
605/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
606/// marker — only at end of input or the next blank line.
607fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
608 matches!(
609 block_type,
610 HtmlBlockType::Type7
611 | HtmlBlockType::BlockTag {
612 closed_by_blank_line: true,
613 ..
614 }
615 )
616}
617
618/// Check if a line contains the closing marker for the given HTML block type.
619/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
620/// blank-line-terminated types (6 in CommonMark, 7) never match here.
621fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
622 match block_type {
623 HtmlBlockType::Comment => line.contains("-->"),
624 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
625 HtmlBlockType::Declaration => line.contains('>'),
626 HtmlBlockType::CData => line.contains("]]>"),
627 HtmlBlockType::BlockTag {
628 tag_name,
629 closed_by_blank_line: false,
630 ..
631 } => {
632 // Look for closing tag </tagname>
633 let closing_tag = format!("</{}>", tag_name);
634 line.to_lowercase().contains(&closing_tag)
635 }
636 HtmlBlockType::BlockTag {
637 closed_by_blank_line: true,
638 ..
639 }
640 | HtmlBlockType::Type7 => false,
641 }
642}
643
644/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
645/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
646/// inside a quoted attribute value are NOT counted — the scanner walks
647/// `<...>` brackets and respects `"`/`'` quoting.
648///
649/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
650/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`), and by
651/// `ListItemBuffer::unclosed_pandoc_matched_pair_tag` to suppress the
652/// close-form dispatch that would otherwise break the list-item buffer
653/// mid-`<div>...</div>`.
654pub(crate) fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
655 let bytes = line.as_bytes();
656 let lower_line = line.to_ascii_lowercase();
657 let lower_bytes = lower_line.as_bytes();
658 let tag_lower = tag_name.to_ascii_lowercase();
659 let tag_bytes = tag_lower.as_bytes();
660
661 let mut opens = 0usize;
662 let mut closes = 0usize;
663 let mut i = 0usize;
664
665 while i < bytes.len() {
666 if bytes[i] != b'<' {
667 i += 1;
668 continue;
669 }
670 let after = i + 1;
671 let is_close = after < bytes.len() && bytes[after] == b'/';
672 let name_start = if is_close { after + 1 } else { after };
673 let matched = name_start + tag_bytes.len() <= bytes.len()
674 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
675 let after_name = name_start + tag_bytes.len();
676 let is_boundary = matched
677 && matches!(
678 bytes.get(after_name).copied(),
679 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
680 );
681
682 // Walk forward to the closing `>` of this tag bracket, skipping
683 // inside quoted attribute values. Self-closing form ends with `/>`.
684 let mut j = if matched { after_name } else { after };
685 let mut quote: Option<u8> = None;
686 let mut self_close = false;
687 let mut found_gt = false;
688 while j < bytes.len() {
689 let b = bytes[j];
690 match (quote, b) {
691 (Some(q), x) if x == q => quote = None,
692 (None, b'"') | (None, b'\'') => quote = Some(b),
693 (None, b'>') => {
694 found_gt = true;
695 if j > i + 1 && bytes[j - 1] == b'/' {
696 self_close = true;
697 }
698 break;
699 }
700 _ => {}
701 }
702 j += 1;
703 }
704
705 if matched && is_boundary {
706 if is_close {
707 closes += 1;
708 } else if !self_close {
709 opens += 1;
710 }
711 }
712
713 if found_gt {
714 i = j + 1;
715 } else {
716 // Unterminated `<...` — bail out to avoid an infinite loop.
717 // The remaining bytes don't form a complete tag.
718 break;
719 }
720 }
721
722 (opens, closes)
723}
724
725/// Pandoc-dialect lift for HTML comments / processing instructions
726/// whose close marker is followed by additional bytes (same-line
727/// trailing or following lines). Pandoc-native emits a `RawBlock` for
728/// the marker bytes only, then parses the remainder as fresh blocks.
729///
730/// Returns `Some(consumed_lines)` when the split fires (caller must
731/// NOT enter the legacy emission); `None` to fall back to the legacy
732/// path (no close marker found, or no trailing content to split).
733///
734/// CST shape on success:
735/// ```text
736/// HTML_BLOCK
737/// HTML_BLOCK_TAG (open) // line[0] up to and incl close marker
738/// TEXT "<!-- hi -->" // or with HTML_BLOCK_CONTENT in between
739/// ... // for multi-line `<!--\n…\n-->` shape
740/// <sibling blocks> // recursive parse of trailing + lines[M+1..]
741/// ```
742fn try_parse_comment_pi_with_trailing_split(
743 builder: &mut GreenNodeBuilder<'static>,
744 lines: &[&str],
745 start_pos: usize,
746 block_type: &HtmlBlockType,
747 wrapper_kind: SyntaxKind,
748 bq_depth: usize,
749 config: &ParserOptions,
750) -> Option<usize> {
751 let marker: &str = match block_type {
752 HtmlBlockType::Comment => "-->",
753 HtmlBlockType::ProcessingInstruction => "?>",
754 _ => return None,
755 };
756
757 // Find the close marker in the bq-stripped line content. For
758 // bq_depth == 0 the inner content equals the raw line; for
759 // bq_depth > 0 we look past the `>` markers stripped by the
760 // outer dispatcher (line 0) and emitted as bq prefix below
761 // (lines > 0). `marker_end_in_inner` is the byte offset of the
762 // first byte AFTER the close marker, measured from the start
763 // of the inner (post-strip) content.
764 let mut close_line_idx: Option<usize> = None;
765 let mut marker_end_in_inner: usize = 0;
766 for (offset, line) in lines[start_pos..].iter().enumerate() {
767 let inner = if bq_depth > 0 {
768 strip_n_blockquote_markers(line, bq_depth)
769 } else {
770 line
771 };
772 if let Some(pos) = inner.find(marker) {
773 close_line_idx = Some(start_pos + offset);
774 marker_end_in_inner = pos + marker.len();
775 break;
776 }
777 }
778 let close_line_idx = close_line_idx?;
779 let close_line = lines[close_line_idx];
780 let close_inner = if bq_depth > 0 {
781 strip_n_blockquote_markers(close_line, bq_depth)
782 } else {
783 close_line
784 };
785 let close_prefix_len = close_line.len() - close_inner.len();
786 let trailing = &close_inner[marker_end_in_inner..];
787
788 // Only fire when there is non-whitespace content AFTER the close
789 // marker on the close line. The legacy path correctly handles
790 // the close-line-ends-at-close-marker shapes (`-->\n` followed
791 // by separate blocks); only the same-line-trailing case needs
792 // structural splitting. Trailing-whitespace-only handling
793 // (`--> \n`) is a projector-side trim — separate concern.
794 let has_non_ws_trailing = trailing.bytes().any(|b| !b.is_ascii_whitespace());
795 if !has_non_ws_trailing {
796 return None;
797 }
798
799 builder.start_node(wrapper_kind.into());
800
801 // Emit open `HTML_BLOCK_TAG` (the opening marker line(s)) and any
802 // middle `HTML_BLOCK_CONTENT` lines between open and close. The
803 // close `HTML_BLOCK_TAG` carries only the bytes up to and
804 // including the close marker — trailing bytes go to the sibling.
805 if close_line_idx == start_pos {
806 // Same-line shape: one HTML_BLOCK_TAG containing the close
807 // marker's bytes. The newline lives on the trailing sibling.
808 // Line 0's bq prefix (if any) was already emitted by the
809 // outer dispatcher; emit only the inner marker bytes.
810 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
811 let close_part = &close_inner[..marker_end_in_inner];
812 if !close_part.is_empty() {
813 builder.token(SyntaxKind::TEXT.into(), close_part);
814 }
815 builder.finish_node();
816 } else {
817 // Multi-line shape: open tag covers lines[start_pos..close],
818 // middle lines go inside HTML_BLOCK_CONTENT, close tag holds
819 // only the marker bytes. Line 0's bq prefix was emitted by
820 // the outer dispatcher; subsequent lines (middle + close)
821 // need bq prefix re-emission inside the wrapper.
822 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
823 let first_line = lines[start_pos];
824 let first_inner = if bq_depth > 0 {
825 strip_n_blockquote_markers(first_line, bq_depth)
826 } else {
827 first_line
828 };
829 let (line_no_nl, nl) = strip_newline(first_inner);
830 if !line_no_nl.is_empty() {
831 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
832 }
833 if !nl.is_empty() {
834 builder.token(SyntaxKind::NEWLINE.into(), nl);
835 }
836 builder.finish_node();
837
838 if close_line_idx > start_pos + 1 {
839 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
840 for content_line in &lines[start_pos + 1..close_line_idx] {
841 emit_html_block_line(builder, content_line, bq_depth);
842 }
843 builder.finish_node();
844 }
845
846 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
847 if bq_depth > 0 && close_prefix_len > 0 {
848 emit_bq_prefix_tokens(builder, &close_line[..close_prefix_len]);
849 }
850 let close_part = &close_inner[..marker_end_in_inner];
851 if !close_part.is_empty() {
852 builder.token(SyntaxKind::TEXT.into(), close_part);
853 }
854 builder.finish_node();
855 }
856
857 builder.finish_node(); // HTML_BLOCK
858
859 // Recursively parse JUST the trailing bytes on the close line
860 // and graft top-level children as siblings of the HTML_BLOCK we
861 // just closed. We do NOT consume subsequent lines here — the
862 // outer dispatcher continues from `close_line_idx + 1` and
863 // handles container-boundary lines (`:::` div closes, blockquote
864 // markers, list-marker continuations) correctly. Multi-line
865 // softbreak continuation (`<!-- --> trailing\nmore\n` →
866 // `Para [trailing, SoftBreak, more]`) is NOT modeled — the
867 // outer dispatcher sees `more` after the close line and starts
868 // a fresh paragraph. Refdefs flow through from the outer config
869 // (same pattern as `emit_html_block_body_lifted_inner`).
870 if !trailing.is_empty() {
871 let mut inner_options = config.clone();
872 let refdefs = config.refdef_labels.clone().unwrap_or_default();
873 inner_options.refdef_labels = Some(refdefs.clone());
874 let inner_root = crate::parser::parse_with_refdefs(trailing, Some(inner_options), refdefs);
875 let mut bq = None;
876 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
877 }
878
879 Some(close_line_idx + 1)
880}
881
882/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
883/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
884/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
885/// identical to the source either way; only the wrapper retag changes.
886pub(crate) fn parse_html_block_with_wrapper(
887 builder: &mut GreenNodeBuilder<'static>,
888 lines: &[&str],
889 start_pos: usize,
890 block_type: HtmlBlockType,
891 bq_depth: usize,
892 wrapper_kind: SyntaxKind,
893 config: &ParserOptions,
894) -> usize {
895 // Pandoc-dialect Comment / PI trailing-text split. Pandoc-native
896 // closes the RawBlock at the close marker (`-->` / `?>`) and parses
897 // any subsequent bytes (same-line trailing or following lines) as
898 // fresh blocks. The legacy path absorbs them into the HTML block
899 // wrapper, producing one oversized RawBlock. Handle the split here
900 // before entering the legacy emission so the CST encodes the
901 // sibling structure.
902 if config.dialect == crate::options::Dialect::Pandoc
903 && matches!(
904 block_type,
905 HtmlBlockType::Comment | HtmlBlockType::ProcessingInstruction
906 )
907 && let Some(consumed) = try_parse_comment_pi_with_trailing_split(
908 builder,
909 lines,
910 start_pos,
911 &block_type,
912 wrapper_kind,
913 bq_depth,
914 config,
915 )
916 {
917 return consumed;
918 }
919
920 // Start HTML block
921 builder.start_node(wrapper_kind.into());
922
923 let first_line = lines[start_pos];
924 let blank_terminated = ends_at_blank_line(&block_type);
925
926 // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
927 // tokens for the first line's blockquote prefix; emit only the inner
928 // content as TEXT to keep the CST byte-equal to the source.
929 let first_inner = if bq_depth > 0 {
930 strip_n_blockquote_markers(first_line, bq_depth)
931 } else {
932 first_line
933 };
934
935 // Detect a multi-line open tag.
936 // - `<div>` (Pandoc lift): we tokenize each line structurally so the
937 // salsa anchor walk picks up `id` from the HTML_ATTRS region.
938 // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
939 // `<section>`, `<header>`, …): same structural emission, exposing
940 // `id` to the salsa anchor walk and enabling the body lift below.
941 // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
942 // without this, the parser closes the block after line 0 and the
943 // remainder of the open tag falls into following paragraphs;
944 // pandoc-native treats the whole multi-line open tag as a single
945 // `RawBlock`. Emission for void tags uses simple per-line
946 // TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
947 // from void tags).
948 let multiline_open_end = match (wrapper_kind, &block_type) {
949 (SyntaxKind::HTML_BLOCK_DIV, _) => {
950 find_multiline_open_end(lines, start_pos, first_inner, "div", bq_depth)
951 }
952 (
953 _,
954 HtmlBlockType::BlockTag {
955 tag_name,
956 closes_at_open_tag: true,
957 ..
958 },
959 ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name, bq_depth),
960 (
961 _,
962 HtmlBlockType::BlockTag {
963 tag_name,
964 is_verbatim: false,
965 closed_by_blank_line: false,
966 depth_aware: true,
967 closes_at_open_tag: false,
968 is_closing: false,
969 },
970 ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
971 find_multiline_open_end(lines, start_pos, first_inner, tag_name, bq_depth)
972 }
973 _ => None,
974 };
975
976 // Set up depth-aware close tracking when the block type asks for it
977 // (Pandoc dialect, balanced same-name tag matching). A `None` means
978 // we fall back to the legacy "first matching close" path via
979 // `is_closing_marker`. Computed up front so the lift-mode gate
980 // below can decide whether the open line already balances the
981 // block (same-line `<div>...</div>`).
982 let depth_aware_tag: Option<String> = match &block_type {
983 HtmlBlockType::BlockTag {
984 tag_name,
985 closed_by_blank_line: false,
986 depth_aware: true,
987 ..
988 } => Some(tag_name.clone()),
989 _ => None,
990 };
991 let mut depth: i64 = 1;
992 if let Some(tag_name) = &depth_aware_tag {
993 // Sum opens/closes across all open-tag lines (single-line: just
994 // line 0; multi-line: lines 0..=end_line_idx).
995 let last_open_line = multiline_open_end.unwrap_or(start_pos);
996 let mut opens = 0usize;
997 let mut closes = 0usize;
998 for line in &lines[start_pos..=last_open_line] {
999 let inner = if bq_depth > 0 {
1000 strip_n_blockquote_markers(line, bq_depth)
1001 } else {
1002 line
1003 };
1004 let (o, c) = count_tag_balance(inner, tag_name);
1005 opens += o;
1006 closes += c;
1007 }
1008 depth = opens as i64 - closes as i64;
1009 }
1010
1011 // Same-line `<div>foo</div>` shape: the open line balances the
1012 // block under depth-aware tracking. We can lift this structurally
1013 // only when the open-tag trailing has exactly one `</div>` close,
1014 // zero `<div>` opens, and no non-whitespace content after the
1015 // close. Other same-line shapes (nested, trailing text, malformed)
1016 // fall through to the byte-reparse path.
1017 let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1018 && multiline_open_end.is_none()
1019 && depth_aware_tag.is_some()
1020 && depth <= 0;
1021 let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
1022 let (line_without_newline, _) = strip_newline(first_inner);
1023 probe_same_line_lift(line_without_newline, "div")
1024 };
1025
1026 // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
1027 // `<nav>`, …): the body parses as fresh markdown between RawBlock
1028 // emissions of the open/close tags. Covers the clean multi-line
1029 // shape (open tag stands alone on its line), open-trailing
1030 // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
1031 // and same-line (`<form>foo</form>`). Multi-line open and
1032 // blockquote-wrapped non-div shapes still fall through to the
1033 // byte-walker path.
1034 let strict_block_tag_name: Option<&str> =
1035 if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
1036 match &block_type {
1037 HtmlBlockType::BlockTag {
1038 tag_name,
1039 is_verbatim: false,
1040 closed_by_blank_line: false,
1041 depth_aware: true,
1042 closes_at_open_tag: false,
1043 is_closing: false,
1044 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1045 _ => None,
1046 }
1047 } else {
1048 None
1049 };
1050 // Same-line `<form>foo</form>` shape: the open line already
1051 // balances the block (`depth <= 0`). Lift only when the trailing
1052 // bytes after the open `>` end with `</tag>` and contain exactly
1053 // one close + zero nested opens.
1054 let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
1055 multiline_open_end.is_none() && depth <= 0 && {
1056 let (line_no_nl, _) = strip_newline(first_inner);
1057 probe_same_line_lift(line_no_nl, name)
1058 }
1059 });
1060 // Strict-block lift gate: accept (a) a multi-line open tag spanning
1061 // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
1062 // trailing single-line open (depth > 0, open `>` is present with
1063 // quote-aware matching), or (c) a safe same-line shape. For
1064 // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
1065 // …) the lift additionally abandons when the body starts at a
1066 // fresh-block position with a void block tag — pandoc-native pins
1067 // per-tag emission rather than a matched-pair lift in that case.
1068 let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
1069 let (line_no_nl, _) = strip_newline(first_inner);
1070 let shape_ok = if multiline_open_end.is_some() {
1071 // `find_multiline_open_end` already verified the open tag
1072 // closes with a quote-aware `>` somewhere in lines
1073 // `start_pos+1..=end`. No same-line trailing content to
1074 // probe; defer trailing-on-close-`>`-line handling to a
1075 // future session (rare in practice).
1076 true
1077 } else if depth > 0 {
1078 probe_open_tag_line_has_close_gt(line_no_nl, name)
1079 } else {
1080 same_line_strict_lift_safe
1081 };
1082 if !shape_ok {
1083 return false;
1084 }
1085 if !is_pandoc_inline_block_tag_name(name) {
1086 return true;
1087 }
1088 !inline_block_void_interior_abandons(
1089 first_inner,
1090 lines,
1091 start_pos,
1092 multiline_open_end,
1093 bq_depth,
1094 name,
1095 )
1096 });
1097
1098 // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
1099 // are byte-equal to the non-bq same-line shape minus the leading
1100 // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
1101 // The body has no inner newlines, so no bq prefix re-injection is
1102 // needed when grafting — `emit_html_block_body_lifted` (passing
1103 // `bq: &mut None`) is enough. Other bq shapes (butted-close,
1104 // open-trailing) still fall through to the projector's byte
1105 // walker — they need per-line prefix injection.
1106 let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
1107 && multiline_open_end.is_none()
1108 && depth_aware_tag.is_some()
1109 && depth <= 0
1110 {
1111 let (line_no_nl, _) = strip_newline(first_inner);
1112 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1113 if probe_same_line_lift(line_no_nl, "div") {
1114 Some("div")
1115 } else {
1116 None
1117 }
1118 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1119 match &block_type {
1120 HtmlBlockType::BlockTag {
1121 tag_name,
1122 is_verbatim: false,
1123 closed_by_blank_line: false,
1124 depth_aware: true,
1125 closes_at_open_tag: false,
1126 is_closing: false,
1127 } if is_pandoc_lift_eligible_block_tag(tag_name)
1128 && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
1129 {
1130 // Inline-block tags (`<video>`, `<iframe>`, …) skip
1131 // the void-interior check at same-line — the shape
1132 // has no inner block content to interfere with.
1133 Some(tag_name.as_str())
1134 }
1135 _ => None,
1136 }
1137 } else {
1138 None
1139 }
1140 } else {
1141 None
1142 };
1143
1144 // Messy-shape lift inside a blockquote — covers open-trailing
1145 // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
1146 // and open-trailing + butted-close (`> <div>foo\n> bar</div>`),
1147 // including the multi-line-open variants (`> <div\n> id="x">foo\n>
1148 // body\n> </div>`) where the trailing is captured into `pre_content`
1149 // by `emit_multiline_open_tag_with_attrs` with `lift_trailing=true`.
1150 // The open line does NOT balance the block (depth > 0 after the
1151 // open line, distinguishing this from `same_line_bq_lift_tag` which
1152 // requires depth <= 0). The close line — possibly with leading body
1153 // text — closes the block when depth returns to 0. Body lines (incl.
1154 // open trailing and close leading) graft via prefix re-injection.
1155 let bq_messy_lift_tag: Option<&str> = if bq_depth > 0 && depth_aware_tag.is_some() && depth > 0
1156 {
1157 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1158 Some("div")
1159 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1160 match &block_type {
1161 HtmlBlockType::BlockTag {
1162 tag_name,
1163 is_verbatim: false,
1164 closed_by_blank_line: false,
1165 depth_aware: true,
1166 closes_at_open_tag: false,
1167 is_closing: false,
1168 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1169 // Inline-block matched-pair tags (`<video>`, `<iframe>`,
1170 // …) abandon the lift when the body starts at a
1171 // fresh-block position with a void block tag. Same gate
1172 // as the non-bq matched-pair lift (`strict_block_lift`).
1173 if is_pandoc_inline_block_tag_name(tag_name)
1174 && inline_block_void_interior_abandons(
1175 first_inner,
1176 lines,
1177 start_pos,
1178 multiline_open_end,
1179 bq_depth,
1180 tag_name,
1181 )
1182 {
1183 None
1184 } else {
1185 Some(tag_name.as_str())
1186 }
1187 }
1188 _ => None,
1189 }
1190 } else {
1191 None
1192 }
1193 } else {
1194 None
1195 };
1196
1197 // Multi-line open + matched close-on-the-open's-last-line shape inside
1198 // a blockquote (`> <div\n> id="x">foo</div>` and depth-aware variants:
1199 // nested same-tag, trailing close, trailing text, strict-block `<form>`).
1200 // Mirrors the non-bq `pre_content`-close branch (line ~1363) but inside
1201 // a blockquote. Distinguishing features from `bq_messy_lift_tag`: the
1202 // close is on the open's last line (`depth <= 0` after the open lines)
1203 // AND `multiline_open_end.is_some()`. The trailing bytes after the
1204 // last `>` get lifted into `pre_content` via
1205 // `emit_multiline_open_tag_with_attrs(... lift_trailing=true)`, then the
1206 // new branch below splits `pre_content` at the matched close marker
1207 // and grafts body + close + any trailing siblings.
1208 let bq_multiline_close_lift_tag: Option<&str> = if bq_depth > 0
1209 && multiline_open_end.is_some()
1210 && depth_aware_tag.is_some()
1211 && depth <= 0
1212 {
1213 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1214 Some("div")
1215 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1216 match &block_type {
1217 HtmlBlockType::BlockTag {
1218 tag_name,
1219 is_verbatim: false,
1220 closed_by_blank_line: false,
1221 depth_aware: true,
1222 closes_at_open_tag: false,
1223 is_closing: false,
1224 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
1225 if is_pandoc_inline_block_tag_name(tag_name)
1226 && inline_block_void_interior_abandons(
1227 first_inner,
1228 lines,
1229 start_pos,
1230 multiline_open_end,
1231 bq_depth,
1232 tag_name,
1233 )
1234 {
1235 None
1236 } else {
1237 Some(tag_name.as_str())
1238 }
1239 }
1240 _ => None,
1241 }
1242 } else {
1243 None
1244 }
1245 } else {
1246 None
1247 };
1248
1249 // Whether this block participates in the Phase 6 structural lift
1250 // (recursively parse body as Pandoc markdown and graft children).
1251 // Covers `<div>` outside blockquote context. For same-line shapes
1252 // the lift is gated on `same_line_*_lift_safe` — when unsafe we
1253 // keep the legacy single-HTML_BLOCK_TAG shape and let the
1254 // byte-reparse path handle projection.
1255 let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1256 && bq_depth == 0
1257 && (!is_same_line_div || same_line_div_lift_safe))
1258 || strict_block_lift
1259 || same_line_bq_lift_tag.is_some()
1260 || bq_messy_lift_tag.is_some()
1261 || bq_multiline_close_lift_tag.is_some();
1262
1263 // Trailing content from the open tag (after `>`). When the lift is
1264 // active and the open line is `<div ATTRS>foo\n`, this captures
1265 // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1266 // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1267 // shapes (same-line / blockquote-wrapped).
1268 let mut pre_content = String::new();
1269
1270 // Emit opening line(s)
1271 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1272
1273 if let Some(end_line_idx) = multiline_open_end {
1274 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1275 emit_multiline_open_tag_with_attrs(
1276 builder,
1277 lines,
1278 start_pos,
1279 end_line_idx,
1280 "div",
1281 bq_depth,
1282 lift_mode,
1283 &mut pre_content,
1284 );
1285 } else if let Some(name) = strict_block_tag_name
1286 && strict_block_lift
1287 {
1288 emit_multiline_open_tag_with_attrs(
1289 builder,
1290 lines,
1291 start_pos,
1292 end_line_idx,
1293 name,
1294 bq_depth,
1295 lift_mode,
1296 &mut pre_content,
1297 );
1298 } else if let Some(name) = bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1299 {
1300 // Multi-line open of a lift-eligible strict-block tag inside a
1301 // blockquote (`> <section\n> id=...>`). The non-bq
1302 // `strict_block_tag_name` gate is `bq_depth == 0`; this branch
1303 // covers the bq side so the open tag emits HTML_ATTRS regions
1304 // for `AttributeNode::cast` and the projector's canonicalizer.
1305 //
1306 // `lift_trailing` mirrors the single-line `emit_open_tag_tokens`
1307 // call below: only push trailing bytes into `pre_content` when
1308 // the structural lift will consume them (bq messy lift). The
1309 // bq clean-lift requires `pre_content.is_empty()`, so for clean
1310 // multi-line opens the trailing is empty anyway and this is
1311 // a no-op.
1312 let lift_trailing =
1313 bq_messy_lift_tag == Some(name) || bq_multiline_close_lift_tag == Some(name);
1314 emit_multiline_open_tag_with_attrs(
1315 builder,
1316 lines,
1317 start_pos,
1318 end_line_idx,
1319 name,
1320 bq_depth,
1321 lift_trailing,
1322 &mut pre_content,
1323 );
1324 } else {
1325 emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx, bq_depth);
1326 }
1327 } else {
1328 let (line_without_newline, newline_str) = strip_newline(first_inner);
1329 if !line_without_newline.is_empty() {
1330 // For HTML_BLOCK_DIV, expose the open tag's attributes
1331 // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1332 // via the same descendants walk that handles fenced-div /
1333 // heading attrs. CST bytes stay byte-equal to source — we only
1334 // tokenize at finer granularity for matched div opens.
1335 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1336 let trailing =
1337 emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1338 if !trailing.is_empty() {
1339 pre_content.push_str(trailing);
1340 pre_content.push_str(newline_str);
1341 }
1342 } else if let Some(name) = strict_block_tag_name
1343 && strict_block_lift
1344 {
1345 let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1346 if !trailing.is_empty() {
1347 pre_content.push_str(trailing);
1348 pre_content.push_str(newline_str);
1349 }
1350 } else if let Some(name) =
1351 bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1352 {
1353 // Inside a blockquote, lift trailing bytes into
1354 // `pre_content` when either the same-line bq gate fires
1355 // (`> <tag>body</tag>` — handled by `same_line_closed`)
1356 // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1357 // </tag>` and butted-close — handled at the close-marker
1358 // site below). For the clean-shape bq lift the open has
1359 // no trailing bytes regardless, so `lift_trailing=true`
1360 // is a no-op there.
1361 let lift_trailing =
1362 same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1363 let trailing =
1364 emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1365 if lift_trailing && !trailing.is_empty() {
1366 pre_content.push_str(trailing);
1367 pre_content.push_str(newline_str);
1368 }
1369 } else {
1370 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1371 }
1372 }
1373 // When the open tag has trailing content under lift mode, the
1374 // newline belongs to that trailing line (it terminates the
1375 // synthetic body line, not the open tag). Don't double-emit.
1376 if pre_content.is_empty() && !newline_str.is_empty() {
1377 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1378 }
1379 }
1380
1381 builder.finish_node(); // HtmlBlockTag
1382
1383 // Check if opening line also contains closing marker. Blank-line-terminated
1384 // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1385 // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1386 // (`closes_at_open_tag: true`) close immediately — the block always
1387 // ends on the open-tag line since there is no closing tag to find.
1388 let void_block = matches!(
1389 &block_type,
1390 HtmlBlockType::BlockTag {
1391 closes_at_open_tag: true,
1392 ..
1393 }
1394 );
1395 // Void tags with a multi-line open close immediately after the open
1396 // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1397 // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1398 // a single RawBlock for the whole multi-line tag, with no following
1399 // content.
1400 if void_block && let Some(end_line_idx) = multiline_open_end {
1401 log::trace!(
1402 "HTML void block at line {} closes after multi-line open ending at line {}",
1403 start_pos + 1,
1404 end_line_idx + 1
1405 );
1406 builder.finish_node(); // HtmlBlock
1407 return end_line_idx + 1;
1408 }
1409 // Multi-line open with all matched closes on the open's last line:
1410 // `pre_content` holds the bytes after the last open `>` (lifted there
1411 // by `emit_multiline_open_tag_with_attrs` when `lift_trailing=true`).
1412 // When `depth <= 0` after the multi-line open and the trailing bytes
1413 // contain the depth-zero matched close, do the same-line lift on
1414 // `pre_content` directly. Mirrors the single-line `same_line_closed`
1415 // lift below — same body / close-marker / trailing-graft shape, just
1416 // consuming `end_line_idx + 1` lines instead of `start_pos + 1`.
1417 //
1418 // The body bytes of `pre_content` come from the open's last line,
1419 // which `emit_multiline_open_tag_with_attrs` already prefixed with the
1420 // re-emitted bq prefix tokens (for `bq_depth > 0`). The body and close
1421 // tag thus inherit the bq context without per-line prefix injection,
1422 // so `emit_html_block_body_lifted` (with `bq: &mut None`) suffices for
1423 // both the non-bq and bq variants of this shape.
1424 if let Some(end_line_idx) = multiline_open_end
1425 && !blank_terminated
1426 && depth_aware_tag.is_some()
1427 && depth <= 0
1428 && lift_mode
1429 && (bq_depth == 0 || bq_multiline_close_lift_tag.is_some())
1430 && !pre_content.is_empty()
1431 {
1432 let tag_name_opt: Option<&str> = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1433 Some("div")
1434 } else if strict_block_lift {
1435 strict_block_tag_name
1436 } else if let Some(name) = bq_multiline_close_lift_tag {
1437 Some(name)
1438 } else {
1439 None
1440 };
1441 if let Some(tag_name) = tag_name_opt {
1442 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1443 if let Some((leading, close_part)) =
1444 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1445 {
1446 let close_marker_end =
1447 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1448 let close_marker = &close_part[..close_marker_end];
1449 let same_line_trailing = &close_part[close_marker_end..];
1450 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1451 LastParaDemote::SkipTrailingBlanks
1452 } else {
1453 LastParaDemote::OnlyIfLast
1454 };
1455 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1456 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1457 if same_line_trailing.is_empty() {
1458 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1459 close_line.push_str(close_marker);
1460 close_line.push_str(post_nl);
1461 emit_html_block_line(builder, &close_line, 0);
1462 builder.finish_node();
1463 builder.finish_node(); // HtmlBlock
1464 } else {
1465 builder.token(SyntaxKind::TEXT.into(), close_marker);
1466 builder.finish_node(); // HTML_BLOCK_TAG
1467 builder.finish_node(); // HtmlBlock
1468
1469 let mut trailing_text =
1470 String::with_capacity(same_line_trailing.len() + post_nl.len());
1471 trailing_text.push_str(same_line_trailing);
1472 trailing_text.push_str(post_nl);
1473 let mut inner_options = config.clone();
1474 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1475 inner_options.refdef_labels = Some(refdefs.clone());
1476 let inner_root = crate::parser::parse_with_refdefs(
1477 &trailing_text,
1478 Some(inner_options),
1479 refdefs,
1480 );
1481 let mut bq = None;
1482 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1483 }
1484 return end_line_idx + 1;
1485 }
1486 }
1487 }
1488
1489 let same_line_closed = !blank_terminated
1490 && multiline_open_end.is_none()
1491 && (void_block
1492 || match &depth_aware_tag {
1493 Some(_) => depth <= 0,
1494 None => is_closing_marker(first_inner, &block_type),
1495 });
1496 if same_line_closed {
1497 log::trace!(
1498 "HTML block at line {} opens and closes on same line",
1499 start_pos + 1
1500 );
1501 // Same-line structural lift (div or non-div strict-block):
1502 // pre_content holds the bytes after the open `>` (including
1503 // the close `</tag>` and the trailing newline). Split into
1504 // body + close tag, emit body via recursive parse, emit close
1505 // tag as a sibling `HTML_BLOCK_TAG`.
1506 let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1507 None
1508 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1509 Some("div")
1510 } else if same_line_strict_lift_safe {
1511 strict_block_tag_name
1512 } else if let Some(name) = same_line_bq_lift_tag {
1513 // Bq same-line: body has no inner newlines so the standard
1514 // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1515 // sufficient. The bq prefix `> ` lives on the outer
1516 // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1517 Some(name)
1518 } else {
1519 None
1520 };
1521 if let Some(tag_name) = same_line_lift_tag {
1522 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1523 // Depth-aware split: handles `<tag>foo</tag>bar` (single
1524 // close, trailing text), `<tag>foo</tag></tag>` (matched
1525 // close + unmatched trailing close → sibling RawBlock),
1526 // and `<tag><tag>x</tag></tag>bar` (nested same-tag,
1527 // recursive body parse).
1528 if let Some((leading, close_part)) =
1529 try_split_close_line_depth_aware(pre_no_nl, tag_name)
1530 {
1531 // `close_part` starts with `</tag` and contains the close
1532 // marker followed by any same-line trailing text. Split
1533 // off the close marker bytes (`</tag>`) so the close
1534 // `HTML_BLOCK_TAG` carries only those bytes; trailing
1535 // text is parsed and grafted as a sibling block at the
1536 // parent level (matches pandoc-native shape:
1537 // `<div>foo</div>bar` → `Div [Plain[foo]] + Para [bar]`).
1538 let close_marker_end =
1539 split_close_marker_end(close_part, tag_name).unwrap_or(close_part.len());
1540 let close_marker = &close_part[..close_marker_end];
1541 let same_line_trailing = &close_part[close_marker_end..];
1542
1543 // Same-line is always close-butted; div demotes the
1544 // trailing Para→Plain via `SkipTrailingBlanks`.
1545 // Non-div strict-block uses `OnlyIfLast` (consistent
1546 // with butted-close — no trailing BLANK_LINE before
1547 // the close means the trailing Para demotes).
1548 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1549 LastParaDemote::SkipTrailingBlanks
1550 } else {
1551 LastParaDemote::OnlyIfLast
1552 };
1553 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1554 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1555 if same_line_trailing.is_empty() {
1556 let mut close_line = String::with_capacity(close_marker.len() + post_nl.len());
1557 close_line.push_str(close_marker);
1558 close_line.push_str(post_nl);
1559 emit_html_block_line(builder, &close_line, 0);
1560 builder.finish_node();
1561 builder.finish_node(); // HtmlBlock
1562 } else {
1563 // Close tag holds only the close-marker bytes;
1564 // trailing + newline graft as siblings of the
1565 // wrapper (matches pandoc's per-tag block split).
1566 builder.token(SyntaxKind::TEXT.into(), close_marker);
1567 builder.finish_node(); // HTML_BLOCK_TAG
1568 builder.finish_node(); // HtmlBlock
1569
1570 let mut trailing_text =
1571 String::with_capacity(same_line_trailing.len() + post_nl.len());
1572 trailing_text.push_str(same_line_trailing);
1573 trailing_text.push_str(post_nl);
1574 let mut inner_options = config.clone();
1575 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1576 inner_options.refdef_labels = Some(refdefs.clone());
1577 let inner_root = crate::parser::parse_with_refdefs(
1578 &trailing_text,
1579 Some(inner_options),
1580 refdefs,
1581 );
1582 let mut bq = None;
1583 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1584 }
1585 return start_pos + 1;
1586 }
1587 }
1588 builder.finish_node(); // HtmlBlock
1589 return start_pos + 1;
1590 }
1591
1592 let mut current_pos = multiline_open_end
1593 .map(|end| end + 1)
1594 .unwrap_or(start_pos + 1);
1595 let mut content_lines: Vec<&str> = Vec::new();
1596 let mut found_closing = false;
1597
1598 // Parse content until we find the closing marker
1599 while current_pos < lines.len() {
1600 let line = lines[current_pos];
1601 let (line_bq_depth, inner) = count_blockquote_markers(line);
1602
1603 // Only process lines at the same or deeper blockquote depth
1604 if line_bq_depth < bq_depth {
1605 break;
1606 }
1607
1608 // Blank-line-terminated blocks (types 6/7) end before the blank line.
1609 // The blank line itself is not part of the block.
1610 if blank_terminated && inner.trim().is_empty() {
1611 break;
1612 }
1613
1614 // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1615 // count opens/closes of the same tag name and only close when depth
1616 // returns to 0; otherwise fall back to substring-match on the line.
1617 let line_closes = match &depth_aware_tag {
1618 Some(tag_name) => {
1619 let (opens, closes) = count_tag_balance(inner, tag_name);
1620 depth += opens as i64;
1621 depth -= closes as i64;
1622 depth <= 0
1623 }
1624 None => is_closing_marker(inner, &block_type),
1625 };
1626
1627 if line_closes {
1628 log::trace!("Found HTML block closing at line {}", current_pos + 1);
1629 found_closing = true;
1630
1631 // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1632 // the open and close tags stand alone on their source lines
1633 // (no trailing on open, no body content on close after
1634 // stripping bq markers), lift the body lines structurally
1635 // so the projector walks CST children instead of
1636 // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1637 //
1638 // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1639 // grafted, Para preserved), non-div strict-block tags
1640 // (`<form>`, `<section>`, …) and inline-block matched-pair
1641 // tags (`<video>`, `<iframe>`, …) — the latter two under
1642 // HTML_BLOCK with the structural lift hitting pandoc's
1643 // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1644 // demotion. Inline-block additionally bails if the body
1645 // starts at a fresh-block position with a void block tag
1646 // (mirrors the non-bq matched-pair gate).
1647 //
1648 // Other bq-wrapped shapes (butted-close / open-trailing /
1649 // same-line) still fall through to the opaque path.
1650 // Multi-line opens are allowed here as of 2026-05-12: the
1651 // open `HTML_BLOCK_TAG` was emitted (potentially with HTML_ATTRS
1652 // per attr line and per-line bq prefix tokens) by the bq-aware
1653 // `emit_multiline_open_tag_with_attrs`. `pre_content` stays
1654 // empty for multi-line opens (the emitter writes any trailing
1655 // bytes on the last open line directly as TEXT inside
1656 // HTML_BLOCK_TAG, not into `pre_content`) — so multi-line +
1657 // trailing falls through to the opaque path, matching the non-
1658 // bq deferral.
1659 let bq_lift_tag: Option<&str> = if bq_depth > 0 && pre_content.is_empty() {
1660 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1661 Some("div")
1662 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1663 match &block_type {
1664 HtmlBlockType::BlockTag {
1665 tag_name,
1666 is_verbatim: false,
1667 closed_by_blank_line: false,
1668 depth_aware: true,
1669 closes_at_open_tag: false,
1670 is_closing: false,
1671 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1672 _ => None,
1673 }
1674 } else {
1675 None
1676 }
1677 } else {
1678 None
1679 };
1680
1681 let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1682 // Open-shape: last open line must end with `>` (clean
1683 // close-of-open). For single-line, that's `first_inner`
1684 // (already bq-stripped); for multi-line, strip bq markers
1685 // from `lines[end_line_idx]` and check the same.
1686 let last_open_line: &str = match multiline_open_end {
1687 None => first_inner,
1688 Some(end) if bq_depth > 0 => strip_n_blockquote_markers(lines[end], bq_depth),
1689 Some(end) => lines[end],
1690 };
1691 let (open_no_nl, _) = strip_newline(last_open_line);
1692 if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1693 return false;
1694 }
1695 let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1696 let (close_no_nl, _) = strip_newline(close_stripped);
1697 if !close_no_nl
1698 .trim_start_matches([' ', '\t'])
1699 .starts_with("</")
1700 {
1701 return false;
1702 }
1703 if is_pandoc_inline_block_tag_name(tag_name)
1704 && inline_block_void_interior_abandons(
1705 first_inner,
1706 lines,
1707 start_pos,
1708 multiline_open_end,
1709 bq_depth,
1710 tag_name,
1711 )
1712 {
1713 return false;
1714 }
1715 true
1716 });
1717
1718 if bq_clean_lift {
1719 let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1720 LastParaDemote::Never
1721 } else {
1722 LastParaDemote::OnlyIfLast
1723 };
1724 emit_html_block_body_lifted_bq(
1725 builder,
1726 &content_lines,
1727 bq_depth,
1728 demote_policy,
1729 config,
1730 );
1731 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1732 emit_html_block_line(builder, line, bq_depth);
1733 builder.finish_node();
1734 current_pos += 1;
1735 break;
1736 }
1737
1738 // Bq messy-shape lift — single-line open with trailing or
1739 // butted-close (or both). `pre_content` already captures any
1740 // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1741 // strip the close line's bq markers before splitting so
1742 // `leading` and `close_part` are bq-prefix-free. Body parses
1743 // recursively from `pre_content + stripped(content_lines) +
1744 // leading`, with per-line bq prefixes re-injected so the CST
1745 // stays byte-equal to the source. Demote: div is keyed on
1746 // close-butted-ness (Plain when leading non-empty, Para
1747 // otherwise); non-div uses OnlyIfLast either way.
1748 if let Some(tag_name) = bq_messy_lift_tag {
1749 let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1750 let close_prefix_len = line.len() - close_stripped.len();
1751 let close_prefix = &line[..close_prefix_len];
1752 if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1753 {
1754 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1755 if leading.is_empty() {
1756 LastParaDemote::Never
1757 } else {
1758 LastParaDemote::SkipTrailingBlanks
1759 }
1760 } else {
1761 LastParaDemote::OnlyIfLast
1762 };
1763 emit_html_block_body_lifted_bq_messy(
1764 builder,
1765 &pre_content,
1766 &content_lines,
1767 leading,
1768 close_prefix,
1769 bq_depth,
1770 policy,
1771 config,
1772 );
1773 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1774 // When `leading` is empty, no recursive-parse output carries
1775 // the close line's bq prefix, so emit it here before the
1776 // close tag. When `leading` is non-empty,
1777 // `emit_html_block_body_lifted_bq_messy` already injected
1778 // the prefix at the start of the leading bytes (via the
1779 // BqPrefixState entry); emitting again would double the
1780 // prefix bytes and break losslessness.
1781 if leading.is_empty() {
1782 emit_bq_prefix_tokens(builder, close_prefix);
1783 }
1784 emit_html_block_line(builder, close_part, 0);
1785 builder.finish_node();
1786 current_pos += 1;
1787 break;
1788 }
1789 }
1790
1791 // Under lift mode, try to split the close line into a
1792 // leading "body content" prefix and the close-marker
1793 // remainder using depth-aware matching. Walks at depth 1
1794 // (we're inside the open tag) so nested same-tag opens
1795 // (e.g. `<inner></inner></tag>` style with a nested div)
1796 // are absorbed into the body and parsed recursively, and
1797 // multi-close shapes (`foo</div></div>` on the close line)
1798 // peel off the matched-pair close — the unmatched
1799 // trailing close projects as a sibling `RawBlock` per
1800 // pandoc-native. For `<div>`, non-empty `leading`
1801 // propagates pandoc's `markdown_in_html_blocks` Plain
1802 // demotion rule. For non-div strict-block tags, demotion
1803 // follows pandoc's `OnlyIfLast` rule (demote the trailing
1804 // Para only when no blank line precedes the close).
1805 let close_split_tag = if lift_mode {
1806 if strict_block_lift {
1807 strict_block_tag_name
1808 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1809 Some("div")
1810 } else {
1811 None
1812 }
1813 } else {
1814 None
1815 };
1816 let (close_no_nl, close_post_nl) = strip_newline(line);
1817 let close_split = close_split_tag
1818 .and_then(|name| try_split_close_line_depth_aware(close_no_nl, name));
1819
1820 if let Some((leading, close_part)) = close_split {
1821 // Close-line leading that is whitespace-only is close-tag
1822 // indentation, not body content (pandoc-native strips it
1823 // from the close RawBlock and treats the close as butted —
1824 // see ` </tag>` shapes). Route those bytes into the
1825 // close `HTML_BLOCK_TAG` as a WHITESPACE token so the
1826 // projector strips them; keep the demote policy keyed on
1827 // the original leading so butted-close detection (Plain
1828 // demotion for div, OnlyIfLast for non-div) still fires.
1829 let leading_is_ws_only =
1830 !leading.is_empty() && leading.bytes().all(|b| b == b' ' || b == b'\t');
1831 let body_leading = if leading_is_ws_only { "" } else { leading };
1832 let policy = if strict_block_lift {
1833 LastParaDemote::OnlyIfLast
1834 } else if !leading.is_empty() {
1835 LastParaDemote::SkipTrailingBlanks
1836 } else {
1837 LastParaDemote::Never
1838 };
1839 // Split close_part into close-marker bytes (`</tag>`)
1840 // and trailing bytes (e.g. an extra `</div>` for the
1841 // double-close case, or `bar` for trailing text after
1842 // a normal close). Trailing bytes are recursively
1843 // parsed and grafted as siblings of the HTML_BLOCK_DIV
1844 // wrapper.
1845 let close_tag_name = close_split_tag.expect("close_split_tag present");
1846 let close_marker_end =
1847 split_close_marker_end(close_part, close_tag_name).unwrap_or(close_part.len());
1848 let close_marker = &close_part[..close_marker_end];
1849 let close_trailing = &close_part[close_marker_end..];
1850
1851 emit_html_block_body_lifted(
1852 builder,
1853 &pre_content,
1854 &content_lines,
1855 body_leading,
1856 policy,
1857 config,
1858 );
1859 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1860 if leading_is_ws_only {
1861 builder.token(SyntaxKind::WHITESPACE.into(), leading);
1862 }
1863 if close_trailing.is_empty() {
1864 let mut close_line =
1865 String::with_capacity(close_marker.len() + close_post_nl.len());
1866 close_line.push_str(close_marker);
1867 close_line.push_str(close_post_nl);
1868 emit_html_block_line(builder, &close_line, 0);
1869 builder.finish_node();
1870 } else {
1871 // Close tag holds only the close-marker bytes;
1872 // trailing + newline graft as siblings.
1873 builder.token(SyntaxKind::TEXT.into(), close_marker);
1874 builder.finish_node(); // HTML_BLOCK_TAG
1875 builder.finish_node(); // HtmlBlock
1876
1877 let mut trailing_text =
1878 String::with_capacity(close_trailing.len() + close_post_nl.len());
1879 trailing_text.push_str(close_trailing);
1880 trailing_text.push_str(close_post_nl);
1881 let mut inner_options = config.clone();
1882 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1883 inner_options.refdef_labels = Some(refdefs.clone());
1884 let inner_root = crate::parser::parse_with_refdefs(
1885 &trailing_text,
1886 Some(inner_options),
1887 refdefs,
1888 );
1889 let mut bq = None;
1890 graft_document_children(builder, &inner_root, LastParaDemote::Never, &mut bq);
1891 current_pos += 1;
1892 return current_pos;
1893 }
1894 } else {
1895 emit_html_block_body(
1896 builder,
1897 &pre_content,
1898 &content_lines,
1899 bq_depth,
1900 wrapper_kind,
1901 lift_mode,
1902 config,
1903 );
1904 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1905 emit_html_block_line(builder, line, bq_depth);
1906 builder.finish_node();
1907 }
1908
1909 current_pos += 1;
1910 break;
1911 }
1912
1913 // Regular content line
1914 content_lines.push(line);
1915 current_pos += 1;
1916 }
1917
1918 // If we didn't find a closing marker, emit what we collected
1919 if !found_closing {
1920 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1921 emit_html_block_body(
1922 builder,
1923 &pre_content,
1924 &content_lines,
1925 bq_depth,
1926 wrapper_kind,
1927 lift_mode,
1928 config,
1929 );
1930 }
1931
1932 builder.finish_node(); // HtmlBlock
1933 current_pos
1934}
1935
1936/// Emit the collected inner content lines for an HTML block.
1937///
1938/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1939/// line `<div>` open outside blockquote), recursively parse the inner
1940/// content (including any open-tag trailing) as Pandoc-flavored
1941/// markdown and graft the resulting top-level blocks as direct children
1942/// of the wrapper. This is the Phase 6 structural lift — the projector
1943/// and downstream consumers (linter, salsa, LSP) can walk the
1944/// structural children instead of re-tokenizing the body bytes.
1945///
1946/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1947/// blockquote, multi-line open, or no content at all — fall through to
1948/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1949///
1950/// CST bytes remain byte-identical to source: the recursive parser is
1951/// lossless on the same byte slice the legacy path would have captured
1952/// as TEXT.
1953fn emit_html_block_body(
1954 builder: &mut GreenNodeBuilder<'static>,
1955 pre_content: &str,
1956 content_lines: &[&str],
1957 bq_depth: usize,
1958 wrapper_kind: SyntaxKind,
1959 lift_mode: bool,
1960 config: &ParserOptions,
1961) {
1962 if pre_content.is_empty() && content_lines.is_empty() {
1963 return;
1964 }
1965 if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1966 // Reached when the parser walked to end-of-input without finding
1967 // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1968 emit_html_block_body_lifted(
1969 builder,
1970 pre_content,
1971 content_lines,
1972 "",
1973 LastParaDemote::Never,
1974 config,
1975 );
1976 return;
1977 }
1978 // Legacy path: opaque TEXT capture. `pre_content` is always empty
1979 // here (lift_mode is the only path that populates it), but be
1980 // defensive — if a trailing prefix snuck in, emit it as TEXT so
1981 // bytes are preserved.
1982 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1983 if !pre_content.is_empty() {
1984 builder.token(SyntaxKind::TEXT.into(), pre_content);
1985 }
1986 for content_line in content_lines {
1987 emit_html_block_line(builder, content_line, bq_depth);
1988 }
1989 builder.finish_node();
1990}
1991
1992/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1993/// to `PLAIN` when grafting children into the structural CST.
1994#[derive(Copy, Clone, Debug)]
1995enum LastParaDemote {
1996 /// Never demote — pandoc preserves the trailing `Para`.
1997 Never,
1998 /// Demote the LAST `PARAGRAPH` child, skipping any trailing
1999 /// `BLANK_LINE` children. Used for `<div>` shapes where the close
2000 /// tag is butted against the paragraph text on its source line —
2001 /// pandoc's `markdown_in_html_blocks` Plain demotion.
2002 SkipTrailingBlanks,
2003 /// Demote the LAST top-level child only when it is a `PARAGRAPH`
2004 /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
2005 /// for non-div strict-block tags whose body emits at top-level
2006 /// adjacent to the close-tag `RawBlock`; pandoc's rule there
2007 /// demotes the trailing `Para` to `Plain` unless a blank line
2008 /// separates them.
2009 OnlyIfLast,
2010}
2011
2012/// Lift the HTML-block body into structural CST children: build the
2013/// inner text from `pre_content` + `content_lines` + `post_content`
2014/// (in order), recursively parse it as Pandoc-flavored markdown, and
2015/// graft the resulting top-level blocks into `builder`. `demote_policy`
2016/// controls whether the trailing paragraph is retagged as `PLAIN` to
2017/// encode pandoc's Plain/Para adjacency rules structurally.
2018fn emit_html_block_body_lifted(
2019 builder: &mut GreenNodeBuilder<'static>,
2020 pre_content: &str,
2021 content_lines: &[&str],
2022 post_content: &str,
2023 demote_policy: LastParaDemote,
2024 config: &ParserOptions,
2025) {
2026 emit_html_block_body_lifted_inner(
2027 builder,
2028 pre_content,
2029 content_lines,
2030 post_content,
2031 demote_policy,
2032 config,
2033 &mut None,
2034 )
2035}
2036
2037/// Body-lift variant for `<div>` inside a blockquote. Strips
2038/// `bq_depth` levels of blockquote markers from each `content_line`,
2039/// captures the per-line prefix bytes, and grafts the recursive parse
2040/// with prefix injection so the output CST stays byte-equal to the
2041/// source. `pre_content` and `post_content` must be empty (the bq
2042/// clean lift only handles the shape where the open and close tags
2043/// stand alone on their source lines).
2044fn emit_html_block_body_lifted_bq(
2045 builder: &mut GreenNodeBuilder<'static>,
2046 content_lines: &[&str],
2047 bq_depth: usize,
2048 demote_policy: LastParaDemote,
2049 config: &ParserOptions,
2050) {
2051 let mut prefixes: Vec<String> = Vec::with_capacity(content_lines.len());
2052 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2053 for cl in content_lines {
2054 let stripped = strip_n_blockquote_markers(cl, bq_depth);
2055 let prefix_len = cl.len() - stripped.len();
2056 prefixes.push(cl[..prefix_len].to_string());
2057 stripped_lines.push(stripped);
2058 }
2059 let mut bq = Some(BqPrefixState {
2060 prefixes,
2061 line_idx: 0,
2062 at_line_start: true,
2063 });
2064 emit_html_block_body_lifted_inner(
2065 builder,
2066 "",
2067 &stripped_lines,
2068 "",
2069 demote_policy,
2070 config,
2071 &mut bq,
2072 )
2073}
2074
2075/// Body-lift variant for the bq messy-shape lift — open-trailing,
2076/// butted-close, or both. The open-trailing bytes (if any) sit in
2077/// `pre_content` (line 0 of the body — no bq prefix in source because
2078/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
2079/// each carry their own bq prefix. The close line's `leading` (body
2080/// bytes before `</tag>`) sits on the close line, prefixed in source
2081/// by `close_line_prefix` (the bq prefix captured from `line`).
2082///
2083/// Builds `prefixes` so each emitted line in the recursive parse
2084/// output gets the right per-line bq prefix re-injected at line start:
2085/// `pre_content` → empty prefix (no source `> ` precedes it); each
2086/// content line → its stripped prefix; `leading` → `close_line_prefix`.
2087/// Result CST stays byte-equal to source.
2088#[allow(clippy::too_many_arguments)]
2089fn emit_html_block_body_lifted_bq_messy(
2090 builder: &mut GreenNodeBuilder<'static>,
2091 pre_content: &str,
2092 content_lines: &[&str],
2093 leading: &str,
2094 close_line_prefix: &str,
2095 bq_depth: usize,
2096 demote_policy: LastParaDemote,
2097 config: &ParserOptions,
2098) {
2099 let mut prefixes: Vec<String> = Vec::new();
2100 if !pre_content.is_empty() {
2101 prefixes.push(String::new());
2102 }
2103 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
2104 for cl in content_lines {
2105 let stripped = strip_n_blockquote_markers(cl, bq_depth);
2106 let prefix_len = cl.len() - stripped.len();
2107 prefixes.push(cl[..prefix_len].to_string());
2108 stripped_lines.push(stripped);
2109 }
2110 if !leading.is_empty() {
2111 prefixes.push(close_line_prefix.to_string());
2112 }
2113 let mut bq = Some(BqPrefixState {
2114 prefixes,
2115 line_idx: 0,
2116 at_line_start: true,
2117 });
2118 emit_html_block_body_lifted_inner(
2119 builder,
2120 pre_content,
2121 &stripped_lines,
2122 leading,
2123 demote_policy,
2124 config,
2125 &mut bq,
2126 )
2127}
2128
2129fn emit_html_block_body_lifted_inner(
2130 builder: &mut GreenNodeBuilder<'static>,
2131 pre_content: &str,
2132 content_lines: &[&str],
2133 post_content: &str,
2134 demote_policy: LastParaDemote,
2135 config: &ParserOptions,
2136 bq: &mut Option<BqPrefixState>,
2137) {
2138 if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
2139 return;
2140 }
2141 let mut inner_text = String::with_capacity(
2142 pre_content.len()
2143 + content_lines.iter().map(|s| s.len()).sum::<usize>()
2144 + post_content.len(),
2145 );
2146 inner_text.push_str(pre_content);
2147 for line in content_lines {
2148 inner_text.push_str(line);
2149 }
2150 inner_text.push_str(post_content);
2151
2152 let mut inner_options = config.clone();
2153 let refdefs = config.refdef_labels.clone().unwrap_or_default();
2154 inner_options.refdef_labels = Some(refdefs.clone());
2155 let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
2156 graft_document_children(builder, &inner_root, demote_policy, bq);
2157}
2158
2159/// Per-line blockquote-prefix injection state used by the graft helpers
2160/// when the lifted body originated inside a `> …` blockquote: the
2161/// recursive parse was fed the bq-stripped text, so the prefix bytes
2162/// (`BLOCK_QUOTE_MARKER` + `WHITESPACE`) must be re-emitted at the
2163/// start of each source line to keep the CST byte-equal to the source.
2164///
2165/// `prefixes[i]` is the literal prefix bytes for source line `i` of the
2166/// body (e.g. `"> "`, `"> "`, or `">"`). `line_idx` is the index of
2167/// the next prefix to emit; `at_line_start` flips to `true` after every
2168/// `NEWLINE` so the next token triggers prefix emission.
2169struct BqPrefixState {
2170 prefixes: Vec<String>,
2171 line_idx: usize,
2172 at_line_start: bool,
2173}
2174
2175/// Walk a parsed inner document's top-level children and re-emit them
2176/// into `builder`. The document's wrapper node is skipped — only its
2177/// children are grafted.
2178///
2179/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
2180/// as `PLAIN` — see [`LastParaDemote`].
2181///
2182/// `bq` is `Some` when grafting a body that lived inside a blockquote
2183/// — token emission then injects `BLOCK_QUOTE_MARKER + WHITESPACE`
2184/// prefix tokens at line starts. See [`BqPrefixState`].
2185fn graft_document_children(
2186 builder: &mut GreenNodeBuilder<'static>,
2187 doc: &SyntaxNode,
2188 demote_policy: LastParaDemote,
2189 bq: &mut Option<BqPrefixState>,
2190) {
2191 let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
2192
2193 let mut demote_idx: Option<usize> = None;
2194 match demote_policy {
2195 LastParaDemote::Never => {}
2196 LastParaDemote::SkipTrailingBlanks => {
2197 for (i, c) in children.iter().enumerate().rev() {
2198 if let rowan::NodeOrToken::Node(n) = c {
2199 if n.kind() == SyntaxKind::BLANK_LINE {
2200 continue;
2201 }
2202 if n.kind() == SyntaxKind::PARAGRAPH {
2203 demote_idx = Some(i);
2204 }
2205 break;
2206 }
2207 }
2208 }
2209 LastParaDemote::OnlyIfLast => {
2210 for (i, c) in children.iter().enumerate().rev() {
2211 if let rowan::NodeOrToken::Node(n) = c {
2212 if n.kind() == SyntaxKind::PARAGRAPH {
2213 demote_idx = Some(i);
2214 }
2215 break;
2216 }
2217 }
2218 }
2219 }
2220
2221 for (i, child) in children.into_iter().enumerate() {
2222 match child {
2223 rowan::NodeOrToken::Node(n) => {
2224 if Some(i) == demote_idx {
2225 graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
2226 } else {
2227 graft_subtree(builder, &n, bq);
2228 }
2229 }
2230 rowan::NodeOrToken::Token(t) => {
2231 emit_grafted_token(builder, t.kind(), t.text(), bq);
2232 }
2233 }
2234 }
2235}
2236
2237/// Recursively re-emit `node` and its descendants into `builder`.
2238/// Token text is copied verbatim so the result is byte-identical to
2239/// the input span (modulo bq prefix tokens injected at line starts
2240/// when `bq` is `Some`).
2241fn graft_subtree(
2242 builder: &mut GreenNodeBuilder<'static>,
2243 node: &SyntaxNode,
2244 bq: &mut Option<BqPrefixState>,
2245) {
2246 graft_subtree_as(builder, node, node.kind(), bq);
2247}
2248
2249/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
2250/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
2251/// the close-butted demotion rule.
2252fn graft_subtree_as(
2253 builder: &mut GreenNodeBuilder<'static>,
2254 node: &SyntaxNode,
2255 kind: SyntaxKind,
2256 bq: &mut Option<BqPrefixState>,
2257) {
2258 builder.start_node(kind.into());
2259 for child in node.children_with_tokens() {
2260 match child {
2261 rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
2262 rowan::NodeOrToken::Token(t) => {
2263 emit_grafted_token(builder, t.kind(), t.text(), bq);
2264 }
2265 }
2266 }
2267 builder.finish_node();
2268}
2269
2270/// Emit a single token while optionally injecting blockquote prefix
2271/// tokens at line starts. When `bq` is `None`, this is a plain
2272/// `builder.token()` passthrough.
2273fn emit_grafted_token(
2274 builder: &mut GreenNodeBuilder<'static>,
2275 kind: SyntaxKind,
2276 text: &str,
2277 bq: &mut Option<BqPrefixState>,
2278) {
2279 if let Some(state) = bq.as_mut() {
2280 if state.at_line_start {
2281 if let Some(prefix) = state.prefixes.get(state.line_idx) {
2282 emit_bq_prefix_tokens(builder, prefix);
2283 }
2284 state.at_line_start = false;
2285 }
2286 builder.token(kind.into(), text);
2287 // `BLANK_LINE` token represents an entirely blank source line —
2288 // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
2289 // token as line-ending so the per-line prefix index advances
2290 // correctly.
2291 if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
2292 state.line_idx += 1;
2293 state.at_line_start = true;
2294 }
2295 } else {
2296 builder.token(kind.into(), text);
2297 }
2298}
2299
2300/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
2301/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
2302fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
2303 for ch in prefix.chars() {
2304 if ch == '>' {
2305 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2306 } else {
2307 let mut buf = [0u8; 4];
2308 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2309 }
2310 }
2311}
2312
2313/// Locate the byte index (within `line`) of the open-tag's closing `>`
2314/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
2315/// the line doesn't fit the expected shape. Mirrors the inner scan of
2316/// `probe_open_tag_line_has_close_gt` but exposes the position so the
2317/// caller can slice off the trailing bytes.
2318fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
2319 let bytes = line.as_bytes();
2320 let indent_end = bytes
2321 .iter()
2322 .position(|&b| b != b' ' && b != b'\t')
2323 .unwrap_or(bytes.len());
2324 let rest = &line[indent_end..];
2325 let rest_bytes = rest.as_bytes();
2326 let prefix_len = 1 + tag_name.len();
2327 if rest_bytes.len() < prefix_len + 1
2328 || rest_bytes[0] != b'<'
2329 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2330 {
2331 return None;
2332 }
2333 let after_name = &rest[prefix_len..];
2334 let after_name_bytes = after_name.as_bytes();
2335 let mut i = 0usize;
2336 let mut quote: Option<u8> = None;
2337 while i < after_name_bytes.len() {
2338 match (quote, after_name_bytes[i]) {
2339 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2340 (Some(q), b2) if b2 == q => quote = None,
2341 (None, b'>') => return Some(indent_end + prefix_len + i),
2342 _ => {}
2343 }
2344 i += 1;
2345 }
2346 None
2347}
2348
2349/// Whether `slice` begins (after leading ASCII whitespace) with an
2350/// open tag whose name is a Pandoc void block tag (`<source>`,
2351/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
2352/// open tags return false.
2353///
2354/// Used by the inline-block matched-pair lift gate: pandoc-native
2355/// abandons the lift when the body's first non-blank content is a
2356/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
2357/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
2358/// matched-pair lift).
2359fn slice_starts_with_void_block_tag(slice: &str) -> bool {
2360 let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
2361 if !trimmed.starts_with('<') || trimmed.starts_with("</") {
2362 return false;
2363 }
2364 let Some(tag_end) = parse_open_tag(trimmed) else {
2365 return false;
2366 };
2367 let bytes = trimmed.as_bytes();
2368 let mut name_end = 1usize;
2369 while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
2370 {
2371 name_end += 1;
2372 }
2373 if name_end == 1 {
2374 return false;
2375 }
2376 is_pandoc_void_block_tag_name(&trimmed[1..name_end])
2377}
2378
2379/// Whether the body of an inline-block matched-pair (`<video>...`,
2380/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
2381/// a void block tag — the condition under which pandoc-native abandons
2382/// the matched-pair lift. Probes three shapes:
2383///
2384/// - **Same-line** (`<video><source ...></video>`): trailing bytes
2385/// after the open `>` on `first_inner` start with `<source`.
2386/// - **Single-line open + multi-line body**: open-trailing on the open
2387/// line is empty/whitespace AND the first non-blank body line
2388/// (`lines[start_pos+1..]`) starts with a void tag.
2389/// - **Multi-line open**: same body-line scan starting at
2390/// `lines[multiline_open_end+1..]`.
2391///
2392/// Returns `false` when the body begins with text, with a close tag,
2393/// or with a non-void block tag — those cases all proceed with the
2394/// matched-pair lift.
2395fn inline_block_void_interior_abandons(
2396 first_inner: &str,
2397 lines: &[&str],
2398 start_pos: usize,
2399 multiline_open_end: Option<usize>,
2400 bq_depth: usize,
2401 tag_name: &str,
2402) -> bool {
2403 let (line_no_nl, _) = strip_newline(first_inner);
2404 let (body_start_line_idx, open_trailing) = match multiline_open_end {
2405 Some(end) => (end + 1, ""),
2406 None => {
2407 let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
2408 let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
2409 (start_pos + 1, trailing)
2410 }
2411 };
2412 let trimmed = open_trailing.trim_start_matches([' ', '\t']);
2413 if !trimmed.is_empty() {
2414 return slice_starts_with_void_block_tag(trimmed);
2415 }
2416 for line in &lines[body_start_line_idx..] {
2417 let inner = if bq_depth > 0 {
2418 strip_n_blockquote_markers(line, bq_depth)
2419 } else {
2420 line
2421 };
2422 let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
2423 if trimmed.is_empty() {
2424 continue;
2425 }
2426 return slice_starts_with_void_block_tag(trimmed);
2427 }
2428 false
2429}
2430
2431/// Probe whether the open-tag line has a valid (quote-aware) closing
2432/// `>` after the tag name. Admits trailing content after `>` (the
2433/// open-trailing shape `<form>foo`) — the caller is expected to capture
2434/// that trailing into the structural lift's `pre_content`.
2435pub(crate) fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
2436 let bytes = line.as_bytes();
2437 let indent_end = bytes
2438 .iter()
2439 .position(|&b| b != b' ' && b != b'\t')
2440 .unwrap_or(bytes.len());
2441 let rest = &line[indent_end..];
2442 let rest_bytes = rest.as_bytes();
2443 let prefix_len = 1 + tag_name.len();
2444 if rest_bytes.len() < prefix_len + 1
2445 || rest_bytes[0] != b'<'
2446 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2447 {
2448 return false;
2449 }
2450 let after_name = &rest[prefix_len..];
2451 let after_name_bytes = after_name.as_bytes();
2452 let mut i = 0usize;
2453 let mut quote: Option<u8> = None;
2454 while i < after_name_bytes.len() {
2455 match (quote, after_name_bytes[i]) {
2456 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2457 (Some(q), b2) if b2 == q => quote = None,
2458 (None, b'>') => return true,
2459 _ => {}
2460 }
2461 i += 1;
2462 }
2463 false
2464}
2465
2466/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
2467/// be lifted structurally. Returns `true` only when:
2468/// - The line starts with `<tag_name` (modulo leading whitespace).
2469/// - The open tag's `>` exists with proper quote handling.
2470/// - The bytes after the open `>` contain a depth-zero matched
2471/// `</tag_name>` close (depth-aware: nested `<tag>` opens
2472/// increment depth; matching is case-insensitive, quote-aware).
2473///
2474/// Trailing bytes after the matched close are accepted and grafted
2475/// as a sibling block by the caller. Examples:
2476/// - `<div>foo</div>bar` → body=`foo`, trailing=`bar`.
2477/// - `<div>foo</div></div>` → body=`foo`, trailing=`</div>` (which
2478/// recursively parses to a `RawBlock`).
2479/// - `<div><div>x</div></div>bar` → body=`<div>x</div>` (nested div
2480/// parsed recursively), trailing=`bar`.
2481fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
2482 let bytes = line.as_bytes();
2483 let indent_end = bytes
2484 .iter()
2485 .position(|&b| b != b' ' && b != b'\t')
2486 .unwrap_or(bytes.len());
2487 let rest = &line[indent_end..];
2488 let rest_bytes = rest.as_bytes();
2489 let prefix_len = 1 + tag_name.len();
2490 if rest_bytes.len() < prefix_len
2491 || rest_bytes[0] != b'<'
2492 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2493 {
2494 return false;
2495 }
2496 let after_name = &rest[prefix_len..];
2497 let after_name_bytes = after_name.as_bytes();
2498 let mut i = 0usize;
2499 let mut quote: Option<u8> = None;
2500 let mut gt_idx: Option<usize> = None;
2501 while i < after_name_bytes.len() {
2502 match (quote, after_name_bytes[i]) {
2503 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2504 (Some(q), b2) if b2 == q => quote = None,
2505 (None, b'>') => {
2506 gt_idx = Some(i);
2507 break;
2508 }
2509 _ => {}
2510 }
2511 i += 1;
2512 }
2513 let Some(gt_idx) = gt_idx else {
2514 return false;
2515 };
2516 let trailing = &after_name[gt_idx + 1..];
2517 // Depth-aware: walk `trailing` (we begin inside the open tag at
2518 // depth 1). Return true iff a matched `</tag>` exists where depth
2519 // returns to 0. Self-closing `<tag/>` opens don't bump depth.
2520 matched_close_offset(trailing, tag_name).is_some()
2521}
2522
2523/// Walk `trailing` (the bytes after an open `<tag ...>`'s closing `>`)
2524/// looking for the depth-zero matched `</tag>` close. Counts `<tag>`
2525/// opens and `</tag>` closes case-insensitively, quote-aware. Depth
2526/// starts at 1 (we begin inside the open tag). Self-closing opens
2527/// (`<tag/>`) do not increment depth.
2528///
2529/// Returns `Some((close_start, close_end))` where:
2530/// - `close_start` is the byte offset of `<` in the matched `</tag>`.
2531/// - `close_end` is one past the matched `>`.
2532///
2533/// Returns `None` when no matched close is present (unclosed tag,
2534/// depth never returns to 0).
2535fn matched_close_offset(trailing: &str, tag_name: &str) -> Option<(usize, usize)> {
2536 let bytes = trailing.as_bytes();
2537 let lower_line = trailing.to_ascii_lowercase();
2538 let lower_bytes = lower_line.as_bytes();
2539 let tag_lower = tag_name.to_ascii_lowercase();
2540 let tag_bytes = tag_lower.as_bytes();
2541
2542 let mut depth: i32 = 1;
2543 let mut i = 0usize;
2544
2545 while i < bytes.len() {
2546 if bytes[i] != b'<' {
2547 i += 1;
2548 continue;
2549 }
2550 let after = i + 1;
2551 let is_close = after < bytes.len() && bytes[after] == b'/';
2552 let name_start = if is_close { after + 1 } else { after };
2553 let matched = name_start + tag_bytes.len() <= bytes.len()
2554 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
2555 let after_name = name_start + tag_bytes.len();
2556 let is_boundary = matched
2557 && matches!(
2558 bytes.get(after_name).copied(),
2559 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
2560 );
2561
2562 // Scan forward to this tag bracket's `>`, respecting quoted
2563 // attribute values; track self-closing form (`/>`).
2564 let mut j = if matched { after_name } else { after };
2565 let mut quote: Option<u8> = None;
2566 let mut self_close = false;
2567 let mut found_gt = false;
2568 while j < bytes.len() {
2569 let b = bytes[j];
2570 match (quote, b) {
2571 (Some(q), x) if x == q => quote = None,
2572 (None, b'"') | (None, b'\'') => quote = Some(b),
2573 (None, b'>') => {
2574 found_gt = true;
2575 if j > i + 1 && bytes[j - 1] == b'/' {
2576 self_close = true;
2577 }
2578 break;
2579 }
2580 _ => {}
2581 }
2582 j += 1;
2583 }
2584
2585 if matched && is_boundary {
2586 if is_close {
2587 depth -= 1;
2588 if depth == 0 && found_gt {
2589 return Some((i, j + 1));
2590 }
2591 } else if !self_close {
2592 depth += 1;
2593 }
2594 }
2595
2596 if found_gt {
2597 i = j + 1;
2598 } else {
2599 // Unterminated `<...` — give up.
2600 break;
2601 }
2602 }
2603 None
2604}
2605
2606/// Locate the byte offset of the first `>` after a `</tag` prefix at
2607/// the start of `close_part`. Returns `Some(end_of_close_marker)` so
2608/// the caller can split `close_part` into the close-marker bytes
2609/// (`</tag>`) and any same-line trailing text. Returns `None` if the
2610/// expected prefix shape is missing — caller treats the whole slice
2611/// as the close marker (no trailing).
2612fn split_close_marker_end(close_part: &str, tag_name: &str) -> Option<usize> {
2613 let prefix_len = 2 + tag_name.len();
2614 let bytes = close_part.as_bytes();
2615 if bytes.len() < prefix_len
2616 || bytes[0] != b'<'
2617 || bytes[1] != b'/'
2618 || !bytes[2..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2619 {
2620 return None;
2621 }
2622 // Scan from after `</tag` to the first unquoted `>`.
2623 let mut i = prefix_len;
2624 let mut quote: Option<u8> = None;
2625 while i < bytes.len() {
2626 match (quote, bytes[i]) {
2627 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2628 (Some(q), b2) if b2 == q => quote = None,
2629 (None, b'>') => return Some(i + 1),
2630 _ => {}
2631 }
2632 i += 1;
2633 }
2634 None
2635}
2636
2637/// Try to split the close line of an HTML_BLOCK_DIV body into a
2638/// leading content prefix and a clean `</tag>...` remainder. Returns
2639/// `Some((leading, close_part))` only when the line contains exactly
2640/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2641/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2642/// for missing close tags, or for compound shapes the parser
2643/// shouldn't attempt to lift in this pass.
2644///
2645/// `leading` may be empty (close starts at column 0) or pure
2646/// whitespace (close on an indented line). Both count as "butted" per
2647/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2648/// the trailing paragraph inside the div demotes Para→Plain.
2649fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2650 let (opens, closes) = count_tag_balance(line, tag_name);
2651 if opens != 0 || closes != 1 {
2652 return None;
2653 }
2654 // Locate the close tag's opening `<` by lowercased substring search.
2655 // Safe because we've already established (above) that the line has
2656 // exactly one `</tag>` and no `<tag>` opens, so the first match is
2657 // THE close.
2658 let needle = format!("</{}", tag_name);
2659 let lower = line.to_ascii_lowercase();
2660 let close_lt = lower.find(&needle)?;
2661 Some((&line[..close_lt], &line[close_lt..]))
2662}
2663
2664/// Depth-aware variant of `try_split_close_line` used by the same-line
2665/// lift path. Walks `line` starting at depth 1 (we begin inside the
2666/// open `<tag>`) and splits at the byte position where the matched
2667/// `</tag>` close brings depth to 0. Returns `Some((body,
2668/// close_part))` where `body` is the bytes before the matched-close
2669/// start and `close_part` is the bytes from the matched close onward.
2670///
2671/// Unlike `try_split_close_line` this accepts nested same-tag opens
2672/// and multiple closes: for `<div><div>x</div></div>bar` it returns
2673/// body=`<div>x</div>` (a nested div the body lift parses
2674/// recursively) and close_part=`</div>bar`. For `<div>foo</div></div>`
2675/// it returns body=`foo`, close_part=`</div></div>` — the unmatched
2676/// trailing close projects as a sibling `RawBlock` per pandoc-native.
2677fn try_split_close_line_depth_aware<'a>(
2678 line: &'a str,
2679 tag_name: &str,
2680) -> Option<(&'a str, &'a str)> {
2681 let (close_start, _close_end) = matched_close_offset(line, tag_name)?;
2682 Some((&line[..close_start], &line[close_start..]))
2683}
2684
2685/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2686/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2687/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2688/// + TEXT(">") + TEXT(trailing)?`.
2689///
2690/// Bytes are byte-identical to the source — this only tokenizes at finer
2691/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2692/// region structurally. Falls back to a single TEXT token if the line
2693/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2694/// only retags as the lift kind when this shape was matched).
2695///
2696/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2697/// returned as `&str` instead so the caller can splice them into the
2698/// recursive-parse input for the structural body lift. When false
2699/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2700/// empty slice is returned.
2701fn emit_open_tag_tokens<'a>(
2702 builder: &mut GreenNodeBuilder<'static>,
2703 line: &'a str,
2704 tag_name: &str,
2705 lift_trailing: bool,
2706) -> &'a str {
2707 let bytes = line.as_bytes();
2708 // Leading indent (CommonMark allows up to 3 spaces).
2709 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2710 if indent_end > 0 {
2711 builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2712 }
2713 let rest = &line[indent_end..];
2714 // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2715 let prefix_len = 1 + tag_name.len();
2716 if !rest.starts_with('<')
2717 || rest.len() < prefix_len
2718 || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2719 {
2720 builder.token(SyntaxKind::TEXT.into(), rest);
2721 return "";
2722 }
2723 let after_name = &rest[prefix_len..];
2724 let after_name_bytes = after_name.as_bytes();
2725 // Find the closing `>` of the open tag, respecting quoted attribute values.
2726 let mut i = 0usize;
2727 let mut quote: Option<u8> = None;
2728 let mut tag_close: Option<usize> = None;
2729 while i < after_name_bytes.len() {
2730 let b = after_name_bytes[i];
2731 match (quote, b) {
2732 (None, b'"') | (None, b'\'') => quote = Some(b),
2733 (Some(q), b2) if b2 == q => quote = None,
2734 (None, b'>') => {
2735 tag_close = Some(i);
2736 break;
2737 }
2738 _ => {}
2739 }
2740 i += 1;
2741 }
2742 let Some(tag_close) = tag_close else {
2743 // Open tag has no closing `>` on this line — defensive fallback.
2744 builder.token(SyntaxKind::TEXT.into(), rest);
2745 return "";
2746 };
2747 // Whitespace between the tag name and the attribute region.
2748 let attrs_inner = &after_name[..tag_close];
2749 let ws_end = attrs_inner
2750 .as_bytes()
2751 .iter()
2752 .position(|&b| !matches!(b, b' ' | b'\t'))
2753 .unwrap_or(attrs_inner.len());
2754 let leading_ws = &attrs_inner[..ws_end];
2755 // Strip a trailing self-closing slash and the whitespace before it
2756 // from the attribute region; emit them as TEXT outside the
2757 // HTML_ATTRS node so the structural region only holds attribute
2758 // bytes (not formatting punctuation).
2759 let attrs_after_ws = &attrs_inner[ws_end..];
2760 let mut attr_end = attrs_after_ws.len();
2761 let attr_bytes = attrs_after_ws.as_bytes();
2762 let mut self_close_start = attr_end;
2763 if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2764 self_close_start = attr_end - 1;
2765 attr_end = self_close_start;
2766 while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2767 attr_end -= 1;
2768 }
2769 }
2770 let attrs_text = &attrs_after_ws[..attr_end];
2771 let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2772 let after_self_close = &attrs_after_ws[self_close_start..];
2773
2774 // Use the original source bytes for the `<tag` prefix (preserves
2775 // source casing — losslessness).
2776 builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2777 if !leading_ws.is_empty() {
2778 builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2779 }
2780 if !attrs_text.is_empty() {
2781 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2782 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2783 builder.finish_node();
2784 }
2785 if !trailing_text.is_empty() {
2786 builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2787 }
2788 if !after_self_close.is_empty() {
2789 builder.token(SyntaxKind::TEXT.into(), after_self_close);
2790 }
2791 builder.token(SyntaxKind::TEXT.into(), ">");
2792 let after_gt = &after_name[tag_close + 1..];
2793 if lift_trailing {
2794 // Return trailing bytes to the caller (will be spliced into the
2795 // recursive-parse input for the body lift).
2796 return after_gt;
2797 }
2798 if !after_gt.is_empty() {
2799 builder.token(SyntaxKind::TEXT.into(), after_gt);
2800 }
2801 ""
2802}
2803
2804/// Detect a multi-line HTML open tag for `tag_name`. Returns
2805/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2806/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2807/// the existing path) or when the `>` is missing entirely.
2808///
2809/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2810/// attribute value doesn't terminate the open tag. Quote state carries
2811/// across line boundaries.
2812fn find_multiline_open_end(
2813 lines: &[&str],
2814 start_pos: usize,
2815 first_inner: &str,
2816 tag_name: &str,
2817 bq_depth: usize,
2818) -> Option<usize> {
2819 // Locate the `<tag_name` literal in `first_inner` to start scanning past
2820 // it. Match is ASCII case-insensitive; the parser preserves source casing.
2821 // `first_inner` is already bq-stripped by the caller; subsequent lines are
2822 // stripped inline below via `strip_n_blockquote_markers`.
2823 let trimmed = strip_leading_spaces(first_inner);
2824 let prefix_len = 1 + tag_name.len();
2825 if !trimmed.starts_with('<')
2826 || trimmed.len() < prefix_len
2827 || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2828 {
2829 return None;
2830 }
2831 let leading_indent = first_inner.len() - trimmed.len();
2832 let mut i = leading_indent + prefix_len; // past `<tag_name`
2833 let mut quote: Option<u8> = None;
2834
2835 // Scan first line for an unquoted `>`.
2836 let line0_bytes = first_inner.as_bytes();
2837 while i < line0_bytes.len() {
2838 match (quote, line0_bytes[i]) {
2839 (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2840 (Some(q), x) if x == q => quote = None,
2841 (None, b'>') => return None, // single-line case
2842 _ => {}
2843 }
2844 i += 1;
2845 }
2846
2847 // No `>` on first line. Scan subsequent lines, stripping `bq_depth`
2848 // blockquote markers per line so `> ` prefixes don't count toward the
2849 // quote-aware scan. Mirrors `pandoc_html_open_tag_closes`.
2850 let mut line_idx = start_pos + 1;
2851 while line_idx < lines.len() {
2852 let raw = lines[line_idx];
2853 let inner = if bq_depth > 0 {
2854 strip_n_blockquote_markers(raw, bq_depth)
2855 } else {
2856 raw
2857 };
2858 for &b in inner.as_bytes() {
2859 match (quote, b) {
2860 (None, b'"') | (None, b'\'') => quote = Some(b),
2861 (Some(q), x) if x == q => quote = None,
2862 (None, b'>') => return Some(line_idx),
2863 _ => {}
2864 }
2865 }
2866 line_idx += 1;
2867 }
2868
2869 None
2870}
2871
2872/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2873/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2874/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2875/// open tag (no `>` in the remaining input) as paragraph text rather than
2876/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2877/// reparse the same content recursively, causing a stack overflow.
2878///
2879/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2880/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2881/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2882/// the tag eventually closes far away.
2883pub(crate) fn pandoc_html_open_tag_closes(
2884 lines: &[&str],
2885 start_pos: usize,
2886 bq_depth: usize,
2887) -> bool {
2888 if start_pos >= lines.len() {
2889 return false;
2890 }
2891 let mut quote: Option<u8> = None;
2892 for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2893 let inner = if bq_depth > 0 {
2894 strip_n_blockquote_markers(line, bq_depth)
2895 } else {
2896 line
2897 };
2898 let bytes = inner.as_bytes();
2899 let mut i = 0usize;
2900 if offset == start_pos {
2901 while i < bytes.len() && bytes[i] == b' ' {
2902 i += 1;
2903 }
2904 if bytes.get(i) != Some(&b'<') {
2905 return false;
2906 }
2907 i += 1;
2908 }
2909 while i < bytes.len() {
2910 match (quote, bytes[i]) {
2911 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2912 (Some(q), x) if x == q => quote = None,
2913 (None, b'>') => return true,
2914 _ => {}
2915 }
2916 i += 1;
2917 }
2918 }
2919 false
2920}
2921
2922/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2923/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2924/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2925/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2926/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2927/// Phase 6 structural lift.
2928///
2929/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2930/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2931/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2932/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2933/// + TEXT(">") + (TEXT(trailing))? + NEWLINE
2934///
2935/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2936/// newline; `parse_html_attribute_list` tolerates whitespace.
2937#[allow(clippy::too_many_arguments)]
2938fn emit_multiline_open_tag_with_attrs(
2939 builder: &mut GreenNodeBuilder<'static>,
2940 lines: &[&str],
2941 start_pos: usize,
2942 end_line_idx: usize,
2943 tag_name: &str,
2944 bq_depth: usize,
2945 lift_trailing: bool,
2946 pre_content: &mut String,
2947) {
2948 let prefix_len = 1 + tag_name.len();
2949 for (line_idx, raw) in lines
2950 .iter()
2951 .enumerate()
2952 .take(end_line_idx + 1)
2953 .skip(start_pos)
2954 {
2955 // Strip `bq_depth` blockquote markers from the source line so
2956 // indent/HTML_ATTRS/TEXT splitting ignores the bq prefix bytes.
2957 // Re-emit the stripped prefix as `BLOCK_QUOTE_MARKER` /
2958 // `WHITESPACE` tokens — but ONLY for lines past `start_pos`.
2959 // Line 0's bq prefix is consumed by the outer BLOCK_QUOTE node
2960 // before this parser runs; re-emitting it here would double
2961 // the bytes and break losslessness.
2962 let stripped = if bq_depth > 0 {
2963 strip_n_blockquote_markers(raw, bq_depth)
2964 } else {
2965 raw
2966 };
2967 let bq_prefix_len = raw.len() - stripped.len();
2968 if bq_prefix_len > 0 && line_idx != start_pos {
2969 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
2970 }
2971 let line = stripped;
2972 let (line_no_nl, newline_str) = strip_newline(line);
2973
2974 if line_idx == start_pos {
2975 // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2976 // + attrs)?. The closing `>` is on a later line, so any
2977 // remaining bytes after "<{tag_name}" on this line are the
2978 // start of the attribute region.
2979 let bytes = line_no_nl.as_bytes();
2980 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2981 if indent_end > 0 {
2982 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2983 }
2984 // Defensive: caller verified the line starts with `<{tag_name}`.
2985 let after_indent = &line_no_nl[indent_end..];
2986 if after_indent.len() >= prefix_len {
2987 builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2988 let rest = &after_indent[prefix_len..];
2989 emit_attr_region(builder, rest);
2990 } else {
2991 builder.token(SyntaxKind::TEXT.into(), after_indent);
2992 }
2993 } else if line_idx < end_line_idx {
2994 // Pure attribute line.
2995 let bytes = line_no_nl.as_bytes();
2996 let indent_end = bytes
2997 .iter()
2998 .position(|&b| !matches!(b, b' ' | b'\t'))
2999 .unwrap_or(bytes.len());
3000 if indent_end > 0 {
3001 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3002 }
3003 let attrs_text = &line_no_nl[indent_end..];
3004 if !attrs_text.is_empty() {
3005 builder.start_node(SyntaxKind::HTML_ATTRS.into());
3006 builder.token(SyntaxKind::TEXT.into(), attrs_text);
3007 builder.finish_node();
3008 }
3009 } else {
3010 // Last line: indent + attrs + ">" + trailing.
3011 let bytes = line_no_nl.as_bytes();
3012 let indent_end = bytes
3013 .iter()
3014 .position(|&b| !matches!(b, b' ' | b'\t'))
3015 .unwrap_or(bytes.len());
3016 if indent_end > 0 {
3017 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
3018 }
3019 // Find the unquoted `>` byte position in this line.
3020 let mut quote: Option<u8> = None;
3021 let mut gt_pos: Option<usize> = None;
3022 for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
3023 let actual_j = indent_end + j;
3024 match (quote, b) {
3025 (None, b'"') | (None, b'\'') => quote = Some(b),
3026 (Some(q), x) if x == q => quote = None,
3027 (None, b'>') => {
3028 gt_pos = Some(actual_j);
3029 break;
3030 }
3031 _ => {}
3032 }
3033 }
3034 let Some(gt) = gt_pos else {
3035 // Defensive — caller said `>` is on this line.
3036 builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
3037 if !newline_str.is_empty() {
3038 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3039 }
3040 continue;
3041 };
3042 // Attribute region: between indent_end and gt, with possibly
3043 // trailing whitespace before `>`.
3044 let attrs_region = &line_no_nl[indent_end..gt];
3045 let region_bytes = attrs_region.as_bytes();
3046 // Strip trailing whitespace from attrs region; emit as
3047 // separate WHITESPACE so HTML_ATTRS only contains attribute
3048 // bytes.
3049 let mut attr_end = region_bytes.len();
3050 while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
3051 attr_end -= 1;
3052 }
3053 let attrs_text = &attrs_region[..attr_end];
3054 let trailing_ws = &attrs_region[attr_end..];
3055 if !attrs_text.is_empty() {
3056 builder.start_node(SyntaxKind::HTML_ATTRS.into());
3057 builder.token(SyntaxKind::TEXT.into(), attrs_text);
3058 builder.finish_node();
3059 }
3060 if !trailing_ws.is_empty() {
3061 builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
3062 }
3063 builder.token(SyntaxKind::TEXT.into(), ">");
3064 let after_gt = &line_no_nl[gt + 1..];
3065 if lift_trailing && !after_gt.is_empty() {
3066 // Lift trailing bytes (and the trailing newline) into
3067 // `pre_content` so the open `HTML_BLOCK_TAG` ends cleanly
3068 // with `TEXT(">")`. The recursive parse at the close-marker
3069 // site treats `pre_content` as the leading bytes of the
3070 // structural body — same shape produced by `emit_open_tag_tokens`
3071 // for single-line opens.
3072 pre_content.push_str(after_gt);
3073 pre_content.push_str(newline_str);
3074 continue;
3075 }
3076 if !after_gt.is_empty() {
3077 builder.token(SyntaxKind::TEXT.into(), after_gt);
3078 }
3079 }
3080
3081 if !newline_str.is_empty() {
3082 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3083 }
3084 }
3085}
3086
3087/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
3088/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
3089/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
3090/// node is added. Pandoc's projector reads attributes only for `<div>` /
3091/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
3092fn emit_multiline_open_tag_simple(
3093 builder: &mut GreenNodeBuilder<'static>,
3094 lines: &[&str],
3095 start_pos: usize,
3096 end_line_idx: usize,
3097 bq_depth: usize,
3098) {
3099 for (line_idx, raw) in lines
3100 .iter()
3101 .enumerate()
3102 .take(end_line_idx + 1)
3103 .skip(start_pos)
3104 {
3105 let stripped = if bq_depth > 0 {
3106 strip_n_blockquote_markers(raw, bq_depth)
3107 } else {
3108 raw
3109 };
3110 let bq_prefix_len = raw.len() - stripped.len();
3111 // Line 0's bq prefix is owned by the outer BLOCK_QUOTE node;
3112 // re-emit prefixes only for subsequent lines.
3113 if bq_prefix_len > 0 && line_idx != start_pos {
3114 emit_bq_prefix_tokens(builder, &raw[..bq_prefix_len]);
3115 }
3116 let (line_no_nl, newline_str) = strip_newline(stripped);
3117 if !line_no_nl.is_empty() {
3118 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
3119 }
3120 if !newline_str.is_empty() {
3121 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3122 }
3123 }
3124}
3125
3126/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
3127/// `<div` literal up to end-of-line. Called only from
3128/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
3129/// pure attribute (and possibly inter-attribute whitespace).
3130fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
3131 if region.is_empty() {
3132 return;
3133 }
3134 let bytes = region.as_bytes();
3135 // Split a leading run of whitespace into a WHITESPACE token so the
3136 // HTML_ATTRS node holds only attribute bytes.
3137 let ws_end = bytes
3138 .iter()
3139 .position(|&b| !matches!(b, b' ' | b'\t'))
3140 .unwrap_or(bytes.len());
3141 if ws_end > 0 {
3142 builder.token(SyntaxKind::WHITESPACE.into(), ®ion[..ws_end]);
3143 }
3144 let attrs_text = ®ion[ws_end..];
3145 if !attrs_text.is_empty() {
3146 builder.start_node(SyntaxKind::HTML_ATTRS.into());
3147 builder.token(SyntaxKind::TEXT.into(), attrs_text);
3148 builder.finish_node();
3149 }
3150}
3151
3152/// Emit one continuation line of an HTML block, preserving any blockquote
3153/// markers as structural tokens (so the CST stays byte-equal to the source
3154/// and downstream consumers can strip them per-context).
3155fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
3156 let inner = if bq_depth > 0 {
3157 let stripped = strip_n_blockquote_markers(line, bq_depth);
3158 let prefix_len = line.len() - stripped.len();
3159 if prefix_len > 0 {
3160 for ch in line[..prefix_len].chars() {
3161 if ch == '>' {
3162 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
3163 } else {
3164 let mut buf = [0u8; 4];
3165 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
3166 }
3167 }
3168 }
3169 stripped
3170 } else {
3171 line
3172 };
3173
3174 let (line_without_newline, newline_str) = strip_newline(inner);
3175 if !line_without_newline.is_empty() {
3176 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
3177 }
3178 if !newline_str.is_empty() {
3179 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
3180 }
3181}
3182
3183#[cfg(test)]
3184mod tests {
3185 use super::*;
3186
3187 #[test]
3188 fn test_try_parse_html_comment() {
3189 assert_eq!(
3190 try_parse_html_block_start("<!-- comment -->", false),
3191 Some(HtmlBlockType::Comment)
3192 );
3193 assert_eq!(
3194 try_parse_html_block_start(" <!-- comment -->", false),
3195 Some(HtmlBlockType::Comment)
3196 );
3197 }
3198
3199 #[test]
3200 fn test_try_parse_div_tag() {
3201 assert_eq!(
3202 try_parse_html_block_start("<div>", false),
3203 Some(HtmlBlockType::BlockTag {
3204 tag_name: "div".to_string(),
3205 is_verbatim: false,
3206 closed_by_blank_line: false,
3207 depth_aware: true,
3208 closes_at_open_tag: false,
3209 is_closing: false,
3210 })
3211 );
3212 assert_eq!(
3213 try_parse_html_block_start("<div class=\"test\">", false),
3214 Some(HtmlBlockType::BlockTag {
3215 tag_name: "div".to_string(),
3216 is_verbatim: false,
3217 closed_by_blank_line: false,
3218 depth_aware: true,
3219 closes_at_open_tag: false,
3220 is_closing: false,
3221 })
3222 );
3223 }
3224
3225 #[test]
3226 fn test_try_parse_script_tag() {
3227 assert_eq!(
3228 try_parse_html_block_start("<script>", false),
3229 Some(HtmlBlockType::BlockTag {
3230 tag_name: "script".to_string(),
3231 is_verbatim: true,
3232 closed_by_blank_line: false,
3233 depth_aware: true,
3234 closes_at_open_tag: false,
3235 is_closing: false,
3236 })
3237 );
3238 }
3239
3240 #[test]
3241 fn test_try_parse_processing_instruction() {
3242 assert_eq!(
3243 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
3244 Some(HtmlBlockType::ProcessingInstruction)
3245 );
3246 }
3247
3248 #[test]
3249 fn test_try_parse_declaration() {
3250 // CommonMark dialect recognizes declarations as type-4 HTML blocks.
3251 assert_eq!(
3252 try_parse_html_block_start("<!DOCTYPE html>", true),
3253 Some(HtmlBlockType::Declaration)
3254 );
3255 // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
3256 // just uppercase. Lowercase doctype must match too.
3257 assert_eq!(
3258 try_parse_html_block_start("<!doctype html>", true),
3259 Some(HtmlBlockType::Declaration)
3260 );
3261 // Pandoc dialect does not — bare declarations fall through to
3262 // paragraph parsing.
3263 assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
3264 assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
3265 }
3266
3267 #[test]
3268 fn test_dialect_specific_block_tag_membership() {
3269 // Pandoc-markdown's `blockHtmlTags` is a strict subset of
3270 // CommonMark §4.6 type-6 plus a few additions. These tags
3271 // diverge between dialects:
3272 // CM-only block tags (Pandoc treats as inline raw HTML):
3273 // dialog, legend, menuitem, optgroup, option, frame,
3274 // base, basefont, link, param
3275 // Pandoc-only block tags (CM doesn't recognize):
3276 // canvas, hgroup, isindex, meta, output
3277 for cm_only in [
3278 "<dialog>",
3279 "<legend>",
3280 "<menuitem>",
3281 "<optgroup>",
3282 "<option>",
3283 "<frame>",
3284 "<base>",
3285 "<basefont>",
3286 "<link>",
3287 "<param>",
3288 ] {
3289 assert!(
3290 matches!(
3291 try_parse_html_block_start(cm_only, true),
3292 Some(HtmlBlockType::BlockTag { .. })
3293 ),
3294 "{cm_only} should be a block-tag start under CommonMark",
3295 );
3296 assert_eq!(
3297 try_parse_html_block_start(cm_only, false),
3298 None,
3299 "{cm_only} should NOT be a block-tag start under Pandoc",
3300 );
3301 }
3302 for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
3303 // Under CM these are not type-6 BlockTags; they may still match
3304 // type-7 (complete tag on a line) which has different semantics.
3305 assert!(
3306 !matches!(
3307 try_parse_html_block_start(pandoc_only, true),
3308 Some(HtmlBlockType::BlockTag { .. })
3309 ),
3310 "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
3311 );
3312 assert!(
3313 matches!(
3314 try_parse_html_block_start(pandoc_only, false),
3315 Some(HtmlBlockType::BlockTag { .. })
3316 ),
3317 "{pandoc_only} should be a block-tag start under Pandoc",
3318 );
3319 }
3320 }
3321
3322 #[test]
3323 fn test_pandoc_inline_block_tag_membership() {
3324 // Pandoc's `eitherBlockOrInline` tags start an HTML block at
3325 // fresh-block positions under Pandoc dialect. We list the
3326 // non-void, non-script subset (verbatim `script` is handled
3327 // via the verbatim path; void elements are deferred — see
3328 // PANDOC_INLINE_BLOCK_TAGS docs).
3329 for tag in [
3330 "<button>",
3331 "<iframe>",
3332 "<video>",
3333 "<audio>",
3334 "<noscript>",
3335 "<object>",
3336 "<map>",
3337 "<progress>",
3338 "<del>",
3339 "<ins>",
3340 "<svg>",
3341 "<applet>",
3342 ] {
3343 assert!(
3344 matches!(
3345 try_parse_html_block_start(tag, false),
3346 Some(HtmlBlockType::BlockTag {
3347 depth_aware: true,
3348 ..
3349 })
3350 ),
3351 "{tag} should be a depth-aware block-tag start under Pandoc",
3352 );
3353 }
3354 // Closing forms of inline-block tags also start a block under
3355 // Pandoc — pandoc-native pins `</button>` standalone as a
3356 // single-line `RawBlock`. These use `closes_at_open_tag: true`
3357 // (no balanced match — the close emits as a one-line block on
3358 // its own).
3359 for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
3360 assert!(
3361 matches!(
3362 try_parse_html_block_start(closing, false),
3363 Some(HtmlBlockType::BlockTag {
3364 depth_aware: false,
3365 closes_at_open_tag: true,
3366 ..
3367 })
3368 ),
3369 "{closing} (closing form) should be a single-line block-tag start under Pandoc",
3370 );
3371 }
3372 }
3373
3374 #[test]
3375 fn test_pandoc_void_block_tag_membership() {
3376 // Pandoc's void `eitherBlockOrInline` tags start an HTML block
3377 // at fresh-block positions under Pandoc dialect, with
3378 // `closes_at_open_tag: true` — the block always ends on the
3379 // open-tag line (no closing tag to match).
3380 for tag in [
3381 "<area>",
3382 "<embed>",
3383 "<source>",
3384 "<track>",
3385 "<embed src=\"foo.swf\">",
3386 "<source src=\"foo.mp4\" type=\"video/mp4\">",
3387 ] {
3388 assert!(
3389 matches!(
3390 try_parse_html_block_start(tag, false),
3391 Some(HtmlBlockType::BlockTag {
3392 depth_aware: false,
3393 closes_at_open_tag: true,
3394 ..
3395 })
3396 ),
3397 "{tag} should be a void block-tag start under Pandoc",
3398 );
3399 }
3400 // Closing forms of void tags also start a single-line block
3401 // under Pandoc. Void elements have no closing tag in HTML, but
3402 // `</embed>` etc. can appear in the wild — pandoc-native still
3403 // emits them as `RawBlock`s at fresh-block positions; mirror
3404 // that with the same `closes_at_open_tag: true` shape.
3405 for closing in ["</area>", "</embed>", "</source>", "</track>"] {
3406 assert!(
3407 matches!(
3408 try_parse_html_block_start(closing, false),
3409 Some(HtmlBlockType::BlockTag {
3410 depth_aware: false,
3411 closes_at_open_tag: true,
3412 ..
3413 })
3414 ),
3415 "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
3416 );
3417 }
3418 // Under CommonMark dialect, the void-tag block-start path is
3419 // skipped. `<source>` and `<track>` are in the CM type-6
3420 // BLOCK_TAGS set so they DO start a block, but with CM type-6
3421 // semantics (`closed_by_blank_line: true`,
3422 // `closes_at_open_tag: false`), not the Pandoc void-tag path.
3423 // `<embed>` and `<area>` aren't in the CM type-6 list — they
3424 // fall through to type 7 (complete tag on a line by itself).
3425 assert_eq!(
3426 try_parse_html_block_start("<embed>", true),
3427 Some(HtmlBlockType::Type7)
3428 );
3429 assert_eq!(
3430 try_parse_html_block_start("<area>", true),
3431 Some(HtmlBlockType::Type7)
3432 );
3433 assert!(matches!(
3434 try_parse_html_block_start("<source src=\"x\">", true),
3435 Some(HtmlBlockType::BlockTag {
3436 closed_by_blank_line: true,
3437 closes_at_open_tag: false,
3438 ..
3439 })
3440 ));
3441 assert!(matches!(
3442 try_parse_html_block_start("<track src=\"x\">", true),
3443 Some(HtmlBlockType::BlockTag {
3444 closed_by_blank_line: true,
3445 closes_at_open_tag: false,
3446 ..
3447 })
3448 ));
3449 }
3450
3451 #[test]
3452 fn test_find_multiline_open_end() {
3453 // Single-line opens return None (caller takes the regular path).
3454 assert_eq!(
3455 find_multiline_open_end(&["<div id=\"x\">"], 0, "<div id=\"x\">", "div", 0),
3456 None
3457 );
3458 assert_eq!(
3459 find_multiline_open_end(&["<embed src=\"x\">"], 0, "<embed src=\"x\">", "embed", 0),
3460 None
3461 );
3462 // Multi-line opens return the line index of the closing `>`.
3463 assert_eq!(
3464 find_multiline_open_end(&["<embed", " src=\"x\">"], 0, "<embed", "embed", 0),
3465 Some(1)
3466 );
3467 assert_eq!(
3468 find_multiline_open_end(
3469 &["<embed", " src=\"x\"", " type=\"video\">"],
3470 0,
3471 "<embed",
3472 "embed",
3473 0
3474 ),
3475 Some(2)
3476 );
3477 // Tag-name mismatch returns None (case-insensitive on the tag name).
3478 assert_eq!(
3479 find_multiline_open_end(&["<embed", " src=\"x\">"], 0, "<embed", "div", 0),
3480 None
3481 );
3482 assert_eq!(
3483 find_multiline_open_end(&["<EMBED", " src=\"x\">"], 0, "<EMBED", "embed", 0),
3484 Some(1)
3485 );
3486 // Quoted `>` does not terminate the open tag; quote state threads
3487 // across line boundaries.
3488 assert_eq!(
3489 find_multiline_open_end(
3490 &["<embed title=\"a>b", " c\">"],
3491 0,
3492 "<embed title=\"a>b",
3493 "embed",
3494 0
3495 ),
3496 Some(1)
3497 );
3498 // No `>` anywhere returns None.
3499 assert_eq!(
3500 find_multiline_open_end(&["<embed", " src=\"x\""], 0, "<embed", "embed", 0),
3501 None
3502 );
3503 // Subsequent lines inside a blockquote: bq markers stripped before
3504 // scanning so `> ` prefixes don't count.
3505 assert_eq!(
3506 find_multiline_open_end(&["<div", "> id=\"x\">"], 0, "<div", "div", 1),
3507 Some(1)
3508 );
3509 // Nested bq: strips two `> ` per line.
3510 assert_eq!(
3511 find_multiline_open_end(
3512 &["<section", "> > id=\"x\">"],
3513 0,
3514 "<section",
3515 "section",
3516 2
3517 ),
3518 Some(1)
3519 );
3520 }
3521
3522 #[test]
3523 fn test_pandoc_html_open_tag_closes() {
3524 // Single-line complete: scanner finds `>` on the first line.
3525 assert!(pandoc_html_open_tag_closes(&["<div>"], 0, 0));
3526 assert!(pandoc_html_open_tag_closes(&["<embed src=\"x\">"], 0, 0));
3527 // Multi-line complete: scanner finds `>` on a later line.
3528 assert!(pandoc_html_open_tag_closes(
3529 &["<div", " id=\"x\">", "body", "</div>"],
3530 0,
3531 0
3532 ));
3533 assert!(pandoc_html_open_tag_closes(
3534 &["<embed", " src=\"x.png\" alt=\"y\">"],
3535 0,
3536 0
3537 ));
3538 // Quoted `>` does not close: scanner threads quote state.
3539 assert!(!pandoc_html_open_tag_closes(
3540 &["<div title=\"a>b", " c\""],
3541 0,
3542 0
3543 ));
3544 assert!(pandoc_html_open_tag_closes(
3545 &["<div title=\"a>b", " c\">"],
3546 0,
3547 0
3548 ));
3549 // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
3550 assert!(!pandoc_html_open_tag_closes(&["<embed"], 0, 0));
3551 assert!(!pandoc_html_open_tag_closes(&["<div", "foo", "bar"], 0, 0));
3552 // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
3553 // across them); the scan continues until EOF or `>`.
3554 assert!(pandoc_html_open_tag_closes(
3555 &["<div", "", "id=\"x\">"],
3556 0,
3557 0
3558 ));
3559 }
3560
3561 #[test]
3562 fn test_try_parse_cdata() {
3563 // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
3564 assert_eq!(
3565 try_parse_html_block_start("<![CDATA[content]]>", true),
3566 Some(HtmlBlockType::CData)
3567 );
3568 // Pandoc dialect does not.
3569 assert_eq!(
3570 try_parse_html_block_start("<![CDATA[content]]>", false),
3571 None
3572 );
3573 }
3574
3575 #[test]
3576 fn test_extract_block_tag_name_open_only() {
3577 assert_eq!(
3578 extract_block_tag_name("<div>", false),
3579 Some("div".to_string())
3580 );
3581 assert_eq!(
3582 extract_block_tag_name("<div class=\"test\">", false),
3583 Some("div".to_string())
3584 );
3585 assert_eq!(
3586 extract_block_tag_name("<div/>", false),
3587 Some("div".to_string())
3588 );
3589 assert_eq!(extract_block_tag_name("</div>", false), None);
3590 assert_eq!(extract_block_tag_name("<>", false), None);
3591 assert_eq!(extract_block_tag_name("< div>", false), None);
3592 }
3593
3594 #[test]
3595 fn test_extract_block_tag_name_with_closing() {
3596 // CommonMark §4.6 type-6 starts also accept closing tags.
3597 assert_eq!(
3598 extract_block_tag_name("</div>", true),
3599 Some("div".to_string())
3600 );
3601 assert_eq!(
3602 extract_block_tag_name("</div >", true),
3603 Some("div".to_string())
3604 );
3605 }
3606
3607 #[test]
3608 fn test_commonmark_type6_closing_tag_start() {
3609 assert_eq!(
3610 try_parse_html_block_start("</div>", true),
3611 Some(HtmlBlockType::BlockTag {
3612 tag_name: "div".to_string(),
3613 is_verbatim: false,
3614 closed_by_blank_line: true,
3615 depth_aware: false,
3616 closes_at_open_tag: false,
3617 is_closing: true,
3618 })
3619 );
3620 }
3621
3622 #[test]
3623 fn test_commonmark_type7_open_tag() {
3624 // `<a>` (not a type-6 tag) on a line by itself is type 7 under
3625 // CommonMark; rejected under non-CommonMark.
3626 assert_eq!(
3627 try_parse_html_block_start("<a href=\"foo\">", true),
3628 Some(HtmlBlockType::Type7)
3629 );
3630 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
3631 }
3632
3633 #[test]
3634 fn test_commonmark_type7_close_tag() {
3635 assert_eq!(
3636 try_parse_html_block_start("</ins>", true),
3637 Some(HtmlBlockType::Type7)
3638 );
3639 }
3640
3641 #[test]
3642 fn test_commonmark_type7_rejects_with_trailing_text() {
3643 // A complete tag must be followed only by whitespace.
3644 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
3645 }
3646
3647 #[test]
3648 fn test_is_closing_marker_comment() {
3649 let block_type = HtmlBlockType::Comment;
3650 assert!(is_closing_marker("-->", &block_type));
3651 assert!(is_closing_marker("end -->", &block_type));
3652 assert!(!is_closing_marker("<!--", &block_type));
3653 }
3654
3655 #[test]
3656 fn test_is_closing_marker_tag() {
3657 let block_type = HtmlBlockType::BlockTag {
3658 tag_name: "div".to_string(),
3659 is_verbatim: false,
3660 closed_by_blank_line: false,
3661 depth_aware: false,
3662 closes_at_open_tag: false,
3663 is_closing: false,
3664 };
3665 assert!(is_closing_marker("</div>", &block_type));
3666 assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
3667 assert!(is_closing_marker("content</div>", &block_type));
3668 assert!(!is_closing_marker("<div>", &block_type));
3669 }
3670
3671 #[test]
3672 fn test_parse_html_comment_block() {
3673 let input = "<!-- comment -->\n";
3674 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3675 let mut builder = GreenNodeBuilder::new();
3676
3677 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3678 let opts = ParserOptions::default();
3679 let new_pos = parse_html_block_with_wrapper(
3680 &mut builder,
3681 &lines,
3682 0,
3683 block_type,
3684 0,
3685 SyntaxKind::HTML_BLOCK,
3686 &opts,
3687 );
3688
3689 assert_eq!(new_pos, 1);
3690 }
3691
3692 #[test]
3693 fn test_parse_div_block() {
3694 let input = "<div>\ncontent\n</div>\n";
3695 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3696 let mut builder = GreenNodeBuilder::new();
3697
3698 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3699 let opts = ParserOptions::default();
3700 let new_pos = parse_html_block_with_wrapper(
3701 &mut builder,
3702 &lines,
3703 0,
3704 block_type,
3705 0,
3706 SyntaxKind::HTML_BLOCK,
3707 &opts,
3708 );
3709
3710 assert_eq!(new_pos, 3);
3711 }
3712
3713 #[test]
3714 fn test_parse_html_block_no_closing() {
3715 let input = "<div>\ncontent\n";
3716 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3717 let mut builder = GreenNodeBuilder::new();
3718
3719 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3720 let opts = ParserOptions::default();
3721 let new_pos = parse_html_block_with_wrapper(
3722 &mut builder,
3723 &lines,
3724 0,
3725 block_type,
3726 0,
3727 SyntaxKind::HTML_BLOCK,
3728 &opts,
3729 );
3730
3731 // Should consume all lines even without closing tag
3732 assert_eq!(new_pos, 2);
3733 }
3734
3735 #[test]
3736 fn test_parse_div_block_nested_pandoc() {
3737 // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3738 // close on the OUTER `</div>`, not the first `</div>` seen. The
3739 // CommonMark-style "first close" scanner is wrong here; Pandoc's
3740 // div parser is depth-aware (mirrors `htmlInBalanced`).
3741 let input =
3742 "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3743 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3744 let mut builder = GreenNodeBuilder::new();
3745
3746 // is_commonmark = false → Pandoc dialect.
3747 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3748 let opts = ParserOptions::default();
3749 let new_pos = parse_html_block_with_wrapper(
3750 &mut builder,
3751 &lines,
3752 0,
3753 block_type,
3754 0,
3755 SyntaxKind::HTML_BLOCK_DIV,
3756 &opts,
3757 );
3758
3759 // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3760 // inner-close, blank, outer-close. All consumed.
3761 assert_eq!(new_pos, 9);
3762 }
3763
3764 #[test]
3765 fn test_parse_div_block_same_line_pandoc() {
3766 // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3767 // close on first line. Depth-aware tracking must not regress this.
3768 let input = "<div>foo</div>\n";
3769 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3770 let mut builder = GreenNodeBuilder::new();
3771
3772 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3773 let opts = ParserOptions::default();
3774 let new_pos = parse_html_block_with_wrapper(
3775 &mut builder,
3776 &lines,
3777 0,
3778 block_type,
3779 0,
3780 SyntaxKind::HTML_BLOCK_DIV,
3781 &opts,
3782 );
3783 assert_eq!(new_pos, 1);
3784 }
3785
3786 #[test]
3787 fn test_commonmark_verbatim_first_close() {
3788 // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3789 // ends at the first matching close — not depth-aware. Stash a
3790 // bogus inner `<script>` inside a JS string; the outer block
3791 // still closes at the first `</script>`.
3792 let input = "<script>\nlet x = '<script>';\n</script>\n";
3793 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3794 let mut builder = GreenNodeBuilder::new();
3795
3796 // is_commonmark = true.
3797 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3798 let opts = ParserOptions::default();
3799 let new_pos = parse_html_block_with_wrapper(
3800 &mut builder,
3801 &lines,
3802 0,
3803 block_type,
3804 0,
3805 SyntaxKind::HTML_BLOCK,
3806 &opts,
3807 );
3808 // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3809 assert_eq!(new_pos, 3);
3810 }
3811
3812 #[test]
3813 fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3814 // Multi-line open tag with the closing `>` on its own line:
3815 //
3816 // <div
3817 // id="x"
3818 // class="y"
3819 // >
3820 //
3821 // foo
3822 //
3823 // </div>
3824 //
3825 // Open tag spans lines 0..=3. Content starts at line 4.
3826 let input = "<div\n id=\"x\"\n class=\"y\"\n>\n\nfoo\n\n</div>\n";
3827 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3828 let mut builder = GreenNodeBuilder::new();
3829
3830 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3831 let opts = ParserOptions::default();
3832 let new_pos = parse_html_block_with_wrapper(
3833 &mut builder,
3834 &lines,
3835 0,
3836 block_type,
3837 0,
3838 SyntaxKind::HTML_BLOCK_DIV,
3839 &opts,
3840 );
3841
3842 // 8 lines: open-line 0, open-line 1 (` id="x"`), open-line 2
3843 // (` class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3844 assert_eq!(new_pos, 8);
3845
3846 // CST must contain a structural HTML_ATTRS region holding the
3847 // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3848 let green = builder.finish();
3849 let root = crate::syntax::SyntaxNode::new_root(green);
3850 let attrs_count = root
3851 .descendants()
3852 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3853 .count();
3854 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3855
3856 // Byte-identical losslessness check.
3857 let collected: String = root
3858 .descendants_with_tokens()
3859 .filter_map(|n| n.into_token())
3860 .map(|t| t.text().to_string())
3861 .collect();
3862 assert_eq!(collected, input);
3863 }
3864
3865 #[test]
3866 fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3867 // Multi-line open tag with the closing `>` on the last attribute
3868 // line (case 0262 already covers this pattern; pin behavior to
3869 // also ensure HTML_ATTRS structural exposure).
3870 let input = "<div\n id=\"x\"\n class=\"y\">\nfoo\n</div>\n";
3871 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3872 let mut builder = GreenNodeBuilder::new();
3873
3874 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3875 let opts = ParserOptions::default();
3876 let new_pos = parse_html_block_with_wrapper(
3877 &mut builder,
3878 &lines,
3879 0,
3880 block_type,
3881 0,
3882 SyntaxKind::HTML_BLOCK_DIV,
3883 &opts,
3884 );
3885
3886 // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3887 // </div>.
3888 assert_eq!(new_pos, 5);
3889
3890 let green = builder.finish();
3891 let root = crate::syntax::SyntaxNode::new_root(green);
3892 let attrs_count = root
3893 .descendants()
3894 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3895 .count();
3896 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3897
3898 let collected: String = root
3899 .descendants_with_tokens()
3900 .filter_map(|n| n.into_token())
3901 .map(|t| t.text().to_string())
3902 .collect();
3903 assert_eq!(collected, input);
3904 }
3905
3906 #[test]
3907 fn test_commonmark_type6_blank_line_terminates() {
3908 let input = "<div>\nfoo\n\nbar\n";
3909 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3910 let mut builder = GreenNodeBuilder::new();
3911
3912 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3913 let opts = ParserOptions::default();
3914 let new_pos = parse_html_block_with_wrapper(
3915 &mut builder,
3916 &lines,
3917 0,
3918 block_type,
3919 0,
3920 SyntaxKind::HTML_BLOCK,
3921 &opts,
3922 );
3923
3924 // Block contains <div>\nfoo\n; stops at blank line (line 2).
3925 assert_eq!(new_pos, 2);
3926 }
3927}