panache_parser/parser/blocks/html_blocks.rs
1//! HTML block parsing utilities.
2
3use crate::options::ParserOptions;
4use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
5use crate::syntax::{SyntaxKind, SyntaxNode};
6use rowan::GreenNodeBuilder;
7
8use super::blockquotes::{count_blockquote_markers, strip_n_blockquote_markers};
9use crate::parser::utils::helpers::{strip_leading_spaces, strip_newline};
10
11/// HTML block-level tags as defined by CommonMark spec.
12/// These tags start an HTML block when found at the start of a line.
13const BLOCK_TAGS: &[&str] = &[
14 "address",
15 "article",
16 "aside",
17 "base",
18 "basefont",
19 "blockquote",
20 "body",
21 "caption",
22 "center",
23 "col",
24 "colgroup",
25 "dd",
26 "details",
27 "dialog",
28 "dir",
29 "div",
30 "dl",
31 "dt",
32 "fieldset",
33 "figcaption",
34 "figure",
35 "footer",
36 "form",
37 "frame",
38 "frameset",
39 "h1",
40 "h2",
41 "h3",
42 "h4",
43 "h5",
44 "h6",
45 "head",
46 "header",
47 "hr",
48 "html",
49 "iframe",
50 "legend",
51 "li",
52 "link",
53 "main",
54 "menu",
55 "menuitem",
56 "nav",
57 "noframes",
58 "ol",
59 "optgroup",
60 "option",
61 "p",
62 "param",
63 "section",
64 "source",
65 "summary",
66 "table",
67 "tbody",
68 "td",
69 "tfoot",
70 "th",
71 "thead",
72 "title",
73 "tr",
74 "track",
75 "ul",
76];
77
78/// Tags that contain raw/verbatim content (no Markdown processing inside).
79const VERBATIM_TAGS: &[&str] = &["script", "style", "pre", "textarea"];
80
81/// Pandoc's `blockHtmlTags` (mirrors
82/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`). Pandoc-markdown
83/// uses this narrower set rather than CommonMark §4.6 type-6: it omits a
84/// number of CM type-6 tags (e.g. `dialog`, `legend`, `optgroup`, `option`,
85/// `frame`, `link`, `param`, `base`, `basefont`, `menuitem`) that pandoc
86/// treats as raw inline HTML, and adds a few pandoc keeps as block-level
87/// (`canvas`, `hgroup`, `isindex`, `meta`, `output`).
88///
89/// Pandoc's `eitherBlockOrInline` set (`audio`, `button`, `iframe`,
90/// `noscript`, `object`, `map`, `progress`, `video`, `del`, `ins`, `svg`,
91/// `applet`, plus the void elements `embed`, `area`, `source`, `track`
92/// and the verbatim `script`) is tracked separately as
93/// [`PANDOC_INLINE_BLOCK_TAGS`]. Those tags act as block starters at
94/// fresh-block positions but stay inline inside an existing HTML block
95/// (e.g. `<form><input><button>X</button></form>`); the projector's
96/// `split_html_block_by_tags` keys on `inline_pending` to keep them
97/// inline once an inline-only tag or text byte has been seen since the
98/// last splitter.
99const PANDOC_BLOCK_TAGS: &[&str] = &[
100 "address",
101 "article",
102 "aside",
103 "blockquote",
104 "body",
105 "canvas",
106 "caption",
107 "center",
108 "col",
109 "colgroup",
110 "dd",
111 "details",
112 "dir",
113 "div",
114 "dl",
115 "dt",
116 "fieldset",
117 "figcaption",
118 "figure",
119 "footer",
120 "form",
121 "frameset",
122 "h1",
123 "h2",
124 "h3",
125 "h4",
126 "h5",
127 "h6",
128 "head",
129 "header",
130 "hgroup",
131 "hr",
132 "html",
133 "isindex",
134 "li",
135 "main",
136 "menu",
137 "meta",
138 "nav",
139 "noframes",
140 "ol",
141 "output",
142 "p",
143 "pre",
144 "script",
145 "section",
146 "style",
147 "summary",
148 "table",
149 "tbody",
150 "td",
151 "textarea",
152 "tfoot",
153 "th",
154 "thead",
155 "tr",
156 "ul",
157];
158
159/// Whether `name` (case-insensitive) is one of the HTML block-level tags
160/// recognized by CommonMark §4.6 type-6.
161pub fn is_html_block_tag_name(name: &str) -> bool {
162 let lower = name.to_ascii_lowercase();
163 BLOCK_TAGS.contains(&lower.as_str())
164}
165
166/// Whether `name` (case-insensitive) is one of pandoc's `blockHtmlTags` —
167/// the narrower set pandoc-markdown's `htmlBlock` reader recognizes.
168/// Used by the pandoc-native projector's `split_html_block_by_tags` to
169/// decide whether a complete HTML tag inside an `HTML_BLOCK` should split
170/// the block — block-level tags emit as separate `RawBlock` entries;
171/// inline tags stay inline in the surrounding `Plain` content.
172pub fn is_pandoc_block_tag_name(name: &str) -> bool {
173 let lower = name.to_ascii_lowercase();
174 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
175}
176
177/// Pandoc's `eitherBlockOrInline` set (mirrors
178/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`): tags that
179/// `isBlockTag` accepts as block starters but `isInlineTag` ALSO accepts
180/// (because `name ∉ blockTags`). At top level (or after a blank line)
181/// pandoc treats `<iframe>foo</iframe>` as RawBlock+Plain+RawBlock, but
182/// inside an existing HTML block once a paragraph has started parsing,
183/// the same tag stays inline as `RawInline`.
184///
185/// The projector's `split_html_block_by_tags` mirrors this with an
186/// `inline_pending` flag — strict block tags ([`PANDOC_BLOCK_TAGS`])
187/// always split; inline-block tags split only when no inline content
188/// has been buffered since the last splitter.
189///
190/// Void elements (`area`, `embed`, `source`, `track`) live in
191/// [`PANDOC_VOID_BLOCK_TAGS`]; they follow the same `inline_pending`
192/// rule as non-void inline-block tags but emit a single RawBlock per
193/// instance instead of a matched-pair lift.
194/// `script` is omitted because it is already verbatim (handled by the
195/// `<script>...</script>` raw-text path) and the strict-block check
196/// fires first regardless.
197const PANDOC_INLINE_BLOCK_TAGS: &[&str] = &[
198 "applet", "audio", "button", "del", "iframe", "ins", "map", "noscript", "object", "progress",
199 "svg", "video",
200];
201
202/// Whether `name` (case-insensitive) is one of pandoc's
203/// `eitherBlockOrInline` tags (excluding void elements and `script`;
204/// see [`PANDOC_INLINE_BLOCK_TAGS`]).
205pub fn is_pandoc_inline_block_tag_name(name: &str) -> bool {
206 let lower = name.to_ascii_lowercase();
207 PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
208}
209
210/// Pandoc's void-element subset of `eitherBlockOrInline` (mirrors
211/// `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`'s void list
212/// minus those handled elsewhere: `br` and `wbr` are inline-only;
213/// `img` and `input` are inline-only; HTML void elements that pandoc
214/// classifies as `eitherBlockOrInline` are `area`, `embed`, `source`,
215/// `track`).
216///
217/// At fresh-block positions (or after a blank line) pandoc emits these
218/// as a single `RawBlock`; inside a running paragraph they stay inline
219/// as `RawInline`. The parser opens a depth-zero HTML block (closes
220/// immediately on the open-tag line — there is no closing tag to
221/// match) so subsequent lines start fresh blocks; the projector's
222/// `split_html_block_by_tags` handles the same-line splitting via
223/// `inline_pending`, emitting one `RawBlock` per void-tag instance.
224const PANDOC_VOID_BLOCK_TAGS: &[&str] = &["area", "embed", "source", "track"];
225
226/// Whether `name` (case-insensitive) is one of pandoc's void
227/// `eitherBlockOrInline` tags (`area`, `embed`, `source`, `track`).
228pub fn is_pandoc_void_block_tag_name(name: &str) -> bool {
229 let lower = name.to_ascii_lowercase();
230 PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str())
231}
232
233/// Whether the given tag name is eligible for the Phase 6 / Fix #4
234/// structural body lift inside an `HTML_BLOCK` wrapper: it's a Pandoc
235/// block-level tag (strict-block from `PANDOC_BLOCK_TAGS` OR non-void
236/// inline-block from `PANDOC_INLINE_BLOCK_TAGS`) that is NOT verbatim
237/// and NOT void. These are the tags where pandoc parses the body as
238/// fresh markdown between RawBlock emissions of the open/close tags —
239/// exactly the shape we can lift into structural CST children.
240///
241/// Inline-block tags (`<video>`, `<iframe>`, `<button>`, …) have an
242/// additional gate at the lift-gate site: the lift is abandoned when
243/// the body's first non-blank content is a void block tag at a
244/// fresh-block position (`<video>\n<source ...>\n</video>` projects
245/// per-tag rather than matched-pair, mirroring pandoc).
246///
247/// `<div>` is intentionally excluded — it has its own lift path
248/// (`HTML_BLOCK_DIV` wrapper retag) with different demotion rules
249/// (Plain/Para keyed on `close_butted`, not on trailing blank line).
250fn is_pandoc_lift_eligible_block_tag(name: &str) -> bool {
251 let lower = name.to_ascii_lowercase();
252 if VERBATIM_TAGS.contains(&lower.as_str()) {
253 return false;
254 }
255 if PANDOC_VOID_BLOCK_TAGS.contains(&lower.as_str()) {
256 return false;
257 }
258 if lower == "div" {
259 return false;
260 }
261 PANDOC_BLOCK_TAGS.contains(&lower.as_str())
262 || PANDOC_INLINE_BLOCK_TAGS.contains(&lower.as_str())
263}
264
265/// Open-tag-attribute tokenization gate for non-div strict-block tags
266/// inside a blockquote (`bq_depth > 0`). Returns the tag name when the
267/// open tag is eligible for finer-grained tokenization
268/// (`TEXT("<tag") + WS + HTML_ATTRS{TEXT(attrs)} + TEXT(">")`) without
269/// driving the full body lift — that's the `bq_clean_lift` path. The
270/// HTML_ATTRS region lets `AttributeNode::cast` register any `id` with
271/// the salsa anchor index.
272///
273/// `<div>` is handled by its own structural path (`HTML_BLOCK_DIV`
274/// wrapper) regardless of bq depth, so this gate skips it.
275fn bq_strict_attr_emit_tag_name(
276 wrapper_kind: SyntaxKind,
277 block_type: &HtmlBlockType,
278 bq_depth: usize,
279) -> Option<&str> {
280 if bq_depth == 0 || wrapper_kind != SyntaxKind::HTML_BLOCK {
281 return None;
282 }
283 match block_type {
284 HtmlBlockType::BlockTag {
285 tag_name,
286 is_verbatim: false,
287 closed_by_blank_line: false,
288 depth_aware: true,
289 closes_at_open_tag: false,
290 is_closing: false,
291 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
292 _ => None,
293 }
294}
295
296/// Information about a detected HTML block opening.
297#[derive(Debug, Clone, PartialEq, Eq)]
298pub(crate) enum HtmlBlockType {
299 /// HTML comment: <!-- ... -->
300 Comment,
301 /// Processing instruction: <? ... ?>
302 ProcessingInstruction,
303 /// Declaration: <!...>
304 Declaration,
305 /// CDATA section: <![CDATA[ ... ]]>
306 CData,
307 /// Block-level tag (CommonMark types 6/1 — `tag_name` is one of
308 /// `BLOCK_TAGS` or `VERBATIM_TAGS`). Set `closed_by_blank_line` to use
309 /// CommonMark §4.6 type-6 end semantics (block ends at blank line);
310 /// otherwise the legacy "ends at matching `</tag>`" semantics apply.
311 /// `depth_aware` extends the matching-tag close path with balanced
312 /// open/close tracking of the same tag name (mirrors pandoc's
313 /// `htmlInBalanced`); used under Pandoc dialect to handle nested
314 /// `<div>...<div>...</div>...</div>` shapes correctly. Ignored when
315 /// `closed_by_blank_line` is true.
316 /// `closes_at_open_tag` short-circuits the close search: the block
317 /// always ends after the open-tag line. Used for void
318 /// `eitherBlockOrInline` tags (`<embed>`, `<area>`, `<source>`,
319 /// `<track>`) which have no closing tag — depth-aware matching
320 /// would walk to end-of-input.
321 /// `is_closing` records whether the tag at the start position is a
322 /// closing form (`</tag>`) rather than an opening form (`<tag>`).
323 /// The dispatcher's `cannot_interrupt` consults this to mirror
324 /// pandoc's `isInlineTag` special cases (e.g. `</script>` is inline
325 /// even when `<script>` is not — pandoc treats the close-form as
326 /// always-inline regardless of attributes).
327 BlockTag {
328 tag_name: String,
329 is_verbatim: bool,
330 closed_by_blank_line: bool,
331 depth_aware: bool,
332 closes_at_open_tag: bool,
333 is_closing: bool,
334 },
335 /// CommonMark §4.6 type 7: complete open or close tag on a line by
336 /// itself, tag name not in the type-1 verbatim list. Block ends at
337 /// blank line. Cannot interrupt a paragraph.
338 Type7,
339}
340
341/// Try to detect an HTML block opening from content.
342/// Returns block type if this is a valid HTML block start.
343///
344/// `is_commonmark` enables CommonMark §4.6 semantics: type-6 starts also
345/// accept closing tags (`</div>`), type-6 blocks end at the next blank
346/// line (rather than a matching close tag), and type 7 is recognized.
347pub(crate) fn try_parse_html_block_start(
348 content: &str,
349 is_commonmark: bool,
350) -> Option<HtmlBlockType> {
351 let trimmed = strip_leading_spaces(content);
352
353 // Must start with <
354 if !trimmed.starts_with('<') {
355 return None;
356 }
357
358 // HTML comment
359 if trimmed.starts_with("<!--") {
360 return Some(HtmlBlockType::Comment);
361 }
362
363 // Processing instruction
364 if trimmed.starts_with("<?") {
365 return Some(HtmlBlockType::ProcessingInstruction);
366 }
367
368 // CDATA section — CommonMark dialect only. Pandoc-markdown does not
369 // recognize bare CDATA as a raw HTML block; the literal bytes fall
370 // through to paragraph parsing (`<![CDATA[` becomes Str, the inner
371 // text is parsed as inline markdown, etc).
372 if is_commonmark && trimmed.starts_with("<![CDATA[") {
373 return Some(HtmlBlockType::CData);
374 }
375
376 // Declaration (DOCTYPE, etc.) — CommonMark dialect only. Pandoc-markdown
377 // does not recognize bare declarations as raw HTML blocks (its
378 // `htmlBlock` reader uses `htmlTag isBlockTag`, which only matches
379 // tag-shaped blocks); the bytes fall through to paragraph parsing.
380 if is_commonmark && trimmed.starts_with("<!") && trimmed.len() > 2 {
381 let after_bang = &trimmed[2..];
382 if after_bang.chars().next()?.is_ascii_alphabetic() {
383 return Some(HtmlBlockType::Declaration);
384 }
385 }
386
387 // Try to parse as opening tag (or closing tag, under CommonMark and Pandoc).
388 // Pandoc-native recognizes standalone closing forms of strict-block tags
389 // (`</p>`, `</nav>`, `</section>`), verbatim tags (`</pre>`, `</style>`,
390 // `</script>`, `</textarea>`), and inline-block / void tags (`</video>`,
391 // `</button>`, `</embed>`) as single-line `RawBlock`s — they always end on
392 // the open-tag line via `closes_at_open_tag: true`.
393 if let Some(tag_name) = extract_block_tag_name(trimmed, true) {
394 let tag_lower = tag_name.to_lowercase();
395 let is_closing = trimmed.starts_with("</");
396
397 // Pandoc dialect: strict-block (`PANDOC_BLOCK_TAGS`) and verbatim
398 // (`VERBATIM_TAGS`) closing forms emit as single-line `RawBlock`.
399 // Unlike inline-block / void closes, these CAN interrupt a running
400 // paragraph (the dispatcher's `cannot_interrupt` only covers the
401 // inline-block / void categories). Inline-block / void closes are
402 // handled by their own branches further below.
403 if !is_commonmark
404 && is_closing
405 && (PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
406 || VERBATIM_TAGS.contains(&tag_lower.as_str()))
407 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
408 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
409 {
410 return Some(HtmlBlockType::BlockTag {
411 tag_name: tag_lower,
412 is_verbatim: false,
413 closed_by_blank_line: false,
414 depth_aware: false,
415 closes_at_open_tag: true,
416 is_closing: true,
417 });
418 }
419
420 // Under Pandoc, remaining closing forms (truly inline-only tags like
421 // `</em>`, `</span>`) are not block starts — fall through to the
422 // existing inline-html path. Inline-block + void closes are caught
423 // by the dedicated branches further below.
424 if !is_commonmark
425 && is_closing
426 && !PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str())
427 && !PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str())
428 {
429 return None;
430 }
431
432 // Check if it's a block-level tag. Pandoc and CommonMark disagree on
433 // membership: pandoc's `blockHtmlTags` (see
434 // `pandoc/src/Text/Pandoc/Readers/HTML/TagCategories.hs`) treats some
435 // CM type-6 tags as inline (e.g. `dialog`, `legend`, `option`) and
436 // some non-CM tags as block (e.g. `canvas`, `hgroup`, `meta`).
437 let is_block_tag = if is_commonmark {
438 BLOCK_TAGS.contains(&tag_lower.as_str())
439 } else {
440 PANDOC_BLOCK_TAGS.contains(&tag_lower.as_str())
441 };
442 if is_block_tag {
443 let is_verbatim = VERBATIM_TAGS.contains(&tag_lower.as_str());
444 return Some(HtmlBlockType::BlockTag {
445 tag_name: tag_lower,
446 is_verbatim,
447 closed_by_blank_line: is_commonmark && !is_verbatim,
448 depth_aware: !is_commonmark,
449 closes_at_open_tag: false,
450 is_closing,
451 });
452 }
453
454 // Pandoc dialect also treats `eitherBlockOrInline` tags as block
455 // starters at fresh-block positions. The block dispatcher caller
456 // gates these as `cannot_interrupt` (mirrors pandoc — they never
457 // interrupt a running paragraph; only start a fresh block when
458 // following a blank line or at document start). Closing forms
459 // (`</video>`) emit as a single-line `RawBlock` with no balanced
460 // match — pandoc-native pins this for standalone closes.
461 if !is_commonmark && PANDOC_INLINE_BLOCK_TAGS.contains(&tag_lower.as_str()) {
462 return Some(HtmlBlockType::BlockTag {
463 tag_name: tag_lower,
464 is_verbatim: false,
465 closed_by_blank_line: false,
466 depth_aware: !is_closing,
467 closes_at_open_tag: is_closing,
468 is_closing,
469 });
470 }
471
472 // Pandoc dialect also recognizes the void subset of
473 // `eitherBlockOrInline` (`area`, `embed`, `source`, `track`).
474 // These have no closing tag, so the parser closes the block
475 // immediately on the open-tag line; the projector's
476 // `split_html_block_by_tags` handles the same-line splitting
477 // (e.g. `<embed src="a"> trailing` → RawBlock + Para). Like
478 // non-void inline-block tags, void tags never interrupt a
479 // running paragraph (gated as `cannot_interrupt` in the
480 // dispatcher). Closing forms (`</embed>`) — semantically
481 // nonsensical for void elements — pandoc still emits as a
482 // single-line `RawBlock`; mirror that.
483 if !is_commonmark && PANDOC_VOID_BLOCK_TAGS.contains(&tag_lower.as_str()) {
484 return Some(HtmlBlockType::BlockTag {
485 tag_name: tag_lower,
486 is_verbatim: false,
487 closed_by_blank_line: false,
488 depth_aware: false,
489 closes_at_open_tag: true,
490 is_closing,
491 });
492 }
493
494 // Also accept verbatim tags even if not in BLOCK_TAGS list — but
495 // only as opening tags. CommonMark §4.6 type 1 starts with `<pre`,
496 // `<script`, `<style`, or `<textarea`; closing forms like `</pre>`
497 // do not start a type-1 block. Letting `</pre>` through here would
498 // wrongly interrupt a paragraph.
499 if !is_closing && VERBATIM_TAGS.contains(&tag_lower.as_str()) {
500 return Some(HtmlBlockType::BlockTag {
501 tag_name: tag_lower,
502 is_verbatim: true,
503 closed_by_blank_line: false,
504 depth_aware: !is_commonmark,
505 closes_at_open_tag: false,
506 is_closing: false,
507 });
508 }
509 }
510
511 // Type 7 (CommonMark only): complete open or close tag on a line by
512 // itself, tag name not in the type-1 verbatim list.
513 if is_commonmark && let Some(end) = parse_open_tag(trimmed).or_else(|| parse_close_tag(trimmed))
514 {
515 let rest = &trimmed[end..];
516 let only_ws = rest
517 .bytes()
518 .all(|b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'));
519 if only_ws {
520 // Reject if the tag name belongs to the type-1 verbatim set
521 // (`<pre>`, `<script>`, `<style>`, `<textarea>`) — those are
522 // type-1 starts above, so seeing one here means the opener
523 // had a different shape (e.g. `<pre/>` self-closing) that
524 // shouldn't trigger type 7 either. Conservatively skip.
525 let leading = trimmed.strip_prefix("</").unwrap_or_else(|| &trimmed[1..]);
526 let name_end = leading
527 .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
528 .unwrap_or(leading.len());
529 let name = leading[..name_end].to_ascii_lowercase();
530 if !VERBATIM_TAGS.contains(&name.as_str()) {
531 return Some(HtmlBlockType::Type7);
532 }
533 }
534 }
535
536 None
537}
538
539/// Extract the tag name for HTML-block-start detection.
540///
541/// Accepts both opening (`<tag>`) and closing (`</tag>`) forms when
542/// `accept_closing` is true (CommonMark §4.6 type 6 allows either). The
543/// tag must be followed by a space, tab, line ending, `>`, or `/>` per
544/// the spec — we approximate that with the space/`>`/`/` boundary check.
545fn extract_block_tag_name(text: &str, accept_closing: bool) -> Option<String> {
546 if !text.starts_with('<') {
547 return None;
548 }
549
550 let after_bracket = &text[1..];
551
552 let after_slash = if let Some(stripped) = after_bracket.strip_prefix('/') {
553 if !accept_closing {
554 return None;
555 }
556 stripped
557 } else {
558 after_bracket
559 };
560
561 // Extract tag name (alphanumeric, ends at space, >, or /)
562 let tag_end = after_slash
563 .find(|c: char| c.is_whitespace() || c == '>' || c == '/')
564 .unwrap_or(after_slash.len());
565
566 if tag_end == 0 {
567 return None;
568 }
569
570 let tag_name = &after_slash[..tag_end];
571
572 // Tag name must be valid (ASCII alphabetic start, alphanumeric)
573 if !tag_name.chars().next()?.is_ascii_alphabetic() {
574 return None;
575 }
576
577 if !tag_name.chars().all(|c| c.is_ascii_alphanumeric()) {
578 return None;
579 }
580
581 Some(tag_name.to_string())
582}
583
584/// Whether this block type ends at a blank line (CommonMark types 6 & 7
585/// in CommonMark dialect). Such blocks do NOT close on a matching tag /
586/// marker — only at end of input or the next blank line.
587fn ends_at_blank_line(block_type: &HtmlBlockType) -> bool {
588 matches!(
589 block_type,
590 HtmlBlockType::Type7
591 | HtmlBlockType::BlockTag {
592 closed_by_blank_line: true,
593 ..
594 }
595 )
596}
597
598/// Check if a line contains the closing marker for the given HTML block type.
599/// Only meaningful for types 1–5 and the legacy "type 6 closed by tag" path;
600/// blank-line-terminated types (6 in CommonMark, 7) never match here.
601fn is_closing_marker(line: &str, block_type: &HtmlBlockType) -> bool {
602 match block_type {
603 HtmlBlockType::Comment => line.contains("-->"),
604 HtmlBlockType::ProcessingInstruction => line.contains("?>"),
605 HtmlBlockType::Declaration => line.contains('>'),
606 HtmlBlockType::CData => line.contains("]]>"),
607 HtmlBlockType::BlockTag {
608 tag_name,
609 closed_by_blank_line: false,
610 ..
611 } => {
612 // Look for closing tag </tagname>
613 let closing_tag = format!("</{}>", tag_name);
614 line.to_lowercase().contains(&closing_tag)
615 }
616 HtmlBlockType::BlockTag {
617 closed_by_blank_line: true,
618 ..
619 }
620 | HtmlBlockType::Type7 => false,
621 }
622}
623
624/// Count occurrences of `<tag_name ...>` (open) and `</tag_name>` (close) in
625/// `line`. Self-closing forms (`<tag .../>`) and tags whose name appears
626/// inside a quoted attribute value are NOT counted — the scanner walks
627/// `<...>` brackets and respects `"`/`'` quoting.
628///
629/// Used by [`parse_html_block_with_wrapper`] to balance nested same-name
630/// tags under Pandoc dialect (mirrors pandoc's `htmlInBalanced`).
631fn count_tag_balance(line: &str, tag_name: &str) -> (usize, usize) {
632 let bytes = line.as_bytes();
633 let lower_line = line.to_ascii_lowercase();
634 let lower_bytes = lower_line.as_bytes();
635 let tag_lower = tag_name.to_ascii_lowercase();
636 let tag_bytes = tag_lower.as_bytes();
637
638 let mut opens = 0usize;
639 let mut closes = 0usize;
640 let mut i = 0usize;
641
642 while i < bytes.len() {
643 if bytes[i] != b'<' {
644 i += 1;
645 continue;
646 }
647 let after = i + 1;
648 let is_close = after < bytes.len() && bytes[after] == b'/';
649 let name_start = if is_close { after + 1 } else { after };
650 let matched = name_start + tag_bytes.len() <= bytes.len()
651 && &lower_bytes[name_start..name_start + tag_bytes.len()] == tag_bytes;
652 let after_name = name_start + tag_bytes.len();
653 let is_boundary = matched
654 && matches!(
655 bytes.get(after_name).copied(),
656 Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
657 );
658
659 // Walk forward to the closing `>` of this tag bracket, skipping
660 // inside quoted attribute values. Self-closing form ends with `/>`.
661 let mut j = if matched { after_name } else { after };
662 let mut quote: Option<u8> = None;
663 let mut self_close = false;
664 let mut found_gt = false;
665 while j < bytes.len() {
666 let b = bytes[j];
667 match (quote, b) {
668 (Some(q), x) if x == q => quote = None,
669 (None, b'"') | (None, b'\'') => quote = Some(b),
670 (None, b'>') => {
671 found_gt = true;
672 if j > i + 1 && bytes[j - 1] == b'/' {
673 self_close = true;
674 }
675 break;
676 }
677 _ => {}
678 }
679 j += 1;
680 }
681
682 if matched && is_boundary {
683 if is_close {
684 closes += 1;
685 } else if !self_close {
686 opens += 1;
687 }
688 }
689
690 if found_gt {
691 i = j + 1;
692 } else {
693 // Unterminated `<...` — bail out to avoid an infinite loop.
694 // The remaining bytes don't form a complete tag.
695 break;
696 }
697 }
698
699 (opens, closes)
700}
701
702/// Parse an HTML block, allowing the caller to pick the wrapper SyntaxKind
703/// (`HTML_BLOCK` for opaque preservation, `HTML_BLOCK_DIV` for the
704/// Pandoc-dialect `<div>` lift). Children are emitted byte-for-byte
705/// identical to the source either way; only the wrapper retag changes.
706pub(crate) fn parse_html_block_with_wrapper(
707 builder: &mut GreenNodeBuilder<'static>,
708 lines: &[&str],
709 start_pos: usize,
710 block_type: HtmlBlockType,
711 bq_depth: usize,
712 wrapper_kind: SyntaxKind,
713 config: &ParserOptions,
714) -> usize {
715 // Start HTML block
716 builder.start_node(wrapper_kind.into());
717
718 let first_line = lines[start_pos];
719 let blank_terminated = ends_at_blank_line(&block_type);
720
721 // The block dispatcher has already emitted BLOCK_QUOTE_MARKER + WHITESPACE
722 // tokens for the first line's blockquote prefix; emit only the inner
723 // content as TEXT to keep the CST byte-equal to the source.
724 let first_inner = if bq_depth > 0 {
725 strip_n_blockquote_markers(first_line, bq_depth)
726 } else {
727 first_line
728 };
729
730 // Detect a multi-line open tag.
731 // - `<div>` (Pandoc lift): we tokenize each line structurally so the
732 // salsa anchor walk picks up `id` from the HTML_ATTRS region.
733 // - Pandoc strict-block tags eligible for the Fix #4 lift (`<form>`,
734 // `<section>`, `<header>`, …): same structural emission, exposing
735 // `id` to the salsa anchor walk and enabling the body lift below.
736 // - Void block tags (`<embed>`, `<area>`, `<source>`, `<track>`):
737 // without this, the parser closes the block after line 0 and the
738 // remainder of the open tag falls into following paragraphs;
739 // pandoc-native treats the whole multi-line open tag as a single
740 // `RawBlock`. Emission for void tags uses simple per-line
741 // TEXT + NEWLINE (no HTML_ATTRS — the projector doesn't read attrs
742 // from void tags).
743 let multiline_open_end = if bq_depth == 0 {
744 match (wrapper_kind, &block_type) {
745 (SyntaxKind::HTML_BLOCK_DIV, _) => {
746 find_multiline_open_end(lines, start_pos, first_inner, "div")
747 }
748 (
749 _,
750 HtmlBlockType::BlockTag {
751 tag_name,
752 closes_at_open_tag: true,
753 ..
754 },
755 ) => find_multiline_open_end(lines, start_pos, first_inner, tag_name),
756 (
757 _,
758 HtmlBlockType::BlockTag {
759 tag_name,
760 is_verbatim: false,
761 closed_by_blank_line: false,
762 depth_aware: true,
763 closes_at_open_tag: false,
764 is_closing: false,
765 },
766 ) if is_pandoc_lift_eligible_block_tag(tag_name) => {
767 find_multiline_open_end(lines, start_pos, first_inner, tag_name)
768 }
769 _ => None,
770 }
771 } else {
772 None
773 };
774
775 // Set up depth-aware close tracking when the block type asks for it
776 // (Pandoc dialect, balanced same-name tag matching). A `None` means
777 // we fall back to the legacy "first matching close" path via
778 // `is_closing_marker`. Computed up front so the lift-mode gate
779 // below can decide whether the open line already balances the
780 // block (same-line `<div>...</div>`).
781 let depth_aware_tag: Option<String> = match &block_type {
782 HtmlBlockType::BlockTag {
783 tag_name,
784 closed_by_blank_line: false,
785 depth_aware: true,
786 ..
787 } => Some(tag_name.clone()),
788 _ => None,
789 };
790 let mut depth: i64 = 1;
791 if let Some(tag_name) = &depth_aware_tag {
792 // Sum opens/closes across all open-tag lines (single-line: just
793 // line 0; multi-line: lines 0..=end_line_idx).
794 let last_open_line = multiline_open_end.unwrap_or(start_pos);
795 let mut opens = 0usize;
796 let mut closes = 0usize;
797 for line in &lines[start_pos..=last_open_line] {
798 let inner = if bq_depth > 0 {
799 strip_n_blockquote_markers(line, bq_depth)
800 } else {
801 line
802 };
803 let (o, c) = count_tag_balance(inner, tag_name);
804 opens += o;
805 closes += c;
806 }
807 depth = opens as i64 - closes as i64;
808 }
809
810 // Same-line `<div>foo</div>` shape: the open line balances the
811 // block under depth-aware tracking. We can lift this structurally
812 // only when the open-tag trailing has exactly one `</div>` close,
813 // zero `<div>` opens, and no non-whitespace content after the
814 // close. Other same-line shapes (nested, trailing text, malformed)
815 // fall through to the byte-reparse path.
816 let is_same_line_div = wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
817 && multiline_open_end.is_none()
818 && depth_aware_tag.is_some()
819 && depth <= 0;
820 let same_line_div_lift_safe = is_same_line_div && bq_depth == 0 && {
821 let (line_without_newline, _) = strip_newline(first_inner);
822 probe_same_line_lift(line_without_newline, "div")
823 };
824
825 // Strict-block-tag Fix #4 lift (`<form>`, `<section>`, `<header>`,
826 // `<nav>`, …): the body parses as fresh markdown between RawBlock
827 // emissions of the open/close tags. Covers the clean multi-line
828 // shape (open tag stands alone on its line), open-trailing
829 // (`<form>foo\n…\n</form>`), butted-close (`<form>\n…\nfoo</form>`),
830 // and same-line (`<form>foo</form>`). Multi-line open and
831 // blockquote-wrapped non-div shapes still fall through to the
832 // byte-walker path.
833 let strict_block_tag_name: Option<&str> =
834 if wrapper_kind == SyntaxKind::HTML_BLOCK && bq_depth == 0 {
835 match &block_type {
836 HtmlBlockType::BlockTag {
837 tag_name,
838 is_verbatim: false,
839 closed_by_blank_line: false,
840 depth_aware: true,
841 closes_at_open_tag: false,
842 is_closing: false,
843 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
844 _ => None,
845 }
846 } else {
847 None
848 };
849 // Same-line `<form>foo</form>` shape: the open line already
850 // balances the block (`depth <= 0`). Lift only when the trailing
851 // bytes after the open `>` end with `</tag>` and contain exactly
852 // one close + zero nested opens.
853 let same_line_strict_lift_safe = strict_block_tag_name.is_some_and(|name| {
854 multiline_open_end.is_none() && depth <= 0 && {
855 let (line_no_nl, _) = strip_newline(first_inner);
856 probe_same_line_lift(line_no_nl, name)
857 }
858 });
859 // Strict-block lift gate: accept (a) a multi-line open tag spanning
860 // `lines[start_pos..=multiline_open_end]`, or (b) a clean / open-
861 // trailing single-line open (depth > 0, open `>` is present with
862 // quote-aware matching), or (c) a safe same-line shape. For
863 // inline-block matched-pair tags (`<video>`, `<iframe>`, `<button>`,
864 // …) the lift additionally abandons when the body starts at a
865 // fresh-block position with a void block tag — pandoc-native pins
866 // per-tag emission rather than a matched-pair lift in that case.
867 let strict_block_lift = strict_block_tag_name.is_some_and(|name| {
868 let (line_no_nl, _) = strip_newline(first_inner);
869 let shape_ok = if multiline_open_end.is_some() {
870 // `find_multiline_open_end` already verified the open tag
871 // closes with a quote-aware `>` somewhere in lines
872 // `start_pos+1..=end`. No same-line trailing content to
873 // probe; defer trailing-on-close-`>`-line handling to a
874 // future session (rare in practice).
875 true
876 } else if depth > 0 {
877 probe_open_tag_line_has_close_gt(line_no_nl, name)
878 } else {
879 same_line_strict_lift_safe
880 };
881 if !shape_ok {
882 return false;
883 }
884 if !is_pandoc_inline_block_tag_name(name) {
885 return true;
886 }
887 !inline_block_void_interior_abandons(
888 first_inner,
889 lines,
890 start_pos,
891 multiline_open_end,
892 bq_depth,
893 name,
894 )
895 });
896
897 // Same-line lift inside a blockquote (`> <tag>body</tag>`). Bytes
898 // are byte-equal to the non-bq same-line shape minus the leading
899 // `> ` (which sits on the outer BLOCK_QUOTE, not inside HTML_BLOCK).
900 // The body has no inner newlines, so no bq prefix re-injection is
901 // needed when grafting — `emit_html_block_body_lifted` (passing
902 // `bq: &mut None`) is enough. Other bq shapes (butted-close,
903 // open-trailing) still fall through to the projector's byte
904 // walker — they need per-line prefix injection.
905 let same_line_bq_lift_tag: Option<&str> = if bq_depth > 0
906 && multiline_open_end.is_none()
907 && depth_aware_tag.is_some()
908 && depth <= 0
909 {
910 let (line_no_nl, _) = strip_newline(first_inner);
911 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
912 if probe_same_line_lift(line_no_nl, "div") {
913 Some("div")
914 } else {
915 None
916 }
917 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
918 match &block_type {
919 HtmlBlockType::BlockTag {
920 tag_name,
921 is_verbatim: false,
922 closed_by_blank_line: false,
923 depth_aware: true,
924 closes_at_open_tag: false,
925 is_closing: false,
926 } if is_pandoc_lift_eligible_block_tag(tag_name)
927 && probe_same_line_lift(line_no_nl, tag_name.as_str()) =>
928 {
929 // Inline-block tags (`<video>`, `<iframe>`, …) skip
930 // the void-interior check at same-line — the shape
931 // has no inner block content to interfere with.
932 Some(tag_name.as_str())
933 }
934 _ => None,
935 }
936 } else {
937 None
938 }
939 } else {
940 None
941 };
942
943 // Messy-shape lift inside a blockquote — covers open-trailing
944 // (`> <div>foo\n> </div>`), butted-close (`> <div>\n> foo</div>`),
945 // and open-trailing + butted-close (`> <div>foo\n> bar</div>`).
946 // The open line does NOT balance the block (depth > 0 after the
947 // open line, distinguishing this from `same_line_bq_lift_tag` which
948 // requires depth <= 0). The close line — possibly with leading body
949 // text — closes the block when depth returns to 0. Body lines (incl.
950 // open trailing and close leading) graft via prefix re-injection.
951 let bq_messy_lift_tag: Option<&str> =
952 if bq_depth > 0 && multiline_open_end.is_none() && depth_aware_tag.is_some() && depth > 0 {
953 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
954 Some("div")
955 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
956 match &block_type {
957 HtmlBlockType::BlockTag {
958 tag_name,
959 is_verbatim: false,
960 closed_by_blank_line: false,
961 depth_aware: true,
962 closes_at_open_tag: false,
963 is_closing: false,
964 } if is_pandoc_lift_eligible_block_tag(tag_name) => {
965 // Inline-block matched-pair tags (`<video>`, `<iframe>`,
966 // …) abandon the lift when the body starts at a
967 // fresh-block position with a void block tag. Same gate
968 // as the non-bq matched-pair lift (`strict_block_lift`).
969 if is_pandoc_inline_block_tag_name(tag_name)
970 && inline_block_void_interior_abandons(
971 first_inner,
972 lines,
973 start_pos,
974 multiline_open_end,
975 bq_depth,
976 tag_name,
977 )
978 {
979 None
980 } else {
981 Some(tag_name.as_str())
982 }
983 }
984 _ => None,
985 }
986 } else {
987 None
988 }
989 } else {
990 None
991 };
992
993 // Whether this block participates in the Phase 6 structural lift
994 // (recursively parse body as Pandoc markdown and graft children).
995 // Covers `<div>` outside blockquote context. For same-line shapes
996 // the lift is gated on `same_line_*_lift_safe` — when unsafe we
997 // keep the legacy single-HTML_BLOCK_TAG shape and let the
998 // byte-reparse path handle projection.
999 let lift_mode = (wrapper_kind == SyntaxKind::HTML_BLOCK_DIV
1000 && bq_depth == 0
1001 && (!is_same_line_div || same_line_div_lift_safe))
1002 || strict_block_lift
1003 || same_line_bq_lift_tag.is_some()
1004 || bq_messy_lift_tag.is_some();
1005
1006 // Trailing content from the open tag (after `>`). When the lift is
1007 // active and the open line is `<div ATTRS>foo\n`, this captures
1008 // `"foo\n"` so it becomes the leading bytes of the recursive-parse
1009 // input. Stays empty for clean opens (`<div>\n`) and for non-lift
1010 // shapes (same-line / blockquote-wrapped).
1011 let mut pre_content = String::new();
1012
1013 // Emit opening line(s)
1014 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1015
1016 if let Some(end_line_idx) = multiline_open_end {
1017 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1018 emit_multiline_open_tag_with_attrs(builder, lines, start_pos, end_line_idx, "div");
1019 } else if let Some(name) = strict_block_tag_name
1020 && strict_block_lift
1021 {
1022 emit_multiline_open_tag_with_attrs(builder, lines, start_pos, end_line_idx, name);
1023 } else {
1024 emit_multiline_open_tag_simple(builder, lines, start_pos, end_line_idx);
1025 }
1026 } else {
1027 let (line_without_newline, newline_str) = strip_newline(first_inner);
1028 if !line_without_newline.is_empty() {
1029 // For HTML_BLOCK_DIV, expose the open tag's attributes
1030 // structurally so `AttributeNode::cast(HTML_ATTRS)` finds them
1031 // via the same descendants walk that handles fenced-div /
1032 // heading attrs. CST bytes stay byte-equal to source — we only
1033 // tokenize at finer granularity for matched div opens.
1034 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1035 let trailing =
1036 emit_open_tag_tokens(builder, line_without_newline, "div", lift_mode);
1037 if !trailing.is_empty() {
1038 pre_content.push_str(trailing);
1039 pre_content.push_str(newline_str);
1040 }
1041 } else if let Some(name) = strict_block_tag_name
1042 && strict_block_lift
1043 {
1044 let trailing = emit_open_tag_tokens(builder, line_without_newline, name, lift_mode);
1045 if !trailing.is_empty() {
1046 pre_content.push_str(trailing);
1047 pre_content.push_str(newline_str);
1048 }
1049 } else if let Some(name) =
1050 bq_strict_attr_emit_tag_name(wrapper_kind, &block_type, bq_depth)
1051 {
1052 // Inside a blockquote, lift trailing bytes into
1053 // `pre_content` when either the same-line bq gate fires
1054 // (`> <tag>body</tag>` — handled by `same_line_closed`)
1055 // or the messy-shape bq gate fires (`> <tag>foo\n…\n>
1056 // </tag>` and butted-close — handled at the close-marker
1057 // site below). For the clean-shape bq lift the open has
1058 // no trailing bytes regardless, so `lift_trailing=true`
1059 // is a no-op there.
1060 let lift_trailing =
1061 same_line_bq_lift_tag == Some(name) || bq_messy_lift_tag == Some(name);
1062 let trailing =
1063 emit_open_tag_tokens(builder, line_without_newline, name, lift_trailing);
1064 if lift_trailing && !trailing.is_empty() {
1065 pre_content.push_str(trailing);
1066 pre_content.push_str(newline_str);
1067 }
1068 } else {
1069 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
1070 }
1071 }
1072 // When the open tag has trailing content under lift mode, the
1073 // newline belongs to that trailing line (it terminates the
1074 // synthetic body line, not the open tag). Don't double-emit.
1075 if pre_content.is_empty() && !newline_str.is_empty() {
1076 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
1077 }
1078 }
1079
1080 builder.finish_node(); // HtmlBlockTag
1081
1082 // Check if opening line also contains closing marker. Blank-line-terminated
1083 // blocks (CommonMark types 6 & 7) ignore inline close markers — they only
1084 // end at a blank line or end of input. Void `eitherBlockOrInline` tags
1085 // (`closes_at_open_tag: true`) close immediately — the block always
1086 // ends on the open-tag line since there is no closing tag to find.
1087 let void_block = matches!(
1088 &block_type,
1089 HtmlBlockType::BlockTag {
1090 closes_at_open_tag: true,
1091 ..
1092 }
1093 );
1094 // Void tags with a multi-line open close immediately after the open
1095 // tag's last line. The HTML_BLOCK_TAG already covers all open-tag
1096 // lines (`emit_multiline_open_tag_simple` above); pandoc-native emits
1097 // a single RawBlock for the whole multi-line tag, with no following
1098 // content.
1099 if void_block && let Some(end_line_idx) = multiline_open_end {
1100 log::trace!(
1101 "HTML void block at line {} closes after multi-line open ending at line {}",
1102 start_pos + 1,
1103 end_line_idx + 1
1104 );
1105 builder.finish_node(); // HtmlBlock
1106 return end_line_idx + 1;
1107 }
1108 let same_line_closed = !blank_terminated
1109 && multiline_open_end.is_none()
1110 && (void_block
1111 || match &depth_aware_tag {
1112 Some(_) => depth <= 0,
1113 None => is_closing_marker(first_inner, &block_type),
1114 });
1115 if same_line_closed {
1116 log::trace!(
1117 "HTML block at line {} opens and closes on same line",
1118 start_pos + 1
1119 );
1120 // Same-line structural lift (div or non-div strict-block):
1121 // pre_content holds the bytes after the open `>` (including
1122 // the close `</tag>` and the trailing newline). Split into
1123 // body + close tag, emit body via recursive parse, emit close
1124 // tag as a sibling `HTML_BLOCK_TAG`.
1125 let same_line_lift_tag: Option<&str> = if !lift_mode || pre_content.is_empty() {
1126 None
1127 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV && same_line_div_lift_safe {
1128 Some("div")
1129 } else if same_line_strict_lift_safe {
1130 strict_block_tag_name
1131 } else if let Some(name) = same_line_bq_lift_tag {
1132 // Bq same-line: body has no inner newlines so the standard
1133 // `emit_html_block_body_lifted` (with `bq: &mut None`) is
1134 // sufficient. The bq prefix `> ` lives on the outer
1135 // BLOCK_QUOTE, outside the HTML_BLOCK[_DIV] span.
1136 Some(name)
1137 } else {
1138 None
1139 };
1140 if let Some(tag_name) = same_line_lift_tag {
1141 let (pre_no_nl, post_nl) = strip_newline(&pre_content);
1142 if let Some((leading, close_part)) = try_split_close_line(pre_no_nl, tag_name) {
1143 // Same-line is always close-butted; div demotes the
1144 // trailing Para→Plain via `SkipTrailingBlanks`.
1145 // Non-div strict-block uses `OnlyIfLast` (consistent
1146 // with butted-close — no trailing BLANK_LINE before
1147 // the close means the trailing Para demotes).
1148 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1149 LastParaDemote::SkipTrailingBlanks
1150 } else {
1151 LastParaDemote::OnlyIfLast
1152 };
1153 emit_html_block_body_lifted(builder, "", &[], leading, policy, config);
1154 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1155 let mut close_line = String::with_capacity(close_part.len() + post_nl.len());
1156 close_line.push_str(close_part);
1157 close_line.push_str(post_nl);
1158 emit_html_block_line(builder, &close_line, 0);
1159 builder.finish_node();
1160 builder.finish_node(); // HtmlBlock
1161 return start_pos + 1;
1162 }
1163 }
1164 builder.finish_node(); // HtmlBlock
1165 return start_pos + 1;
1166 }
1167
1168 let mut current_pos = multiline_open_end
1169 .map(|end| end + 1)
1170 .unwrap_or(start_pos + 1);
1171 let mut content_lines: Vec<&str> = Vec::new();
1172 let mut found_closing = false;
1173
1174 // Parse content until we find the closing marker
1175 while current_pos < lines.len() {
1176 let line = lines[current_pos];
1177 let (line_bq_depth, inner) = count_blockquote_markers(line);
1178
1179 // Only process lines at the same or deeper blockquote depth
1180 if line_bq_depth < bq_depth {
1181 break;
1182 }
1183
1184 // Blank-line-terminated blocks (types 6/7) end before the blank line.
1185 // The blank line itself is not part of the block.
1186 if blank_terminated && inner.trim().is_empty() {
1187 break;
1188 }
1189
1190 // Check for closing marker. Under depth-aware mode (Pandoc dialect)
1191 // count opens/closes of the same tag name and only close when depth
1192 // returns to 0; otherwise fall back to substring-match on the line.
1193 let line_closes = match &depth_aware_tag {
1194 Some(tag_name) => {
1195 let (opens, closes) = count_tag_balance(inner, tag_name);
1196 depth += opens as i64;
1197 depth -= closes as i64;
1198 depth <= 0
1199 }
1200 None => is_closing_marker(inner, &block_type),
1201 };
1202
1203 if line_closes {
1204 log::trace!("Found HTML block closing at line {}", current_pos + 1);
1205 found_closing = true;
1206
1207 // Pandoc-dialect blockquote-wrapped clean-shape lift: when
1208 // the open and close tags stand alone on their source lines
1209 // (no trailing on open, no body content on close after
1210 // stripping bq markers), lift the body lines structurally
1211 // so the projector walks CST children instead of
1212 // byte-reparsing via `collect_html_block_text_skip_bq_markers`.
1213 //
1214 // Covers `<div>` (HTML_BLOCK_DIV → Block::Div with body
1215 // grafted, Para preserved), non-div strict-block tags
1216 // (`<form>`, `<section>`, …) and inline-block matched-pair
1217 // tags (`<video>`, `<iframe>`, …) — the latter two under
1218 // HTML_BLOCK with the structural lift hitting pandoc's
1219 // RawBlock + Plain + RawBlock shape via `OnlyIfLast`
1220 // demotion. Inline-block additionally bails if the body
1221 // starts at a fresh-block position with a void block tag
1222 // (mirrors the non-bq matched-pair gate).
1223 //
1224 // Other bq-wrapped shapes (butted-close / open-trailing /
1225 // same-line) still fall through to the opaque path.
1226 let bq_lift_tag: Option<&str> = if bq_depth > 0
1227 && multiline_open_end.is_none()
1228 && pre_content.is_empty()
1229 {
1230 if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1231 Some("div")
1232 } else if wrapper_kind == SyntaxKind::HTML_BLOCK {
1233 match &block_type {
1234 HtmlBlockType::BlockTag {
1235 tag_name,
1236 is_verbatim: false,
1237 closed_by_blank_line: false,
1238 depth_aware: true,
1239 closes_at_open_tag: false,
1240 is_closing: false,
1241 } if is_pandoc_lift_eligible_block_tag(tag_name) => Some(tag_name.as_str()),
1242 _ => None,
1243 }
1244 } else {
1245 None
1246 }
1247 } else {
1248 None
1249 };
1250
1251 let bq_clean_lift = bq_lift_tag.is_some_and(|tag_name| {
1252 let (open_no_nl, _) = strip_newline(first_inner);
1253 if !open_no_nl.trim_end_matches([' ', '\t']).ends_with('>') {
1254 return false;
1255 }
1256 let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1257 let (close_no_nl, _) = strip_newline(close_stripped);
1258 if !close_no_nl
1259 .trim_start_matches([' ', '\t'])
1260 .starts_with("</")
1261 {
1262 return false;
1263 }
1264 if is_pandoc_inline_block_tag_name(tag_name)
1265 && inline_block_void_interior_abandons(
1266 first_inner,
1267 lines,
1268 start_pos,
1269 multiline_open_end,
1270 bq_depth,
1271 tag_name,
1272 )
1273 {
1274 return false;
1275 }
1276 true
1277 });
1278
1279 if bq_clean_lift {
1280 let demote_policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1281 LastParaDemote::Never
1282 } else {
1283 LastParaDemote::OnlyIfLast
1284 };
1285 emit_html_block_body_lifted_bq(
1286 builder,
1287 &content_lines,
1288 bq_depth,
1289 demote_policy,
1290 config,
1291 );
1292 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1293 emit_html_block_line(builder, line, bq_depth);
1294 builder.finish_node();
1295 current_pos += 1;
1296 break;
1297 }
1298
1299 // Bq messy-shape lift — single-line open with trailing or
1300 // butted-close (or both). `pre_content` already captures any
1301 // open-trailing bytes (open `HTML_BLOCK_TAG` ends at `>`);
1302 // strip the close line's bq markers before splitting so
1303 // `leading` and `close_part` are bq-prefix-free. Body parses
1304 // recursively from `pre_content + stripped(content_lines) +
1305 // leading`, with per-line bq prefixes re-injected so the CST
1306 // stays byte-equal to the source. Demote: div is keyed on
1307 // close-butted-ness (Plain when leading non-empty, Para
1308 // otherwise); non-div uses OnlyIfLast either way.
1309 if let Some(tag_name) = bq_messy_lift_tag {
1310 let close_stripped = strip_n_blockquote_markers(line, bq_depth);
1311 let close_prefix_len = line.len() - close_stripped.len();
1312 let close_prefix = &line[..close_prefix_len];
1313 if let Some((leading, close_part)) = try_split_close_line(close_stripped, tag_name)
1314 {
1315 let policy = if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1316 if leading.is_empty() {
1317 LastParaDemote::Never
1318 } else {
1319 LastParaDemote::SkipTrailingBlanks
1320 }
1321 } else {
1322 LastParaDemote::OnlyIfLast
1323 };
1324 emit_html_block_body_lifted_bq_messy(
1325 builder,
1326 &pre_content,
1327 &content_lines,
1328 leading,
1329 close_prefix,
1330 bq_depth,
1331 policy,
1332 config,
1333 );
1334 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1335 // When `leading` is empty, no recursive-parse output carries
1336 // the close line's bq prefix, so emit it here before the
1337 // close tag. When `leading` is non-empty,
1338 // `emit_html_block_body_lifted_bq_messy` already injected
1339 // the prefix at the start of the leading bytes (via the
1340 // BqPrefixState entry); emitting again would double the
1341 // prefix bytes and break losslessness.
1342 if leading.is_empty() {
1343 emit_bq_prefix_tokens(builder, close_prefix);
1344 }
1345 emit_html_block_line(builder, close_part, 0);
1346 builder.finish_node();
1347 current_pos += 1;
1348 break;
1349 }
1350 }
1351
1352 // Under lift mode, try to split the close line into a
1353 // leading "body content" prefix and a clean `</tag>...`
1354 // remainder. Lift only when the close line has exactly one
1355 // `</tag>` and no nested `<tag>` opens — depth-aware corner
1356 // cases (e.g. `<inner></inner></tag>` on the close line)
1357 // fall back to the non-lift path. For `<div>`, non-empty
1358 // `leading` propagates pandoc's `markdown_in_html_blocks`
1359 // Plain demotion rule. For non-div strict-block tags,
1360 // demotion follows pandoc's `OnlyIfLast` rule (demote the
1361 // trailing Para only when no blank line precedes the close).
1362 let close_split_tag = if lift_mode {
1363 if strict_block_lift {
1364 strict_block_tag_name
1365 } else if wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1366 Some("div")
1367 } else {
1368 None
1369 }
1370 } else {
1371 None
1372 };
1373 let close_split = close_split_tag.and_then(|name| try_split_close_line(line, name));
1374
1375 if let Some((leading, close_part)) = close_split {
1376 let policy = if strict_block_lift {
1377 LastParaDemote::OnlyIfLast
1378 } else if !leading.is_empty() {
1379 LastParaDemote::SkipTrailingBlanks
1380 } else {
1381 LastParaDemote::Never
1382 };
1383 emit_html_block_body_lifted(
1384 builder,
1385 &pre_content,
1386 &content_lines,
1387 leading,
1388 policy,
1389 config,
1390 );
1391 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1392 emit_html_block_line(builder, close_part, 0);
1393 builder.finish_node();
1394 } else {
1395 emit_html_block_body(
1396 builder,
1397 &pre_content,
1398 &content_lines,
1399 bq_depth,
1400 wrapper_kind,
1401 lift_mode,
1402 config,
1403 );
1404 builder.start_node(SyntaxKind::HTML_BLOCK_TAG.into());
1405 emit_html_block_line(builder, line, bq_depth);
1406 builder.finish_node();
1407 }
1408
1409 current_pos += 1;
1410 break;
1411 }
1412
1413 // Regular content line
1414 content_lines.push(line);
1415 current_pos += 1;
1416 }
1417
1418 // If we didn't find a closing marker, emit what we collected
1419 if !found_closing {
1420 log::trace!("HTML block at line {} has no closing marker", start_pos + 1);
1421 emit_html_block_body(
1422 builder,
1423 &pre_content,
1424 &content_lines,
1425 bq_depth,
1426 wrapper_kind,
1427 lift_mode,
1428 config,
1429 );
1430 }
1431
1432 builder.finish_node(); // HtmlBlock
1433 current_pos
1434}
1435
1436/// Emit the collected inner content lines for an HTML block.
1437///
1438/// For `HTML_BLOCK_DIV` under Pandoc with `lift_mode == true` (single-
1439/// line `<div>` open outside blockquote), recursively parse the inner
1440/// content (including any open-tag trailing) as Pandoc-flavored
1441/// markdown and graft the resulting top-level blocks as direct children
1442/// of the wrapper. This is the Phase 6 structural lift — the projector
1443/// and downstream consumers (linter, salsa, LSP) can walk the
1444/// structural children instead of re-tokenizing the body bytes.
1445///
1446/// All other shapes — opaque `HTML_BLOCK`, `HTML_BLOCK_DIV` inside a
1447/// blockquote, multi-line open, or no content at all — fall through to
1448/// the legacy `HTML_BLOCK_CONTENT`-with-TEXT capture.
1449///
1450/// CST bytes remain byte-identical to source: the recursive parser is
1451/// lossless on the same byte slice the legacy path would have captured
1452/// as TEXT.
1453fn emit_html_block_body(
1454 builder: &mut GreenNodeBuilder<'static>,
1455 pre_content: &str,
1456 content_lines: &[&str],
1457 bq_depth: usize,
1458 wrapper_kind: SyntaxKind,
1459 lift_mode: bool,
1460 config: &ParserOptions,
1461) {
1462 if pre_content.is_empty() && content_lines.is_empty() {
1463 return;
1464 }
1465 if lift_mode && wrapper_kind == SyntaxKind::HTML_BLOCK_DIV {
1466 // Reached when the parser walked to end-of-input without finding
1467 // `</div>` (unbalanced div) — no close tag, no Plain demotion.
1468 emit_html_block_body_lifted(
1469 builder,
1470 pre_content,
1471 content_lines,
1472 "",
1473 LastParaDemote::Never,
1474 config,
1475 );
1476 return;
1477 }
1478 // Legacy path: opaque TEXT capture. `pre_content` is always empty
1479 // here (lift_mode is the only path that populates it), but be
1480 // defensive — if a trailing prefix snuck in, emit it as TEXT so
1481 // bytes are preserved.
1482 builder.start_node(SyntaxKind::HTML_BLOCK_CONTENT.into());
1483 if !pre_content.is_empty() {
1484 builder.token(SyntaxKind::TEXT.into(), pre_content);
1485 }
1486 for content_line in content_lines {
1487 emit_html_block_line(builder, content_line, bq_depth);
1488 }
1489 builder.finish_node();
1490}
1491
1492/// Rule for promoting the trailing `PARAGRAPH` of an HTML-block body
1493/// to `PLAIN` when grafting children into the structural CST.
1494#[derive(Copy, Clone, Debug)]
1495enum LastParaDemote {
1496 /// Never demote — pandoc preserves the trailing `Para`.
1497 Never,
1498 /// Demote the LAST `PARAGRAPH` child, skipping any trailing
1499 /// `BLANK_LINE` children. Used for `<div>` shapes where the close
1500 /// tag is butted against the paragraph text on its source line —
1501 /// pandoc's `markdown_in_html_blocks` Plain demotion.
1502 SkipTrailingBlanks,
1503 /// Demote the LAST top-level child only when it is a `PARAGRAPH`
1504 /// (i.e. no trailing `BLANK_LINE` precedes the close tag). Used
1505 /// for non-div strict-block tags whose body emits at top-level
1506 /// adjacent to the close-tag `RawBlock`; pandoc's rule there
1507 /// demotes the trailing `Para` to `Plain` unless a blank line
1508 /// separates them.
1509 OnlyIfLast,
1510}
1511
1512/// Lift the HTML-block body into structural CST children: build the
1513/// inner text from `pre_content` + `content_lines` + `post_content`
1514/// (in order), recursively parse it as Pandoc-flavored markdown, and
1515/// graft the resulting top-level blocks into `builder`. `demote_policy`
1516/// controls whether the trailing paragraph is retagged as `PLAIN` to
1517/// encode pandoc's Plain/Para adjacency rules structurally.
1518fn emit_html_block_body_lifted(
1519 builder: &mut GreenNodeBuilder<'static>,
1520 pre_content: &str,
1521 content_lines: &[&str],
1522 post_content: &str,
1523 demote_policy: LastParaDemote,
1524 config: &ParserOptions,
1525) {
1526 emit_html_block_body_lifted_inner(
1527 builder,
1528 pre_content,
1529 content_lines,
1530 post_content,
1531 demote_policy,
1532 config,
1533 &mut None,
1534 )
1535}
1536
1537/// Body-lift variant for `<div>` inside a blockquote. Strips
1538/// `bq_depth` levels of blockquote markers from each `content_line`,
1539/// captures the per-line prefix bytes, and grafts the recursive parse
1540/// with prefix injection so the output CST stays byte-equal to the
1541/// source. `pre_content` and `post_content` must be empty (the bq
1542/// clean lift only handles the shape where the open and close tags
1543/// stand alone on their source lines).
1544fn emit_html_block_body_lifted_bq(
1545 builder: &mut GreenNodeBuilder<'static>,
1546 content_lines: &[&str],
1547 bq_depth: usize,
1548 demote_policy: LastParaDemote,
1549 config: &ParserOptions,
1550) {
1551 let mut prefixes: Vec<String> = Vec::with_capacity(content_lines.len());
1552 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
1553 for cl in content_lines {
1554 let stripped = strip_n_blockquote_markers(cl, bq_depth);
1555 let prefix_len = cl.len() - stripped.len();
1556 prefixes.push(cl[..prefix_len].to_string());
1557 stripped_lines.push(stripped);
1558 }
1559 let mut bq = Some(BqPrefixState {
1560 prefixes,
1561 line_idx: 0,
1562 at_line_start: true,
1563 });
1564 emit_html_block_body_lifted_inner(
1565 builder,
1566 "",
1567 &stripped_lines,
1568 "",
1569 demote_policy,
1570 config,
1571 &mut bq,
1572 )
1573}
1574
1575/// Body-lift variant for the bq messy-shape lift — open-trailing,
1576/// butted-close, or both. The open-trailing bytes (if any) sit in
1577/// `pre_content` (line 0 of the body — no bq prefix in source because
1578/// line 0's `> ` is consumed by the outer BLOCK_QUOTE). Content lines
1579/// each carry their own bq prefix. The close line's `leading` (body
1580/// bytes before `</tag>`) sits on the close line, prefixed in source
1581/// by `close_line_prefix` (the bq prefix captured from `line`).
1582///
1583/// Builds `prefixes` so each emitted line in the recursive parse
1584/// output gets the right per-line bq prefix re-injected at line start:
1585/// `pre_content` → empty prefix (no source `> ` precedes it); each
1586/// content line → its stripped prefix; `leading` → `close_line_prefix`.
1587/// Result CST stays byte-equal to source.
1588#[allow(clippy::too_many_arguments)]
1589fn emit_html_block_body_lifted_bq_messy(
1590 builder: &mut GreenNodeBuilder<'static>,
1591 pre_content: &str,
1592 content_lines: &[&str],
1593 leading: &str,
1594 close_line_prefix: &str,
1595 bq_depth: usize,
1596 demote_policy: LastParaDemote,
1597 config: &ParserOptions,
1598) {
1599 let mut prefixes: Vec<String> = Vec::new();
1600 if !pre_content.is_empty() {
1601 prefixes.push(String::new());
1602 }
1603 let mut stripped_lines: Vec<&str> = Vec::with_capacity(content_lines.len());
1604 for cl in content_lines {
1605 let stripped = strip_n_blockquote_markers(cl, bq_depth);
1606 let prefix_len = cl.len() - stripped.len();
1607 prefixes.push(cl[..prefix_len].to_string());
1608 stripped_lines.push(stripped);
1609 }
1610 if !leading.is_empty() {
1611 prefixes.push(close_line_prefix.to_string());
1612 }
1613 let mut bq = Some(BqPrefixState {
1614 prefixes,
1615 line_idx: 0,
1616 at_line_start: true,
1617 });
1618 emit_html_block_body_lifted_inner(
1619 builder,
1620 pre_content,
1621 &stripped_lines,
1622 leading,
1623 demote_policy,
1624 config,
1625 &mut bq,
1626 )
1627}
1628
1629fn emit_html_block_body_lifted_inner(
1630 builder: &mut GreenNodeBuilder<'static>,
1631 pre_content: &str,
1632 content_lines: &[&str],
1633 post_content: &str,
1634 demote_policy: LastParaDemote,
1635 config: &ParserOptions,
1636 bq: &mut Option<BqPrefixState>,
1637) {
1638 if pre_content.is_empty() && content_lines.is_empty() && post_content.is_empty() {
1639 return;
1640 }
1641 let mut inner_text = String::with_capacity(
1642 pre_content.len()
1643 + content_lines.iter().map(|s| s.len()).sum::<usize>()
1644 + post_content.len(),
1645 );
1646 inner_text.push_str(pre_content);
1647 for line in content_lines {
1648 inner_text.push_str(line);
1649 }
1650 inner_text.push_str(post_content);
1651
1652 let mut inner_options = config.clone();
1653 let refdefs = config.refdef_labels.clone().unwrap_or_default();
1654 inner_options.refdef_labels = Some(refdefs.clone());
1655 let inner_root = crate::parser::parse_with_refdefs(&inner_text, Some(inner_options), refdefs);
1656 graft_document_children(builder, &inner_root, demote_policy, bq);
1657}
1658
1659/// Per-line blockquote-prefix injection state used by the graft helpers
1660/// when the lifted body originated inside a `> …` blockquote: the
1661/// recursive parse was fed the bq-stripped text, so the prefix bytes
1662/// (`BLOCK_QUOTE_MARKER` + `WHITESPACE`) must be re-emitted at the
1663/// start of each source line to keep the CST byte-equal to the source.
1664///
1665/// `prefixes[i]` is the literal prefix bytes for source line `i` of the
1666/// body (e.g. `"> "`, `"> "`, or `">"`). `line_idx` is the index of
1667/// the next prefix to emit; `at_line_start` flips to `true` after every
1668/// `NEWLINE` so the next token triggers prefix emission.
1669struct BqPrefixState {
1670 prefixes: Vec<String>,
1671 line_idx: usize,
1672 at_line_start: bool,
1673}
1674
1675/// Walk a parsed inner document's top-level children and re-emit them
1676/// into `builder`. The document's wrapper node is skipped — only its
1677/// children are grafted.
1678///
1679/// `demote_policy` controls whether a trailing `PARAGRAPH` is retagged
1680/// as `PLAIN` — see [`LastParaDemote`].
1681///
1682/// `bq` is `Some` when grafting a body that lived inside a blockquote
1683/// — token emission then injects `BLOCK_QUOTE_MARKER + WHITESPACE`
1684/// prefix tokens at line starts. See [`BqPrefixState`].
1685fn graft_document_children(
1686 builder: &mut GreenNodeBuilder<'static>,
1687 doc: &SyntaxNode,
1688 demote_policy: LastParaDemote,
1689 bq: &mut Option<BqPrefixState>,
1690) {
1691 let children: Vec<rowan::NodeOrToken<SyntaxNode, _>> = doc.children_with_tokens().collect();
1692
1693 let mut demote_idx: Option<usize> = None;
1694 match demote_policy {
1695 LastParaDemote::Never => {}
1696 LastParaDemote::SkipTrailingBlanks => {
1697 for (i, c) in children.iter().enumerate().rev() {
1698 if let rowan::NodeOrToken::Node(n) = c {
1699 if n.kind() == SyntaxKind::BLANK_LINE {
1700 continue;
1701 }
1702 if n.kind() == SyntaxKind::PARAGRAPH {
1703 demote_idx = Some(i);
1704 }
1705 break;
1706 }
1707 }
1708 }
1709 LastParaDemote::OnlyIfLast => {
1710 for (i, c) in children.iter().enumerate().rev() {
1711 if let rowan::NodeOrToken::Node(n) = c {
1712 if n.kind() == SyntaxKind::PARAGRAPH {
1713 demote_idx = Some(i);
1714 }
1715 break;
1716 }
1717 }
1718 }
1719 }
1720
1721 for (i, child) in children.into_iter().enumerate() {
1722 match child {
1723 rowan::NodeOrToken::Node(n) => {
1724 if Some(i) == demote_idx {
1725 graft_subtree_as(builder, &n, SyntaxKind::PLAIN, bq);
1726 } else {
1727 graft_subtree(builder, &n, bq);
1728 }
1729 }
1730 rowan::NodeOrToken::Token(t) => {
1731 emit_grafted_token(builder, t.kind(), t.text(), bq);
1732 }
1733 }
1734 }
1735}
1736
1737/// Recursively re-emit `node` and its descendants into `builder`.
1738/// Token text is copied verbatim so the result is byte-identical to
1739/// the input span (modulo bq prefix tokens injected at line starts
1740/// when `bq` is `Some`).
1741fn graft_subtree(
1742 builder: &mut GreenNodeBuilder<'static>,
1743 node: &SyntaxNode,
1744 bq: &mut Option<BqPrefixState>,
1745) {
1746 graft_subtree_as(builder, node, node.kind(), bq);
1747}
1748
1749/// Like `graft_subtree` but the outer wrapper's `SyntaxKind` is
1750/// overridden. Used to retag a top-level `PARAGRAPH` as `PLAIN` for
1751/// the close-butted demotion rule.
1752fn graft_subtree_as(
1753 builder: &mut GreenNodeBuilder<'static>,
1754 node: &SyntaxNode,
1755 kind: SyntaxKind,
1756 bq: &mut Option<BqPrefixState>,
1757) {
1758 builder.start_node(kind.into());
1759 for child in node.children_with_tokens() {
1760 match child {
1761 rowan::NodeOrToken::Node(n) => graft_subtree(builder, &n, bq),
1762 rowan::NodeOrToken::Token(t) => {
1763 emit_grafted_token(builder, t.kind(), t.text(), bq);
1764 }
1765 }
1766 }
1767 builder.finish_node();
1768}
1769
1770/// Emit a single token while optionally injecting blockquote prefix
1771/// tokens at line starts. When `bq` is `None`, this is a plain
1772/// `builder.token()` passthrough.
1773fn emit_grafted_token(
1774 builder: &mut GreenNodeBuilder<'static>,
1775 kind: SyntaxKind,
1776 text: &str,
1777 bq: &mut Option<BqPrefixState>,
1778) {
1779 if let Some(state) = bq.as_mut() {
1780 if state.at_line_start {
1781 if let Some(prefix) = state.prefixes.get(state.line_idx) {
1782 emit_bq_prefix_tokens(builder, prefix);
1783 }
1784 state.at_line_start = false;
1785 }
1786 builder.token(kind.into(), text);
1787 // `BLANK_LINE` token represents an entirely blank source line —
1788 // its text is `\n`. Treat both `NEWLINE` and the `BLANK_LINE`
1789 // token as line-ending so the per-line prefix index advances
1790 // correctly.
1791 if kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE {
1792 state.line_idx += 1;
1793 state.at_line_start = true;
1794 }
1795 } else {
1796 builder.token(kind.into(), text);
1797 }
1798}
1799
1800/// Emit a captured per-line bq prefix as a stream of `BLOCK_QUOTE_MARKER`
1801/// (`>`) and `WHITESPACE` (everything else, byte-by-byte) tokens.
1802fn emit_bq_prefix_tokens(builder: &mut GreenNodeBuilder<'static>, prefix: &str) {
1803 for ch in prefix.chars() {
1804 if ch == '>' {
1805 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
1806 } else {
1807 let mut buf = [0u8; 4];
1808 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
1809 }
1810 }
1811}
1812
1813/// Locate the byte index (within `line`) of the open-tag's closing `>`
1814/// after a quote-aware scan of `<tag_name ATTRS>`. Returns `None` when
1815/// the line doesn't fit the expected shape. Mirrors the inner scan of
1816/// `probe_open_tag_line_has_close_gt` but exposes the position so the
1817/// caller can slice off the trailing bytes.
1818fn locate_open_tag_close_gt(line: &str, tag_name: &str) -> Option<usize> {
1819 let bytes = line.as_bytes();
1820 let indent_end = bytes
1821 .iter()
1822 .position(|&b| b != b' ' && b != b'\t')
1823 .unwrap_or(bytes.len());
1824 let rest = &line[indent_end..];
1825 let rest_bytes = rest.as_bytes();
1826 let prefix_len = 1 + tag_name.len();
1827 if rest_bytes.len() < prefix_len + 1
1828 || rest_bytes[0] != b'<'
1829 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1830 {
1831 return None;
1832 }
1833 let after_name = &rest[prefix_len..];
1834 let after_name_bytes = after_name.as_bytes();
1835 let mut i = 0usize;
1836 let mut quote: Option<u8> = None;
1837 while i < after_name_bytes.len() {
1838 match (quote, after_name_bytes[i]) {
1839 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
1840 (Some(q), b2) if b2 == q => quote = None,
1841 (None, b'>') => return Some(indent_end + prefix_len + i),
1842 _ => {}
1843 }
1844 i += 1;
1845 }
1846 None
1847}
1848
1849/// Whether `slice` begins (after leading ASCII whitespace) with an
1850/// open tag whose name is a Pandoc void block tag (`<source>`,
1851/// `<embed>`, `<area>`, `<track>`). Close tags (`</...>`) and non-void
1852/// open tags return false.
1853///
1854/// Used by the inline-block matched-pair lift gate: pandoc-native
1855/// abandons the lift when the body's first non-blank content is a
1856/// fresh-block void tag (e.g. `<video>\n<source ...>\n</video>`
1857/// projects as RawBlock+RawBlock+Plain[..,RawInline</video>], not a
1858/// matched-pair lift).
1859fn slice_starts_with_void_block_tag(slice: &str) -> bool {
1860 let trimmed = slice.trim_start_matches([' ', '\t', '\n', '\r']);
1861 if !trimmed.starts_with('<') || trimmed.starts_with("</") {
1862 return false;
1863 }
1864 let Some(tag_end) = parse_open_tag(trimmed) else {
1865 return false;
1866 };
1867 let bytes = trimmed.as_bytes();
1868 let mut name_end = 1usize;
1869 while name_end < tag_end && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-')
1870 {
1871 name_end += 1;
1872 }
1873 if name_end == 1 {
1874 return false;
1875 }
1876 is_pandoc_void_block_tag_name(&trimmed[1..name_end])
1877}
1878
1879/// Whether the body of an inline-block matched-pair (`<video>...`,
1880/// `<iframe>...`, `<button>...`) begins at a fresh-block position with
1881/// a void block tag — the condition under which pandoc-native abandons
1882/// the matched-pair lift. Probes three shapes:
1883///
1884/// - **Same-line** (`<video><source ...></video>`): trailing bytes
1885/// after the open `>` on `first_inner` start with `<source`.
1886/// - **Single-line open + multi-line body**: open-trailing on the open
1887/// line is empty/whitespace AND the first non-blank body line
1888/// (`lines[start_pos+1..]`) starts with a void tag.
1889/// - **Multi-line open**: same body-line scan starting at
1890/// `lines[multiline_open_end+1..]`.
1891///
1892/// Returns `false` when the body begins with text, with a close tag,
1893/// or with a non-void block tag — those cases all proceed with the
1894/// matched-pair lift.
1895fn inline_block_void_interior_abandons(
1896 first_inner: &str,
1897 lines: &[&str],
1898 start_pos: usize,
1899 multiline_open_end: Option<usize>,
1900 bq_depth: usize,
1901 tag_name: &str,
1902) -> bool {
1903 let (line_no_nl, _) = strip_newline(first_inner);
1904 let (body_start_line_idx, open_trailing) = match multiline_open_end {
1905 Some(end) => (end + 1, ""),
1906 None => {
1907 let gt = locate_open_tag_close_gt(line_no_nl, tag_name);
1908 let trailing = gt.map(|i| &line_no_nl[i + 1..]).unwrap_or("");
1909 (start_pos + 1, trailing)
1910 }
1911 };
1912 let trimmed = open_trailing.trim_start_matches([' ', '\t']);
1913 if !trimmed.is_empty() {
1914 return slice_starts_with_void_block_tag(trimmed);
1915 }
1916 for line in &lines[body_start_line_idx..] {
1917 let inner = if bq_depth > 0 {
1918 strip_n_blockquote_markers(line, bq_depth)
1919 } else {
1920 line
1921 };
1922 let trimmed = inner.trim_start_matches([' ', '\t', '\n', '\r']);
1923 if trimmed.is_empty() {
1924 continue;
1925 }
1926 return slice_starts_with_void_block_tag(trimmed);
1927 }
1928 false
1929}
1930
1931/// Probe whether the open-tag line has a valid (quote-aware) closing
1932/// `>` after the tag name. Admits trailing content after `>` (the
1933/// open-trailing shape `<form>foo`) — the caller is expected to capture
1934/// that trailing into the structural lift's `pre_content`.
1935fn probe_open_tag_line_has_close_gt(line: &str, tag_name: &str) -> bool {
1936 let bytes = line.as_bytes();
1937 let indent_end = bytes
1938 .iter()
1939 .position(|&b| b != b' ' && b != b'\t')
1940 .unwrap_or(bytes.len());
1941 let rest = &line[indent_end..];
1942 let rest_bytes = rest.as_bytes();
1943 let prefix_len = 1 + tag_name.len();
1944 if rest_bytes.len() < prefix_len + 1
1945 || rest_bytes[0] != b'<'
1946 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1947 {
1948 return false;
1949 }
1950 let after_name = &rest[prefix_len..];
1951 let after_name_bytes = after_name.as_bytes();
1952 let mut i = 0usize;
1953 let mut quote: Option<u8> = None;
1954 while i < after_name_bytes.len() {
1955 match (quote, after_name_bytes[i]) {
1956 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
1957 (Some(q), b2) if b2 == q => quote = None,
1958 (None, b'>') => return true,
1959 _ => {}
1960 }
1961 i += 1;
1962 }
1963 false
1964}
1965
1966/// Probe whether the same-line `<tag>BODY</tag>` shape on `line` can
1967/// be lifted structurally. Returns `true` only when:
1968/// - The line starts with `<tag_name` (modulo leading whitespace).
1969/// - The open tag's `>` exists with proper quote handling.
1970/// - The bytes after the open `>` end with `</tag_name>` (case-
1971/// insensitive, allowing trailing whitespace).
1972/// - The trailing has exactly one `</tag_name>` close and zero
1973/// `<tag_name>` opens (rejects nested same-line shapes).
1974///
1975/// Trailing non-whitespace content after `</tag_name>` (e.g.
1976/// `<form>foo</form>extra`) rejects the lift — pandoc projects that
1977/// shape as RawBlock + content + RawBlock + trailing-Para, which the
1978/// byte walker handles via `split_html_block_by_tags`.
1979fn probe_same_line_lift(line: &str, tag_name: &str) -> bool {
1980 let bytes = line.as_bytes();
1981 let indent_end = bytes
1982 .iter()
1983 .position(|&b| b != b' ' && b != b'\t')
1984 .unwrap_or(bytes.len());
1985 let rest = &line[indent_end..];
1986 let rest_bytes = rest.as_bytes();
1987 let prefix_len = 1 + tag_name.len();
1988 if rest_bytes.len() < prefix_len
1989 || rest_bytes[0] != b'<'
1990 || !rest_bytes[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
1991 {
1992 return false;
1993 }
1994 let after_name = &rest[prefix_len..];
1995 let after_name_bytes = after_name.as_bytes();
1996 let mut i = 0usize;
1997 let mut quote: Option<u8> = None;
1998 let mut gt_idx: Option<usize> = None;
1999 while i < after_name_bytes.len() {
2000 match (quote, after_name_bytes[i]) {
2001 (None, b'"') | (None, b'\'') => quote = Some(after_name_bytes[i]),
2002 (Some(q), b2) if b2 == q => quote = None,
2003 (None, b'>') => {
2004 gt_idx = Some(i);
2005 break;
2006 }
2007 _ => {}
2008 }
2009 i += 1;
2010 }
2011 let Some(gt_idx) = gt_idx else {
2012 return false;
2013 };
2014 let trailing = &after_name[gt_idx + 1..];
2015 let trimmed = trailing.trim_end_matches([' ', '\t']);
2016 let close_marker = format!("</{}>", tag_name);
2017 if !trimmed
2018 .to_ascii_lowercase()
2019 .ends_with(&close_marker.to_ascii_lowercase())
2020 {
2021 return false;
2022 }
2023 let (opens, closes) = count_tag_balance(trailing, tag_name);
2024 opens == 0 && closes == 1
2025}
2026
2027/// Try to split the close line of an HTML_BLOCK_DIV body into a
2028/// leading content prefix and a clean `</tag>...` remainder. Returns
2029/// `Some((leading, close_part))` only when the line contains exactly
2030/// one `</tag>` and no `<tag>` opens — the safe shape for the lift.
2031/// Returns `None` for nested closes (e.g. `<inner></inner></div>`),
2032/// for missing close tags, or for compound shapes the parser
2033/// shouldn't attempt to lift in this pass.
2034///
2035/// `leading` may be empty (close starts at column 0) or pure
2036/// whitespace (close on an indented line). Both count as "butted" per
2037/// pandoc's `markdown_in_html_blocks` rule — if leading is non-empty
2038/// the trailing paragraph inside the div demotes Para→Plain.
2039fn try_split_close_line<'a>(line: &'a str, tag_name: &str) -> Option<(&'a str, &'a str)> {
2040 let (opens, closes) = count_tag_balance(line, tag_name);
2041 if opens != 0 || closes != 1 {
2042 return None;
2043 }
2044 // Locate the close tag's opening `<` by lowercased substring search.
2045 // Safe because we've already established (above) that the line has
2046 // exactly one `</tag>` and no `<tag>` opens, so the first match is
2047 // THE close.
2048 let needle = format!("</{}", tag_name);
2049 let lower = line.to_ascii_lowercase();
2050 let close_lt = lower.find(&needle)?;
2051 Some((&line[..close_lt], &line[close_lt..]))
2052}
2053
2054/// Emit the open-tag line of a lift-eligible HTML block (div or non-div
2055/// strict-block tag), splitting the bytes `[ws]<tag[ ws ATTRS]>[trailing]`
2056/// into `WHITESPACE? + TEXT("<tag") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)})?
2057/// + TEXT(">") + TEXT(trailing)?`.
2058///
2059/// Bytes are byte-identical to the source — this only tokenizes at finer
2060/// granularity so `AttributeNode::cast(HTML_ATTRS)` can read the attribute
2061/// region structurally. Falls back to a single TEXT token if the line
2062/// doesn't fit the expected `<tag ...>` shape (defensive — the parser
2063/// only retags as the lift kind when this shape was matched).
2064///
2065/// `lift_trailing`: when true, bytes after `>` are NOT emitted as TEXT —
2066/// returned as `&str` instead so the caller can splice them into the
2067/// recursive-parse input for the structural body lift. When false
2068/// (legacy / non-lift path), trailing bytes are emitted as TEXT and an
2069/// empty slice is returned.
2070fn emit_open_tag_tokens<'a>(
2071 builder: &mut GreenNodeBuilder<'static>,
2072 line: &'a str,
2073 tag_name: &str,
2074 lift_trailing: bool,
2075) -> &'a str {
2076 let bytes = line.as_bytes();
2077 // Leading indent (CommonMark allows up to 3 spaces).
2078 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2079 if indent_end > 0 {
2080 builder.token(SyntaxKind::WHITESPACE.into(), &line[..indent_end]);
2081 }
2082 let rest = &line[indent_end..];
2083 // Match the literal `<tag_name` prefix (ASCII case-insensitive on the tag name).
2084 let prefix_len = 1 + tag_name.len();
2085 if !rest.starts_with('<')
2086 || rest.len() < prefix_len
2087 || !rest.as_bytes()[1..prefix_len].eq_ignore_ascii_case(tag_name.as_bytes())
2088 {
2089 builder.token(SyntaxKind::TEXT.into(), rest);
2090 return "";
2091 }
2092 let after_name = &rest[prefix_len..];
2093 let after_name_bytes = after_name.as_bytes();
2094 // Find the closing `>` of the open tag, respecting quoted attribute values.
2095 let mut i = 0usize;
2096 let mut quote: Option<u8> = None;
2097 let mut tag_close: Option<usize> = None;
2098 while i < after_name_bytes.len() {
2099 let b = after_name_bytes[i];
2100 match (quote, b) {
2101 (None, b'"') | (None, b'\'') => quote = Some(b),
2102 (Some(q), b2) if b2 == q => quote = None,
2103 (None, b'>') => {
2104 tag_close = Some(i);
2105 break;
2106 }
2107 _ => {}
2108 }
2109 i += 1;
2110 }
2111 let Some(tag_close) = tag_close else {
2112 // Open tag has no closing `>` on this line — defensive fallback.
2113 builder.token(SyntaxKind::TEXT.into(), rest);
2114 return "";
2115 };
2116 // Whitespace between the tag name and the attribute region.
2117 let attrs_inner = &after_name[..tag_close];
2118 let ws_end = attrs_inner
2119 .as_bytes()
2120 .iter()
2121 .position(|&b| !matches!(b, b' ' | b'\t'))
2122 .unwrap_or(attrs_inner.len());
2123 let leading_ws = &attrs_inner[..ws_end];
2124 // Strip a trailing self-closing slash and the whitespace before it
2125 // from the attribute region; emit them as TEXT outside the
2126 // HTML_ATTRS node so the structural region only holds attribute
2127 // bytes (not formatting punctuation).
2128 let attrs_after_ws = &attrs_inner[ws_end..];
2129 let mut attr_end = attrs_after_ws.len();
2130 let attr_bytes = attrs_after_ws.as_bytes();
2131 let mut self_close_start = attr_end;
2132 if attr_end > 0 && attr_bytes[attr_end - 1] == b'/' {
2133 self_close_start = attr_end - 1;
2134 attr_end = self_close_start;
2135 while attr_end > 0 && matches!(attr_bytes[attr_end - 1], b' ' | b'\t') {
2136 attr_end -= 1;
2137 }
2138 }
2139 let attrs_text = &attrs_after_ws[..attr_end];
2140 let trailing_text = &attrs_after_ws[attr_end..self_close_start.max(attr_end)];
2141 let after_self_close = &attrs_after_ws[self_close_start..];
2142
2143 // Use the original source bytes for the `<tag` prefix (preserves
2144 // source casing — losslessness).
2145 builder.token(SyntaxKind::TEXT.into(), &rest[..prefix_len]);
2146 if !leading_ws.is_empty() {
2147 builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
2148 }
2149 if !attrs_text.is_empty() {
2150 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2151 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2152 builder.finish_node();
2153 }
2154 if !trailing_text.is_empty() {
2155 builder.token(SyntaxKind::WHITESPACE.into(), trailing_text);
2156 }
2157 if !after_self_close.is_empty() {
2158 builder.token(SyntaxKind::TEXT.into(), after_self_close);
2159 }
2160 builder.token(SyntaxKind::TEXT.into(), ">");
2161 let after_gt = &after_name[tag_close + 1..];
2162 if lift_trailing {
2163 // Return trailing bytes to the caller (will be spliced into the
2164 // recursive-parse input for the body lift).
2165 return after_gt;
2166 }
2167 if !after_gt.is_empty() {
2168 builder.token(SyntaxKind::TEXT.into(), after_gt);
2169 }
2170 ""
2171}
2172
2173/// Detect a multi-line HTML open tag for `tag_name`. Returns
2174/// `Some(end_line_idx)` when the open tag's closing `>` is on a line *after*
2175/// `start_pos` and within `lines`; `None` for single-line opens (handled by
2176/// the existing path) or when the `>` is missing entirely.
2177///
2178/// Quoted attribute values (`"..."`, `'...'`) are honored so a `>` inside an
2179/// attribute value doesn't terminate the open tag. Quote state carries
2180/// across line boundaries.
2181fn find_multiline_open_end(
2182 lines: &[&str],
2183 start_pos: usize,
2184 first_inner: &str,
2185 tag_name: &str,
2186) -> Option<usize> {
2187 // Locate the `<tag_name` literal in `first_inner` to start scanning past
2188 // it. Match is ASCII case-insensitive; the parser preserves source casing.
2189 let trimmed = strip_leading_spaces(first_inner);
2190 let prefix_len = 1 + tag_name.len();
2191 if !trimmed.starts_with('<')
2192 || trimmed.len() < prefix_len
2193 || !trimmed[1..prefix_len].eq_ignore_ascii_case(tag_name)
2194 {
2195 return None;
2196 }
2197 let leading_indent = first_inner.len() - trimmed.len();
2198 let mut i = leading_indent + prefix_len; // past `<tag_name`
2199 let mut quote: Option<u8> = None;
2200
2201 // Scan first line for an unquoted `>`.
2202 let line0_bytes = first_inner.as_bytes();
2203 while i < line0_bytes.len() {
2204 match (quote, line0_bytes[i]) {
2205 (None, b'"') | (None, b'\'') => quote = Some(line0_bytes[i]),
2206 (Some(q), x) if x == q => quote = None,
2207 (None, b'>') => return None, // single-line case
2208 _ => {}
2209 }
2210 i += 1;
2211 }
2212
2213 // No `>` on first line. Scan subsequent lines.
2214 let mut line_idx = start_pos + 1;
2215 while line_idx < lines.len() {
2216 let bytes = lines[line_idx].as_bytes();
2217 for &b in bytes {
2218 match (quote, b) {
2219 (None, b'"') | (None, b'\'') => quote = Some(b),
2220 (Some(q), x) if x == q => quote = None,
2221 (None, b'>') => return Some(line_idx),
2222 _ => {}
2223 }
2224 }
2225 line_idx += 1;
2226 }
2227
2228 None
2229}
2230
2231/// Pandoc-only: validate that the HTML open tag starting at `lines[start_pos]`
2232/// is syntactically complete — i.e. an unquoted `>` exists somewhere from the
2233/// `<` onward, possibly spanning subsequent lines. Pandoc treats an unclosed
2234/// open tag (no `>` in the remaining input) as paragraph text rather than
2235/// starting a `RawBlock`; recognizing it as an HTML block makes the projector
2236/// reparse the same content recursively, causing a stack overflow.
2237///
2238/// Quote state (`"..."` / `'...'`) is threaded across line boundaries so a
2239/// `>` inside an attribute value doesn't count. Blank lines do not stop the
2240/// scan — pandoc's `htmlTag` reads across them, just emitting a warning when
2241/// the tag eventually closes far away.
2242pub(crate) fn pandoc_html_open_tag_closes(
2243 lines: &[&str],
2244 start_pos: usize,
2245 bq_depth: usize,
2246) -> bool {
2247 if start_pos >= lines.len() {
2248 return false;
2249 }
2250 let mut quote: Option<u8> = None;
2251 for (offset, line) in lines.iter().enumerate().skip(start_pos) {
2252 let inner = if bq_depth > 0 {
2253 strip_n_blockquote_markers(line, bq_depth)
2254 } else {
2255 line
2256 };
2257 let bytes = inner.as_bytes();
2258 let mut i = 0usize;
2259 if offset == start_pos {
2260 while i < bytes.len() && bytes[i] == b' ' {
2261 i += 1;
2262 }
2263 if bytes.get(i) != Some(&b'<') {
2264 return false;
2265 }
2266 i += 1;
2267 }
2268 while i < bytes.len() {
2269 match (quote, bytes[i]) {
2270 (None, b'"') | (None, b'\'') => quote = Some(bytes[i]),
2271 (Some(q), x) if x == q => quote = None,
2272 (None, b'>') => return true,
2273 _ => {}
2274 }
2275 i += 1;
2276 }
2277 }
2278 false
2279}
2280
2281/// Emit a multi-line open tag spanning `lines[start_pos..=end_line_idx]` as
2282/// structural CST tokens, exposing the attribute region as `HTML_ATTRS` for
2283/// `AttributeNode::cast` to find. Bytes are byte-identical to the source —
2284/// only tokenization granularity changes. Used for `<div>` (Pandoc dialect)
2285/// and non-div strict-block tags (`<form>`, `<section>`, …) under the
2286/// Phase 6 structural lift.
2287///
2288/// Per-line layout (with `prefix_len = 1 + tag_name.len()`):
2289/// - Line 0: TEXT("<{tag_name}") + (optional WHITESPACE + HTML_ATTRS) + NEWLINE
2290/// - Lines 1..N-1: (optional WHITESPACE indent) + HTML_ATTRS + NEWLINE
2291/// - Line N (last): (optional WHITESPACE indent) + (HTML_ATTRS + WHITESPACE)?
2292/// + TEXT(">") + (TEXT(trailing))? + NEWLINE
2293///
2294/// Bytes inside HTML_ATTRS may include trailing whitespace before the next
2295/// newline; `parse_html_attribute_list` tolerates whitespace.
2296fn emit_multiline_open_tag_with_attrs(
2297 builder: &mut GreenNodeBuilder<'static>,
2298 lines: &[&str],
2299 start_pos: usize,
2300 end_line_idx: usize,
2301 tag_name: &str,
2302) {
2303 let prefix_len = 1 + tag_name.len();
2304 for (line_idx, line) in lines
2305 .iter()
2306 .enumerate()
2307 .take(end_line_idx + 1)
2308 .skip(start_pos)
2309 {
2310 let (line_no_nl, newline_str) = strip_newline(line);
2311
2312 if line_idx == start_pos {
2313 // Line 0: leading indent (if any) + "<{tag_name}" + (whitespace
2314 // + attrs)?. The closing `>` is on a later line, so any
2315 // remaining bytes after "<{tag_name}" on this line are the
2316 // start of the attribute region.
2317 let bytes = line_no_nl.as_bytes();
2318 let indent_end = bytes.iter().position(|&b| b != b' ').unwrap_or(bytes.len());
2319 if indent_end > 0 {
2320 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2321 }
2322 // Defensive: caller verified the line starts with `<{tag_name}`.
2323 let after_indent = &line_no_nl[indent_end..];
2324 if after_indent.len() >= prefix_len {
2325 builder.token(SyntaxKind::TEXT.into(), &after_indent[..prefix_len]);
2326 let rest = &after_indent[prefix_len..];
2327 emit_attr_region(builder, rest);
2328 } else {
2329 builder.token(SyntaxKind::TEXT.into(), after_indent);
2330 }
2331 } else if line_idx < end_line_idx {
2332 // Pure attribute line.
2333 let bytes = line_no_nl.as_bytes();
2334 let indent_end = bytes
2335 .iter()
2336 .position(|&b| !matches!(b, b' ' | b'\t'))
2337 .unwrap_or(bytes.len());
2338 if indent_end > 0 {
2339 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2340 }
2341 let attrs_text = &line_no_nl[indent_end..];
2342 if !attrs_text.is_empty() {
2343 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2344 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2345 builder.finish_node();
2346 }
2347 } else {
2348 // Last line: indent + attrs + ">" + trailing.
2349 let bytes = line_no_nl.as_bytes();
2350 let indent_end = bytes
2351 .iter()
2352 .position(|&b| !matches!(b, b' ' | b'\t'))
2353 .unwrap_or(bytes.len());
2354 if indent_end > 0 {
2355 builder.token(SyntaxKind::WHITESPACE.into(), &line_no_nl[..indent_end]);
2356 }
2357 // Find the unquoted `>` byte position in this line.
2358 let mut quote: Option<u8> = None;
2359 let mut gt_pos: Option<usize> = None;
2360 for (j, &b) in line_no_nl.as_bytes()[indent_end..].iter().enumerate() {
2361 let actual_j = indent_end + j;
2362 match (quote, b) {
2363 (None, b'"') | (None, b'\'') => quote = Some(b),
2364 (Some(q), x) if x == q => quote = None,
2365 (None, b'>') => {
2366 gt_pos = Some(actual_j);
2367 break;
2368 }
2369 _ => {}
2370 }
2371 }
2372 let Some(gt) = gt_pos else {
2373 // Defensive — caller said `>` is on this line.
2374 builder.token(SyntaxKind::TEXT.into(), &line_no_nl[indent_end..]);
2375 if !newline_str.is_empty() {
2376 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2377 }
2378 continue;
2379 };
2380 // Attribute region: between indent_end and gt, with possibly
2381 // trailing whitespace before `>`.
2382 let attrs_region = &line_no_nl[indent_end..gt];
2383 let region_bytes = attrs_region.as_bytes();
2384 // Strip trailing whitespace from attrs region; emit as
2385 // separate WHITESPACE so HTML_ATTRS only contains attribute
2386 // bytes.
2387 let mut attr_end = region_bytes.len();
2388 while attr_end > 0 && matches!(region_bytes[attr_end - 1], b' ' | b'\t') {
2389 attr_end -= 1;
2390 }
2391 let attrs_text = &attrs_region[..attr_end];
2392 let trailing_ws = &attrs_region[attr_end..];
2393 if !attrs_text.is_empty() {
2394 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2395 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2396 builder.finish_node();
2397 }
2398 if !trailing_ws.is_empty() {
2399 builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
2400 }
2401 builder.token(SyntaxKind::TEXT.into(), ">");
2402 let after_gt = &line_no_nl[gt + 1..];
2403 if !after_gt.is_empty() {
2404 builder.token(SyntaxKind::TEXT.into(), after_gt);
2405 }
2406 }
2407
2408 if !newline_str.is_empty() {
2409 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2410 }
2411 }
2412}
2413
2414/// Emit a multi-line HTML open tag spanning `lines[start_pos..=end_line_idx]`
2415/// for non-`<div>` tags (void tags `<embed>`/`<area>`/`<source>`/`<track>`).
2416/// Each line is emitted as plain TEXT + NEWLINE; no `HTML_ATTRS` structural
2417/// node is added. Pandoc's projector reads attributes only for `<div>` /
2418/// `<span>` lifts, so non-div multi-line opens just need byte preservation.
2419fn emit_multiline_open_tag_simple(
2420 builder: &mut GreenNodeBuilder<'static>,
2421 lines: &[&str],
2422 start_pos: usize,
2423 end_line_idx: usize,
2424) {
2425 for line in lines.iter().take(end_line_idx + 1).skip(start_pos) {
2426 let (line_no_nl, newline_str) = strip_newline(line);
2427 if !line_no_nl.is_empty() {
2428 builder.token(SyntaxKind::TEXT.into(), line_no_nl);
2429 }
2430 if !newline_str.is_empty() {
2431 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2432 }
2433 }
2434}
2435
2436/// Emit the trailing portion of `<div`'s line 0 — i.e. anything after the
2437/// `<div` literal up to end-of-line. Called only from
2438/// `emit_multiline_open_tag_with_attrs`. The `>` is on a later line, so this is
2439/// pure attribute (and possibly inter-attribute whitespace).
2440fn emit_attr_region(builder: &mut GreenNodeBuilder<'static>, region: &str) {
2441 if region.is_empty() {
2442 return;
2443 }
2444 let bytes = region.as_bytes();
2445 // Split a leading run of whitespace into a WHITESPACE token so the
2446 // HTML_ATTRS node holds only attribute bytes.
2447 let ws_end = bytes
2448 .iter()
2449 .position(|&b| !matches!(b, b' ' | b'\t'))
2450 .unwrap_or(bytes.len());
2451 if ws_end > 0 {
2452 builder.token(SyntaxKind::WHITESPACE.into(), ®ion[..ws_end]);
2453 }
2454 let attrs_text = ®ion[ws_end..];
2455 if !attrs_text.is_empty() {
2456 builder.start_node(SyntaxKind::HTML_ATTRS.into());
2457 builder.token(SyntaxKind::TEXT.into(), attrs_text);
2458 builder.finish_node();
2459 }
2460}
2461
2462/// Emit one continuation line of an HTML block, preserving any blockquote
2463/// markers as structural tokens (so the CST stays byte-equal to the source
2464/// and downstream consumers can strip them per-context).
2465fn emit_html_block_line(builder: &mut GreenNodeBuilder<'static>, line: &str, bq_depth: usize) {
2466 let inner = if bq_depth > 0 {
2467 let stripped = strip_n_blockquote_markers(line, bq_depth);
2468 let prefix_len = line.len() - stripped.len();
2469 if prefix_len > 0 {
2470 for ch in line[..prefix_len].chars() {
2471 if ch == '>' {
2472 builder.token(SyntaxKind::BLOCK_QUOTE_MARKER.into(), ">");
2473 } else {
2474 let mut buf = [0u8; 4];
2475 builder.token(SyntaxKind::WHITESPACE.into(), ch.encode_utf8(&mut buf));
2476 }
2477 }
2478 }
2479 stripped
2480 } else {
2481 line
2482 };
2483
2484 let (line_without_newline, newline_str) = strip_newline(inner);
2485 if !line_without_newline.is_empty() {
2486 builder.token(SyntaxKind::TEXT.into(), line_without_newline);
2487 }
2488 if !newline_str.is_empty() {
2489 builder.token(SyntaxKind::NEWLINE.into(), newline_str);
2490 }
2491}
2492
2493#[cfg(test)]
2494mod tests {
2495 use super::*;
2496
2497 #[test]
2498 fn test_try_parse_html_comment() {
2499 assert_eq!(
2500 try_parse_html_block_start("<!-- comment -->", false),
2501 Some(HtmlBlockType::Comment)
2502 );
2503 assert_eq!(
2504 try_parse_html_block_start(" <!-- comment -->", false),
2505 Some(HtmlBlockType::Comment)
2506 );
2507 }
2508
2509 #[test]
2510 fn test_try_parse_div_tag() {
2511 assert_eq!(
2512 try_parse_html_block_start("<div>", false),
2513 Some(HtmlBlockType::BlockTag {
2514 tag_name: "div".to_string(),
2515 is_verbatim: false,
2516 closed_by_blank_line: false,
2517 depth_aware: true,
2518 closes_at_open_tag: false,
2519 is_closing: false,
2520 })
2521 );
2522 assert_eq!(
2523 try_parse_html_block_start("<div class=\"test\">", false),
2524 Some(HtmlBlockType::BlockTag {
2525 tag_name: "div".to_string(),
2526 is_verbatim: false,
2527 closed_by_blank_line: false,
2528 depth_aware: true,
2529 closes_at_open_tag: false,
2530 is_closing: false,
2531 })
2532 );
2533 }
2534
2535 #[test]
2536 fn test_try_parse_script_tag() {
2537 assert_eq!(
2538 try_parse_html_block_start("<script>", false),
2539 Some(HtmlBlockType::BlockTag {
2540 tag_name: "script".to_string(),
2541 is_verbatim: true,
2542 closed_by_blank_line: false,
2543 depth_aware: true,
2544 closes_at_open_tag: false,
2545 is_closing: false,
2546 })
2547 );
2548 }
2549
2550 #[test]
2551 fn test_try_parse_processing_instruction() {
2552 assert_eq!(
2553 try_parse_html_block_start("<?xml version=\"1.0\"?>", false),
2554 Some(HtmlBlockType::ProcessingInstruction)
2555 );
2556 }
2557
2558 #[test]
2559 fn test_try_parse_declaration() {
2560 // CommonMark dialect recognizes declarations as type-4 HTML blocks.
2561 assert_eq!(
2562 try_parse_html_block_start("<!DOCTYPE html>", true),
2563 Some(HtmlBlockType::Declaration)
2564 );
2565 // CommonMark §4.6 type 4 accepts any ASCII letter after `<!`, not
2566 // just uppercase. Lowercase doctype must match too.
2567 assert_eq!(
2568 try_parse_html_block_start("<!doctype html>", true),
2569 Some(HtmlBlockType::Declaration)
2570 );
2571 // Pandoc dialect does not — bare declarations fall through to
2572 // paragraph parsing.
2573 assert_eq!(try_parse_html_block_start("<!DOCTYPE html>", false), None);
2574 assert_eq!(try_parse_html_block_start("<!doctype html>", false), None);
2575 }
2576
2577 #[test]
2578 fn test_dialect_specific_block_tag_membership() {
2579 // Pandoc-markdown's `blockHtmlTags` is a strict subset of
2580 // CommonMark §4.6 type-6 plus a few additions. These tags
2581 // diverge between dialects:
2582 // CM-only block tags (Pandoc treats as inline raw HTML):
2583 // dialog, legend, menuitem, optgroup, option, frame,
2584 // base, basefont, link, param
2585 // Pandoc-only block tags (CM doesn't recognize):
2586 // canvas, hgroup, isindex, meta, output
2587 for cm_only in [
2588 "<dialog>",
2589 "<legend>",
2590 "<menuitem>",
2591 "<optgroup>",
2592 "<option>",
2593 "<frame>",
2594 "<base>",
2595 "<basefont>",
2596 "<link>",
2597 "<param>",
2598 ] {
2599 assert!(
2600 matches!(
2601 try_parse_html_block_start(cm_only, true),
2602 Some(HtmlBlockType::BlockTag { .. })
2603 ),
2604 "{cm_only} should be a block-tag start under CommonMark",
2605 );
2606 assert_eq!(
2607 try_parse_html_block_start(cm_only, false),
2608 None,
2609 "{cm_only} should NOT be a block-tag start under Pandoc",
2610 );
2611 }
2612 for pandoc_only in ["<canvas>", "<hgroup>", "<isindex>", "<meta>", "<output>"] {
2613 // Under CM these are not type-6 BlockTags; they may still match
2614 // type-7 (complete tag on a line) which has different semantics.
2615 assert!(
2616 !matches!(
2617 try_parse_html_block_start(pandoc_only, true),
2618 Some(HtmlBlockType::BlockTag { .. })
2619 ),
2620 "{pandoc_only} should NOT be a type-6 block-tag start under CommonMark",
2621 );
2622 assert!(
2623 matches!(
2624 try_parse_html_block_start(pandoc_only, false),
2625 Some(HtmlBlockType::BlockTag { .. })
2626 ),
2627 "{pandoc_only} should be a block-tag start under Pandoc",
2628 );
2629 }
2630 }
2631
2632 #[test]
2633 fn test_pandoc_inline_block_tag_membership() {
2634 // Pandoc's `eitherBlockOrInline` tags start an HTML block at
2635 // fresh-block positions under Pandoc dialect. We list the
2636 // non-void, non-script subset (verbatim `script` is handled
2637 // via the verbatim path; void elements are deferred — see
2638 // PANDOC_INLINE_BLOCK_TAGS docs).
2639 for tag in [
2640 "<button>",
2641 "<iframe>",
2642 "<video>",
2643 "<audio>",
2644 "<noscript>",
2645 "<object>",
2646 "<map>",
2647 "<progress>",
2648 "<del>",
2649 "<ins>",
2650 "<svg>",
2651 "<applet>",
2652 ] {
2653 assert!(
2654 matches!(
2655 try_parse_html_block_start(tag, false),
2656 Some(HtmlBlockType::BlockTag {
2657 depth_aware: true,
2658 ..
2659 })
2660 ),
2661 "{tag} should be a depth-aware block-tag start under Pandoc",
2662 );
2663 }
2664 // Closing forms of inline-block tags also start a block under
2665 // Pandoc — pandoc-native pins `</button>` standalone as a
2666 // single-line `RawBlock`. These use `closes_at_open_tag: true`
2667 // (no balanced match — the close emits as a one-line block on
2668 // its own).
2669 for closing in ["</button>", "</iframe>", "</video>", "</audio>"] {
2670 assert!(
2671 matches!(
2672 try_parse_html_block_start(closing, false),
2673 Some(HtmlBlockType::BlockTag {
2674 depth_aware: false,
2675 closes_at_open_tag: true,
2676 ..
2677 })
2678 ),
2679 "{closing} (closing form) should be a single-line block-tag start under Pandoc",
2680 );
2681 }
2682 }
2683
2684 #[test]
2685 fn test_pandoc_void_block_tag_membership() {
2686 // Pandoc's void `eitherBlockOrInline` tags start an HTML block
2687 // at fresh-block positions under Pandoc dialect, with
2688 // `closes_at_open_tag: true` — the block always ends on the
2689 // open-tag line (no closing tag to match).
2690 for tag in [
2691 "<area>",
2692 "<embed>",
2693 "<source>",
2694 "<track>",
2695 "<embed src=\"foo.swf\">",
2696 "<source src=\"foo.mp4\" type=\"video/mp4\">",
2697 ] {
2698 assert!(
2699 matches!(
2700 try_parse_html_block_start(tag, false),
2701 Some(HtmlBlockType::BlockTag {
2702 depth_aware: false,
2703 closes_at_open_tag: true,
2704 ..
2705 })
2706 ),
2707 "{tag} should be a void block-tag start under Pandoc",
2708 );
2709 }
2710 // Closing forms of void tags also start a single-line block
2711 // under Pandoc. Void elements have no closing tag in HTML, but
2712 // `</embed>` etc. can appear in the wild — pandoc-native still
2713 // emits them as `RawBlock`s at fresh-block positions; mirror
2714 // that with the same `closes_at_open_tag: true` shape.
2715 for closing in ["</area>", "</embed>", "</source>", "</track>"] {
2716 assert!(
2717 matches!(
2718 try_parse_html_block_start(closing, false),
2719 Some(HtmlBlockType::BlockTag {
2720 depth_aware: false,
2721 closes_at_open_tag: true,
2722 ..
2723 })
2724 ),
2725 "{closing} (closing form) should be a single-line void block-tag start under Pandoc",
2726 );
2727 }
2728 // Under CommonMark dialect, the void-tag block-start path is
2729 // skipped. `<source>` and `<track>` are in the CM type-6
2730 // BLOCK_TAGS set so they DO start a block, but with CM type-6
2731 // semantics (`closed_by_blank_line: true`,
2732 // `closes_at_open_tag: false`), not the Pandoc void-tag path.
2733 // `<embed>` and `<area>` aren't in the CM type-6 list — they
2734 // fall through to type 7 (complete tag on a line by itself).
2735 assert_eq!(
2736 try_parse_html_block_start("<embed>", true),
2737 Some(HtmlBlockType::Type7)
2738 );
2739 assert_eq!(
2740 try_parse_html_block_start("<area>", true),
2741 Some(HtmlBlockType::Type7)
2742 );
2743 assert!(matches!(
2744 try_parse_html_block_start("<source src=\"x\">", true),
2745 Some(HtmlBlockType::BlockTag {
2746 closed_by_blank_line: true,
2747 closes_at_open_tag: false,
2748 ..
2749 })
2750 ));
2751 assert!(matches!(
2752 try_parse_html_block_start("<track src=\"x\">", true),
2753 Some(HtmlBlockType::BlockTag {
2754 closed_by_blank_line: true,
2755 closes_at_open_tag: false,
2756 ..
2757 })
2758 ));
2759 }
2760
2761 #[test]
2762 fn test_find_multiline_open_end() {
2763 // Single-line opens return None (caller takes the regular path).
2764 assert_eq!(
2765 find_multiline_open_end(&["<div id=\"x\">"], 0, "<div id=\"x\">", "div"),
2766 None
2767 );
2768 assert_eq!(
2769 find_multiline_open_end(&["<embed src=\"x\">"], 0, "<embed src=\"x\">", "embed"),
2770 None
2771 );
2772 // Multi-line opens return the line index of the closing `>`.
2773 assert_eq!(
2774 find_multiline_open_end(&["<embed", " src=\"x\">"], 0, "<embed", "embed"),
2775 Some(1)
2776 );
2777 assert_eq!(
2778 find_multiline_open_end(
2779 &["<embed", " src=\"x\"", " type=\"video\">"],
2780 0,
2781 "<embed",
2782 "embed"
2783 ),
2784 Some(2)
2785 );
2786 // Tag-name mismatch returns None (case-insensitive on the tag name).
2787 assert_eq!(
2788 find_multiline_open_end(&["<embed", " src=\"x\">"], 0, "<embed", "div"),
2789 None
2790 );
2791 assert_eq!(
2792 find_multiline_open_end(&["<EMBED", " src=\"x\">"], 0, "<EMBED", "embed"),
2793 Some(1)
2794 );
2795 // Quoted `>` does not terminate the open tag; quote state threads
2796 // across line boundaries.
2797 assert_eq!(
2798 find_multiline_open_end(
2799 &["<embed title=\"a>b", " c\">"],
2800 0,
2801 "<embed title=\"a>b",
2802 "embed"
2803 ),
2804 Some(1)
2805 );
2806 // No `>` anywhere returns None.
2807 assert_eq!(
2808 find_multiline_open_end(&["<embed", " src=\"x\""], 0, "<embed", "embed"),
2809 None
2810 );
2811 }
2812
2813 #[test]
2814 fn test_pandoc_html_open_tag_closes() {
2815 // Single-line complete: scanner finds `>` on the first line.
2816 assert!(pandoc_html_open_tag_closes(&["<div>"], 0, 0));
2817 assert!(pandoc_html_open_tag_closes(&["<embed src=\"x\">"], 0, 0));
2818 // Multi-line complete: scanner finds `>` on a later line.
2819 assert!(pandoc_html_open_tag_closes(
2820 &["<div", " id=\"x\">", "body", "</div>"],
2821 0,
2822 0
2823 ));
2824 assert!(pandoc_html_open_tag_closes(
2825 &["<embed", " src=\"x.png\" alt=\"y\">"],
2826 0,
2827 0
2828 ));
2829 // Quoted `>` does not close: scanner threads quote state.
2830 assert!(!pandoc_html_open_tag_closes(
2831 &["<div title=\"a>b", " c\""],
2832 0,
2833 0
2834 ));
2835 assert!(pandoc_html_open_tag_closes(
2836 &["<div title=\"a>b", " c\">"],
2837 0,
2838 0
2839 ));
2840 // Incomplete: no `>` anywhere — pandoc treats as paragraph text.
2841 assert!(!pandoc_html_open_tag_closes(&["<embed"], 0, 0));
2842 assert!(!pandoc_html_open_tag_closes(&["<div", "foo", "bar"], 0, 0));
2843 // Pandoc tolerates blank lines mid-open-tag (its `htmlTag` reads
2844 // across them); the scan continues until EOF or `>`.
2845 assert!(pandoc_html_open_tag_closes(
2846 &["<div", "", "id=\"x\">"],
2847 0,
2848 0
2849 ));
2850 }
2851
2852 #[test]
2853 fn test_try_parse_cdata() {
2854 // CommonMark dialect recognizes CDATA as type-5 HTML blocks.
2855 assert_eq!(
2856 try_parse_html_block_start("<![CDATA[content]]>", true),
2857 Some(HtmlBlockType::CData)
2858 );
2859 // Pandoc dialect does not.
2860 assert_eq!(
2861 try_parse_html_block_start("<![CDATA[content]]>", false),
2862 None
2863 );
2864 }
2865
2866 #[test]
2867 fn test_extract_block_tag_name_open_only() {
2868 assert_eq!(
2869 extract_block_tag_name("<div>", false),
2870 Some("div".to_string())
2871 );
2872 assert_eq!(
2873 extract_block_tag_name("<div class=\"test\">", false),
2874 Some("div".to_string())
2875 );
2876 assert_eq!(
2877 extract_block_tag_name("<div/>", false),
2878 Some("div".to_string())
2879 );
2880 assert_eq!(extract_block_tag_name("</div>", false), None);
2881 assert_eq!(extract_block_tag_name("<>", false), None);
2882 assert_eq!(extract_block_tag_name("< div>", false), None);
2883 }
2884
2885 #[test]
2886 fn test_extract_block_tag_name_with_closing() {
2887 // CommonMark §4.6 type-6 starts also accept closing tags.
2888 assert_eq!(
2889 extract_block_tag_name("</div>", true),
2890 Some("div".to_string())
2891 );
2892 assert_eq!(
2893 extract_block_tag_name("</div >", true),
2894 Some("div".to_string())
2895 );
2896 }
2897
2898 #[test]
2899 fn test_commonmark_type6_closing_tag_start() {
2900 assert_eq!(
2901 try_parse_html_block_start("</div>", true),
2902 Some(HtmlBlockType::BlockTag {
2903 tag_name: "div".to_string(),
2904 is_verbatim: false,
2905 closed_by_blank_line: true,
2906 depth_aware: false,
2907 closes_at_open_tag: false,
2908 is_closing: true,
2909 })
2910 );
2911 }
2912
2913 #[test]
2914 fn test_commonmark_type7_open_tag() {
2915 // `<a>` (not a type-6 tag) on a line by itself is type 7 under
2916 // CommonMark; rejected under non-CommonMark.
2917 assert_eq!(
2918 try_parse_html_block_start("<a href=\"foo\">", true),
2919 Some(HtmlBlockType::Type7)
2920 );
2921 assert_eq!(try_parse_html_block_start("<a href=\"foo\">", false), None);
2922 }
2923
2924 #[test]
2925 fn test_commonmark_type7_close_tag() {
2926 assert_eq!(
2927 try_parse_html_block_start("</ins>", true),
2928 Some(HtmlBlockType::Type7)
2929 );
2930 }
2931
2932 #[test]
2933 fn test_commonmark_type7_rejects_with_trailing_text() {
2934 // A complete tag must be followed only by whitespace.
2935 assert_eq!(try_parse_html_block_start("<a> hi", true), None);
2936 }
2937
2938 #[test]
2939 fn test_is_closing_marker_comment() {
2940 let block_type = HtmlBlockType::Comment;
2941 assert!(is_closing_marker("-->", &block_type));
2942 assert!(is_closing_marker("end -->", &block_type));
2943 assert!(!is_closing_marker("<!--", &block_type));
2944 }
2945
2946 #[test]
2947 fn test_is_closing_marker_tag() {
2948 let block_type = HtmlBlockType::BlockTag {
2949 tag_name: "div".to_string(),
2950 is_verbatim: false,
2951 closed_by_blank_line: false,
2952 depth_aware: false,
2953 closes_at_open_tag: false,
2954 is_closing: false,
2955 };
2956 assert!(is_closing_marker("</div>", &block_type));
2957 assert!(is_closing_marker("</DIV>", &block_type)); // Case insensitive
2958 assert!(is_closing_marker("content</div>", &block_type));
2959 assert!(!is_closing_marker("<div>", &block_type));
2960 }
2961
2962 #[test]
2963 fn test_parse_html_comment_block() {
2964 let input = "<!-- comment -->\n";
2965 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
2966 let mut builder = GreenNodeBuilder::new();
2967
2968 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
2969 let opts = ParserOptions::default();
2970 let new_pos = parse_html_block_with_wrapper(
2971 &mut builder,
2972 &lines,
2973 0,
2974 block_type,
2975 0,
2976 SyntaxKind::HTML_BLOCK,
2977 &opts,
2978 );
2979
2980 assert_eq!(new_pos, 1);
2981 }
2982
2983 #[test]
2984 fn test_parse_div_block() {
2985 let input = "<div>\ncontent\n</div>\n";
2986 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
2987 let mut builder = GreenNodeBuilder::new();
2988
2989 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
2990 let opts = ParserOptions::default();
2991 let new_pos = parse_html_block_with_wrapper(
2992 &mut builder,
2993 &lines,
2994 0,
2995 block_type,
2996 0,
2997 SyntaxKind::HTML_BLOCK,
2998 &opts,
2999 );
3000
3001 assert_eq!(new_pos, 3);
3002 }
3003
3004 #[test]
3005 fn test_parse_html_block_no_closing() {
3006 let input = "<div>\ncontent\n";
3007 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3008 let mut builder = GreenNodeBuilder::new();
3009
3010 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3011 let opts = ParserOptions::default();
3012 let new_pos = parse_html_block_with_wrapper(
3013 &mut builder,
3014 &lines,
3015 0,
3016 block_type,
3017 0,
3018 SyntaxKind::HTML_BLOCK,
3019 &opts,
3020 );
3021
3022 // Should consume all lines even without closing tag
3023 assert_eq!(new_pos, 2);
3024 }
3025
3026 #[test]
3027 fn test_parse_div_block_nested_pandoc() {
3028 // Pandoc dialect: a nested `<div>...<div>...</div>...</div>` must
3029 // close on the OUTER `</div>`, not the first `</div>` seen. The
3030 // CommonMark-style "first close" scanner is wrong here; Pandoc's
3031 // div parser is depth-aware (mirrors `htmlInBalanced`).
3032 let input =
3033 "<div id=\"outer\">\n\n<div id=\"inner\">\n\ndeep content\n\n</div>\n\n</div>\n";
3034 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3035 let mut builder = GreenNodeBuilder::new();
3036
3037 // is_commonmark = false → Pandoc dialect.
3038 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3039 let opts = ParserOptions::default();
3040 let new_pos = parse_html_block_with_wrapper(
3041 &mut builder,
3042 &lines,
3043 0,
3044 block_type,
3045 0,
3046 SyntaxKind::HTML_BLOCK_DIV,
3047 &opts,
3048 );
3049
3050 // 9 lines: outer-open, blank, inner-open, blank, content, blank,
3051 // inner-close, blank, outer-close. All consumed.
3052 assert_eq!(new_pos, 9);
3053 }
3054
3055 #[test]
3056 fn test_parse_div_block_same_line_pandoc() {
3057 // <div>foo</div> on a single line: opens=1, closes=1, depth=0 →
3058 // close on first line. Depth-aware tracking must not regress this.
3059 let input = "<div>foo</div>\n";
3060 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3061 let mut builder = GreenNodeBuilder::new();
3062
3063 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3064 let opts = ParserOptions::default();
3065 let new_pos = parse_html_block_with_wrapper(
3066 &mut builder,
3067 &lines,
3068 0,
3069 block_type,
3070 0,
3071 SyntaxKind::HTML_BLOCK_DIV,
3072 &opts,
3073 );
3074 assert_eq!(new_pos, 1);
3075 }
3076
3077 #[test]
3078 fn test_commonmark_verbatim_first_close() {
3079 // CommonMark verbatim tag (`<script>`): per CommonMark §4.6 type-1,
3080 // ends at the first matching close — not depth-aware. Stash a
3081 // bogus inner `<script>` inside a JS string; the outer block
3082 // still closes at the first `</script>`.
3083 let input = "<script>\nlet x = '<script>';\n</script>\n";
3084 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3085 let mut builder = GreenNodeBuilder::new();
3086
3087 // is_commonmark = true.
3088 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3089 let opts = ParserOptions::default();
3090 let new_pos = parse_html_block_with_wrapper(
3091 &mut builder,
3092 &lines,
3093 0,
3094 block_type,
3095 0,
3096 SyntaxKind::HTML_BLOCK,
3097 &opts,
3098 );
3099 // Three lines, closed at first `</script>` (line 2). new_pos = 3.
3100 assert_eq!(new_pos, 3);
3101 }
3102
3103 #[test]
3104 fn test_parse_div_block_multiline_open_close_separate_line_pandoc() {
3105 // Multi-line open tag with the closing `>` on its own line:
3106 //
3107 // <div
3108 // id="x"
3109 // class="y"
3110 // >
3111 //
3112 // foo
3113 //
3114 // </div>
3115 //
3116 // Open tag spans lines 0..=3. Content starts at line 4.
3117 let input = "<div\n id=\"x\"\n class=\"y\"\n>\n\nfoo\n\n</div>\n";
3118 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3119 let mut builder = GreenNodeBuilder::new();
3120
3121 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3122 let opts = ParserOptions::default();
3123 let new_pos = parse_html_block_with_wrapper(
3124 &mut builder,
3125 &lines,
3126 0,
3127 block_type,
3128 0,
3129 SyntaxKind::HTML_BLOCK_DIV,
3130 &opts,
3131 );
3132
3133 // 8 lines: open-line 0, open-line 1 (` id="x"`), open-line 2
3134 // (` class="y"`), open-line 3 (`>`), blank, foo, blank, </div>.
3135 assert_eq!(new_pos, 8);
3136
3137 // CST must contain a structural HTML_ATTRS region holding the
3138 // attribute bytes (so the salsa anchor walk picks up `id="x"`).
3139 let green = builder.finish();
3140 let root = crate::syntax::SyntaxNode::new_root(green);
3141 let attrs_count = root
3142 .descendants()
3143 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3144 .count();
3145 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3146
3147 // Byte-identical losslessness check.
3148 let collected: String = root
3149 .descendants_with_tokens()
3150 .filter_map(|n| n.into_token())
3151 .map(|t| t.text().to_string())
3152 .collect();
3153 assert_eq!(collected, input);
3154 }
3155
3156 #[test]
3157 fn test_parse_div_block_multiline_open_close_inline_pandoc() {
3158 // Multi-line open tag with the closing `>` on the last attribute
3159 // line (case 0262 already covers this pattern; pin behavior to
3160 // also ensure HTML_ATTRS structural exposure).
3161 let input = "<div\n id=\"x\"\n class=\"y\">\nfoo\n</div>\n";
3162 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3163 let mut builder = GreenNodeBuilder::new();
3164
3165 let block_type = try_parse_html_block_start(lines[0], false).unwrap();
3166 let opts = ParserOptions::default();
3167 let new_pos = parse_html_block_with_wrapper(
3168 &mut builder,
3169 &lines,
3170 0,
3171 block_type,
3172 0,
3173 SyntaxKind::HTML_BLOCK_DIV,
3174 &opts,
3175 );
3176
3177 // 5 lines: open-line 0, open-line 1, open-line 2 (with `>`), foo,
3178 // </div>.
3179 assert_eq!(new_pos, 5);
3180
3181 let green = builder.finish();
3182 let root = crate::syntax::SyntaxNode::new_root(green);
3183 let attrs_count = root
3184 .descendants()
3185 .filter(|n| n.kind() == SyntaxKind::HTML_ATTRS)
3186 .count();
3187 assert!(attrs_count >= 1, "expected at least one HTML_ATTRS node");
3188
3189 let collected: String = root
3190 .descendants_with_tokens()
3191 .filter_map(|n| n.into_token())
3192 .map(|t| t.text().to_string())
3193 .collect();
3194 assert_eq!(collected, input);
3195 }
3196
3197 #[test]
3198 fn test_commonmark_type6_blank_line_terminates() {
3199 let input = "<div>\nfoo\n\nbar\n";
3200 let lines: Vec<&str> = crate::parser::utils::helpers::split_lines_inclusive(input);
3201 let mut builder = GreenNodeBuilder::new();
3202
3203 let block_type = try_parse_html_block_start(lines[0], true).unwrap();
3204 let opts = ParserOptions::default();
3205 let new_pos = parse_html_block_with_wrapper(
3206 &mut builder,
3207 &lines,
3208 0,
3209 block_type,
3210 0,
3211 SyntaxKind::HTML_BLOCK,
3212 &opts,
3213 );
3214
3215 // Block contains <div>\nfoo\n; stops at blank line (line 2).
3216 assert_eq!(new_pos, 2);
3217 }
3218}