Skip to main content

panache_parser/
pandoc_ast.rs

1//! CST → Pandoc-native AST text projector.
2//!
3//! Walks a panache [`SyntaxNode`] and emits a string in the textual shape of
4//! pandoc's `Pandoc [Block]` AST — the same format produced by
5//! `pandoc -f markdown -t native`. Exposed via [`to_pandoc_ast`] and the
6//! `panache parse --to pandoc-ast` CLI mode; also drives the pandoc
7//! conformance harness in `tests/pandoc.rs`.
8//!
9//! Coverage is intentionally narrow. Unsupported nodes emit
10//! `Unsupported "<KIND>"` so a failing case stays visibly failing rather
11//! than silently dropping content; expand coverage as the corpus grows.
12//!
13//! Output shape matches pandoc 3.9.0.2 with default-standalone-off behavior:
14//! the document is rendered as a bare block list `[ <block>, ... ]`. The
15//! comparison normalizer collapses whitespace runs, so ppShow's pretty-print
16//! line breaks/indentation are not load-bearing.
17
18use std::cell::RefCell;
19use std::collections::{HashMap, HashSet};
20
21use crate::SyntaxNode;
22use crate::syntax::SyntaxKind;
23use rowan::NodeOrToken;
24use serde_json::{Value, json};
25
26/// Pinned `pandoc-api-version` reported in `to_pandoc_json` output. Mirrors
27/// the version reported by pandoc 3.9.0.2 (the version pinned by the
28/// conformance corpus — see
29/// `tests/fixtures/pandoc-conformance/.panache-source`). Bump alongside
30/// any pandoc-version bump in that corpus.
31const PANDOC_API_VERSION: [u32; 4] = [1, 23, 1, 1];
32
33#[derive(Default)]
34struct RefsCtx {
35    refs: HashMap<String, (String, String)>,
36    heading_ids: HashSet<String>,
37    /// Heading text-range start → final disambiguated id. Lets
38    /// `heading_block` look up the document-level id (with `section`
39    /// fallback for empty slugs and `-1`/`-2` suffixes for duplicates)
40    /// that was computed during the pre-pass.
41    heading_id_by_offset: HashMap<u32, String>,
42    /// Footnote label → parsed body blocks. Lookup keyed by the raw label
43    /// id text (no normalization needed — pandoc footnote labels are
44    /// case-sensitive and not whitespace-collapsed).
45    footnotes: HashMap<String, Vec<Block>>,
46    /// Example-list label (`@label`) → resolved item number. Pandoc
47    /// numbers all `OrderedList(_, Example, _)` items across the entire
48    /// document with one shared counter; labeled items also become
49    /// referenceable so inline `@label` resolves to the item's number.
50    example_label_to_num: HashMap<String, usize>,
51    /// Example-list start number per `LIST` text-range start. Looked up
52    /// in `ordered_list_attrs` so each Example list reports the first
53    /// item's number — picking up where the previous Example list left
54    /// off rather than restarting at 1.
55    example_list_start_by_offset: HashMap<u32, usize>,
56    /// Note number per `CITATION` text-range start. Pandoc assigns each
57    /// inline-cite group (and each footnote, regardless of inner cites)
58    /// a position-counter value; cites inside a footnote share its number.
59    cite_note_num_by_offset: HashMap<u32, i64>,
60}
61
62thread_local! {
63    static REFS_CTX: RefCell<RefsCtx> = RefCell::new(RefsCtx::default());
64}
65
66/// Render the given panache CST as pandoc-native AST text.
67///
68/// Output mirrors `pandoc -f markdown -t native` for supported constructs.
69/// Unsupported nodes emit a visible `Unsupported "<KIND>"` sentinel rather
70/// than silently dropping content. Pair with [`normalize_native`] when
71/// comparing against captured pandoc output to ignore pretty-print
72/// whitespace differences.
73pub fn to_pandoc_ast(tree: &SyntaxNode) -> String {
74    let ctx = build_refs_ctx(tree);
75    REFS_CTX.with(|c| *c.borrow_mut() = ctx);
76    let blocks = blocks_from_doc(tree);
77    let mut out = String::new();
78    out.push('[');
79    for (i, b) in blocks.iter().enumerate() {
80        if i > 0 {
81            out.push(',');
82        }
83        out.push(' ');
84        write_block(b, &mut out);
85    }
86    out.push_str(" ]");
87    REFS_CTX.with(|c| *c.borrow_mut() = RefsCtx::default());
88    out
89}
90
91/// Render the given panache CST as pandoc JSON-AST text.
92///
93/// Output mirrors `pandoc -f markdown -t json`: a single JSON object
94/// `{"pandoc-api-version": [...], "meta": {...}, "blocks": [...]}` where
95/// each AST node is `{"t": "Constructor", "c": <content>}` (nullary
96/// constructors omit `"c"`). The block tree is the same one used by
97/// [`to_pandoc_ast`] — the difference is the surface encoding only.
98///
99/// Output is compact (no whitespace), matching pandoc's default. The
100/// `pandoc-api-version` field is pinned to [`PANDOC_API_VERSION`].
101///
102/// Note: object keys are emitted in alphabetical order (e.g. `"c"` before
103/// `"t"`) rather than pandoc's insertion order. JSON objects are unordered
104/// by spec, so downstream tools (`jq`, `ascii2uni`, deserializers) treat
105/// the outputs as equivalent — but they are not byte-identical.
106///
107/// As with [`to_pandoc_ast`], unsupported nodes emit a panache-internal
108/// `{"t": "Unsupported", "c": "<KIND>"}` sentinel rather than being
109/// silently dropped. This sentinel is not emitted by real pandoc.
110pub fn to_pandoc_json(tree: &SyntaxNode) -> String {
111    let ctx = build_refs_ctx(tree);
112    REFS_CTX.with(|c| *c.borrow_mut() = ctx);
113    let blocks = blocks_from_doc(tree);
114    let blocks_json: Vec<Value> = blocks.iter().map(block_to_json).collect();
115    REFS_CTX.with(|c| *c.borrow_mut() = RefsCtx::default());
116    let doc = json!({
117        "pandoc-api-version": PANDOC_API_VERSION,
118        "meta": {},
119        "blocks": blocks_json,
120    });
121    serde_json::to_string(&doc).expect("pandoc-json serialization is infallible")
122}
123
124fn build_refs_ctx(tree: &SyntaxNode) -> RefsCtx {
125    build_refs_ctx_inherited(tree, None)
126}
127
128fn build_refs_ctx_inherited(tree: &SyntaxNode, parent: Option<&RefsCtx>) -> RefsCtx {
129    let mut ctx = RefsCtx::default();
130    collect_cite_note_nums(tree, &mut ctx);
131    let mut example_counter: usize = 0;
132    collect_example_numbering(tree, &mut ctx, &mut example_counter);
133    REFS_CTX.with(|c| {
134        let mut borrowed = c.borrow_mut();
135        borrowed.cite_note_num_by_offset = ctx.cite_note_num_by_offset.clone();
136        borrowed.example_label_to_num = ctx.example_label_to_num.clone();
137        borrowed.example_list_start_by_offset = ctx.example_list_start_by_offset.clone();
138    });
139    // Seed seen_ids from parent's heading_ids so inner heading auto-ids
140    // disambiguate against outer's history. Reverse-engineer counts from
141    // final ids: id `base` implies count >= 1; `base-N` implies count >=
142    // N+1. Take max per base.
143    let mut seen_ids: HashMap<String, u32> = HashMap::new();
144    if let Some(p) = parent {
145        for id in &p.heading_ids {
146            if let Some(idx) = id.rfind('-')
147                && let Ok(n) = id[idx + 1..].parse::<u32>()
148            {
149                let base = id[..idx].to_string();
150                let entry = seen_ids.entry(base).or_insert(0);
151                *entry = (*entry).max(n + 1);
152            }
153            let entry = seen_ids.entry(id.clone()).or_insert(0);
154            *entry = (*entry).max(1);
155        }
156    }
157    collect_refs_and_headings(tree, &mut ctx, &mut seen_ids);
158    // Fold parent refs/footnotes/heading_ids into the inner ctx so lookups
159    // during projection see both halves. Inner-defined keys win on conflict
160    // (scoping semantics; pandoc's true rule is "first def in document
161    // order wins" but tracking that across the recursive boundary would
162    // require offset-aware merging that no current corpus case exercises).
163    if let Some(p) = parent {
164        for (k, v) in &p.refs {
165            ctx.refs.entry(k.clone()).or_insert_with(|| v.clone());
166        }
167        for (k, v) in &p.footnotes {
168            ctx.footnotes.entry(k.clone()).or_insert_with(|| v.clone());
169        }
170        for id in &p.heading_ids {
171            ctx.heading_ids.insert(id.clone());
172        }
173    }
174    ctx
175}
176
177/// Walk every inline tree under `tree` and assign a `citationNoteNum` to
178/// each `CITATION` node. Pandoc's rule: outside footnotes, each Cite group
179/// (one CITATION node, regardless of internal `;`-separated keys) gets a
180/// fresh counter value; footnotes increment the counter once on entry,
181/// then ALL cites inside the footnote share that value.
182fn collect_cite_note_nums(tree: &SyntaxNode, ctx: &mut RefsCtx) {
183    let mut footnote_def_nodes: HashMap<String, SyntaxNode> = HashMap::new();
184    for child in tree.descendants() {
185        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION
186            && let Some(label) = footnote_label(&child)
187        {
188            footnote_def_nodes.entry(label).or_insert(child);
189        }
190    }
191    let mut counter: i64 = 0;
192    for child in tree.children() {
193        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION {
194            continue;
195        }
196        visit_for_cite_nums(&child, &footnote_def_nodes, &mut counter, None, ctx);
197    }
198}
199
200fn visit_for_cite_nums(
201    node: &SyntaxNode,
202    fn_defs: &HashMap<String, SyntaxNode>,
203    counter: &mut i64,
204    in_fn: Option<i64>,
205    ctx: &mut RefsCtx,
206) {
207    for el in node.children_with_tokens() {
208        if let NodeOrToken::Node(n) = el {
209            match n.kind() {
210                SyntaxKind::CITATION => {
211                    let offset: u32 = n.text_range().start().into();
212                    let num = if let Some(fn_num) = in_fn {
213                        fn_num
214                    } else {
215                        *counter += 1;
216                        *counter
217                    };
218                    ctx.cite_note_num_by_offset.insert(offset, num);
219                }
220                SyntaxKind::FOOTNOTE_REFERENCE => {
221                    if in_fn.is_none() {
222                        *counter += 1;
223                        let fn_num = *counter;
224                        if let Some(label) = footnote_label(&n)
225                            && let Some(def) = fn_defs.get(&label)
226                        {
227                            visit_for_cite_nums(def, fn_defs, counter, Some(fn_num), ctx);
228                        }
229                    }
230                }
231                _ => visit_for_cite_nums(&n, fn_defs, counter, in_fn, ctx),
232            }
233        }
234    }
235}
236
237/// Walk every `LIST` in document order and assign Example-list numbers.
238/// Pandoc tracks one counter across all `OrderedList(_, Example, _)` lists
239/// in a document, so each subsequent Example list picks up where the prior
240/// one left off. Labeled items (`(@label)`) get a label → number mapping
241/// for inline `@label` reference resolution.
242fn collect_example_numbering(node: &SyntaxNode, ctx: &mut RefsCtx, counter: &mut usize) {
243    for child in node.children() {
244        if child.kind() == SyntaxKind::LIST && list_is_example(&child) {
245            let list_offset: u32 = child.text_range().start().into();
246            ctx.example_list_start_by_offset
247                .insert(list_offset, *counter + 1);
248            for item in child
249                .children()
250                .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
251            {
252                *counter += 1;
253                if let Some(label) = example_item_label(&item) {
254                    ctx.example_label_to_num.entry(label).or_insert(*counter);
255                }
256            }
257            // Recurse into the list's contents to pick up nested Example
258            // lists (rare but possible).
259            collect_example_numbering(&child, ctx, counter);
260        } else {
261            collect_example_numbering(&child, ctx, counter);
262        }
263    }
264}
265
266/// `(@)` / `(@label)` markers identify Example list items. Returns true
267/// iff the LIST's first item carries such a marker (pandoc decides the
268/// list style from the first marker only).
269fn list_is_example(list: &SyntaxNode) -> bool {
270    let Some(item) = list.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
271        return false;
272    };
273    let marker = list_item_marker_text(&item);
274    let trimmed = marker.trim();
275    let body = if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
276        inner
277    } else if let Some(inner) = trimmed.strip_suffix(')') {
278        inner
279    } else if let Some(inner) = trimmed.strip_suffix('.') {
280        inner
281    } else {
282        trimmed
283    };
284    body.starts_with('@')
285        && body[1..]
286            .chars()
287            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
288}
289
290fn list_item_marker_text(item: &SyntaxNode) -> String {
291    item.children_with_tokens()
292        .filter_map(|el| el.into_token())
293        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
294        .map(|t| t.text().to_string())
295        .unwrap_or_default()
296}
297
298/// Returns the `@label` text for an Example list item, or `None` for the
299/// unlabeled `(@)` form.
300fn example_item_label(item: &SyntaxNode) -> Option<String> {
301    let marker = list_item_marker_text(item);
302    let trimmed = marker.trim();
303    let body = trimmed
304        .strip_prefix('(')
305        .and_then(|s| s.strip_suffix(')'))
306        .or_else(|| trimmed.strip_suffix(')'))
307        .or_else(|| trimmed.strip_suffix('.'))
308        .unwrap_or(trimmed);
309    let label = body.strip_prefix('@')?;
310    if label.is_empty() {
311        None
312    } else {
313        Some(label.to_string())
314    }
315}
316
317fn collect_refs_and_headings(
318    node: &SyntaxNode,
319    ctx: &mut RefsCtx,
320    seen_ids: &mut HashMap<String, u32>,
321) {
322    for child in node.children() {
323        match child.kind() {
324            SyntaxKind::REFERENCE_DEFINITION => {
325                if let Some((label, url, title)) = parse_reference_def(&child) {
326                    ctx.refs
327                        .entry(normalize_ref_label(&label))
328                        .or_insert((url, title));
329                }
330            }
331            SyntaxKind::FOOTNOTE_DEFINITION => {
332                if let Some((label, blocks)) = parse_footnote_def(&child) {
333                    ctx.footnotes.entry(label).or_insert(blocks);
334                }
335            }
336            SyntaxKind::HEADING => {
337                let (id, was_explicit) = heading_id_with_explicitness(&child);
338                let final_id = if was_explicit {
339                    // Explicit `{#x}` ids are kept verbatim; pandoc only
340                    // warns on conflicts but does not auto-disambiguate.
341                    seen_ids.entry(id.clone()).or_insert(0);
342                    id
343                } else {
344                    let mut base = id;
345                    if base.is_empty() {
346                        base = "section".to_string();
347                    }
348                    let count = seen_ids.entry(base.clone()).or_insert(0);
349                    let id = if *count == 0 {
350                        base
351                    } else {
352                        format!("{base}-{count}")
353                    };
354                    *count += 1;
355                    id
356                };
357                if !final_id.is_empty() {
358                    let offset: u32 = child.text_range().start().into();
359                    ctx.heading_ids.insert(final_id.clone());
360                    ctx.heading_id_by_offset.insert(offset, final_id);
361                }
362                collect_refs_and_headings(&child, ctx, seen_ids);
363            }
364            _ => collect_refs_and_headings(&child, ctx, seen_ids),
365        }
366    }
367}
368
369/// Returns `(id, was_explicit)` for a HEADING node. Explicit ids come from
370/// `{#id}` attributes; the auto-id is the slugified plaintext (which may be
371/// empty for headings whose text contains no slug-eligible characters).
372fn heading_id_with_explicitness(node: &SyntaxNode) -> (String, bool) {
373    let inlines = node
374        .children()
375        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
376        .map(|c| coalesce_inlines(inlines_from(&c)))
377        .unwrap_or_default();
378    let parsed = extract_attr_from_node(node);
379    if !parsed.id.is_empty() {
380        return (parsed.id, true);
381    }
382    (pandoc_slugify(&inlines_to_plaintext(&inlines)), false)
383}
384
385fn parse_footnote_def(node: &SyntaxNode) -> Option<(String, Vec<Block>)> {
386    let label = footnote_label(node)?;
387    let mut blocks = Vec::new();
388    for child in node.children() {
389        // The CST keeps each footnote-body line at its full raw indentation
390        // (the 4-space body indent plus any nested-block indent). Most blocks
391        // recover transparently because `coalesce_inlines` trims leading
392        // spaces on paragraph content, but indented code blocks preserve all
393        // leading whitespace — strip the 4 footnote-body spaces in addition
394        // to the code block's own 4.
395        if child.kind() == SyntaxKind::CODE_BLOCK
396            && !child
397                .children()
398                .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
399        {
400            blocks.push(indented_code_block_with_extra_strip(&child, 4));
401        } else {
402            collect_block(&child, &mut blocks);
403        }
404    }
405    Some((label, blocks))
406}
407
408fn indented_code_block_with_extra_strip(node: &SyntaxNode, extra: usize) -> Block {
409    let raw_format = code_block_raw_format(node);
410    let attr = code_block_attr(node);
411    let is_fenced = node
412        .children()
413        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
414    let mut content = String::new();
415    for child in node.children() {
416        if child.kind() == SyntaxKind::CODE_CONTENT {
417            content.push_str(&child.text().to_string());
418        }
419    }
420    while content.ends_with('\n') {
421        content.pop();
422    }
423    // Pandoc expands tabs (4-col stops) on code-block bodies before any
424    // indent stripping, so a `:\t` marker followed by `\t\t\tcode` correctly
425    // becomes `"        code"` after the 4-col definition-content offset is
426    // stripped. Apply expansion first, then strip.
427    content = content
428        .split('\n')
429        .map(expand_tabs_to_4)
430        .collect::<Vec<_>>()
431        .join("\n");
432    content = strip_leading_spaces_per_line(&content, extra);
433    if !is_fenced {
434        content = strip_indented_code_indent(&content);
435    }
436    if let Some(fmt) = raw_format {
437        return Block::RawBlock(fmt, content);
438    }
439    Block::CodeBlock(attr, content)
440}
441
442fn strip_leading_spaces_per_line(s: &str, n: usize) -> String {
443    let mut out = String::with_capacity(s.len());
444    for (i, line) in s.split('\n').enumerate() {
445        if i > 0 {
446            out.push('\n');
447        }
448        let to_strip = line.chars().take(n).take_while(|&c| c == ' ').count();
449        out.push_str(&line[to_strip..]);
450    }
451    out
452}
453
454fn footnote_label(node: &SyntaxNode) -> Option<String> {
455    for el in node.children_with_tokens() {
456        if let NodeOrToken::Token(t) = el
457            && t.kind() == SyntaxKind::FOOTNOTE_LABEL_ID
458        {
459            return Some(t.text().to_string());
460        }
461    }
462    None
463}
464
465fn parse_reference_def(node: &SyntaxNode) -> Option<(String, String, String)> {
466    let link = node.children().find(|c| c.kind() == SyntaxKind::LINK)?;
467    let label_node = link
468        .children()
469        .find(|c| c.kind() == SyntaxKind::LINK_TEXT)?;
470    let label = label_node.text().to_string();
471
472    // Read the structured destination/title nodes the parser emits — no
473    // re-parsing of the post-`]` tail. Angle brackets, when present, are
474    // delimiter tokens inside REFERENCE_URL.
475    let url_node = node
476        .children()
477        .find(|c| c.kind() == SyntaxKind::REFERENCE_URL)?;
478    let url_raw = url_node.text().to_string();
479    let url = url_raw
480        .strip_prefix('<')
481        .and_then(|r| r.strip_suffix('>'))
482        .unwrap_or(&url_raw)
483        .to_string();
484
485    let title = node
486        .children()
487        .find(|c| c.kind() == SyntaxKind::REFERENCE_TITLE)
488        .map(|t| parse_dest_title(&t.text().to_string()))
489        .unwrap_or_default();
490
491    Some((unescape_label(&label), url, title))
492}
493
494fn unescape_label(label: &str) -> String {
495    let mut out = String::with_capacity(label.len());
496    let mut chars = label.chars().peekable();
497    while let Some(ch) = chars.next() {
498        if ch == '\\'
499            && let Some(&next) = chars.peek()
500            && is_ascii_punct(next)
501        {
502            out.push(next);
503            chars.next();
504        } else {
505            out.push(ch);
506        }
507    }
508    out
509}
510
511fn is_ascii_punct(c: char) -> bool {
512    c.is_ascii() && (c.is_ascii_punctuation())
513}
514
515/// Pandoc/CommonMark reference-label normalization: case-fold and collapse
516/// runs of whitespace to a single space, with leading/trailing trimmed.
517fn normalize_ref_label(label: &str) -> String {
518    let unescaped = unescape_label(label);
519    let mut out = String::new();
520    let mut last_space = false;
521    for ch in unescaped.chars() {
522        if ch.is_whitespace() {
523            if !out.is_empty() && !last_space {
524                out.push(' ');
525                last_space = true;
526            }
527        } else {
528            for lc in ch.to_lowercase() {
529                out.push(lc);
530            }
531            last_space = false;
532        }
533    }
534    if last_space {
535        out.pop();
536    }
537    out
538}
539
540fn lookup_ref(label: &str) -> Option<(String, String)> {
541    let key = normalize_ref_label(label);
542    REFS_CTX.with(|c| c.borrow().refs.get(&key).cloned())
543}
544
545fn lookup_heading_id(label: &str) -> Option<String> {
546    let id = pandoc_slugify(&unescape_label(label));
547    if id.is_empty() {
548        return None;
549    }
550    REFS_CTX.with(|c| {
551        if c.borrow().heading_ids.contains(&id) {
552            Some(id)
553        } else {
554            None
555        }
556    })
557}
558
559/// Canonical form of a Pandoc-native AST string. Tokenizes the input and
560/// re-serializes it with single-space separation so that pretty-print line
561/// breaks and indentation no longer affect equality.
562pub fn normalize_native(s: &str) -> String {
563    let mut tokens = Vec::new();
564    let bytes = s.as_bytes();
565    let mut i = 0usize;
566    while i < bytes.len() {
567        let c = bytes[i];
568        match c {
569            b' ' | b'\t' | b'\n' | b'\r' => {
570                i += 1;
571            }
572            b'[' | b']' | b'(' | b')' | b',' => {
573                tokens.push((c as char).to_string());
574                i += 1;
575            }
576            b'"' => {
577                // String literal: copy bytes until matching unescaped quote.
578                let start = i;
579                i += 1;
580                while i < bytes.len() {
581                    match bytes[i] {
582                        b'\\' if i + 1 < bytes.len() => {
583                            i += 2;
584                        }
585                        b'"' => {
586                            i += 1;
587                            break;
588                        }
589                        _ => {
590                            i += 1;
591                        }
592                    }
593                }
594                tokens.push(s[start..i].to_string());
595            }
596            _ => {
597                let start = i;
598                while i < bytes.len() {
599                    let b = bytes[i];
600                    if matches!(
601                        b,
602                        b' ' | b'\t' | b'\n' | b'\r' | b'[' | b']' | b'(' | b')' | b',' | b'"'
603                    ) {
604                        break;
605                    }
606                    i += 1;
607                }
608                if i > start {
609                    tokens.push(s[start..i].to_string());
610                }
611            }
612        }
613    }
614    tokens.join(" ")
615}
616
617// Variant names mirror Pandoc's `Text.Pandoc.Definition` constructors so the
618// emission code reads 1:1 against pandoc-native — `BlockQuote`, `CodeBlock`,
619// `BulletList`, `OrderedList` are not redundant here, they are the spec names.
620#[derive(Debug, Clone)]
621#[allow(clippy::enum_variant_names)]
622enum Block {
623    Para(Vec<Inline>),
624    Plain(Vec<Inline>),
625    Header(usize, Attr, Vec<Inline>),
626    BlockQuote(Vec<Block>),
627    CodeBlock(Attr, String),
628    HorizontalRule,
629    BulletList(Vec<Vec<Block>>),
630    OrderedList(usize, &'static str, &'static str, Vec<Vec<Block>>),
631    RawBlock(String, String),
632    Table(TableData),
633    Div(Attr, Vec<Block>),
634    LineBlock(Vec<Vec<Inline>>),
635    DefinitionList(Vec<(Vec<Inline>, Vec<Vec<Block>>)>),
636    /// `Figure attr (Caption Nothing [caption-blocks]) [body-blocks]` —
637    /// pandoc's implicit_figures wraps an image-only paragraph whose
638    /// alt text becomes the caption and whose body re-includes the
639    /// image as a Plain block.
640    Figure(Attr, Vec<Block>, Vec<Block>),
641    Unsupported(String),
642}
643
644#[derive(Debug, Clone)]
645struct TableData {
646    /// Pandoc's `+caption_attributes` extension lifts a trailing
647    /// `{#id .class kv=...}` from the caption text into the Table's outer
648    /// attribute. Default-empty for tables without caption attributes.
649    attr: Attr,
650    caption: Vec<Inline>,
651    aligns: Vec<&'static str>,
652    /// Per-column width. `None` → `ColWidthDefault`, `Some(f)` → `ColWidth f`.
653    widths: Vec<Option<f64>>,
654    head_rows: Vec<Vec<GridCell>>,
655    body_rows: Vec<Vec<GridCell>>,
656    /// Footer rows. Currently only populated for grid tables with a
657    /// trailing `+===+===+` separator before the final body row(s).
658    foot_rows: Vec<Vec<GridCell>>,
659}
660
661/// One cell in a `TableData` row. `row_span`/`col_span` default to 1 for
662/// pipe/simple/multiline tables (which don't model spans). Grid tables
663/// compute proper span counts via the layout algorithm in `grid_table`.
664#[derive(Debug, Clone)]
665struct GridCell {
666    row_span: u32,
667    col_span: u32,
668    blocks: Vec<Block>,
669}
670
671impl GridCell {
672    fn no_span(blocks: Vec<Block>) -> Self {
673        Self {
674            row_span: 1,
675            col_span: 1,
676            blocks,
677        }
678    }
679}
680
681#[derive(Debug, Clone)]
682#[allow(clippy::enum_variant_names)]
683enum Inline {
684    Str(String),
685    Space,
686    SoftBreak,
687    LineBreak,
688    Emph(Vec<Inline>),
689    Strong(Vec<Inline>),
690    Strikeout(Vec<Inline>),
691    Superscript(Vec<Inline>),
692    Subscript(Vec<Inline>),
693    Code(Attr, String),
694    Link(Attr, Vec<Inline>, String, String),
695    Image(Attr, Vec<Inline>, String, String),
696    Math(&'static str, String),
697    Span(Attr, Vec<Inline>),
698    RawInline(String, String),
699    Quoted(&'static str, Vec<Inline>),
700    Note(Vec<Block>),
701    Cite(Vec<Citation>, Vec<Inline>),
702    Unsupported(String),
703}
704
705#[derive(Debug, Clone)]
706struct Citation {
707    id: String,
708    prefix: Vec<Inline>,
709    suffix: Vec<Inline>,
710    mode: CitationMode,
711    note_num: i64,
712    hash: i64,
713}
714
715#[derive(Debug, Clone, Copy)]
716enum CitationMode {
717    AuthorInText,
718    NormalCitation,
719    SuppressAuthor,
720}
721
722#[derive(Debug, Default, Clone)]
723struct Attr {
724    id: String,
725    classes: Vec<String>,
726    kvs: Vec<(String, String)>,
727}
728
729// ----- block-level walking ------------------------------------------------
730
731fn blocks_from_doc(doc: &SyntaxNode) -> Vec<Block> {
732    let mut out = Vec::new();
733    for child in doc.children() {
734        collect_block(&child, &mut out);
735    }
736    out
737}
738
739fn block_from(node: &SyntaxNode) -> Option<Block> {
740    match node.kind() {
741        SyntaxKind::PARAGRAPH => Some(Block::Para(coalesce_inlines(inlines_from(node)))),
742        SyntaxKind::PLAIN => Some(Block::Plain(coalesce_inlines(inlines_from(node)))),
743        SyntaxKind::HEADING => Some(heading_block(node)),
744        SyntaxKind::BLOCK_QUOTE => Some(Block::BlockQuote(blockquote_blocks(node))),
745        SyntaxKind::CODE_BLOCK => Some(code_block(node)),
746        SyntaxKind::HORIZONTAL_RULE => Some(Block::HorizontalRule),
747        SyntaxKind::LIST => Some(list_block(node)),
748        SyntaxKind::BLANK_LINE => None,
749        // Reference definitions don't appear in pandoc-native output (they
750        // resolve into the link they define).
751        SyntaxKind::REFERENCE_DEFINITION => None,
752        // Footnote definitions are pulled into Note inlines at the
753        // FOOTNOTE_REFERENCE site; the definition block itself is dropped.
754        SyntaxKind::FOOTNOTE_DEFINITION => None,
755        // YAML metadata becomes the document Meta wrapper, not a body block.
756        // The projector emits a bare block list, so just drop these.
757        SyntaxKind::YAML_METADATA => None,
758        // Pandoc title block (`% title\n% authors\n% date`) populates Meta
759        // and produces no body block.
760        SyntaxKind::PANDOC_TITLE_BLOCK => None,
761        SyntaxKind::HTML_BLOCK => Some(html_block(node)),
762        SyntaxKind::HTML_BLOCK_DIV => Some(html_div_block(node)),
763        SyntaxKind::PIPE_TABLE => pipe_table(node).map(Block::Table),
764        SyntaxKind::SIMPLE_TABLE => simple_table(node).map(Block::Table),
765        SyntaxKind::GRID_TABLE => grid_table(node).map(Block::Table),
766        SyntaxKind::MULTILINE_TABLE => multiline_table(node).map(Block::Table),
767        SyntaxKind::TEX_BLOCK => Some(tex_block(node)),
768        SyntaxKind::FENCED_DIV => Some(fenced_div(node)),
769        SyntaxKind::LINE_BLOCK => Some(line_block(node)),
770        SyntaxKind::DEFINITION_LIST => Some(definition_list(node)),
771        SyntaxKind::FIGURE => Some(figure_block(node)),
772        other => Some(Block::Unsupported(format!("{other:?}"))),
773    }
774}
775
776/// Pandoc's `implicit_figures` extension wraps a paragraph that is *only*
777/// an Image into a `Figure` block: `Figure (id, [], []) (Caption Nothing
778/// [Plain alt]) [Plain [Image]]`. The image's alt-text inlines become the
779/// caption; the body holds the image itself wrapped in a Plain. Any
780/// attribute attached to the Image migrates to the Figure attr (id only)
781/// — the Image keeps its classes/kvs.
782fn figure_block(node: &SyntaxNode) -> Block {
783    let mut alt: Vec<Inline> = Vec::new();
784    let mut image_inline: Option<Inline> = None;
785    if let Some(image) = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_LINK) {
786        let alt_node = image.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
787        if let Some(an) = alt_node {
788            alt = coalesce_inlines(inlines_from(&an));
789        }
790        let mut tmp = Vec::new();
791        render_image_inline(&image, &mut tmp);
792        if let Some(first) = tmp.into_iter().next() {
793            image_inline = Some(first);
794        }
795    }
796    // Pandoc's `implicit_figures` migrates only the image's id to the Figure
797    // attr; the image keeps its classes and key-value pairs but loses the id.
798    let (figure_attr, image_inline) = match image_inline {
799        Some(Inline::Image(mut attr, alt_inlines, url, title)) if !attr.id.is_empty() => {
800            let fig_attr = Attr::with_id(std::mem::take(&mut attr.id));
801            (fig_attr, Some(Inline::Image(attr, alt_inlines, url, title)))
802        }
803        other => (Attr::default(), other),
804    };
805    let caption = if alt.is_empty() {
806        Vec::new()
807    } else {
808        vec![Block::Plain(alt)]
809    };
810    let body = match image_inline {
811        Some(img) => vec![Block::Plain(vec![img])],
812        None => Vec::new(),
813    };
814    Block::Figure(figure_attr, caption, body)
815}
816
817fn heading_block(node: &SyntaxNode) -> Block {
818    let level = heading_level(node);
819    let inlines = node
820        .children()
821        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
822        .map(|c| coalesce_inlines(inlines_from(&c)))
823        .unwrap_or_default();
824    // Auto-id and disambiguation are computed in the `RefsCtx` pre-pass so
825    // duplicate slugs and `section`-fallbacks are document-wide consistent.
826    // Explicit attributes still need their classes/kvs parsed here.
827    let offset: u32 = node.text_range().start().into();
828    let final_id = REFS_CTX
829        .with(|c| c.borrow().heading_id_by_offset.get(&offset).cloned())
830        .unwrap_or_default();
831    let mut attr = extract_attr_from_node(node);
832    if attr.id.is_empty() {
833        attr.id = final_id;
834    }
835    Block::Header(level, attr, inlines)
836}
837
838fn heading_level(node: &SyntaxNode) -> usize {
839    for child in node.children() {
840        if child.kind() == SyntaxKind::ATX_HEADING_MARKER {
841            for tok in child.children_with_tokens() {
842                if let Some(t) = tok.as_token()
843                    && t.kind() == SyntaxKind::ATX_HEADING_MARKER
844                {
845                    return t.text().chars().filter(|&c| c == '#').count();
846                }
847            }
848        }
849    }
850    for el in node.descendants_with_tokens() {
851        if let NodeOrToken::Token(t) = el
852            && t.kind() == SyntaxKind::SETEXT_HEADING_UNDERLINE
853        {
854            return if t.text().trim_start().starts_with('=') {
855                1
856            } else {
857                2
858            };
859        }
860    }
861    1
862}
863
864fn blockquote_blocks(node: &SyntaxNode) -> Vec<Block> {
865    let mut out = Vec::new();
866    for child in node.children() {
867        collect_block(&child, &mut out);
868    }
869    out
870}
871
872fn code_block(node: &SyntaxNode) -> Block {
873    let raw_format = code_block_raw_format(node);
874    let attr = code_block_attr(node);
875    let is_fenced = node
876        .children()
877        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
878    let mut content = String::new();
879    for child in node.children() {
880        if child.kind() == SyntaxKind::CODE_CONTENT {
881            content.push_str(&child.text().to_string());
882        }
883    }
884    // Pandoc strips the trailing newline that closes the block.
885    while content.ends_with('\n') {
886        content.pop();
887    }
888    if is_fenced {
889        // Pandoc tab-expands code-block bodies before emission. For indented
890        // code, the expansion happens inside `strip_indented_code_indent`
891        // before the 4-col strip; for fenced code there is no strip, so do
892        // it directly here.
893        content = content
894            .split('\n')
895            .map(expand_tabs_to_4)
896            .collect::<Vec<_>>()
897            .join("\n");
898    } else {
899        content = strip_indented_code_indent(&content);
900    }
901    if let Some(fmt) = raw_format {
902        return Block::RawBlock(fmt, content);
903    }
904    Block::CodeBlock(attr, content)
905}
906
907/// Pandoc's raw-attribute syntax (`Ext_raw_attribute`) treats a fenced code
908/// block whose info string is exactly `{=format}` as a `RawBlock` of that
909/// format rather than a `CodeBlock`. The brace contents must start with `=`
910/// followed by a non-empty token, with no other classes/ids/key-value pairs.
911fn code_block_raw_format(node: &SyntaxNode) -> Option<String> {
912    let open = node
913        .children()
914        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)?;
915    let info = open
916        .children()
917        .find(|c| c.kind() == SyntaxKind::CODE_INFO)?;
918    let raw = info.text().to_string();
919    let trimmed = raw.trim();
920    let inner = trimmed
921        .strip_prefix('{')
922        .and_then(|s| s.strip_suffix('}'))?;
923    let inner = inner.trim();
924    let format = inner.strip_prefix('=')?.trim();
925    if format.is_empty() || format.contains(char::is_whitespace) {
926        return None;
927    }
928    Some(format.to_string())
929}
930
931fn code_block_attr(node: &SyntaxNode) -> Attr {
932    let Some(open) = node
933        .children()
934        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
935    else {
936        return Attr::default();
937    };
938    let Some(info) = open.children().find(|c| c.kind() == SyntaxKind::CODE_INFO) else {
939        return Attr::default();
940    };
941    // Structured DisplayExplicit/DisplayShortcut: the parser emits bare `ATTR_*`
942    // children (plus a `CODE_LANGUAGE` token) for `{.python #id key=val}` and
943    // `lang {.cls}` forms. Read them directly. Executable chunks instead carry a
944    // `CHUNK_OPTIONS` node and stay on the text path below, as do opaque
945    // Plain/Raw info strings.
946    let has_bare_attrs = info.children_with_tokens().any(|el| {
947        matches!(
948            el.kind(),
949            SyntaxKind::ATTR_ID | SyntaxKind::ATTR_CLASS | SyntaxKind::ATTR_KEY_VALUE
950        )
951    });
952    let has_chunk_options = info
953        .children()
954        .any(|c| c.kind() == SyntaxKind::CHUNK_OPTIONS);
955    if has_bare_attrs && !has_chunk_options {
956        let mut attr = read_bare_attr_children(&info);
957        // Pandoc concatenates the language as the first class.
958        if let Some(lang) = info
959            .children_with_tokens()
960            .find(|el| el.kind() == SyntaxKind::CODE_LANGUAGE)
961            .and_then(|el| el.as_token().map(|t| t.text().to_string()))
962        {
963            attr.classes.insert(0, normalize_lang_id(&lang));
964        }
965        return attr;
966    }
967
968    let raw = info.text().to_string();
969    let trimmed = raw.trim();
970    if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
971        return parse_attr_block(inner);
972    }
973    // Shortcut form: `lang {.cls #id key=value}` — language followed by an
974    // attribute block. Pandoc concatenates the language as the first class.
975    if let Some(brace) = trimmed.find('{')
976        && trimmed.ends_with('}')
977    {
978        let lang = trimmed[..brace].trim();
979        let attr_inner = &trimmed[brace + 1..trimmed.len() - 1];
980        let mut attr = parse_attr_block(attr_inner);
981        if !lang.is_empty() {
982            attr.classes.insert(0, normalize_lang_id(lang));
983        }
984        return attr;
985    }
986    if !trimmed.is_empty() {
987        return Attr {
988            id: String::new(),
989            classes: vec![normalize_lang_id(trimmed)],
990            kvs: Vec::new(),
991        };
992    }
993    Attr::default()
994}
995
996/// Mirrors pandoc's `toLanguageId` (Markdown reader): lowercases the language
997/// identifier and applies the GitHub-syntax-highlighting normalizations
998/// (`c++` → `cpp`, `objective-c` → `objectivec`).
999fn normalize_lang_id(lang: &str) -> String {
1000    let lower = lang.to_ascii_lowercase();
1001    match lower.as_str() {
1002        "c++" => "cpp".to_string(),
1003        "objective-c" => "objectivec".to_string(),
1004        _ => lower,
1005    }
1006}
1007
1008/// Pandoc strips up to four leading spaces (or one tab) from each line of an
1009/// indented code block. The CST keeps the indent as part of CODE_CONTENT, so
1010/// we remove it here.
1011fn strip_indented_code_indent(s: &str) -> String {
1012    let mut out = String::with_capacity(s.len());
1013    for (i, line) in s.split('\n').enumerate() {
1014        if i > 0 {
1015            out.push('\n');
1016        }
1017        // Pandoc expands tabs to 4-column tab stops *before* stripping the
1018        // 4-column indent. Mixed `  \tfoo` therefore becomes `    foo` →
1019        // `foo` after strip, which is what `pandoc -t native` emits.
1020        let expanded = expand_tabs_to_4(line);
1021        let stripped = if let Some(rest) = expanded.strip_prefix("    ") {
1022            rest.to_string()
1023        } else if let Some(rest) = expanded.strip_prefix('\t') {
1024            rest.to_string()
1025        } else {
1026            // Strip up to 3 leading spaces if present (pandoc tolerates short
1027            // indentation only on blank lines, which we don't try to detect
1028            // here — safer to leave non-conforming lines alone).
1029            expanded
1030        };
1031        out.push_str(&stripped);
1032    }
1033    out
1034}
1035
1036/// Expand `\t` to spaces using 4-column tab stops, starting from column 0
1037/// of `line`. Pandoc applies this to indented code blocks before stripping
1038/// the leading 4-column indent so the body byte-equals what pandoc emits.
1039fn expand_tabs_to_4(line: &str) -> String {
1040    let mut out = String::with_capacity(line.len());
1041    let mut col = 0usize;
1042    for c in line.chars() {
1043        if c == '\t' {
1044            let next = (col / 4 + 1) * 4;
1045            for _ in col..next {
1046                out.push(' ');
1047            }
1048            col = next;
1049        } else {
1050            out.push(c);
1051            col += 1;
1052        }
1053    }
1054    out
1055}
1056
1057/// Single-block projection of an opaque `HTML_BLOCK`. Used when a non-
1058/// structural caller (e.g. grid-table cell reparse via `block_from`)
1059/// needs one `Block` rather than a stream. Emits a single `RawBlock`
1060/// — no structural lift (the lifted shape projects as multiple blocks
1061/// and is handled by `emit_html_block` via `collect_block`).
1062fn html_block(node: &SyntaxNode) -> Block {
1063    let mut content = node.text().to_string();
1064    while content.ends_with('\n') {
1065        content.pop();
1066    }
1067    Block::RawBlock("html".to_string(), content)
1068}
1069
1070/// Project an `HTML_BLOCK_DIV` node (a Pandoc-dialect-lifted
1071/// `<div ...>...</div>` block) into a `Block::Div`.
1072///
1073/// Walks the structural CST: attributes come from the open
1074/// `HTML_BLOCK_TAG`'s `HTML_ATTRS` descendant (Phase 1's structural
1075/// lift) and inner blocks from any non-tag CST children (Phase 6's
1076/// structural lift — `PARAGRAPH`, `HEADING`, nested `HTML_BLOCK_DIV`,
1077/// etc., produced when the parser recursively parses the inner
1078/// content of a `<div>...</div>` body).
1079///
1080/// All currently exercised `<div>` shapes lift structurally (clean
1081/// multi-line, open-trailing, butted-close, indented-close, same-
1082/// line, empty / blank-only, and bq-wrapped shapes). The defensive
1083/// fallback below emits an empty `Div` if a future parser change
1084/// somehow yields an unlifted `HTML_BLOCK_DIV` — that would be a
1085/// parser bug, not something the projector should silently reparse.
1086fn html_div_block(node: &SyntaxNode) -> Block {
1087    let attr = cst_div_open_tag_attr(node);
1088    if div_has_structural_inner(node) {
1089        let mut blocks = Vec::new();
1090        for child in node.children() {
1091            match child.kind() {
1092                SyntaxKind::HTML_BLOCK_TAG | SyntaxKind::BLANK_LINE => {}
1093                _ => collect_block(&child, &mut blocks),
1094            }
1095        }
1096        return Block::Div(attr, blocks);
1097    }
1098    debug_assert!(
1099        false,
1100        "HTML_BLOCK_DIV without structural inner shape — parser regression"
1101    );
1102    Block::Div(attr, Vec::new())
1103}
1104
1105/// Concatenate the node's token text, dropping prefix tokens injected
1106/// by the parser for container nesting:
1107/// - Every `BLOCK_QUOTE_MARKER` and the immediately-following
1108///   `WHITESPACE` token (bq-wrapped HTML lift —
1109///   `> <div>\n> foo\n> </div>` becomes `<div>\nfoo\n</div>`).
1110/// - A leading `WHITESPACE` token at the start of each source line
1111///   when it is NOT preceded by a `BLOCK_QUOTE_MARKER` on the same
1112///   line (list-item content_col stripped by
1113///   `parser/utils/list_item_buffer.rs::strip_list_item_indent` and
1114///   re-injected as a structural `WHITESPACE` token during graft —
1115///   `- <pre>\n  foo\n  </pre>` becomes `<pre>\nfoo\n</pre>` in the
1116///   RawBlock text). The parser never emits a leading line-start
1117///   `WHITESPACE` inside `HTML_BLOCK_CONTENT` or `HTML_BLOCK_TAG`
1118///   outside this lift path — top-level indented HTML keeps the
1119///   leading indent inside a single `TEXT` token — so the rule is
1120///   unambiguous.
1121fn collect_html_block_text_skip_bq_markers(node: &SyntaxNode) -> String {
1122    let mut out = String::new();
1123    let mut skip_next_ws = false;
1124    let mut at_line_start = true;
1125    walk_skip_bq_markers(node, &mut out, &mut skip_next_ws, &mut at_line_start);
1126    out
1127}
1128
1129fn walk_skip_bq_markers(
1130    node: &SyntaxNode,
1131    out: &mut String,
1132    skip_next_ws: &mut bool,
1133    at_line_start: &mut bool,
1134) {
1135    for child in node.children_with_tokens() {
1136        match child {
1137            NodeOrToken::Node(n) => walk_skip_bq_markers(&n, out, skip_next_ws, at_line_start),
1138            NodeOrToken::Token(t) => {
1139                if t.kind() == SyntaxKind::BLOCK_QUOTE_MARKER {
1140                    *skip_next_ws = true;
1141                    *at_line_start = false;
1142                    continue;
1143                }
1144                if *skip_next_ws && t.kind() == SyntaxKind::WHITESPACE {
1145                    *skip_next_ws = false;
1146                    *at_line_start = false;
1147                    continue;
1148                }
1149                if *at_line_start && t.kind() == SyntaxKind::WHITESPACE {
1150                    *at_line_start = false;
1151                    continue;
1152                }
1153                *skip_next_ws = false;
1154                let kind = t.kind();
1155                out.push_str(t.text());
1156                *at_line_start = kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE;
1157            }
1158        }
1159    }
1160}
1161
1162/// True when the parser has lifted the `<div>` body into structural
1163/// CST children AND both the open and close `HTML_BLOCK_TAG`s are
1164/// "clean" (carry no inner content): the open tag ends with the `>`
1165/// token followed only by a NEWLINE, and the close tag's first text
1166/// starts with `</`. "Messy" shapes — same-line `<div>foo</div>`,
1167/// trailing content on the open tag (`<div>foo\nbar\n</div>`),
1168/// butted close (`<div>\nfoo\nbar</div>`) — fall through to the byte
1169/// reparse path, which is the source of truth for those cases until
1170/// follow-up parser work lifts them too.
1171///
1172/// Presence of an `HTML_BLOCK_CONTENT` child signals an unlifted body
1173/// (parser kept body lines as opaque TEXT) — bq-wrapped divs are the
1174/// current example. Those still need the byte-reparse path. Empty
1175/// and blank-only bodies have no `HTML_BLOCK_CONTENT` child and can
1176/// be lifted structurally to `Div ("",[],[]) []`.
1177fn div_has_structural_inner(node: &SyntaxNode) -> bool {
1178    let mut tags = node
1179        .children()
1180        .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG);
1181    let Some(open_tag) = tags.next() else {
1182        return false;
1183    };
1184    // Close tag is optional: pandoc emits an implicit close at EOF
1185    // for an unclosed `<div>` (warning: "Div ... unclosed ... closing
1186    // implicitly"). The body lift still produces structural children
1187    // (or none, for empty `<div>`), which we project as `Block::Div`.
1188    let close_tag = tags.next();
1189    if tags.next().is_some() {
1190        return false;
1191    }
1192    if !html_block_open_tag_is_clean(&open_tag) {
1193        return false;
1194    }
1195    if let Some(close_tag) = close_tag.as_ref()
1196        && !html_block_close_tag_is_clean(close_tag)
1197    {
1198        return false;
1199    }
1200    !node
1201        .children()
1202        .any(|c| c.kind() == SyntaxKind::HTML_BLOCK_CONTENT)
1203}
1204
1205/// True when the open `HTML_BLOCK_TAG` carries no inner content after
1206/// its closing `>`: the tag's children, in order, end with a TEXT
1207/// token whose last byte is `>` (either the dedicated `>` token used
1208/// by the structural `<div>` emission, or the whole-line TEXT used by
1209/// non-div strict-block emission like `<form>` / `<section>`),
1210/// followed only by zero or more NEWLINE tokens. Trailing content
1211/// (e.g. `<div id="x">foo\n`) returns false.
1212fn html_block_open_tag_is_clean(open_tag: &SyntaxNode) -> bool {
1213    let mut seen_gt = false;
1214    for child in open_tag.children_with_tokens() {
1215        let NodeOrToken::Token(t) = child else {
1216            // Structural HTML_ATTRS nodes are part of the open tag;
1217            // ignore them — they belong before `>`.
1218            continue;
1219        };
1220        if !seen_gt {
1221            if t.kind() == SyntaxKind::TEXT && t.text().ends_with('>') {
1222                seen_gt = true;
1223            }
1224        } else if t.kind() != SyntaxKind::NEWLINE {
1225            return false;
1226        }
1227    }
1228    seen_gt
1229}
1230
1231/// True when the close `HTML_BLOCK_TAG`'s first TEXT token begins with
1232/// `</`. A butted-close shape (`bar</div>`) starts with content text
1233/// and returns false.
1234fn html_block_close_tag_is_clean(close_tag: &SyntaxNode) -> bool {
1235    for child in close_tag.children_with_tokens() {
1236        if let NodeOrToken::Token(t) = child
1237            && t.kind() == SyntaxKind::TEXT
1238        {
1239            return t.text().starts_with("</");
1240        }
1241    }
1242    false
1243}
1244
1245/// Read the `<div>` open tag's attributes from the structural CST.
1246/// `HTML_BLOCK_DIV` always has an open `HTML_BLOCK_TAG` as its first
1247/// `HTML_BLOCK_TAG` child. The open tag may contain multiple
1248/// `HTML_ATTRS` regions when the source spans multiple attribute lines
1249/// (e.g. `<div\n  id="x"\n  class="y">`); join their text with spaces
1250/// before parsing so attributes from every line contribute. Empty
1251/// attributes (`<div>`) produce `Attr::default()`.
1252fn cst_div_open_tag_attr(node: &SyntaxNode) -> Attr {
1253    let Some(open_tag) = node
1254        .children()
1255        .find(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG)
1256    else {
1257        return Attr::default();
1258    };
1259    // A multi-line `<div>` open tag emits one `HTML_ATTRS` region per line;
1260    // merge the structured attrs from each (first non-empty id wins, classes
1261    // and key-values concatenated in source order).
1262    let mut attr = Attr::default();
1263    for region in open_tag
1264        .children()
1265        .filter(|c| c.kind() == SyntaxKind::HTML_ATTRS)
1266    {
1267        let part = attr_from_html_attrs_node(&region);
1268        if attr.id.is_empty() && !part.id.is_empty() {
1269            attr.id = part.id;
1270        }
1271        attr.classes.extend(part.classes);
1272        attr.kvs.extend(part.kvs);
1273    }
1274    attr
1275}
1276
1277/// Project an `HTML_BLOCK` node into one or more `Block`s.
1278///
1279/// Pandoc's `markdown_in_html_blocks` extension (default-on under `markdown`
1280/// flavor) splits an HTML block at every complete *block-level* HTML tag:
1281/// each open or close tag emits its own `RawBlock`, and intervening
1282/// non-tag bytes are parsed as fresh markdown and emitted as `Plain` (or
1283/// `Para` for chunks separated by blank lines). Inline-only tags
1284/// (`<em>`, `<a>`, `<input>`, `<br>`, …) are not splitters — they pass
1285/// through as `RawInline` inside the surrounding `Plain` content.
1286///
1287/// Verbatim constructs are preserved as a single `RawBlock`: comments,
1288/// `<script>` / `<style>` / `<pre>` / `<textarea>`, processing
1289/// instructions, declarations, and CDATA. Balanced `<div>...</div>` is
1290/// handled at parse time (HTML_BLOCK_DIV lift) and routed through
1291/// `html_div_block`, not the splitter.
1292fn emit_html_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1293    // Fix #4 / Phase 6 structural lift: when the parser has lifted the
1294    // body into structural CST children (open `HTML_BLOCK_TAG` + body
1295    // blocks + close `HTML_BLOCK_TAG`, no `HTML_BLOCK_CONTENT`), walk
1296    // the children directly. This avoids the byte-reparse path that
1297    // would re-disambiguate heading auto-ids against a fresh inner
1298    // `RefsCtx` (producing `heading-1` instead of `heading` when the
1299    // outer ctx already saw the heading).
1300    if html_block_has_structural_lift(node) {
1301        emit_html_block_structural(node, out);
1302        return;
1303    }
1304    // Strip BLOCK_QUOTE_MARKER + WHITESPACE prefix tokens so the
1305    // byte-level walkers below see clean HTML — the parser keeps bq
1306    // markers as structural tokens inside HTML_BLOCK for verbatim-tag
1307    // content (e.g. `> <pre>\n> code\n> </pre>`). Outside a blockquote
1308    // this returns the same bytes as `node.text()`.
1309    let mut content = collect_html_block_text_skip_bq_markers(node);
1310    // Pandoc trims trailing ASCII whitespace (newlines, spaces, tabs)
1311    // from RawBlock text — `<!-- hi -->   \n` emits `RawBlock
1312    // "<!-- hi -->"`, not `"<!-- hi -->   "`. Interior whitespace is
1313    // preserved (e.g. `<pre>foo\n   </pre>` keeps the indented close).
1314    while content
1315        .as_bytes()
1316        .last()
1317        .is_some_and(|b| matches!(b, b'\n' | b'\r' | b' ' | b'\t'))
1318    {
1319        content.pop();
1320    }
1321    let leading_ws = content
1322        .as_bytes()
1323        .iter()
1324        .position(|&b| b != b' ' && b != b'\t')
1325        .unwrap_or(content.len());
1326    let trimmed = &content[leading_ws..];
1327    // Pandoc strips leading 1-3 spaces of indent from the first line
1328    // of an HTML block's RawBlock text — `  <pre>foo</pre>\n` emits
1329    // `RawBlock "<pre>foo</pre>"`. Subsequent lines keep their
1330    // indent. The HTML-block scanner only recognizes 0-3 leading
1331    // spaces of indent, so leading_ws is bounded; tabs aren't part
1332    // of an HTML-block opener and shouldn't be stripped.
1333    let strip_first_line_indent = leading_ws > 0
1334        && leading_ws <= 3
1335        && content.as_bytes()[..leading_ws].iter().all(|&b| b == b' ');
1336    if trimmed.starts_with("<!--")
1337        || trimmed.starts_with("<?")
1338        || trimmed.starts_with("<![CDATA[")
1339        || trimmed.starts_with("<!")
1340        || is_raw_text_element_open(trimmed)
1341    {
1342        let raw = if strip_first_line_indent {
1343            content[leading_ws..].to_string()
1344        } else {
1345            content
1346        };
1347        out.push(Block::RawBlock("html".to_string(), raw));
1348        return;
1349    }
1350    let walker_input = if strip_first_line_indent {
1351        &content[leading_ws..]
1352    } else {
1353        content.as_str()
1354    };
1355    split_html_block_by_tags(walker_input, out);
1356}
1357
1358/// True when an `HTML_BLOCK` carries the Fix #4 structural lift shape:
1359/// exactly two `HTML_BLOCK_TAG` children (open + close), both "clean"
1360/// (open ends at `>`, close starts with `</`), and no
1361/// `HTML_BLOCK_CONTENT` (which would mark an unlifted opaque body).
1362/// Empty bodies (only the two tags, with optional `BLANK_LINE` in
1363/// between) still count as lifted — they project as RawBlock +
1364/// RawBlock with nothing in between, matching pandoc.
1365fn html_block_has_structural_lift(node: &SyntaxNode) -> bool {
1366    let mut tags = node
1367        .children()
1368        .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG);
1369    let Some(open_tag) = tags.next() else {
1370        return false;
1371    };
1372    let Some(close_tag) = tags.next() else {
1373        return false;
1374    };
1375    if tags.next().is_some() {
1376        return false;
1377    }
1378    if !html_block_open_tag_is_clean(&open_tag) {
1379        return false;
1380    }
1381    if !html_block_close_tag_is_clean(&close_tag) {
1382        return false;
1383    }
1384    !node
1385        .children()
1386        .any(|c| c.kind() == SyntaxKind::HTML_BLOCK_CONTENT)
1387}
1388
1389/// Emit an `HTML_BLOCK` whose body has been structurally lifted: walk
1390/// its CST children, projecting the open/close `HTML_BLOCK_TAG`s as
1391/// `RawBlock` (one each, trailing newlines trimmed to match pandoc-
1392/// native's tag-only emission) and inner block children through
1393/// `collect_block`. `BLANK_LINE` children are skipped (they don't
1394/// project to anything in pandoc-native).
1395fn emit_html_block_structural(node: &SyntaxNode, out: &mut Vec<Block>) {
1396    for child in node.children() {
1397        match child.kind() {
1398            SyntaxKind::HTML_BLOCK_TAG => {
1399                let text = open_tag_raw_block_text(&child);
1400                out.push(Block::RawBlock("html".to_string(), text));
1401            }
1402            SyntaxKind::BLANK_LINE => {}
1403            _ => collect_block(&child, out),
1404        }
1405    }
1406}
1407
1408/// Produce the `RawBlock` text for an `HTML_BLOCK_TAG` (open or close)
1409/// under the structural lift. Trailing newlines are always trimmed
1410/// (pandoc emits the tag bytes only). When the tag contains an
1411/// `HTML_ATTRS` structural region, the text is canonicalized to
1412/// pandoc's single-line form `<tagname attr1 attr2 ...>`:
1413/// multi-line opens collapse to one line, inter-attribute whitespace
1414/// normalizes to a single space, and any trailing whitespace before
1415/// `>` is dropped. Open tags without structural HTML_ATTRS (e.g.
1416/// `<form>`) and close tags (`</form>`) keep their literal text.
1417fn open_tag_raw_block_text(tag: &SyntaxNode) -> String {
1418    let has_attrs = tag.children().any(|c| c.kind() == SyntaxKind::HTML_ATTRS);
1419    if has_attrs {
1420        let mut name_prefix: Option<String> = None;
1421        let mut attrs: Vec<String> = Vec::new();
1422        for child in tag.children_with_tokens() {
1423            match child {
1424                NodeOrToken::Token(t) if t.kind() == SyntaxKind::TEXT => {
1425                    let text = t.text();
1426                    if name_prefix.is_none() && text.starts_with('<') {
1427                        if let Some(gt_idx) = text.find('>') {
1428                            // Whole-line shape (`<form>` etc., shouldn't
1429                            // reach here because has_attrs would be
1430                            // false). Defensive: emit literal prefix.
1431                            return text[..=gt_idx].to_string();
1432                        }
1433                        name_prefix = Some(text.to_string());
1434                    }
1435                }
1436                NodeOrToken::Node(n) if n.kind() == SyntaxKind::HTML_ATTRS => {
1437                    let attr_text = n.text().to_string();
1438                    let trimmed = attr_text.trim();
1439                    if !trimmed.is_empty() {
1440                        attrs.push(trimmed.to_string());
1441                    }
1442                }
1443                _ => {}
1444            }
1445        }
1446        let mut result = name_prefix.unwrap_or_default();
1447        for attr in &attrs {
1448            result.push(' ');
1449            result.push_str(attr);
1450        }
1451        result.push('>');
1452        return result;
1453    }
1454    // Blockquote-wrapped close tags (`> </form>`, `> </video>`) carry
1455    // their leading `BLOCK_QUOTE_MARKER + WHITESPACE` tokens inside the
1456    // close `HTML_BLOCK_TAG` for losslessness. Pandoc-native's RawBlock
1457    // text is the tag bytes only — strip those prefix tokens. Leading
1458    // 1-3 space indent (captured as a WHITESPACE token before the tag
1459    // name TEXT) is likewise stripped: pandoc's HTML block scanner
1460    // accepts ≤ 3 leading spaces on the open/close line but doesn't
1461    // round-trip them into the RawBlock text.
1462    let mut text = String::new();
1463    let mut skip_next_ws = false;
1464    for child in tag.children_with_tokens() {
1465        if let NodeOrToken::Token(t) = child {
1466            if t.kind() == SyntaxKind::BLOCK_QUOTE_MARKER {
1467                skip_next_ws = true;
1468                continue;
1469            }
1470            if skip_next_ws && t.kind() == SyntaxKind::WHITESPACE {
1471                skip_next_ws = false;
1472                continue;
1473            }
1474            if text.is_empty() && t.kind() == SyntaxKind::WHITESPACE {
1475                continue;
1476            }
1477            skip_next_ws = false;
1478            text.push_str(t.text());
1479        }
1480    }
1481    while text.ends_with('\n') {
1482        text.pop();
1483    }
1484    text
1485}
1486
1487/// Walk `content`'s bytes and split at every complete block-level HTML tag.
1488/// Each tag emits its own `RawBlock`; intervening text is flushed via
1489/// [`flush_html_block_text`]. Balanced `<div>...</div>` pairs (depth-aware)
1490/// project to `Block::Div`.
1491///
1492/// Tag classification follows pandoc:
1493/// - **Strict block tags** (pandoc's `blockHtmlTags`) always split.
1494/// - **Inline-block tags** (pandoc's `eitherBlockOrInline` set —
1495///   `<iframe>`, `<button>`, `<video>`, …) split with a matched-pair
1496///   3-way lift only at fresh-block positions; inside an existing
1497///   inline run they pass through as raw inline HTML. The
1498///   `inline_pending` flag tracks whether any non-whitespace content
1499///   (text bytes or non-splitting tags) has appeared since the last
1500///   splitter — when true, we don't split on inline-block tags.
1501fn split_html_block_by_tags(content: &str, out: &mut Vec<Block>) {
1502    use crate::parser::blocks::html_blocks::{
1503        is_pandoc_block_tag_name, is_pandoc_inline_block_tag_name, is_pandoc_void_block_tag_name,
1504    };
1505    use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
1506
1507    let bytes = content.as_bytes();
1508    let mut i = 0usize;
1509    let mut text_start = 0usize;
1510    let mut inline_pending = false;
1511    let mut consecutive_newlines = 0usize;
1512    while i < bytes.len() {
1513        let b = bytes[i];
1514        if b == b'\n' {
1515            consecutive_newlines += 1;
1516            // A blank line resets the inline-pending state — pandoc
1517            // restarts block parsing after a blank line, so subsequent
1518            // inline-block tags become eligible to split again.
1519            if consecutive_newlines >= 2 {
1520                inline_pending = false;
1521            }
1522            i += 1;
1523            continue;
1524        }
1525        consecutive_newlines = 0;
1526        if b != b'<' {
1527            if !b.is_ascii_whitespace() {
1528                inline_pending = true;
1529            }
1530            i += 1;
1531            continue;
1532        }
1533        let rest = &content[i..];
1534        let open_end = parse_open_tag(rest);
1535        let close_end = parse_close_tag(rest);
1536        let Some((tag_end, is_close)) = open_end
1537            .map(|n| (n, false))
1538            .or_else(|| close_end.map(|n| (n, true)))
1539        else {
1540            inline_pending = true;
1541            i += 1;
1542            continue;
1543        };
1544        let tag_text = &rest[..tag_end];
1545        let Some(name) = extract_html_tag_name(tag_text) else {
1546            inline_pending = true;
1547            i += 1;
1548            continue;
1549        };
1550        if is_pandoc_block_tag_name(name) {
1551            // Strict block tags (incl. `<div>`) inside an opaque
1552            // HTML_BLOCK split into RawBlocks per tag. Matched
1553            // `<div>...</div>` is handled at parse time (HTML_BLOCK_DIV
1554            // lift); we only reach the splitter for unbalanced or
1555            // multi-tag content (e.g. `</section>` standalone), so
1556            // emit each tag as its own RawBlock.
1557            if i > text_start {
1558                flush_html_block_text(&content[text_start..i], out);
1559            }
1560            out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1561            i += tag_end;
1562            text_start = i;
1563            inline_pending = false;
1564            continue;
1565        }
1566        if is_pandoc_inline_block_tag_name(name) {
1567            // At a fresh-block position (!inline_pending):
1568            //
1569            // - Open tag with matched close, interior not opening with a
1570            //   void block tag: lift `<tag>...</tag>` into RawBlock +
1571            //   interior + RawBlock.
1572            // - Open tag with no matched close, or open tag whose interior
1573            //   begins (after any indent) with a void block tag at column
1574            //   0: emit the open tag as a single RawBlock and continue
1575            //   scanning. Pandoc-native treats `<video>\n<source>...` as
1576            //   per-tag emission rather than a balanced span; once
1577            //   `<source>` interrupts the run, the closing `</video>` ends
1578            //   up as `RawInline` inside the trailing paragraph.
1579            // - Close tag at fresh-block: emit as a single RawBlock.
1580            //   Pandoc-native pins `</video>` standalone as a RawBlock.
1581            //
1582            // Inside an existing inline run (`inline_pending == true`),
1583            // pass through as inline raw HTML (pandoc's `cannot_interrupt`
1584            // semantics for `eitherBlockOrInline` tags).
1585            if !inline_pending {
1586                if !is_close
1587                    && let Some((close_start, close_end)) =
1588                        find_matching_html_close_with_start(content, i, name)
1589                    && !interior_starts_with_void_block_tag(content, i + tag_end)
1590                {
1591                    if i > text_start {
1592                        flush_html_block_text(&content[text_start..i], out);
1593                    }
1594                    out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1595                    let interior = &content[i + tag_end..close_start];
1596                    flush_html_block_text(interior, out);
1597                    let close_text = &content[close_start..close_end];
1598                    out.push(Block::RawBlock("html".to_string(), close_text.to_string()));
1599                    i = close_end;
1600                    text_start = i;
1601                    inline_pending = false;
1602                    continue;
1603                }
1604                if i > text_start {
1605                    flush_html_block_text(&content[text_start..i], out);
1606                }
1607                out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1608                i += tag_end;
1609                text_start = i;
1610                inline_pending = false;
1611                continue;
1612            }
1613            inline_pending = true;
1614            i += tag_end;
1615            continue;
1616        }
1617        if is_pandoc_void_block_tag_name(name) {
1618            // Void `eitherBlockOrInline` tags (`<embed>`, `<area>`,
1619            // `<source>`, `<track>`) emit as a single `RawBlock` per
1620            // instance at fresh-block positions; inside inline content
1621            // (`inline_pending == true`) they pass through as raw
1622            // inline HTML. The closing form (`</embed>` etc.) is not
1623            // valid HTML for void elements, but if it appears in the
1624            // wild pandoc still emits it as a `RawBlock` at fresh-block
1625            // positions — mirror that.
1626            if !inline_pending {
1627                if i > text_start {
1628                    flush_html_block_text(&content[text_start..i], out);
1629                }
1630                out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1631                i += tag_end;
1632                text_start = i;
1633                inline_pending = false;
1634                continue;
1635            }
1636            inline_pending = true;
1637            i += tag_end;
1638            continue;
1639        }
1640        // Non-splitting tag (truly inline-only HTML). Mark that an
1641        // inline run has started so subsequent `eitherBlockOrInline`
1642        // tags don't split mid-paragraph.
1643        inline_pending = true;
1644        i += tag_end;
1645    }
1646    if text_start < bytes.len() {
1647        // Tail text — no further tag follows in this HTML block, so the
1648        // final `Para` should NOT be demoted to `Plain`. Pandoc only
1649        // promotes a paragraph to `Plain` when it is butted up against
1650        // the next HTML tag in the same block.
1651        flush_html_block_tail_text(&content[text_start..], out);
1652    }
1653}
1654
1655/// Reparse inter-tag text as fresh Pandoc markdown. The final `Para`
1656/// becomes a `Plain` when the text has no trailing blank line (i.e. a
1657/// closing tag follows immediately): pandoc promotes the last paragraph
1658/// to `Plain` whenever it is butted up against the next HTML tag.
1659///
1660/// Use [`flush_html_block_tail_text`] for text at the END of the HTML
1661/// block (no tag follows) — the demotion would be wrong there.
1662fn flush_html_block_text(text: &str, out: &mut Vec<Block>) {
1663    if text.trim().is_empty() {
1664        return;
1665    }
1666    let trailing_blank = trailing_newlines(text) >= 2;
1667    let mut blocks = parse_pandoc_blocks(text);
1668    if blocks.is_empty() {
1669        return;
1670    }
1671    if !trailing_blank
1672        && let Some(Block::Para(_)) = blocks.last()
1673        && let Some(Block::Para(inlines)) = blocks.pop()
1674    {
1675        blocks.push(Block::Plain(inlines));
1676    }
1677    out.extend(blocks);
1678}
1679
1680/// Reparse trailing text at the end of an HTML block (no tag follows).
1681/// Unlike [`flush_html_block_text`], the final `Para` is preserved —
1682/// pandoc only demotes to `Plain` when butted up against the next tag.
1683fn flush_html_block_tail_text(text: &str, out: &mut Vec<Block>) {
1684    if text.trim().is_empty() {
1685        return;
1686    }
1687    let blocks = parse_pandoc_blocks(text);
1688    out.extend(blocks);
1689}
1690
1691fn trailing_newlines(s: &str) -> usize {
1692    s.bytes().rev().take_while(|&b| b == b'\n').count()
1693}
1694
1695/// Whether the slice of `content` starting at `interior_start` (the byte
1696/// just after an inline-block open tag like `<video>`) begins on its first
1697/// non-blank line with a void block tag (`<source>`, `<embed>`, `<area>`,
1698/// `<track>`). When true, pandoc-native abandons the matched-pair lift —
1699/// the void tag emits as its own `RawBlock` and the closing `</video>`
1700/// ends up inline inside the trailing paragraph rather than as a
1701/// matched-pair close. Leading indentation is allowed before the void tag
1702/// (pandoc still abandons even when the void tag is indented).
1703fn interior_starts_with_void_block_tag(content: &str, interior_start: usize) -> bool {
1704    use crate::parser::blocks::html_blocks::is_pandoc_void_block_tag_name;
1705    use crate::parser::inlines::inline_html::parse_open_tag;
1706
1707    let bytes = content.as_bytes();
1708    let mut i = interior_start;
1709    while i < bytes.len() && matches!(bytes[i], b'\n' | b' ' | b'\t') {
1710        i += 1;
1711    }
1712    if i >= bytes.len() || bytes[i] != b'<' {
1713        return false;
1714    }
1715    let rest = &content[i..];
1716    let Some(end) = parse_open_tag(rest) else {
1717        return false;
1718    };
1719    extract_html_tag_name(&rest[..end]).is_some_and(is_pandoc_void_block_tag_name)
1720}
1721
1722/// Extract the tag name from a complete HTML tag text (`<name ...>` or
1723/// `</name>`). Used to gate splitting on block-level tag membership.
1724fn extract_html_tag_name(tag_text: &str) -> Option<&str> {
1725    let bytes = tag_text.as_bytes();
1726    if bytes.first() != Some(&b'<') {
1727        return None;
1728    }
1729    let start = if bytes.get(1) == Some(&b'/') { 2 } else { 1 };
1730    let mut end = start;
1731    while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'-') {
1732        end += 1;
1733    }
1734    if start == end {
1735        None
1736    } else {
1737        Some(&tag_text[start..end])
1738    }
1739}
1740
1741/// Depth-aware scan for the matching closing tag of `name` starting at
1742/// byte position `start` (the `<` of the opening tag) in `content`.
1743/// Returns `(close_start, close_end)` — the bounds of the matching
1744/// `</name>` tag — or `None` when no balanced close exists in `content`.
1745fn find_matching_html_close_with_start(
1746    content: &str,
1747    start: usize,
1748    name: &str,
1749) -> Option<(usize, usize)> {
1750    use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
1751
1752    let bytes = content.as_bytes();
1753    let opener_end = parse_open_tag(&content[start..])?;
1754    let mut i = start + opener_end;
1755    let mut depth = 1usize;
1756    while i < bytes.len() {
1757        if bytes[i] != b'<' {
1758            i += 1;
1759            continue;
1760        }
1761        let rest = &content[i..];
1762        if let Some(end) = parse_open_tag(rest) {
1763            let tag = &rest[..end];
1764            if extract_html_tag_name(tag).is_some_and(|n| n.eq_ignore_ascii_case(name)) {
1765                depth += 1;
1766            }
1767            i += end;
1768            continue;
1769        }
1770        if let Some(end) = parse_close_tag(rest) {
1771            let tag = &rest[..end];
1772            if extract_html_tag_name(tag).is_some_and(|n| n.eq_ignore_ascii_case(name)) {
1773                depth -= 1;
1774                if depth == 0 {
1775                    return Some((i, i + end));
1776                }
1777            }
1778            i += end;
1779            continue;
1780        }
1781        i += 1;
1782    }
1783    None
1784}
1785
1786/// Return true if `s` (with leading `<`) opens a raw-text HTML element where
1787/// pandoc keeps the entire block verbatim — no markdown parsing inside.
1788/// Lowercases the tag name for matching; matches when the tag name is
1789/// followed by whitespace, `>`, `/`, or end-of-string.
1790fn is_raw_text_element_open(s: &str) -> bool {
1791    let bytes = s.as_bytes();
1792    if bytes.is_empty() || bytes[0] != b'<' {
1793        return false;
1794    }
1795    let rest = &s[1..];
1796    for tag in ["script", "style", "pre", "textarea"] {
1797        if rest.len() < tag.len() {
1798            continue;
1799        }
1800        if rest[..tag.len()].eq_ignore_ascii_case(tag) {
1801            let after = rest.as_bytes().get(tag.len()).copied();
1802            match after {
1803                None => return true,
1804                Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'>') | Some(b'/') => {
1805                    return true;
1806                }
1807                _ => {}
1808            }
1809        }
1810    }
1811    false
1812}
1813
1814/// Iterate `node`'s block-level emission, handling `HTML_BLOCK` splitting
1815/// (one HTML block can project as several pandoc-native blocks under
1816/// `markdown_in_html_blocks`) while keeping every other kind one-block.
1817fn collect_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1818    if node.kind() == SyntaxKind::HTML_BLOCK_DIV {
1819        // `HTML_BLOCK_DIV` is the parser's explicit `<div>` retag. The
1820        // structural projector walks lifted CST children directly —
1821        // all balanced `<div>` shapes lift at parse time.
1822        out.push(html_div_block(node));
1823        return;
1824    }
1825    if node.kind() == SyntaxKind::HTML_BLOCK {
1826        // Opaque HTML block — comments, PI, verbatim (`<pre>`, `<style>`,
1827        // `<script>`, `<textarea>`), void inline-block tags, and any
1828        // strict/inline-block tag the parser couldn't lift. The byte
1829        // walker splits these into per-tag RawBlocks plus interior text.
1830        emit_html_block(node, out);
1831        return;
1832    }
1833    if let Some(b) = block_from(node) {
1834        out.push(b);
1835    }
1836}
1837
1838/// Reparse `text` as Pandoc-flavored markdown and return its top-level
1839/// blocks. Unlike `parse_cell_text_blocks`, leaves `Para` as `Para` — the
1840/// caller decides whether the surrounding context demands `Plain`.
1841fn parse_pandoc_blocks(text: &str) -> Vec<Block> {
1842    if text.trim().is_empty() {
1843        return Vec::new();
1844    }
1845    let opts = crate::ParserOptions {
1846        flavor: crate::Flavor::Pandoc,
1847        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
1848        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
1849        ..crate::ParserOptions::default()
1850    };
1851    let doc = crate::parse(text, Some(opts));
1852    // Swap REFS_CTX with one built from the inner CST so heading auto-ids,
1853    // reference-link defs, and footnote defs inside the recursive parse
1854    // resolve against inner offsets/labels rather than the outer document's.
1855    // Outer refs/footnotes/heading-id history are inherited so a `<div>`
1856    // body can use a label/footnote defined outside, and inner heading
1857    // slugs disambiguate against outer headings. Pandoc parses
1858    // `<div>...</div>` natively in one pass, so this approximation
1859    // matches the common case (outer-def-before-inner-use, inner-loses
1860    // for shared keys); offset-aware document-order resolution would be
1861    // needed for full parity but is not exercised by current corpus.
1862    let outer = REFS_CTX.with(|c| std::mem::take(&mut *c.borrow_mut()));
1863    let inner_ctx = build_refs_ctx_inherited(&doc, Some(&outer));
1864    REFS_CTX.with(|c| *c.borrow_mut() = inner_ctx);
1865    let mut out = Vec::new();
1866    for child in doc.children() {
1867        collect_block(&child, &mut out);
1868    }
1869    REFS_CTX.with(|c| *c.borrow_mut() = outer);
1870    out
1871}
1872
1873fn tex_block(node: &SyntaxNode) -> Block {
1874    let mut content = node.text().to_string();
1875    while content.ends_with('\n') {
1876        content.pop();
1877    }
1878    Block::RawBlock("tex".to_string(), content)
1879}
1880
1881fn fenced_div(node: &SyntaxNode) -> Block {
1882    let attr = node
1883        .children()
1884        .find(|c| c.kind() == SyntaxKind::DIV_FENCE_OPEN)
1885        .and_then(|open| open.children().find(|c| c.kind() == SyntaxKind::DIV_INFO))
1886        .map(|info| {
1887            // Structured `{...}` bodies are read straight from the CST; bare-word
1888            // shorthand and opaque/empty bodies still go through `parse_div_info`.
1889            if attr_node_is_structured(&info) {
1890                attr_from_attribute_node(&info)
1891            } else {
1892                parse_div_info(info.text().to_string().trim())
1893            }
1894        })
1895        .unwrap_or_default();
1896    let mut blocks = Vec::new();
1897    for child in node.children() {
1898        match child.kind() {
1899            SyntaxKind::DIV_FENCE_OPEN | SyntaxKind::DIV_FENCE_CLOSE => {}
1900            _ => collect_block(&child, &mut blocks),
1901        }
1902    }
1903    Block::Div(attr, blocks)
1904}
1905
1906/// Parse pandoc div info: either `{#id .class1 .class2 key=value}` or a single
1907/// bare class name like `Warning`.
1908fn parse_div_info(info: &str) -> Attr {
1909    if info.starts_with('{') && info.ends_with('}') {
1910        return parse_attr_block(&info[1..info.len() - 1]);
1911    }
1912    if !info.is_empty() {
1913        return Attr {
1914            id: String::new(),
1915            classes: vec![info.to_string()],
1916            kvs: Vec::new(),
1917        };
1918    }
1919    Attr::default()
1920}
1921
1922/// Whether an attribute-bearing node (`ATTRIBUTE`, `DIV_INFO`, …) carries the
1923/// structured `ATTR_*` children the parser emits for a Pandoc `{...}` body. When
1924/// false the node is an opaque single token (bare-word, raw-inline `{=format}`,
1925/// MMD `[#id]`, malformed/empty body) and callers fall back to reparsing.
1926fn attr_node_is_structured(node: &SyntaxNode) -> bool {
1927    node.children_with_tokens().any(|el| {
1928        matches!(
1929            el.kind(),
1930            SyntaxKind::ATTR_ID | SyntaxKind::ATTR_CLASS | SyntaxKind::ATTR_KEY_VALUE
1931        )
1932    })
1933}
1934
1935/// Build an `Attr` from an `ATTRIBUTE` node, reading structured `ATTR_*`
1936/// children when the parser emitted them and otherwise reparsing the opaque
1937/// `{...}` body. The structured path mirrors [`parse_attr_block`] semantics
1938/// (only `.`-prefixed tokens are classes; values strip a `"` pair) so projector
1939/// output is unchanged.
1940fn attr_from_attribute_node(attr_node: &SyntaxNode) -> Attr {
1941    if !attr_node_is_structured(attr_node) {
1942        let raw = attr_node.text().to_string();
1943        return raw
1944            .trim()
1945            .strip_prefix('{')
1946            .and_then(|s| s.strip_suffix('}'))
1947            .map(parse_attr_block)
1948            .unwrap_or_default();
1949    }
1950
1951    read_bare_attr_children(attr_node)
1952}
1953
1954/// Walk the bare `ATTR_ID` / `ATTR_CLASS` / `ATTR_KEY_VALUE` children of a node
1955/// (as emitted by `emit_attribute_node` and `emit_code_info_attrs`) into an
1956/// `Attr`, mirroring [`parse_attr_block`] semantics: `#id` strips its `#`; only
1957/// `.`-prefixed ATTR_CLASS tokens are classes (`=format` pseudo-classes are
1958/// dropped); ATTR_VALUE strips a `"` pair. Callers handle any `CODE_LANGUAGE`
1959/// token separately.
1960fn read_bare_attr_children(node: &SyntaxNode) -> Attr {
1961    let mut attr = Attr::default();
1962    for el in node.children_with_tokens() {
1963        match el.kind() {
1964            SyntaxKind::ATTR_ID => {
1965                if let Some(t) = el.as_token() {
1966                    attr.id = t.text().strip_prefix('#').unwrap_or(t.text()).to_string();
1967                }
1968            }
1969            SyntaxKind::ATTR_CLASS => {
1970                // `=format` pseudo-classes are not `.`-prefixed; `parse_attr_block`
1971                // never produced them, so drop them here for output parity.
1972                if let Some(c) = el.as_token().and_then(|t| t.text().strip_prefix('.')) {
1973                    attr.classes.push(c.to_string());
1974                }
1975            }
1976            SyntaxKind::ATTR_KEY_VALUE => {
1977                if let Some(kv) = el.as_node() {
1978                    let key = attr_kv_child_text(kv, SyntaxKind::ATTR_KEY);
1979                    if !key.is_empty() {
1980                        let value = strip_attr_value_quotes(&attr_kv_child_text(
1981                            kv,
1982                            SyntaxKind::ATTR_VALUE,
1983                        ));
1984                        attr.kvs.push((key, value));
1985                    }
1986                }
1987            }
1988            _ => {}
1989        }
1990    }
1991    attr
1992}
1993
1994/// Text of the first child token of `kv` with the given kind, or empty.
1995fn attr_kv_child_text(kv: &SyntaxNode, kind: SyntaxKind) -> String {
1996    kv.children_with_tokens()
1997        .find(|el| el.kind() == kind)
1998        .and_then(|el| el.as_token().map(|t| t.text().to_string()))
1999        .unwrap_or_default()
2000}
2001
2002/// Strip a matching pair of double quotes from an attribute value, mirroring
2003/// [`parse_attr_block`] (single quotes are kept, as it does).
2004fn strip_attr_value_quotes(raw: &str) -> String {
2005    if raw.len() >= 2 && raw.starts_with('"') && raw.ends_with('"') {
2006        raw[1..raw.len() - 1].to_string()
2007    } else {
2008        raw.to_string()
2009    }
2010}
2011
2012/// Strip a matching surrounding pair of `"` or `'` from an HTML attribute
2013/// value. HTML uses either quote style, both of which are part of the syntax.
2014fn strip_any_quotes(raw: &str) -> String {
2015    let bytes = raw.as_bytes();
2016    if bytes.len() >= 2 {
2017        let q = bytes[0];
2018        if (q == b'"' || q == b'\'') && bytes[bytes.len() - 1] == q {
2019            return raw[1..raw.len() - 1].to_string();
2020        }
2021    }
2022    raw.to_string()
2023}
2024
2025/// Build an `Attr` from a structural `HTML_ATTRS` node (or the legacy
2026/// native-span `SPAN_ATTRIBUTES` node, which carries the same HTML syntax),
2027/// reading the bare `ATTR_*` children the parser emits. HTML ids/classes carry
2028/// no `#`/`.` marker, and values may use either quote style (both stripped). An
2029/// opaque node (no recognized attributes) yields `Attr::default()`.
2030fn attr_from_html_attrs_node(node: &SyntaxNode) -> Attr {
2031    let mut attr = Attr::default();
2032    for el in node.children_with_tokens() {
2033        match el.kind() {
2034            SyntaxKind::ATTR_ID => {
2035                if attr.id.is_empty()
2036                    && let Some(t) = el.as_token()
2037                {
2038                    attr.id = t.text().to_string();
2039                }
2040            }
2041            SyntaxKind::ATTR_CLASS => {
2042                if let Some(t) = el.as_token() {
2043                    attr.classes.push(t.text().to_string());
2044                }
2045            }
2046            SyntaxKind::ATTR_KEY_VALUE => {
2047                if let Some(kv) = el.as_node() {
2048                    let key = attr_kv_child_text(kv, SyntaxKind::ATTR_KEY);
2049                    if !key.is_empty() {
2050                        let value =
2051                            strip_any_quotes(&attr_kv_child_text(kv, SyntaxKind::ATTR_VALUE));
2052                        attr.kvs.push((key, value));
2053                    }
2054                }
2055            }
2056            _ => {}
2057        }
2058    }
2059    attr
2060}
2061
2062/// Read a child `ATTRIBUTE` (node or token) on `parent` into an `Attr`. Returns
2063/// `Attr::default()` if no attribute is attached or the body isn't
2064/// `{...}`-shaped.
2065fn extract_attr_from_node(parent: &SyntaxNode) -> Attr {
2066    parent
2067        .children_with_tokens()
2068        .find(|el| el.kind() == SyntaxKind::ATTRIBUTE)
2069        .map(|el| match el {
2070            NodeOrToken::Node(n) => attr_from_attribute_node(&n),
2071            NodeOrToken::Token(t) => t
2072                .text()
2073                .trim()
2074                .strip_prefix('{')
2075                .and_then(|s| s.strip_suffix('}'))
2076                .map(parse_attr_block)
2077                .unwrap_or_default(),
2078        })
2079        .unwrap_or_default()
2080}
2081
2082/// Parse the body of an attribute block like `#my-id .class1 .class2 key=value`.
2083/// Whitespace-separated. Tokens starting with `#` are id, `.` are classes,
2084/// `key=value` (optionally quoted value) are kvs.
2085fn parse_attr_block(s: &str) -> Attr {
2086    let mut id = String::new();
2087    let mut classes: Vec<String> = Vec::new();
2088    let mut kvs: Vec<(String, String)> = Vec::new();
2089    let bytes = s.as_bytes();
2090    let mut i = 0usize;
2091    while i < bytes.len() {
2092        match bytes[i] {
2093            b' ' | b'\t' | b'\n' | b'\r' => {
2094                i += 1;
2095            }
2096            b'#' => {
2097                let start = i + 1;
2098                let mut j = start;
2099                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
2100                    j += 1;
2101                }
2102                id = s[start..j].to_string();
2103                i = j;
2104            }
2105            b'.' => {
2106                let start = i + 1;
2107                let mut j = start;
2108                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
2109                    j += 1;
2110                }
2111                classes.push(s[start..j].to_string());
2112                i = j;
2113            }
2114            _ => {
2115                // Read key up to `=` or whitespace.
2116                let key_start = i;
2117                while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=') {
2118                    i += 1;
2119                }
2120                let key = s[key_start..i].to_string();
2121                if i < bytes.len() && bytes[i] == b'=' {
2122                    i += 1;
2123                    let value = if i < bytes.len() && bytes[i] == b'"' {
2124                        i += 1;
2125                        let v_start = i;
2126                        while i < bytes.len() && bytes[i] != b'"' {
2127                            i += 1;
2128                        }
2129                        let v = s[v_start..i].to_string();
2130                        if i < bytes.len() {
2131                            i += 1;
2132                        }
2133                        v
2134                    } else {
2135                        let v_start = i;
2136                        while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
2137                            i += 1;
2138                        }
2139                        s[v_start..i].to_string()
2140                    };
2141                    kvs.push((key, value));
2142                } else if !key.is_empty() {
2143                    // Bare token (legacy class form).
2144                    classes.push(key);
2145                }
2146            }
2147        }
2148    }
2149    Attr { id, classes, kvs }
2150}
2151
2152fn definition_list(node: &SyntaxNode) -> Block {
2153    let items: Vec<(Vec<Inline>, Vec<Vec<Block>>)> = node
2154        .children()
2155        .filter(|c| c.kind() == SyntaxKind::DEFINITION_ITEM)
2156        .map(|item| {
2157            let term = item
2158                .children()
2159                .find(|c| c.kind() == SyntaxKind::TERM)
2160                .map(|t| coalesce_inlines(inlines_from(&t)))
2161                .unwrap_or_default();
2162            let loose = is_loose_definition_item(&item);
2163            let defs: Vec<Vec<Block>> = item
2164                .children()
2165                .filter(|c| c.kind() == SyntaxKind::DEFINITION)
2166                .map(|d| definition_blocks(&d, loose))
2167                .collect();
2168            (term, defs)
2169        })
2170        .collect();
2171    Block::DefinitionList(items)
2172}
2173
2174/// A `DEFINITION_ITEM` is "loose" iff there is a `BLANK_LINE` between the
2175/// `TERM` (or its preceding term continuations) and the first `DEFINITION`.
2176/// Pandoc renders loose definitions with `Para` blocks; tight ones use
2177/// `Plain`. The looseness is per-item (per-term group), not per-definition,
2178/// and applies to *all* definitions in the item — see pandoc's behavior.
2179fn is_loose_definition_item(item: &SyntaxNode) -> bool {
2180    let mut saw_term = false;
2181    for child in item.children_with_tokens() {
2182        if let NodeOrToken::Node(n) = child {
2183            match n.kind() {
2184                SyntaxKind::TERM => {
2185                    saw_term = true;
2186                }
2187                SyntaxKind::BLANK_LINE if saw_term => {
2188                    return true;
2189                }
2190                SyntaxKind::DEFINITION => {
2191                    return false;
2192                }
2193                _ => {}
2194            }
2195        }
2196    }
2197    false
2198}
2199
2200fn definition_blocks(def_node: &SyntaxNode, loose: bool) -> Vec<Block> {
2201    // Definition body content lives at the marker's content offset (`: ` →
2202    // 2 columns by default). The CST keeps that indent on each line, so any
2203    // CODE_BLOCK descendant needs the offset stripped before pandoc-native
2204    // projection.
2205    let extra = definition_content_offset(def_node);
2206    let mut out = Vec::new();
2207    for child in def_node.children() {
2208        match child.kind() {
2209            SyntaxKind::PLAIN => {
2210                let inlines = coalesce_inlines(inlines_from(&child));
2211                if loose {
2212                    out.push(Block::Para(inlines));
2213                } else {
2214                    out.push(Block::Plain(inlines));
2215                }
2216            }
2217            SyntaxKind::PARAGRAPH => {
2218                out.push(Block::Para(coalesce_inlines(inlines_from(&child))));
2219            }
2220            SyntaxKind::CODE_BLOCK if extra > 0 => {
2221                out.push(indented_code_block_with_extra_strip(&child, extra));
2222            }
2223            _ => collect_block(&child, &mut out),
2224        }
2225    }
2226    out
2227}
2228
2229/// Visual column where definition body content starts. The strip later runs
2230/// against the *tab-expanded* body, so this offset must be measured in
2231/// columns (tabs round to the next 4-col stop), not raw chars: `:\t` reaches
2232/// col 4, which is the column the body's strip should remove.
2233fn definition_content_offset(def_node: &SyntaxNode) -> usize {
2234    let mut col = 0usize;
2235    let mut saw_marker = false;
2236    for el in def_node.children_with_tokens() {
2237        if let NodeOrToken::Token(t) = el {
2238            match t.kind() {
2239                SyntaxKind::DEFINITION_MARKER => {
2240                    col = advance_col(col, t.text());
2241                    saw_marker = true;
2242                }
2243                SyntaxKind::WHITESPACE if saw_marker => {
2244                    return advance_col(col, t.text());
2245                }
2246                _ if saw_marker => return col,
2247                _ => {}
2248            }
2249        } else if saw_marker {
2250            return col;
2251        }
2252    }
2253    col
2254}
2255
2256/// Advance a column counter by `s`, treating `\t` as moving to the next
2257/// 4-column tab stop and any other character as a single column.
2258fn advance_col(start: usize, s: &str) -> usize {
2259    let mut col = start;
2260    for c in s.chars() {
2261        if c == '\t' {
2262            col = (col / 4 + 1) * 4;
2263        } else {
2264            col += 1;
2265        }
2266    }
2267    col
2268}
2269
2270fn line_block(node: &SyntaxNode) -> Block {
2271    let lines: Vec<Vec<Inline>> = node
2272        .children()
2273        .filter(|c| c.kind() == SyntaxKind::LINE_BLOCK_LINE)
2274        .map(|line| {
2275            let mut out = Vec::new();
2276            for el in line.children_with_tokens() {
2277                match el {
2278                    NodeOrToken::Token(t) => match t.kind() {
2279                        SyntaxKind::LINE_BLOCK_MARKER | SyntaxKind::NEWLINE => {}
2280                        _ => push_token_inline(&t, &mut out),
2281                    },
2282                    NodeOrToken::Node(n) => out.push(inline_from_node(&n)),
2283                }
2284            }
2285            coalesce_inlines(out)
2286        })
2287        .collect();
2288    Block::LineBlock(lines)
2289}
2290
2291fn latex_command_inline(node: &SyntaxNode) -> Inline {
2292    let content = node.text().to_string();
2293    Inline::RawInline("tex".to_string(), content)
2294}
2295
2296fn bracketed_span_inline(node: &SyntaxNode) -> Inline {
2297    let is_html = node
2298        .children_with_tokens()
2299        .any(|el| matches!(&el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::SPAN_BRACKET_OPEN && t.text().starts_with('<')));
2300    let attr = node
2301        .children()
2302        .find(|n| n.kind() == SyntaxKind::SPAN_ATTRIBUTES)
2303        .map(|n| {
2304            if is_html {
2305                // Legacy native-span path: `SPAN_ATTRIBUTES` carries HTML
2306                // `class="..."` syntax, structured into bare ATTR_* children.
2307                attr_from_html_attrs_node(&n)
2308            } else {
2309                // Pandoc bracketed span: `SPAN_ATTRIBUTES` is structured into
2310                // ATTR_* children; `attr_from_attribute_node` reads them (and
2311                // reparses an opaque/empty body via its own fallback).
2312                attr_from_attribute_node(&n)
2313            }
2314        })
2315        .unwrap_or_default();
2316    let content = node
2317        .children()
2318        .find(|c| c.kind() == SyntaxKind::SPAN_CONTENT)
2319        .map(|n| coalesce_inlines(inlines_from(&n)))
2320        .unwrap_or_default();
2321    Inline::Span(attr, content)
2322}
2323
2324fn inline_html_span_inline(node: &SyntaxNode) -> Inline {
2325    let attr = node
2326        .children()
2327        .find(|c| c.kind() == SyntaxKind::HTML_ATTRS)
2328        .map(|n| attr_from_html_attrs_node(&n))
2329        .unwrap_or_default();
2330    let content = node
2331        .children()
2332        .find(|c| c.kind() == SyntaxKind::SPAN_CONTENT)
2333        .map(|n| coalesce_inlines(inlines_from(&n)))
2334        .unwrap_or_default();
2335    Inline::Span(attr, content)
2336}
2337
2338fn pipe_table(node: &SyntaxNode) -> Option<TableData> {
2339    let mut header_cells: Vec<Vec<Inline>> = Vec::new();
2340    let mut body_rows: Vec<Vec<Vec<Inline>>> = Vec::new();
2341    let mut aligns: Vec<&'static str> = Vec::new();
2342    let mut caption_inlines: Vec<Inline> = Vec::new();
2343    let mut caption_attr_from_node: Option<Attr> = None;
2344    for child in node.children() {
2345        match child.kind() {
2346            SyntaxKind::TABLE_HEADER => {
2347                header_cells = pipe_table_cells(&child);
2348            }
2349            SyntaxKind::TABLE_SEPARATOR => {
2350                let raw = child.text().to_string();
2351                aligns = pipe_separator_aligns(&raw);
2352            }
2353            SyntaxKind::TABLE_ROW => {
2354                body_rows.push(pipe_table_cells(&child));
2355            }
2356            SyntaxKind::TABLE_CAPTION => {
2357                let (inlines, attr) = pipe_table_caption(&child);
2358                caption_inlines = inlines;
2359                caption_attr_from_node = attr;
2360            }
2361            _ => {}
2362        }
2363    }
2364    let cols = header_cells
2365        .len()
2366        .max(body_rows.iter().map(Vec::len).max().unwrap_or(0))
2367        .max(aligns.len());
2368    if cols == 0 {
2369        return None;
2370    }
2371    while aligns.len() < cols {
2372        aligns.push("AlignDefault");
2373    }
2374    let head_rows = if header_cells.is_empty() {
2375        Vec::new()
2376    } else {
2377        vec![cells_to_plain_blocks(header_cells, cols)]
2378    };
2379    let body_rows: Vec<Vec<GridCell>> = body_rows
2380        .into_iter()
2381        .map(|cells| cells_to_plain_blocks(cells, cols))
2382        .collect();
2383    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2384    Some(TableData {
2385        attr,
2386        caption: caption_inlines,
2387        aligns,
2388        widths: vec![None; cols],
2389        head_rows,
2390        body_rows,
2391        foot_rows: Vec::new(),
2392    })
2393}
2394
2395fn pipe_table_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
2396    row.children()
2397        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2398        .map(|cell| coalesce_inlines(inlines_from(&cell)))
2399        .collect()
2400}
2401
2402/// Pandoc's `+caption_attributes` extension lifts a trailing `{...}` from a
2403/// table caption into the Table's outer attribute. Walk the caption inlines
2404/// from the right looking for a balanced trailing `{...}` span: a Str
2405/// ending with `}` plus zero or more (Space, Str) pairs back until a Str
2406/// starts with `{`. If found, parse the brace contents as an attribute
2407/// block and drop those inlines (plus any preceding Space) from the caption
2408/// text.
2409fn extract_caption_attrs(mut inlines: Vec<Inline>) -> (Attr, Vec<Inline>) {
2410    let last_str_end = inlines
2411        .iter()
2412        .rposition(|i| matches!(i, Inline::Str(s) if s.ends_with('}')));
2413    let Some(end_idx) = last_str_end else {
2414        return (Attr::default(), inlines);
2415    };
2416    // Walk back to find the Str starting with `{`. Allow only Str/Space
2417    // between (no structural inlines like Emph), since attribute blocks
2418    // are plain text.
2419    let mut start_idx = end_idx;
2420    let mut found_open = false;
2421    loop {
2422        match &inlines[start_idx] {
2423            Inline::Str(s) => {
2424                if s.starts_with('{') {
2425                    found_open = true;
2426                    break;
2427                }
2428            }
2429            Inline::Space => {}
2430            _ => return (Attr::default(), inlines),
2431        }
2432        if start_idx == 0 {
2433            break;
2434        }
2435        start_idx -= 1;
2436    }
2437    if !found_open {
2438        return (Attr::default(), inlines);
2439    }
2440    // Concatenate the Str/Space slice into a flat string, then strip the
2441    // outer braces.
2442    let mut raw = String::new();
2443    for el in &inlines[start_idx..=end_idx] {
2444        match el {
2445            Inline::Str(s) => raw.push_str(s),
2446            Inline::Space => raw.push(' '),
2447            _ => return (Attr::default(), inlines),
2448        }
2449    }
2450    if !(raw.starts_with('{') && raw.ends_with('}')) {
2451        return (Attr::default(), inlines);
2452    }
2453    let inner = &raw[1..raw.len() - 1];
2454    let attr = parse_attr_block(inner);
2455    inlines.truncate(start_idx);
2456    if matches!(inlines.last(), Some(Inline::Space)) {
2457        inlines.pop();
2458    }
2459    (attr, inlines)
2460}
2461
2462/// Resolve `(Attr, caption_inlines)` for a table whose caption has already
2463/// been projected. Prefers a structural ATTRIBUTE node when the parser
2464/// captured one (`+caption_attributes` lift); falls back to the legacy
2465/// trailing-Str scan for older paths.
2466fn resolve_caption_attr(
2467    caption_inlines: Vec<Inline>,
2468    caption_attr_from_node: Option<Attr>,
2469) -> (Attr, Vec<Inline>) {
2470    match caption_attr_from_node {
2471        Some(attr) => (attr, caption_inlines),
2472        None => extract_caption_attrs(caption_inlines),
2473    }
2474}
2475
2476/// Run `pipe_table_caption` over the table node's TABLE_CAPTION child if any,
2477/// returning collected inlines and a structurally-extracted attr (None when
2478/// the parser didn't lift one).
2479fn project_table_caption_from(node: &SyntaxNode) -> (Vec<Inline>, Option<Attr>) {
2480    node.children()
2481        .find(|c| c.kind() == SyntaxKind::TABLE_CAPTION)
2482        .map(|n| pipe_table_caption(&n))
2483        .unwrap_or_else(|| (Vec::new(), None))
2484}
2485
2486fn pipe_table_caption(node: &SyntaxNode) -> (Vec<Inline>, Option<Attr>) {
2487    // Walk all tokens after TABLE_CAPTION_PREFIX and collect inline content.
2488    // The parser lifts a trailing `{...}` attribute block (Pandoc's
2489    // `+caption_attributes`) into a structural ATTRIBUTE node — surface it as
2490    // the table's outer attr instead of projecting it as an inline.
2491    let mut out = Vec::new();
2492    let mut caption_attr: Option<Attr> = None;
2493    let mut after_prefix = false;
2494    for el in node.children_with_tokens() {
2495        match el {
2496            NodeOrToken::Node(n) => {
2497                if n.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
2498                    after_prefix = true;
2499                    continue;
2500                }
2501                if !after_prefix {
2502                    continue;
2503                }
2504                if n.kind() == SyntaxKind::ATTRIBUTE {
2505                    caption_attr = Some(attr_from_attribute_node(&n));
2506                    // Drop any trailing whitespace inline pushed before the attribute.
2507                    if matches!(out.last(), Some(Inline::Space)) {
2508                        out.pop();
2509                    }
2510                    continue;
2511                }
2512                out.push(inline_from_node(&n));
2513            }
2514            NodeOrToken::Token(t) => {
2515                if t.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
2516                    after_prefix = true;
2517                    continue;
2518                }
2519                if !after_prefix {
2520                    continue;
2521                }
2522                if t.kind() == SyntaxKind::ATTRIBUTE {
2523                    let raw = t.text();
2524                    let inner = raw.trim().trim_start_matches('{').trim_end_matches('}');
2525                    caption_attr = Some(parse_attr_block(inner));
2526                    if matches!(out.last(), Some(Inline::Space)) {
2527                        out.pop();
2528                    }
2529                    continue;
2530                }
2531                push_token_inline(&t, &mut out);
2532            }
2533        }
2534    }
2535    (coalesce_inlines(out), caption_attr)
2536}
2537
2538fn pipe_separator_aligns(raw: &str) -> Vec<&'static str> {
2539    // Strip surrounding whitespace before pipe-stripping so an indented
2540    // pipe-table separator (e.g. fenced-div content at column ≥1) doesn't
2541    // leave a leading whitespace segment that then counts as a phantom
2542    // column.
2543    let trimmed = raw.trim();
2544    let inner = trimmed.trim_start_matches('|').trim_end_matches('|');
2545    inner
2546        .split('|')
2547        .map(|seg| {
2548            let s = seg.trim();
2549            let left = s.starts_with(':');
2550            let right = s.ends_with(':');
2551            match (left, right) {
2552                (true, true) => "AlignCenter",
2553                (true, false) => "AlignLeft",
2554                (false, true) => "AlignRight",
2555                _ => "AlignDefault",
2556            }
2557        })
2558        .collect()
2559}
2560
2561fn cells_to_plain_blocks(cells: Vec<Vec<Inline>>, cols: usize) -> Vec<GridCell> {
2562    let mut out: Vec<GridCell> = cells
2563        .into_iter()
2564        .map(|inlines| {
2565            let blocks = if inlines.is_empty() {
2566                Vec::new()
2567            } else {
2568                vec![Block::Plain(inlines)]
2569            };
2570            GridCell::no_span(blocks)
2571        })
2572        .collect();
2573    while out.len() < cols {
2574        out.push(GridCell::no_span(Vec::new()));
2575    }
2576    out
2577}
2578
2579/// Pandoc-style `show` for `Double`. Decimal in `[0.1, 1e7)`, scientific
2580/// otherwise. Always emits a fractional component (`1.0` not `1`). Used for
2581/// `ColWidth N` rendering, where N is in `(0.0, 1.0)` for our cases.
2582fn show_double(x: f64) -> String {
2583    if x == 0.0 {
2584        return "0.0".to_string();
2585    }
2586    let abs = x.abs();
2587    if (0.1..1e7).contains(&abs) {
2588        let s = format!("{x}");
2589        if s.contains('.') || s.contains('e') {
2590            s
2591        } else {
2592            format!("{s}.0")
2593        }
2594    } else {
2595        // Rust's `{:e}` already matches Haskell's mantissa/exponent shape:
2596        // `8.333333333333333e-2`. Whole-number mantissa needs `.0` appended.
2597        let s = format!("{x:e}");
2598        if let Some((m, e)) = s.split_once('e') {
2599            if m.contains('.') {
2600                s
2601            } else {
2602                format!("{m}.0e{e}")
2603            }
2604        } else {
2605            s
2606        }
2607    }
2608}
2609
2610// ----- simple table -------------------------------------------------------
2611
2612/// Project a `SIMPLE_TABLE` node. Pandoc's "simple" table form:
2613///
2614/// ```text
2615///    Col1     Col2
2616/// -------- --------    ← TABLE_SEPARATOR (dash runs define columns)
2617///   data1    data2
2618///
2619/// Table: optional caption
2620/// ```
2621///
2622/// Headerless variant skips the header row and uses dash runs both above
2623/// and below the data. Alignment is derived from each header cell's
2624/// position relative to its column's dash run boundaries. For headerless
2625/// tables, alignment derives from the *first data row*.
2626fn simple_table(node: &SyntaxNode) -> Option<TableData> {
2627    let separator = node
2628        .children()
2629        .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)?;
2630    let cols = simple_table_dash_runs(&separator);
2631    if cols.is_empty() {
2632        return None;
2633    }
2634    let header = node
2635        .children()
2636        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
2637    // Body rows: every TABLE_ROW. Drop a trailing all-dashes row — that is
2638    // the closing `---` separator of a headerless table that the parser
2639    // currently emits as a TABLE_ROW of dash cells.
2640    let mut body_rows_nodes: Vec<SyntaxNode> = node
2641        .children()
2642        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
2643        .collect();
2644    if header.is_none()
2645        && body_rows_nodes
2646            .last()
2647            .map(simple_table_row_is_all_dashes)
2648            .unwrap_or(false)
2649    {
2650        body_rows_nodes.pop();
2651    }
2652    // Alignment: from header if present, else from the first data row.
2653    let aligns = if let Some(h) = &header {
2654        simple_table_aligns(h, &cols)
2655    } else if let Some(r0) = body_rows_nodes.first() {
2656        simple_table_aligns(r0, &cols)
2657    } else {
2658        vec!["AlignDefault"; cols.len()]
2659    };
2660    let head_rows = match &header {
2661        Some(h) => {
2662            let cells: Vec<Vec<Inline>> = simple_table_row_cells(h);
2663            vec![cells_to_plain_blocks(cells, cols.len())]
2664        }
2665        None => Vec::new(),
2666    };
2667    let body_rows: Vec<Vec<GridCell>> = body_rows_nodes
2668        .iter()
2669        .map(|r| cells_to_plain_blocks(simple_table_row_cells(r), cols.len()))
2670        .collect();
2671    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
2672    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2673    Some(TableData {
2674        attr,
2675        caption: caption_inlines,
2676        aligns,
2677        widths: vec![None; cols.len()],
2678        head_rows,
2679        body_rows,
2680        foot_rows: Vec::new(),
2681    })
2682}
2683
2684/// Return the `(start_col, end_col)` (inclusive) of each dash run in a
2685/// `TABLE_SEPARATOR` node, where columns are 0-based offsets within the
2686/// separator's line.
2687fn simple_table_dash_runs(separator: &SyntaxNode) -> Vec<(usize, usize)> {
2688    let raw = separator.text().to_string();
2689    let line = raw.trim_end_matches(['\n', '\r']);
2690    let mut runs = Vec::new();
2691    let mut start: Option<usize> = None;
2692    for (i, ch) in line.char_indices() {
2693        if ch == '-' {
2694            if start.is_none() {
2695                start = Some(i);
2696            }
2697        } else if let Some(s) = start.take() {
2698            runs.push((s, i - 1));
2699        }
2700    }
2701    if let Some(s) = start.take() {
2702        runs.push((s, line.len() - 1));
2703    }
2704    runs
2705}
2706
2707fn simple_table_row_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
2708    // Zero-width TABLE_CELL nodes represent positionally-empty columns
2709    // (e.g. case 0094, where header words land in only some of the
2710    // dash-defined columns). Keep them as empty cells so the row's
2711    // column ordering matches the dash separator.
2712    row.children()
2713        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2714        .map(|cell| coalesce_inlines(inlines_from(&cell)))
2715        .collect()
2716}
2717
2718fn simple_table_row_is_all_dashes(row: &SyntaxNode) -> bool {
2719    let mut had_cell = false;
2720    for cell in row
2721        .children()
2722        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2723    {
2724        let text = cell.text().to_string();
2725        let trimmed = text.trim();
2726        if trimmed.is_empty() {
2727            continue;
2728        }
2729        had_cell = true;
2730        if !trimmed.chars().all(|c| c == '-') {
2731            return false;
2732        }
2733    }
2734    had_cell
2735}
2736
2737/// Derive alignments for a simple-table header (or first data row) by
2738/// comparing each cell's *visible* (whitespace-trimmed) column range to
2739/// the corresponding dash run. Multiline-table TABLE_CELL nodes include
2740/// the padding whitespace within the column slice, so we have to peel
2741/// off leading/trailing whitespace before applying the flushness rule.
2742/// (Single-line simple-table cells already exclude padding whitespace,
2743/// but the trim is a no-op there.)
2744fn simple_table_aligns(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<&'static str> {
2745    let row_start: u32 = row.text_range().start().into();
2746    let mut cell_ranges: Vec<(usize, usize)> = Vec::new();
2747    for cell in row
2748        .children()
2749        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2750    {
2751        if cell.text_range().is_empty() {
2752            continue;
2753        }
2754        let text = cell.text().to_string();
2755        let lstrip = text.chars().take_while(|c| *c == ' ' || *c == '\t').count();
2756        let rstrip = text
2757            .chars()
2758            .rev()
2759            .take_while(|c| *c == ' ' || *c == '\t')
2760            .count();
2761        let trimmed_len = text.chars().count().saturating_sub(lstrip + rstrip);
2762        if trimmed_len == 0 {
2763            continue;
2764        }
2765        let start: u32 = cell.text_range().start().into();
2766        let s = (start - row_start) as usize;
2767        let visible_start = s + lstrip;
2768        let visible_end = visible_start + trimmed_len - 1;
2769        cell_ranges.push((visible_start, visible_end));
2770    }
2771    cols.iter()
2772        .map(|(col_start, col_end)| {
2773            let cell = cell_ranges
2774                .iter()
2775                .find(|(cs, ce)| ce >= col_start && cs <= col_end);
2776            match cell {
2777                Some((cs, ce)) => {
2778                    let left_flush = cs == col_start;
2779                    let right_flush = ce == col_end;
2780                    match (left_flush, right_flush) {
2781                        (true, true) => "AlignDefault",
2782                        (true, false) => "AlignLeft",
2783                        (false, true) => "AlignRight",
2784                        (false, false) => "AlignCenter",
2785                    }
2786                }
2787                None => "AlignDefault",
2788            }
2789        })
2790        .collect()
2791}
2792
2793// ----- grid table ---------------------------------------------------------
2794
2795/// Project a `GRID_TABLE` node into pandoc-native shape. Implements a
2796/// `gridtables`-style 2D layout pass:
2797///
2798/// 1. Collect every line of the table (excluding caption) into a padded
2799///    char grid, tracking which `TABLE_HEADER` / `TABLE_ROW` /
2800///    `TABLE_FOOTER` parent each line came from.
2801/// 2. The canonical column boundaries are the union of `+` positions
2802///    across every "sep-style" line (lines made of `+`/`-`/`=`/`:`/`|`/`
2803///    `). The canonical row boundaries are the indices of those
2804///    sep-style lines. So a partial separator like
2805///    `|        +----+----+` contributes both to canonical column
2806///    positions and to row block boundaries (it ends some cells and
2807///    starts others mid-row).
2808/// 3. Cells are detected by walking `(row_block, col)` in scan order and,
2809///    at each unoccupied position whose top-left `+` is real, finding the
2810///    smallest valid bounding rectangle: top/bottom edges in
2811///    `{-,=,:,+}`, left/right edges in `{|,+}`, no fully-spanning
2812///    interior separator that would split it. RowSpan/ColSpan are
2813///    derived from the canonical row/col indices of the cell's corners.
2814///
2815/// Column widths use the alignment separator (the one carrying `:`s) if
2816/// present, else the first separator — both via `grid_dash_widths`. The
2817/// alignment row also drives per-column alignment via
2818/// `grid_separator_aligns`.
2819#[allow(clippy::needless_range_loop)]
2820fn grid_table(node: &SyntaxNode) -> Option<TableData> {
2821    // Collect all lines except the caption, tagged with their parent kind.
2822    let mut tagged: Vec<(SyntaxKind, String)> = Vec::new();
2823    for child in node.children() {
2824        if child.kind() == SyntaxKind::TABLE_CAPTION {
2825            continue;
2826        }
2827        let text = child.text().to_string();
2828        for line in text.split_inclusive('\n') {
2829            let trimmed = line.trim_end_matches('\n');
2830            tagged.push((child.kind(), trimmed.to_string()));
2831        }
2832    }
2833    if tagged.is_empty() {
2834        return None;
2835    }
2836
2837    // Pad lines into a 2D char grid.
2838    let max_width = tagged
2839        .iter()
2840        .map(|(_, l)| l.chars().count())
2841        .max()
2842        .unwrap_or(0);
2843    let grid: Vec<Vec<char>> = tagged
2844        .iter()
2845        .map(|(_, l)| {
2846            let mut chars: Vec<char> = l.chars().collect();
2847            chars.resize(max_width, ' ');
2848            chars
2849        })
2850        .collect();
2851    let nlines = grid.len();
2852
2853    // A line is "sep-style" if it contains at least one `+` and no chars
2854    // outside `+`/`-`/`=`/`:`/`|`/` `. Partial separators (lines mixing
2855    // `|` and `+`) qualify; content lines do not.
2856    let is_sep_line: Vec<bool> = grid
2857        .iter()
2858        .map(|row| {
2859            row.contains(&'+')
2860                && row
2861                    .iter()
2862                    .all(|&c| matches!(c, '+' | '-' | '=' | ':' | '|' | ' '))
2863        })
2864        .collect();
2865
2866    // Canonical column boundaries: union of `+` columns across all sep-style lines.
2867    let mut col_set: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
2868    for (i, row) in grid.iter().enumerate() {
2869        if !is_sep_line[i] {
2870            continue;
2871        }
2872        for (j, &c) in row.iter().enumerate() {
2873            if c == '+' {
2874                col_set.insert(j);
2875            }
2876        }
2877    }
2878    let cols_pos: Vec<usize> = col_set.into_iter().collect();
2879    if cols_pos.len() < 2 {
2880        return None;
2881    }
2882    let ncols = cols_pos.len() - 1;
2883
2884    // Canonical row boundaries: line indices of sep-style lines.
2885    let row_seps: Vec<usize> = (0..nlines).filter(|&i| is_sep_line[i]).collect();
2886    if row_seps.len() < 2 {
2887        return None;
2888    }
2889    let nrows = row_seps.len() - 1;
2890
2891    // Block kind per row block: head if any non-sep line in the block came
2892    // from a TABLE_HEADER, foot if from TABLE_FOOTER, else body.
2893    let mut block_kind: Vec<&'static str> = vec!["body"; nrows];
2894    for r in 0..nrows {
2895        let start = row_seps[r];
2896        let end = row_seps[r + 1];
2897        for i in (start + 1)..end {
2898            match tagged[i].0 {
2899                SyntaxKind::TABLE_HEADER => block_kind[r] = "head",
2900                SyntaxKind::TABLE_FOOTER => block_kind[r] = "foot",
2901                _ => {}
2902            }
2903        }
2904    }
2905
2906    // Detect cells.
2907    let mut occupied = vec![vec![false; ncols]; nrows];
2908    // (start_row, start_col, row_span, col_span, content_text)
2909    let mut cells: Vec<(usize, usize, u32, u32, String)> = Vec::new();
2910    for sr in 0..nrows {
2911        for sc in 0..ncols {
2912            if occupied[sr][sc] {
2913                continue;
2914            }
2915            let i = row_seps[sr];
2916            let j = cols_pos[sc];
2917            if grid[i][j] != '+' {
2918                // No corner here — the canonical column is missing on this
2919                // sep line, meaning the cell that owns this position must
2920                // have been emitted earlier and `occupied` should already be
2921                // set. If not, the table is malformed; skip.
2922                continue;
2923            }
2924            let Some((er, ec, content)) = find_grid_cell(&grid, i, j, sr, sc, &cols_pos, &row_seps)
2925            else {
2926                continue;
2927            };
2928            let row_span = (er - sr) as u32;
2929            let col_span = (ec - sc) as u32;
2930            for r in sr..er {
2931                for c in sc..ec {
2932                    occupied[r][c] = true;
2933                }
2934            }
2935            cells.push((sr, sc, row_span, col_span, content));
2936        }
2937    }
2938
2939    // Group cells by row block and convert to GridCells. Within each block,
2940    // emit cells in canonical column order.
2941    let mut head_rows: Vec<Vec<GridCell>> = Vec::new();
2942    let mut body_rows: Vec<Vec<GridCell>> = Vec::new();
2943    let mut foot_rows: Vec<Vec<GridCell>> = Vec::new();
2944    for r in 0..nrows {
2945        let mut row_cells: Vec<&(usize, usize, u32, u32, String)> =
2946            cells.iter().filter(|(sr, _, _, _, _)| *sr == r).collect();
2947        row_cells.sort_by_key(|(_, sc, _, _, _)| *sc);
2948        let row: Vec<GridCell> = row_cells
2949            .into_iter()
2950            .map(|(_, _, rs, cs, text)| {
2951                let blocks = parse_grid_cell_text(text);
2952                GridCell {
2953                    row_span: *rs,
2954                    col_span: *cs,
2955                    blocks,
2956                }
2957            })
2958            .collect();
2959        match block_kind[r] {
2960            "head" => head_rows.push(row),
2961            "foot" => foot_rows.push(row),
2962            _ => body_rows.push(row),
2963        }
2964    }
2965
2966    // Column widths and alignments. Pick the alignment-bearing separator
2967    // for both (or fall back to the first separator).
2968    let alignment_sep = node
2969        .children()
2970        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2971        .find(|c| c.text().to_string().contains(':'))
2972        .or_else(|| {
2973            node.children()
2974                .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2975        })?;
2976    let widths = grid_dash_widths(&alignment_sep);
2977    let aligns_raw = alignment_sep.text().to_string();
2978    let aligns = if aligns_raw.contains(':') {
2979        grid_separator_aligns(&aligns_raw, ncols)
2980    } else {
2981        vec!["AlignDefault"; ncols]
2982    };
2983
2984    // Caption.
2985    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
2986    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2987
2988    Some(TableData {
2989        attr,
2990        caption: caption_inlines,
2991        aligns,
2992        widths: widths.into_iter().map(Some).collect(),
2993        head_rows,
2994        body_rows,
2995        foot_rows,
2996    })
2997}
2998
2999/// Find the smallest valid grid-table cell with its top-left `+` at
3000/// `(i, j)` in the char grid, where `(sr, sc)` are the canonical row /
3001/// column indices of that corner.
3002///
3003/// Returns `(end_row_idx, end_col_idx, content_text)` where the cell
3004/// occupies canonical rows `sr..end_row_idx` and canonical columns
3005/// `sc..end_col_idx`. Content is the text inside the cell, with one
3006/// leading-space pad stripped per line and trailing whitespace trimmed,
3007/// joined with `\n`.
3008#[allow(clippy::needless_range_loop)]
3009fn find_grid_cell(
3010    grid: &[Vec<char>],
3011    i: usize,
3012    j: usize,
3013    sr: usize,
3014    sc: usize,
3015    cols_pos: &[usize],
3016    row_seps: &[usize],
3017) -> Option<(usize, usize, String)> {
3018    let nrows = row_seps.len() - 1;
3019    let ncols = cols_pos.len() - 1;
3020
3021    for ec in (sc + 1)..=ncols {
3022        let k = cols_pos[ec];
3023        // Top edge (i, j+1..k) must be all sep chars (intermediate `+`s OK).
3024        let top_ok = (j + 1..k).all(|c| matches!(grid[i][c], '-' | '=' | ':' | '+'));
3025        if !top_ok {
3026            // Hit a `|` or ` `; can't extend further right.
3027            break;
3028        }
3029        for er in (sr + 1)..=nrows {
3030            let l = row_seps[er];
3031            // Left edge col j from i+1..l: chars in {|, +}.
3032            let left_ok = (i + 1..l).all(|r| matches!(grid[r][j], '|' | '+'));
3033            if !left_ok {
3034                break;
3035            }
3036            // Right edge col k from i+1..l: chars in {|, +}.
3037            let right_ok = (i + 1..l).all(|r| matches!(grid[r][k], '|' | '+'));
3038            if !right_ok {
3039                continue;
3040            }
3041            // Bottom edge (l, j+1..k): chars in {-, =, :, +}.
3042            let bot_ok = (j + 1..k).all(|c| matches!(grid[l][c], '-' | '=' | ':' | '+'));
3043            if !bot_ok {
3044                continue;
3045            }
3046            if grid[l][j] != '+' || grid[l][k] != '+' {
3047                continue;
3048            }
3049            // No interior partial separator that fully spans this cell.
3050            // A line m strictly between i and l splits the cell if it has
3051            // `+` at both col j and col k AND all chars between are sep
3052            // chars (i.e., the partial sep extends across the whole cell
3053            // horizontally).
3054            let interior_split = (i + 1..l).any(|m| {
3055                grid[m][j] == '+'
3056                    && grid[m][k] == '+'
3057                    && (j + 1..k).all(|c| matches!(grid[m][c], '-' | '=' | ':' | '+'))
3058            });
3059            if interior_split {
3060                continue;
3061            }
3062
3063            // Extract content text. For each interior line, take chars
3064            // [j+1..k], strip one leading space (cell padding), trim
3065            // trailing whitespace.
3066            let mut content_lines: Vec<String> = Vec::new();
3067            for r in (i + 1)..l {
3068                let slice: String = grid[r][j + 1..k].iter().collect();
3069                let stripped = slice.strip_prefix(' ').unwrap_or(&slice).to_string();
3070                content_lines.push(stripped.trim_end().to_string());
3071            }
3072            // Drop leading/trailing empty lines.
3073            let first = content_lines.iter().position(|s| !s.is_empty());
3074            let last = content_lines.iter().rposition(|s| !s.is_empty());
3075            let content = match (first, last) {
3076                (Some(f), Some(l)) => content_lines[f..=l].join("\n"),
3077                _ => String::new(),
3078            };
3079            return Some((er, ec, content));
3080        }
3081    }
3082    None
3083}
3084
3085/// Parse a grid-table cell's extracted text as block-level markdown via
3086/// panache, then convert top-level `Para`s to `Plain` (pandoc's
3087/// grid-table cell rule).
3088fn parse_grid_cell_text(text: &str) -> Vec<Block> {
3089    if text.trim().is_empty() {
3090        return Vec::new();
3091    }
3092    let opts = crate::ParserOptions {
3093        flavor: crate::Flavor::Pandoc,
3094        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
3095        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
3096        ..crate::ParserOptions::default()
3097    };
3098    let doc = crate::parse(text, Some(opts));
3099    let mut out = Vec::new();
3100    for child in doc.children() {
3101        if let Some(block) = block_from(&child) {
3102            let block = match block {
3103                Block::Para(inlines) => Block::Plain(inlines),
3104                other => other,
3105            };
3106            out.push(block);
3107        }
3108    }
3109    out
3110}
3111
3112/// Compute per-column widths from a grid-table separator like
3113/// `+--------+----------+----------+`. The `+` characters delimit
3114/// columns; each run of dashes/equals/colons between two `+` is one
3115/// column. Pandoc's formula (`Text/Pandoc/Parsing/GridTable.hs::
3116/// fractionalColumnWidths`):
3117/// ```text
3118/// raw[i] = dashes[i] + 1       (include separator width)
3119/// norm   = max(sum(raw) + count - 2, 72)   (72 = readerColumns)
3120/// width[i] = raw[i] / norm
3121/// ```
3122fn grid_dash_widths(separator: &SyntaxNode) -> Vec<f64> {
3123    let raw_text = separator.text().to_string();
3124    let line = raw_text.trim_end_matches(['\n', '\r']);
3125    let mut raw: Vec<usize> = Vec::new();
3126    let mut count: usize = 0;
3127    let mut in_col = false;
3128    for ch in line.chars() {
3129        match ch {
3130            '+' => {
3131                if in_col {
3132                    raw.push(count + 1);
3133                    count = 0;
3134                }
3135                in_col = true;
3136            }
3137            _ => {
3138                if in_col {
3139                    count += 1;
3140                }
3141            }
3142        }
3143    }
3144    if raw.is_empty() {
3145        return Vec::new();
3146    }
3147    let total: usize = raw.iter().sum();
3148    let count = raw.len();
3149    let norm = (total + count).saturating_sub(2).max(72) as f64;
3150    raw.into_iter().map(|w| w as f64 / norm).collect()
3151}
3152
3153fn grid_separator_aligns(raw: &str, cols: usize) -> Vec<&'static str> {
3154    let line = raw.trim_end_matches(['\n', '\r']);
3155    let mut aligns: Vec<&'static str> = Vec::with_capacity(cols);
3156    let mut col_start: Option<usize> = None;
3157    for (i, ch) in line.char_indices() {
3158        if ch == '+' {
3159            if let Some(s) = col_start.take() {
3160                let seg = &line[s..i];
3161                aligns.push(grid_segment_align(seg));
3162            }
3163            col_start = Some(i + 1);
3164        }
3165    }
3166    while aligns.len() < cols {
3167        aligns.push("AlignDefault");
3168    }
3169    aligns.truncate(cols);
3170    aligns
3171}
3172
3173fn grid_segment_align(seg: &str) -> &'static str {
3174    let bytes = seg.as_bytes();
3175    let left = bytes.first() == Some(&b':');
3176    let right = bytes.last() == Some(&b':');
3177    match (left, right) {
3178        (true, true) => "AlignCenter",
3179        (true, false) => "AlignLeft",
3180        (false, true) => "AlignRight",
3181        _ => "AlignDefault",
3182    }
3183}
3184
3185// ----- multiline table ----------------------------------------------------
3186
3187/// Project a `MULTILINE_TABLE` node. Multi-line tables have an opening
3188/// `-----` border, an optional header (one or more lines), a
3189/// `----- ----- -----` column separator, body rows (each row possibly
3190/// spans multiple lines, separated from the next row by a blank line),
3191/// and a closing `-----` border. Cell content within a row is joined with
3192/// `SoftBreak` between source lines. Column widths are
3193/// `(dash_count + 1) / 72`.
3194fn multiline_table(node: &SyntaxNode) -> Option<TableData> {
3195    // The column-separator (the dashes between header and body) is the
3196    // *second* TABLE_SEPARATOR if there is a header, else the first.
3197    let separators: Vec<SyntaxNode> = node
3198        .children()
3199        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
3200        .collect();
3201    let header = node
3202        .children()
3203        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
3204    let column_sep = if header.is_some() {
3205        separators.get(1).cloned()
3206    } else {
3207        separators.first().cloned()
3208    }?;
3209    let cols = simple_table_dash_runs(&column_sep);
3210    if cols.is_empty() {
3211        return None;
3212    }
3213    // Per pandoc `widthsFromIndices`: each non-last column's width is
3214    // `dashes + spaces_after` (= start of next column - start of this); the
3215    // last column's width is `dashes + 1` (the indices' bump). Normalize
3216    // by `max(total, 72)`.
3217    let raw: Vec<usize> = cols
3218        .iter()
3219        .enumerate()
3220        .map(|(i, (s, e))| {
3221            if i + 1 < cols.len() {
3222                cols[i + 1].0 - s
3223            } else {
3224                e - s + 2
3225            }
3226        })
3227        .collect();
3228    let total: usize = raw.iter().sum();
3229    let norm = (total.max(72)) as f64;
3230    let widths: Vec<f64> = raw.into_iter().map(|w| w as f64 / norm).collect();
3231    // Alignment from header (if present) or first data row, using the
3232    // simple-table flushness rule against the column-separator dash runs.
3233    let aligns = if let Some(h) = &header {
3234        simple_table_aligns(h, &cols)
3235    } else if let Some(r0) = node.children().find(|c| c.kind() == SyntaxKind::TABLE_ROW) {
3236        simple_table_aligns(&r0, &cols)
3237    } else {
3238        vec!["AlignDefault"; cols.len()]
3239    };
3240    let head_rows = match &header {
3241        Some(h) => vec![
3242            multiline_row_cells_blocks(h, &cols)
3243                .into_iter()
3244                .map(GridCell::no_span)
3245                .collect(),
3246        ],
3247        None => Vec::new(),
3248    };
3249    let body_rows: Vec<Vec<GridCell>> = node
3250        .children()
3251        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
3252        .map(|r| {
3253            multiline_row_cells_blocks(&r, &cols)
3254                .into_iter()
3255                .map(GridCell::no_span)
3256                .collect()
3257        })
3258        .collect();
3259    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
3260    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
3261    Some(TableData {
3262        attr,
3263        caption: caption_inlines,
3264        aligns,
3265        widths: widths.into_iter().map(Some).collect(),
3266        head_rows,
3267        body_rows,
3268        foot_rows: Vec::new(),
3269    })
3270}
3271
3272/// Slice each line of a multiline-table row by column ranges, then merge
3273/// each column's per-line text into a single Plain block with `SoftBreak`s
3274/// between source lines.
3275fn multiline_row_cells_blocks(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<Vec<Block>> {
3276    let row_start: u32 = row.text_range().start().into();
3277    let raw = row.text().to_string();
3278    // Re-construct the row's per-line text. Tokens give us byte offsets, but
3279    // plain `.text()` is enough — split on '\n', then for each line, slice by
3280    // column ranges.
3281    let lines: Vec<&str> = raw.split_inclusive('\n').collect();
3282    let mut col_lines: Vec<Vec<String>> = vec![Vec::new(); cols.len()];
3283    let mut line_start_offset: usize = 0;
3284    for line in lines {
3285        let line_no_nl = line.trim_end_matches('\n');
3286        if line_no_nl.trim().is_empty() {
3287            line_start_offset += line.len();
3288            continue;
3289        }
3290        for (i, &(cs, ce)) in cols.iter().enumerate() {
3291            // Slice [cs..=ce] in chars from the line. Lines may be shorter.
3292            let slice = char_slice(line_no_nl, cs, ce + 1);
3293            let trimmed = slice.trim();
3294            if !trimmed.is_empty() {
3295                col_lines[i].push(trimmed.to_string());
3296            }
3297        }
3298        line_start_offset += line.len();
3299    }
3300    let _ = (row_start, line_start_offset);
3301    cols.iter()
3302        .enumerate()
3303        .map(|(i, _)| {
3304            let segments = &col_lines[i];
3305            if segments.is_empty() {
3306                return Vec::new();
3307            }
3308            // Re-parse the cell's joined text through panache's inline parser
3309            // so that `**bold**`, `` `code` ``, `[link](url)` etc. inside
3310            // multiline-table cells project as Strong/Code/Link rather than
3311            // raw Str (matches pandoc's `multilineTableHeader` behavior of
3312            // joining lines per column and parsing as Markdown).
3313            let joined = segments.join("\n");
3314            let inlines = parse_cell_text_inlines(&joined);
3315            if inlines.is_empty() {
3316                return Vec::new();
3317            }
3318            vec![Block::Plain(coalesce_inlines(inlines))]
3319        })
3320        .collect()
3321}
3322
3323/// Parse a cell text fragment through panache's inline parser and return its
3324/// inline content. Used for multiline-table cells whose per-line slices are
3325/// not seen by the outer parser as inline-bearing TABLE_CELLs (the parser
3326/// holds raw TEXT for lines past the first). Empty or whitespace-only input
3327/// returns an empty vec.
3328fn parse_cell_text_inlines(text: &str) -> Vec<Inline> {
3329    if text.trim().is_empty() {
3330        return Vec::new();
3331    }
3332    let opts = crate::ParserOptions {
3333        flavor: crate::Flavor::Pandoc,
3334        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
3335        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
3336        ..crate::ParserOptions::default()
3337    };
3338    let doc = crate::parse(text, Some(opts));
3339    for node in doc.descendants() {
3340        if matches!(node.kind(), SyntaxKind::PARAGRAPH | SyntaxKind::PLAIN) {
3341            return inlines_from(&node);
3342        }
3343    }
3344    Vec::new()
3345}
3346
3347fn char_slice(s: &str, start_char: usize, end_char: usize) -> &str {
3348    let mut start_byte = s.len();
3349    let mut end_byte = s.len();
3350    for (i, (b, _)) in s.char_indices().enumerate() {
3351        if i == start_char {
3352            start_byte = b;
3353        }
3354        if i == end_char {
3355            end_byte = b;
3356            break;
3357        }
3358    }
3359    if start_byte > end_byte {
3360        return "";
3361    }
3362    &s[start_byte..end_byte]
3363}
3364
3365fn list_block(node: &SyntaxNode) -> Block {
3366    let loose = is_loose_list(node);
3367    let items: Vec<Vec<Block>> = node
3368        .children()
3369        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
3370        .map(|item| list_item_blocks(&item, loose))
3371        .collect();
3372    if list_is_ordered(node) {
3373        let (start, style, delim) = ordered_list_attrs(node);
3374        Block::OrderedList(start, style, delim, items)
3375    } else {
3376        Block::BulletList(items)
3377    }
3378}
3379
3380fn list_is_ordered(node: &SyntaxNode) -> bool {
3381    let Some(item) = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
3382        return false;
3383    };
3384    let marker = item
3385        .children_with_tokens()
3386        .filter_map(|el| el.into_token())
3387        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
3388        .map(|t| t.text().to_string())
3389        .unwrap_or_default();
3390    let trimmed = marker.trim();
3391    !trimmed.starts_with(['-', '+', '*'])
3392}
3393
3394fn ordered_list_attrs(node: &SyntaxNode) -> (usize, &'static str, &'static str) {
3395    let item = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM);
3396    let marker = item
3397        .as_ref()
3398        .and_then(|i| {
3399            i.children_with_tokens()
3400                .filter_map(|el| el.into_token())
3401                .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
3402                .map(|t| t.text().to_string())
3403        })
3404        .unwrap_or_default();
3405    let (mut start, style, delim) = classify_ordered_marker(marker.trim());
3406    if style == "Example" {
3407        let offset: u32 = node.text_range().start().into();
3408        if let Some(s) = REFS_CTX.with(|c| {
3409            c.borrow()
3410                .example_list_start_by_offset
3411                .get(&offset)
3412                .copied()
3413        }) {
3414            start = s;
3415        }
3416    }
3417    (start, style, delim)
3418}
3419
3420/// Map a list-marker token (e.g. `1.`, `iv)`, `(A)`, `#.`, `(@)`) to the
3421/// pandoc-native `(start, style, delim)` tuple. Mirrors pandoc's parser logic
3422/// in `Text/Pandoc/Parsing/Lists.hs`: try `decimal`, then `exampleNum` (`@`),
3423/// then `defaultNum` (`#`), then `romanOne` (single `i`/`I`), then alpha,
3424/// then multi-char roman, in that order; the first matching form wins. The
3425/// start value for Example lists is left at 1 — pandoc tracks numbering
3426/// across lists at the document level, which we don't model.
3427fn classify_ordered_marker(trimmed: &str) -> (usize, &'static str, &'static str) {
3428    // Strip surrounding parens / trailing period or paren to get (body, delim).
3429    let (body, delim) =
3430        if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
3431            (inner, "TwoParens")
3432        } else if let Some(inner) = trimmed.strip_suffix(')') {
3433            (inner, "OneParen")
3434        } else if let Some(inner) = trimmed.strip_suffix('.') {
3435            (inner, "Period")
3436        } else {
3437            (trimmed, "DefaultDelim")
3438        };
3439
3440    // All-digit body → Decimal.
3441    if !body.is_empty() && body.chars().all(|c| c.is_ascii_digit()) {
3442        let start: usize = body.parse().unwrap_or(1);
3443        return (start, "Decimal", delim);
3444    }
3445
3446    // `#` (DefaultStyle) — when style is DefaultStyle pandoc forces
3447    // DefaultDelim regardless of the actual punctuation.
3448    if body == "#" {
3449        return (1, "DefaultStyle", "DefaultDelim");
3450    }
3451
3452    // `@` or `@label` (Example list).
3453    if let Some(rest) = body.strip_prefix('@')
3454        && rest
3455            .chars()
3456            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
3457    {
3458        return (1, "Example", delim);
3459    }
3460
3461    // Single `i`/`I` is romanOne (tried before alpha, so `i.`/`I.` is Roman 1).
3462    if body == "i" {
3463        return (1, "LowerRoman", delim);
3464    }
3465    if body == "I" {
3466        return (1, "UpperRoman", delim);
3467    }
3468
3469    // Single lowercase / uppercase letter → alpha.
3470    if body.len() == 1
3471        && let Some(c) = body.chars().next()
3472    {
3473        if c.is_ascii_lowercase() {
3474            return ((c as u8 - b'a') as usize + 1, "LowerAlpha", delim);
3475        }
3476        if c.is_ascii_uppercase() {
3477            return ((c as u8 - b'A') as usize + 1, "UpperAlpha", delim);
3478        }
3479    }
3480
3481    // Multi-char roman lowercase/uppercase.
3482    if body
3483        .chars()
3484        .all(|c| matches!(c, 'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm'))
3485        && let Some(n) = roman_to_int(body, false)
3486    {
3487        return (n, "LowerRoman", delim);
3488    }
3489    if body
3490        .chars()
3491        .all(|c| matches!(c, 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M'))
3492        && let Some(n) = roman_to_int(body, true)
3493    {
3494        return (n, "UpperRoman", delim);
3495    }
3496
3497    // Fallback — the parser accepted some marker we don't classify; emit
3498    // Decimal/Period so the list renders rather than dropping coverage.
3499    (1, "Decimal", delim)
3500}
3501
3502/// Convert a roman numeral string to its integer value. Returns `None` if the
3503/// string isn't a syntactically-valid roman numeral. Mirrors pandoc's
3504/// `romanNumeral` (greedy left-to-right with subtractive pairs).
3505fn roman_to_int(s: &str, upper: bool) -> Option<usize> {
3506    let normalize = |c: char| if upper { c } else { c.to_ascii_uppercase() };
3507    let value = |c: char| match c {
3508        'I' => 1,
3509        'V' => 5,
3510        'X' => 10,
3511        'L' => 50,
3512        'C' => 100,
3513        'D' => 500,
3514        'M' => 1000,
3515        _ => 0,
3516    };
3517    let chars: Vec<char> = s.chars().map(normalize).collect();
3518    if chars.is_empty() {
3519        return None;
3520    }
3521    let mut total = 0usize;
3522    let mut i = 0;
3523    while i < chars.len() {
3524        let v = value(chars[i]);
3525        if v == 0 {
3526            return None;
3527        }
3528        let next = chars.get(i + 1).copied().map(value).unwrap_or(0);
3529        if v < next {
3530            total += next - v;
3531            i += 2;
3532        } else {
3533            total += v;
3534            i += 1;
3535        }
3536    }
3537    Some(total)
3538}
3539
3540fn list_item_blocks(item: &SyntaxNode, loose: bool) -> Vec<Block> {
3541    let mut out = Vec::new();
3542    let item_indent = list_item_content_offset(item);
3543    let task_checkbox = task_checkbox_for_item(item);
3544    let mut checkbox_emitted = false;
3545    for child in item.children() {
3546        match child.kind() {
3547            SyntaxKind::PLAIN => {
3548                let mut inlines = coalesce_inlines(inlines_from(&child));
3549                // Skip empty Plain blocks. The parser emits a PLAIN node for
3550                // any line under a list item, including the bare-marker line
3551                // (`-` followed by blank then indented content); pandoc only
3552                // counts blocks with actual inline content.
3553                if inlines.is_empty() {
3554                    continue;
3555                }
3556                if !checkbox_emitted && let Some(glyph) = task_checkbox {
3557                    inlines.insert(0, Inline::Space);
3558                    inlines.insert(0, Inline::Str(glyph.to_string()));
3559                    checkbox_emitted = true;
3560                }
3561                if loose {
3562                    out.push(Block::Para(inlines));
3563                } else {
3564                    out.push(Block::Plain(inlines));
3565                }
3566            }
3567            SyntaxKind::CODE_BLOCK => {
3568                // Both fenced and indented code blocks inside list items
3569                // carry the item-content indent on every body line in the
3570                // CST. Strip that offset so pandoc sees the same body it
3571                // would in a flat document. (For indented code, the helper
3572                // also strips the 4-space code-block indent on top of the
3573                // item offset; for fenced code, the offset strip alone is
3574                // sufficient.)
3575                out.push(indented_code_block_with_extra_strip(&child, item_indent));
3576            }
3577            _ => collect_block(&child, &mut out),
3578        }
3579    }
3580    out
3581}
3582
3583/// Pandoc renders `- [ ] foo` as `Plain [Str "\u{2610}", Space, Str "foo"]`
3584/// (and `[x]`/`[X]` as `\u{2612}`). The parser keeps `[ ]`/`[x]`/`[X]` as a
3585/// dedicated `TASK_CHECKBOX` token on the `LIST_ITEM`; this helper returns
3586/// the matching ballot-box glyph if one is present.
3587fn task_checkbox_for_item(item: &SyntaxNode) -> Option<&'static str> {
3588    item.children_with_tokens()
3589        .filter_map(|el| el.into_token())
3590        .find(|t| t.kind() == SyntaxKind::TASK_CHECKBOX)
3591        .map(|t| {
3592            let text = t.text();
3593            if text.contains('x') || text.contains('X') {
3594                "\u{2612}"
3595            } else {
3596                "\u{2610}"
3597            }
3598        })
3599}
3600
3601/// Number of leading-space columns each body-content line of `item` carries
3602/// in the CST. Mirrors pandoc's list-item content offset:
3603///   - bare-marker line (no WHITESPACE after LIST_MARKER): offset = marker
3604///     width (e.g. `1` for `-`, `2` for `1.`).
3605///   - marker followed by space(s): offset = marker width + WS width (the
3606///     visual column where content starts on the marker's line).
3607///
3608/// Nested list items also carry leading WHITESPACE *before* the LIST_MARKER
3609/// (the outer item's content offset). Include that so the cumulative depth
3610/// is captured — required for correctly stripping nested fenced/indented
3611/// code blocks.
3612///
3613/// When the LIST is itself a child of an outer container (e.g. a DEFINITION
3614/// body where the `- item` line is indented to the def-content column), the
3615/// per-item leading indent lives on the parent LIST as a WHITESPACE token
3616/// preceding each LIST_ITEM rather than inside the item. Pick that up too —
3617/// without it, code blocks nested inside such items would only have the
3618/// item-local indent stripped, leaving the outer-container offset behind.
3619fn list_item_content_offset(item: &SyntaxNode) -> usize {
3620    let parent_ws = parent_list_leading_ws(item);
3621    let mut marker_width = 0usize;
3622    let mut leading_ws = 0usize;
3623    let mut saw_marker = false;
3624    for el in item.children_with_tokens() {
3625        if let NodeOrToken::Token(t) = el {
3626            match t.kind() {
3627                SyntaxKind::WHITESPACE if !saw_marker => {
3628                    leading_ws += t.text().chars().count();
3629                }
3630                SyntaxKind::LIST_MARKER => {
3631                    marker_width += t.text().chars().count();
3632                    saw_marker = true;
3633                }
3634                SyntaxKind::WHITESPACE if saw_marker => {
3635                    return parent_ws + leading_ws + marker_width + t.text().chars().count();
3636                }
3637                _ if saw_marker => {
3638                    return parent_ws + leading_ws + marker_width;
3639                }
3640                _ => {}
3641            }
3642        } else if saw_marker {
3643            return parent_ws + leading_ws + marker_width;
3644        }
3645    }
3646    parent_ws + leading_ws + marker_width
3647}
3648
3649/// WHITESPACE token immediately preceding `item` on its parent LIST node, if
3650/// any. Used to recover the outer-container indent when the parser stores it
3651/// on the parent LIST (e.g. LIST inside DEFINITION) rather than as the item's
3652/// own leading WHITESPACE.
3653fn parent_list_leading_ws(item: &SyntaxNode) -> usize {
3654    let prev = item.prev_sibling_or_token();
3655    match prev {
3656        Some(NodeOrToken::Token(t)) if t.kind() == SyntaxKind::WHITESPACE => {
3657            t.text().chars().count()
3658        }
3659        _ => 0,
3660    }
3661}
3662
3663fn is_loose_list(node: &SyntaxNode) -> bool {
3664    let mut prev_was_item = false;
3665    for child in node.children_with_tokens() {
3666        if let NodeOrToken::Node(n) = child {
3667            if n.kind() == SyntaxKind::LIST_ITEM {
3668                prev_was_item = true;
3669            } else if n.kind() == SyntaxKind::BLANK_LINE
3670                && prev_was_item
3671                && n.next_sibling()
3672                    .map(|s| s.kind() == SyntaxKind::LIST_ITEM)
3673                    .unwrap_or(false)
3674            {
3675                return true;
3676            }
3677        }
3678    }
3679    for item in node
3680        .children()
3681        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
3682    {
3683        if item.children().any(|c| c.kind() == SyntaxKind::PARAGRAPH) {
3684            return true;
3685        }
3686        // Per CommonMark/pandoc: a list is loose if any item directly
3687        // contains a blank line between two block-level children. The
3688        // single-item form (`- a\n\n  b`) only manifests as a BLANK_LINE
3689        // sandwiched between non-blank block children inside the item.
3690        if has_internal_blank_between_blocks(&item) {
3691            return true;
3692        }
3693    }
3694    false
3695}
3696
3697fn has_internal_blank_between_blocks(item: &SyntaxNode) -> bool {
3698    let mut saw_block_before = false;
3699    let mut pending_blank = false;
3700    for child in item.children() {
3701        match child.kind() {
3702            SyntaxKind::BLANK_LINE => {
3703                if saw_block_before {
3704                    pending_blank = true;
3705                }
3706            }
3707            // Bare-marker line emits an empty PLAIN (NEWLINE only); pandoc
3708            // doesn't count that as a block — its first real block is what
3709            // comes after the blank line.
3710            SyntaxKind::PLAIN if child_is_empty_plain(&child) => {}
3711            _ => {
3712                if pending_blank {
3713                    return true;
3714                }
3715                saw_block_before = true;
3716            }
3717        }
3718    }
3719    false
3720}
3721
3722fn child_is_empty_plain(node: &SyntaxNode) -> bool {
3723    !node.children_with_tokens().any(|el| match el {
3724        NodeOrToken::Token(t) => !matches!(t.kind(), SyntaxKind::NEWLINE | SyntaxKind::WHITESPACE),
3725        NodeOrToken::Node(_) => true,
3726    })
3727}
3728
3729// ----- inline walking -----------------------------------------------------
3730
3731fn inlines_from(parent: &SyntaxNode) -> Vec<Inline> {
3732    let mut out = Vec::new();
3733    let mut iter = parent.children_with_tokens().peekable();
3734    while let Some(el) = iter.next() {
3735        match el {
3736            NodeOrToken::Token(t) => push_token_inline(&t, &mut out),
3737            NodeOrToken::Node(n) if n.kind() == SyntaxKind::LATEX_COMMAND => {
3738                emit_latex_command_with_absorb(&n, &mut iter, &mut out);
3739            }
3740            NodeOrToken::Node(n) if n.kind() == SyntaxKind::CITATION => {
3741                emit_citation_with_absorb(&n, &mut iter, &mut out);
3742            }
3743            NodeOrToken::Node(n) => push_inline_node(&n, &mut out),
3744        }
3745    }
3746    // Trailing NEWLINE inside paragraphs/headings is structural. Strip a
3747    // single trailing SoftBreak so the inline list ends on Str/Space, matching
3748    // pandoc's "trim trailing line endings" rule.
3749    while matches!(out.last(), Some(Inline::SoftBreak)) {
3750        out.pop();
3751    }
3752    out
3753}
3754
3755/// Pandoc absorbs `@key [locator]` into a single AuthorInText `Cite` with
3756/// the bracketed text becoming the citation's suffix. The parser emits two
3757/// separate nodes: `CITATION` (bare `@key`, no surrounding brackets) and an
3758/// adjacent `LINK` whose bracketed text has no destination. When the
3759/// CITATION is bare and we can verify both the next siblings (a single
3760/// `TEXT` whitespace token followed by a `LINK` node lacking
3761/// `LINK_DEST_START`), consume both and absorb the link's text as suffix.
3762fn emit_citation_with_absorb<I>(
3763    node: &SyntaxNode,
3764    iter: &mut std::iter::Peekable<I>,
3765    out: &mut Vec<Inline>,
3766) where
3767    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
3768{
3769    let bracketed = node
3770        .children_with_tokens()
3771        .filter_map(|el| el.into_token())
3772        .any(|t| t.kind() == SyntaxKind::LINK_START);
3773    if bracketed {
3774        render_citation_inline(node, out, None);
3775        return;
3776    }
3777    // Bare AuthorInText form. Use rowan's sibling navigation (not the iter
3778    // peek) to verify the absorption pattern without consuming anything we
3779    // can't put back. Then if confirmed, advance the iter to skip both.
3780    let next_sibling_pair = node.next_sibling_or_token().and_then(|el1| {
3781        let t = el1.as_token().cloned()?;
3782        if t.kind() != SyntaxKind::TEXT || !t.text().starts_with(' ') {
3783            return None;
3784        }
3785        let space_text = t.text().to_string();
3786        let link_el = t.next_sibling_or_token()?;
3787        let link = link_el.as_node().cloned()?;
3788        // Pandoc absorbs `[locator]` after `@key` whether the brackets
3789        // resolve as a link or not; under the new IR, an unresolved
3790        // bracket-shape pattern is `UNRESOLVED_REFERENCE` rather than
3791        // shape-only `LINK`. Both shapes are valid locator candidates.
3792        if link.kind() != SyntaxKind::LINK && link.kind() != SyntaxKind::UNRESOLVED_REFERENCE {
3793            return None;
3794        }
3795        let has_dest = link
3796            .children_with_tokens()
3797            .filter_map(|el| el.into_token())
3798            .any(|tok| tok.kind() == SyntaxKind::LINK_DEST_START);
3799        if has_dest {
3800            return None;
3801        }
3802        let link_text = link
3803            .children()
3804            .find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3805            .map(|tt| tt.text().to_string())
3806            .unwrap_or_default();
3807        Some((space_text, link_text))
3808    });
3809    if let Some((_space_text, locator_text)) = next_sibling_pair {
3810        // Advance the iter past the consumed TEXT and LINK.
3811        iter.next();
3812        iter.next();
3813        render_citation_inline(node, out, Some(&locator_text));
3814    } else {
3815        render_citation_inline(node, out, None);
3816    }
3817}
3818
3819/// Pandoc's tex inline reader absorbs trailing horizontal whitespace into the
3820/// raw command when (and only when) the command is `\letters` with no brace
3821/// arguments — `\foo bar` becomes `RawInline tex "\\foo "` + `Str "bar"`,
3822/// while `\frac{a}{b} bar` keeps the space outside (`RawInline tex
3823/// "\\frac{a}{b}"` + `Space` + `Str "bar"`). The discriminator is the last
3824/// byte of the command text: ASCII letter → absorb, otherwise → don't.
3825fn emit_latex_command_with_absorb<I>(
3826    node: &SyntaxNode,
3827    iter: &mut std::iter::Peekable<I>,
3828    out: &mut Vec<Inline>,
3829) where
3830    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
3831{
3832    let mut content = node.text().to_string();
3833    let ends_in_letter = content
3834        .chars()
3835        .next_back()
3836        .is_some_and(|c| c.is_ascii_alphabetic());
3837    if ends_in_letter
3838        && let Some(NodeOrToken::Token(t)) = iter.peek()
3839        && t.kind() == SyntaxKind::TEXT
3840    {
3841        let text = t.text().to_string();
3842        let bytes = text.as_bytes();
3843        let mut absorbed = 0;
3844        while absorbed < bytes.len() && (bytes[absorbed] == b' ' || bytes[absorbed] == b'\t') {
3845            absorbed += 1;
3846        }
3847        if absorbed > 0 {
3848            content.push_str(&text[..absorbed]);
3849            out.push(Inline::RawInline("tex".to_string(), content));
3850            iter.next();
3851            let remainder = &text[absorbed..];
3852            if !remainder.is_empty() {
3853                push_text(remainder, out);
3854            }
3855            return;
3856        }
3857    }
3858    out.push(Inline::RawInline("tex".to_string(), content));
3859}
3860
3861fn push_inline_node(node: &SyntaxNode, out: &mut Vec<Inline>) {
3862    match node.kind() {
3863        SyntaxKind::LINK => render_link_inline(node, out),
3864        SyntaxKind::IMAGE_LINK => render_image_inline(node, out),
3865        SyntaxKind::CITATION => render_citation_inline(node, out, None),
3866        // Pandoc-native treats unresolved bracket-shape patterns as
3867        // literal text — the bracket bytes themselves are `Str "["`
3868        // and `Str "]"`, but inner inline structure (emphasis, math,
3869        // raw spans, etc.) survives. The Panache `UNRESOLVED_REFERENCE`
3870        // wrapper is a tooling concession; emit the bracket bytes as
3871        // `Str` and recurse into structural children so inner content
3872        // is preserved.
3873        SyntaxKind::UNRESOLVED_REFERENCE => render_unresolved_reference_inline(node, out),
3874        _ => out.push(inline_from_node(node)),
3875    }
3876}
3877
3878/// Project an UNRESOLVED_REFERENCE node as pandoc-native inlines.
3879///
3880/// Mirrors the unresolved fall-through of `render_link_inline`: try
3881/// `lookup_heading_id` for implicit-heading shortcut/full-reference
3882/// resolution at projection time (pandoc resolves heading IDs *during
3883/// inline rendering*; the parser's refdef map only carries explicit
3884/// `[label]: url` definitions). On miss, emit the original bracket
3885/// pattern as `Str "["`, inner inline structure (preserved via
3886/// `coalesce_inlines_keep_edges` so leading/trailing whitespace
3887/// survives, matching pandoc's `[ foo ]` → `Str "[", Space, Str "foo",
3888/// Space, Str "]"` behavior), then `Str "]"` (or `Str "][ref]"` for
3889/// full-reference form).
3890fn render_unresolved_reference_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
3891    let is_image = node
3892        .children()
3893        .any(|c| c.kind() == SyntaxKind::IMAGE_LINK_START);
3894    let text_node = if is_image {
3895        node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT)
3896    } else {
3897        node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3898    };
3899    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
3900
3901    let text_label = text_node
3902        .as_ref()
3903        .map(|n| n.text().to_string())
3904        .unwrap_or_default();
3905    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
3906        Some(rn) => {
3907            let inner = rn.text().to_string();
3908            if inner.is_empty() {
3909                (text_label.clone(), true, String::new())
3910            } else {
3911                (inner.clone(), true, inner)
3912            }
3913        }
3914        None => (text_label.clone(), false, String::new()),
3915    };
3916
3917    // Implicit-heading-id resolution at projection time. Only for
3918    // link-shape (not image-shape) shortcut/full-ref/collapsed forms.
3919    if !is_image && let Some(id) = lookup_heading_id(&label) {
3920        let url = format!("#{id}");
3921        let resolved_text_inlines = text_node
3922            .as_ref()
3923            .map(|n| coalesce_inlines(inlines_from(n)))
3924            .unwrap_or_default();
3925        out.push(Inline::Link(
3926            extract_attr_from_node(node),
3927            resolved_text_inlines,
3928            url,
3929            String::new(),
3930        ));
3931        return;
3932    }
3933
3934    // Inherited reference resolution. The parser emits UNRESOLVED_REFERENCE
3935    // when the corresponding `[label]: url` def isn't in the same CST, but
3936    // when projecting recursively-reparsed content (e.g. a `<div>` body)
3937    // the outer document's refs are folded into REFS_CTX. Resolve here so
3938    // an outer-defined ref used inside `<div>...</div>` becomes a Link.
3939    if let Some((url, title)) = lookup_ref(&label) {
3940        let resolved_text_inlines = text_node
3941            .as_ref()
3942            .map(|n| coalesce_inlines(inlines_from(n)))
3943            .unwrap_or_default();
3944        let kind = if is_image {
3945            Inline::Image
3946        } else {
3947            Inline::Link
3948        };
3949        out.push(kind(
3950            extract_attr_from_node(node),
3951            resolved_text_inlines,
3952            url,
3953            title,
3954        ));
3955        return;
3956    }
3957
3958    // Unresolved: emit the original markdown bytes, preserving inner
3959    // inline structure.
3960    let unresolved_text_inlines = text_node
3961        .as_ref()
3962        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
3963        .unwrap_or_default();
3964    let opener = if is_image { "![" } else { "[" };
3965    out.push(Inline::Str(opener.to_string()));
3966    out.extend(unresolved_text_inlines);
3967    let suffix = if has_second_brackets {
3968        format!("][{second_inner}]")
3969    } else {
3970        "]".to_string()
3971    };
3972    out.push(Inline::Str(suffix));
3973}
3974
3975/// Pandoc treats `(@label)` and bare `@label` as Example-list references
3976/// when the label was defined as an Example item; the inline becomes
3977/// `Str "N"` (just the digits — surrounding parens come from adjacent
3978/// source bytes which our coalesce pass merges back in). Otherwise we
3979/// project the CITATION node as a proper `Cite [Citation, ...] [Inline,
3980/// ...]` per pandoc's citation reader. `extra_suffix_text` carries an
3981/// absorbed `[locator]` (pandoc absorbs `@key [locator]` into the Cite as
3982/// the citation's suffix); the literal text reflects the absorbed bytes.
3983fn render_citation_inline(
3984    node: &SyntaxNode,
3985    out: &mut Vec<Inline>,
3986    extra_suffix_text: Option<&str>,
3987) {
3988    // Example-list resolution short-circuit (legacy carve-out).
3989    let first_key = node
3990        .children_with_tokens()
3991        .filter_map(|el| el.into_token())
3992        .find(|t| t.kind() == SyntaxKind::CITATION_KEY)
3993        .map(|t| t.text().to_string())
3994        .unwrap_or_default();
3995    let example_resolution =
3996        REFS_CTX.with(|c| c.borrow().example_label_to_num.get(&first_key).copied());
3997    if let Some(n) = example_resolution {
3998        out.push(Inline::Str(n.to_string()));
3999        return;
4000    }
4001
4002    let bracketed = node
4003        .children_with_tokens()
4004        .filter_map(|el| el.into_token())
4005        .any(|t| t.kind() == SyntaxKind::LINK_START);
4006
4007    let mut builders: Vec<CitationBuilder> = Vec::new();
4008    let mut current: Option<CitationBuilder> = None;
4009    let mut pending_prefix = String::new();
4010    for el in node.children_with_tokens() {
4011        let token = match el {
4012            NodeOrToken::Token(t) => t,
4013            _ => continue,
4014        };
4015        match token.kind() {
4016            SyntaxKind::LINK_START | SyntaxKind::LINK_DEST => {}
4017            SyntaxKind::CITATION_BRACE_OPEN | SyntaxKind::CITATION_BRACE_CLOSE => {}
4018            SyntaxKind::CITATION_MARKER => {
4019                if let Some(c) = current.take() {
4020                    builders.push(c);
4021                }
4022                let mode = if token.text() == "-@" {
4023                    CitationMode::SuppressAuthor
4024                } else if bracketed {
4025                    CitationMode::NormalCitation
4026                } else {
4027                    CitationMode::AuthorInText
4028                };
4029                current = Some(CitationBuilder::new(
4030                    std::mem::take(&mut pending_prefix),
4031                    mode,
4032                ));
4033            }
4034            SyntaxKind::CITATION_KEY => {
4035                if let Some(c) = &mut current {
4036                    c.id.push_str(token.text());
4037                }
4038            }
4039            SyntaxKind::CITATION_CONTENT => {
4040                if let Some(c) = &mut current {
4041                    c.suffix_raw.push_str(token.text());
4042                } else {
4043                    pending_prefix.push_str(token.text());
4044                }
4045            }
4046            SyntaxKind::CITATION_SEPARATOR => {
4047                if let Some(c) = current.take() {
4048                    builders.push(c);
4049                }
4050            }
4051            _ => {}
4052        }
4053    }
4054    if let Some(c) = current.take() {
4055        builders.push(c);
4056    }
4057
4058    // Absorbed `[locator]` text becomes additional suffix on the LAST
4059    // citation in the group (pandoc only absorbs into AuthorInText cites
4060    // anyway, which always have one citation in the group).
4061    if let Some(extra) = extra_suffix_text
4062        && let Some(last) = builders.last_mut()
4063    {
4064        if !last.suffix_raw.is_empty() && !extra.starts_with(' ') {
4065            last.suffix_raw.push(' ');
4066        }
4067        last.suffix_raw.push_str(extra);
4068    }
4069
4070    let note_offset: u32 = node.text_range().start().into();
4071    let note_num = REFS_CTX
4072        .with(|c| {
4073            c.borrow()
4074                .cite_note_num_by_offset
4075                .get(&note_offset)
4076                .copied()
4077        })
4078        .unwrap_or(1);
4079
4080    let projected: Vec<Citation> = builders
4081        .into_iter()
4082        .map(|b| b.into_citation(note_num))
4083        .collect();
4084
4085    // Build literal text from CITATION node text + any absorbed suffix.
4086    let mut literal = node.text().to_string();
4087    if let Some(extra) = extra_suffix_text {
4088        literal.push(' ');
4089        literal.push('[');
4090        literal.push_str(extra);
4091        literal.push(']');
4092    }
4093    let text_inlines = literal_inlines(&literal);
4094
4095    out.push(Inline::Cite(projected, text_inlines));
4096}
4097
4098/// Internal builder for a single Citation while walking the CITATION node's
4099/// tokens. `prefix_raw` and `suffix_raw` capture the raw `CITATION_CONTENT`
4100/// text segments before / after the key; they are inline-parsed (with smart
4101/// transformations applied via `coalesce_inlines`) once the builder is
4102/// finalized.
4103struct CitationBuilder {
4104    id: String,
4105    prefix_raw: String,
4106    suffix_raw: String,
4107    mode: CitationMode,
4108}
4109
4110impl CitationBuilder {
4111    fn new(prefix_raw: String, mode: CitationMode) -> Self {
4112        Self {
4113            id: String::new(),
4114            prefix_raw,
4115            suffix_raw: String::new(),
4116            mode,
4117        }
4118    }
4119
4120    fn into_citation(self, note_num: i64) -> Citation {
4121        let prefix = parse_cite_affix_inlines(self.prefix_raw.trim_end(), true);
4122        let suffix = parse_cite_affix_inlines(&self.suffix_raw, false);
4123        Citation {
4124            id: self.id,
4125            prefix,
4126            suffix,
4127            mode: self.mode,
4128            note_num,
4129            hash: 0,
4130        }
4131    }
4132}
4133
4134/// Parse a citation prefix or suffix raw-text fragment as inlines, applying
4135/// pandoc's smart transformations (NBSP after abbreviations, en-dash for
4136/// `--`, smart apostrophes/quotes). For prefixes, we trim leading whitespace
4137/// (pandoc's prefix never starts with Space). For suffixes, leading whitespace
4138/// is preserved so `[@key, suffix]` produces `[Str ",", Space, Str "suffix"]`.
4139///
4140/// We wrap the raw text with a benign `Z ` prefix before reparsing, then
4141/// strip the resulting leading `Str "Z"` + `Space`. This is necessary because
4142/// panache's block parser would otherwise misclassify text starting with
4143/// (e.g.) `p. ` as an alphabetical list marker, dropping the `p.` from the
4144/// resulting inline stream entirely.
4145fn parse_cite_affix_inlines(raw: &str, is_prefix: bool) -> Vec<Inline> {
4146    if raw.is_empty() {
4147        return Vec::new();
4148    }
4149    let trimmed = if is_prefix { raw.trim_start() } else { raw };
4150    if trimmed.is_empty() {
4151        return Vec::new();
4152    }
4153    let leading_space = !is_prefix && trimmed.starts_with([' ', '\t']);
4154    let work = trimmed.trim_start_matches([' ', '\t']);
4155    if work.is_empty() {
4156        return if leading_space {
4157            vec![Inline::Space]
4158        } else {
4159            Vec::new()
4160        };
4161    }
4162    let wrapped = format!("Z {work}");
4163    let inlines = parse_cell_text_inlines(&wrapped);
4164    let mut coalesced = coalesce_inlines(inlines);
4165    // Strip the leading `Z` sentinel + Space.
4166    if matches!(coalesced.first(), Some(Inline::Str(s)) if s == "Z") {
4167        coalesced.remove(0);
4168        if matches!(coalesced.first(), Some(Inline::Space)) {
4169            coalesced.remove(0);
4170        }
4171    }
4172    if leading_space {
4173        coalesced.insert(0, Inline::Space);
4174    }
4175    coalesced
4176}
4177
4178/// Tokenize raw input into the literal `[Inline]` payload that pandoc emits
4179/// as the second argument of `Cite`. This is a lossless representation of
4180/// the original bytes (including brackets, semicolons, `*`, `**`, etc.) —
4181/// no markup parsing, no smart-typography. Newlines become `SoftBreak`,
4182/// runs of spaces/tabs become a single `Space`.
4183fn literal_inlines(text: &str) -> Vec<Inline> {
4184    let mut out: Vec<Inline> = Vec::new();
4185    let mut buf = String::new();
4186    for ch in text.chars() {
4187        match ch {
4188            ' ' | '\t' => {
4189                if !buf.is_empty() {
4190                    out.push(Inline::Str(std::mem::take(&mut buf)));
4191                }
4192                if !matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4193                    out.push(Inline::Space);
4194                }
4195            }
4196            '\n' => {
4197                if !buf.is_empty() {
4198                    out.push(Inline::Str(std::mem::take(&mut buf)));
4199                }
4200                if matches!(out.last(), Some(Inline::Space)) {
4201                    out.pop();
4202                }
4203                out.push(Inline::SoftBreak);
4204            }
4205            _ => buf.push(ch),
4206        }
4207    }
4208    if !buf.is_empty() {
4209        out.push(Inline::Str(buf));
4210    }
4211    out
4212}
4213
4214fn push_token_inline(
4215    t: &rowan::SyntaxToken<crate::syntax::PanacheLanguage>,
4216    out: &mut Vec<Inline>,
4217) {
4218    match t.kind() {
4219        SyntaxKind::TEXT => push_text(t.text(), out),
4220        SyntaxKind::WHITESPACE => out.push(Inline::Space),
4221        SyntaxKind::NEWLINE => out.push(Inline::SoftBreak),
4222        SyntaxKind::HARD_LINE_BREAK => out.push(Inline::LineBreak),
4223        SyntaxKind::ESCAPED_CHAR => {
4224            // \x — keep just the escaped character as a Str
4225            let s: String = t.text().chars().skip(1).collect();
4226            out.push(Inline::Str(s));
4227        }
4228        SyntaxKind::NONBREAKING_SPACE => out.push(Inline::Str("\u{a0}".to_string())),
4229        // Skip structural tokens (markers, brackets, fence bytes) that don't
4230        // contribute to the inline stream.
4231        _ => {}
4232    }
4233}
4234
4235fn push_text(text: &str, out: &mut Vec<Inline>) {
4236    let mut buf = String::new();
4237    for ch in text.chars() {
4238        if ch == ' ' || ch == '\t' {
4239            if !buf.is_empty() {
4240                out.push(Inline::Str(std::mem::take(&mut buf)));
4241            }
4242            out.push(Inline::Space);
4243        } else if ch == '\n' {
4244            if !buf.is_empty() {
4245                out.push(Inline::Str(std::mem::take(&mut buf)));
4246            }
4247            out.push(Inline::SoftBreak);
4248        } else {
4249            buf.push(ch);
4250        }
4251    }
4252    if !buf.is_empty() {
4253        out.push(Inline::Str(buf));
4254    }
4255}
4256
4257fn inline_from_node(node: &SyntaxNode) -> Inline {
4258    match node.kind() {
4259        SyntaxKind::EMPHASIS => {
4260            Inline::Emph(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4261        }
4262        SyntaxKind::STRONG => {
4263            Inline::Strong(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4264        }
4265        SyntaxKind::STRIKEOUT => {
4266            Inline::Strikeout(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4267        }
4268        SyntaxKind::SUPERSCRIPT => {
4269            Inline::Superscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4270        }
4271        SyntaxKind::SUBSCRIPT => {
4272            Inline::Subscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4273        }
4274        SyntaxKind::INLINE_CODE => {
4275            let content: String = node
4276                .children_with_tokens()
4277                .filter_map(|el| el.into_token())
4278                .filter(|t| t.kind() == SyntaxKind::INLINE_CODE_CONTENT)
4279                .map(|t| t.text().to_string())
4280                .collect();
4281            Inline::Code(
4282                extract_attr_from_node(node),
4283                strip_inline_code_padding(&content),
4284            )
4285        }
4286        SyntaxKind::LINK | SyntaxKind::IMAGE_LINK | SyntaxKind::UNRESOLVED_REFERENCE => {
4287            // LINK / IMAGE_LINK / UNRESOLVED_REFERENCE render through
4288            // `push_inline_node` so reference resolution can emit
4289            // multiple inlines (resolved Link, or unresolved Str
4290            // fragments). This single-Inline path is unreachable;
4291            // emit Unsupported as a guard rather than silently
4292            // dropping.
4293            Inline::Unsupported(format!("{:?}", node.kind()))
4294        }
4295        SyntaxKind::AUTO_LINK => autolink_inline(node),
4296        SyntaxKind::INLINE_MATH => math_inline(node, "InlineMath"),
4297        SyntaxKind::DISPLAY_MATH => math_inline(node, "DisplayMath"),
4298        SyntaxKind::LATEX_COMMAND => latex_command_inline(node),
4299        SyntaxKind::BRACKETED_SPAN => bracketed_span_inline(node),
4300        SyntaxKind::INLINE_HTML_SPAN => inline_html_span_inline(node),
4301        SyntaxKind::INLINE_HTML => Inline::RawInline("html".to_string(), node.text().to_string()),
4302        SyntaxKind::FOOTNOTE_REFERENCE => footnote_reference_inline(node),
4303        SyntaxKind::INLINE_FOOTNOTE => inline_footnote_inline(node),
4304        other => Inline::Unsupported(format!("{other:?}")),
4305    }
4306}
4307
4308/// Inlines from a wrapper (Emph/Strong/...) where the structural markers are
4309/// child *nodes* (e.g. EMPHASIS_MARKER) rather than child tokens. We descend
4310/// through such marker children but skip their bytes.
4311fn inlines_from_marked(parent: &SyntaxNode) -> Vec<Inline> {
4312    let mut out = Vec::new();
4313    let mut iter = parent.children_with_tokens().peekable();
4314    while let Some(el) = iter.next() {
4315        match el {
4316            NodeOrToken::Token(t) => match t.kind() {
4317                SyntaxKind::EMPHASIS_MARKER
4318                | SyntaxKind::STRONG_MARKER
4319                | SyntaxKind::STRIKEOUT_MARKER
4320                | SyntaxKind::SUPERSCRIPT_MARKER
4321                | SyntaxKind::SUBSCRIPT_MARKER
4322                | SyntaxKind::MARK_MARKER => {}
4323                _ => push_token_inline(&t, &mut out),
4324            },
4325            NodeOrToken::Node(n) => match n.kind() {
4326                SyntaxKind::EMPHASIS_MARKER
4327                | SyntaxKind::STRONG_MARKER
4328                | SyntaxKind::STRIKEOUT_MARKER
4329                | SyntaxKind::SUPERSCRIPT_MARKER
4330                | SyntaxKind::SUBSCRIPT_MARKER
4331                | SyntaxKind::MARK_MARKER => {}
4332                _ if n.kind() == SyntaxKind::LATEX_COMMAND => {
4333                    emit_latex_command_with_absorb(&n, &mut iter, &mut out);
4334                }
4335                _ => push_inline_node(&n, &mut out),
4336            },
4337        }
4338    }
4339    out
4340}
4341
4342fn render_link_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
4343    let text_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT);
4344    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
4345    let has_dest_paren = node
4346        .children_with_tokens()
4347        .any(|el| matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::LINK_DEST_START));
4348
4349    if has_dest_paren {
4350        let text = text_node
4351            .as_ref()
4352            .map(|n| coalesce_inlines(inlines_from(n)))
4353            .unwrap_or_default();
4354        let (url, title) = dest_node
4355            .as_ref()
4356            .map(parse_link_dest)
4357            .unwrap_or((String::new(), String::new()));
4358        out.push(Inline::Link(extract_attr_from_node(node), text, url, title));
4359        return;
4360    }
4361
4362    // Reference-style link: shortcut [label], implicit [label][], or full
4363    // [text][ref]. Distinguish by presence/contents of LINK_REF.
4364    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
4365    let resolved_text_inlines = text_node
4366        .as_ref()
4367        .map(|n| coalesce_inlines(inlines_from(n)))
4368        .unwrap_or_default();
4369    let text_label = text_node
4370        .as_ref()
4371        .map(|n| n.text().to_string())
4372        .unwrap_or_default();
4373
4374    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
4375        Some(rn) => {
4376            let inner = rn.text().to_string();
4377            if inner.is_empty() {
4378                (text_label.clone(), true, String::new())
4379            } else {
4380                (inner.clone(), true, inner)
4381            }
4382        }
4383        None => (text_label.clone(), false, String::new()),
4384    };
4385
4386    if let Some((url, title)) = lookup_ref(&label) {
4387        out.push(Inline::Link(
4388            extract_attr_from_node(node),
4389            resolved_text_inlines,
4390            url,
4391            title,
4392        ));
4393        return;
4394    }
4395
4396    if let Some(id) = lookup_heading_id(&label) {
4397        let url = format!("#{id}");
4398        out.push(Inline::Link(
4399            extract_attr_from_node(node),
4400            resolved_text_inlines,
4401            url,
4402            String::new(),
4403        ));
4404        return;
4405    }
4406
4407    // Unresolved: emit the original markdown bytes as plain text. The reader
4408    // assembles `[<text>]`, optionally followed by `[<ref>]` for a full or
4409    // implicit reference. Using Str inlines here (rather than Link with empty
4410    // dest) matches pandoc's behavior of leaving unresolved references as raw
4411    // text in the output stream. Use keep_edges so leading/trailing whitespace
4412    // inside `[ ... ]` survives — pandoc preserves source whitespace for
4413    // unresolved references (`[ foo ]` → `Str "[", Space, Str "foo", Space,
4414    // Str "]"`), unlike resolved Links which strip edges.
4415    let unresolved_text_inlines = text_node
4416        .as_ref()
4417        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
4418        .unwrap_or_default();
4419    out.push(Inline::Str("[".to_string()));
4420    out.extend(unresolved_text_inlines);
4421    let suffix = if has_second_brackets {
4422        format!("][{second_inner}]")
4423    } else {
4424        "]".to_string()
4425    };
4426    out.push(Inline::Str(suffix));
4427}
4428
4429fn render_image_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
4430    let alt_node = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
4431    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
4432    let has_dest_paren = node.children_with_tokens().any(|el| {
4433        matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::IMAGE_DEST_START
4434            || t.kind() == SyntaxKind::LINK_DEST_START)
4435    });
4436
4437    if has_dest_paren {
4438        let alt = alt_node
4439            .as_ref()
4440            .map(|n| coalesce_inlines(inlines_from(n)))
4441            .unwrap_or_default();
4442        let (url, title) = dest_node
4443            .as_ref()
4444            .map(parse_link_dest)
4445            .unwrap_or((String::new(), String::new()));
4446        out.push(Inline::Image(extract_attr_from_node(node), alt, url, title));
4447        return;
4448    }
4449
4450    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
4451    let alt_inlines = alt_node
4452        .as_ref()
4453        .map(|n| coalesce_inlines(inlines_from(n)))
4454        .unwrap_or_default();
4455    let alt_label = alt_node
4456        .as_ref()
4457        .map(|n| n.text().to_string())
4458        .unwrap_or_default();
4459
4460    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
4461        Some(rn) => {
4462            let inner = rn.text().to_string();
4463            if inner.is_empty() {
4464                (alt_label.clone(), true, String::new())
4465            } else {
4466                (inner.clone(), true, inner)
4467            }
4468        }
4469        None => (alt_label.clone(), false, String::new()),
4470    };
4471
4472    if let Some((url, title)) = lookup_ref(&label) {
4473        out.push(Inline::Image(
4474            extract_attr_from_node(node),
4475            alt_inlines,
4476            url,
4477            title,
4478        ));
4479        return;
4480    }
4481
4482    if let Some(id) = lookup_heading_id(&label) {
4483        let url = format!("#{id}");
4484        out.push(Inline::Image(
4485            extract_attr_from_node(node),
4486            alt_inlines,
4487            url,
4488            String::new(),
4489        ));
4490        return;
4491    }
4492
4493    out.push(Inline::Str("![".to_string()));
4494    out.extend(alt_inlines);
4495    let suffix = if has_second_brackets {
4496        format!("][{second_inner}]")
4497    } else {
4498        "]".to_string()
4499    };
4500    out.push(Inline::Str(suffix));
4501}
4502
4503/// Pandoc's inline code reader (`Markdown.hs::code`) replaces internal
4504/// newlines with spaces (each `\n` → one space) and then `trim`s leading
4505/// and trailing whitespace from the result. Internal whitespace runs are
4506/// preserved.
4507fn strip_inline_code_padding(s: &str) -> String {
4508    let collapsed: String = s.chars().map(|c| if c == '\n' { ' ' } else { c }).collect();
4509    collapsed.trim().to_string()
4510}
4511
4512fn math_inline(node: &SyntaxNode, kind: &'static str) -> Inline {
4513    // The raw math content lives in a `MATH_CONTENT` subtree (a structural TeX
4514    // CST); reconstruct it excluding any host container prefixes interleaved on
4515    // continuation lines (e.g. blockquote `>`).
4516    let content = crate::syntax::math::math_content_text(node);
4517    Inline::Math(kind, content)
4518}
4519
4520fn autolink_inline(node: &SyntaxNode) -> Inline {
4521    let mut url = String::new();
4522    for el in node.children_with_tokens() {
4523        if let NodeOrToken::Token(t) = el
4524            && t.kind() == SyntaxKind::TEXT
4525        {
4526            url.push_str(t.text());
4527        }
4528    }
4529    // Pandoc treats `<foo@bar>` as an email autolink (class "email", `mailto:`
4530    // dest) when the body has no scheme but contains an `@`.
4531    let is_email = !url.contains("://") && !url.starts_with("mailto:") && url.contains('@');
4532    if is_email {
4533        let attr = Attr {
4534            id: String::new(),
4535            classes: vec!["email".to_string()],
4536            kvs: Vec::new(),
4537        };
4538        let dest = format!("mailto:{url}");
4539        return Inline::Link(attr, vec![Inline::Str(url)], dest, String::new());
4540    }
4541    // Pandoc only treats `<scheme:body>` as a URI autolink when `scheme` is
4542    // in its known-schemes allowlist (see pandoc/src/Text/Pandoc/URI.hs).
4543    // Otherwise the original `<...>` bytes are emitted as raw HTML.
4544    if !is_known_uri_scheme(&url) {
4545        return Inline::RawInline("html".to_string(), node.text().to_string());
4546    }
4547    let attr = Attr {
4548        id: String::new(),
4549        classes: vec!["uri".to_string()],
4550        kvs: Vec::new(),
4551    };
4552    Inline::Link(attr, vec![Inline::Str(url.clone())], url, String::new())
4553}
4554
4555/// Pandoc's URI scheme allowlist (IANA + a few unofficial ones). Mirrors
4556/// `pandoc/src/Text/Pandoc/URI.hs`. Lowercase comparison.
4557fn is_known_uri_scheme(url: &str) -> bool {
4558    let scheme_end = url.find(':');
4559    let Some(end) = scheme_end else {
4560        return false;
4561    };
4562    let scheme = url[..end].to_ascii_lowercase();
4563    PANDOC_KNOWN_SCHEMES.binary_search(&scheme.as_str()).is_ok()
4564}
4565
4566/// Pandoc-known URI schemes, sorted for `binary_search`. Mirrors
4567/// `pandoc/src/Text/Pandoc/URI.hs`'s `schemes` set.
4568#[rustfmt::skip]
4569const PANDOC_KNOWN_SCHEMES: &[&str] = &[
4570    "aaa", "aaas", "about", "acap", "acct", "acr",
4571    "adiumxtra", "afp", "afs", "aim", "appdata", "apt",
4572    "attachment", "aw", "barion", "beshare", "bitcoin", "blob",
4573    "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension",
4574    "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid",
4575    "cvs", "data", "dav", "dict", "dis", "dlna-playcontainer",
4576    "dlna-playsingle", "dns", "dntp", "doi", "dtn", "dvb",
4577    "ed2k", "example", "facetime", "fax", "feed", "feedready",
4578    "file", "filesystem", "finger", "fish", "ftp", "gemini",
4579    "geo", "gg", "git", "gizmoproject", "go", "gopher",
4580    "graph", "gtalk", "h323", "ham", "hcp", "http",
4581    "https", "hxxp", "hxxps", "hydrazone", "iax", "icap",
4582    "icon", "im", "imap", "info", "iotdisco", "ipn",
4583    "ipp", "ipps", "irc", "irc6", "ircs", "iris",
4584    "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs", "isbn", "isostore",
4585    "itms", "jabber", "jar", "javascript", "jms", "keyparc",
4586    "lastfm", "ldap", "ldaps", "lvlt", "magnet", "mailserver",
4587    "mailto", "maps", "market", "message", "mid", "mms",
4588    "modem", "mongodb", "moz", "ms-access", "ms-browser-extension", "ms-drive-to",
4589    "ms-enrollment", "ms-excel", "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath",
4590    "ms-media-stream-id", "ms-officeapp", "ms-powerpoint", "ms-project", "ms-publisher", "ms-search-repair",
4591    "ms-secondary-screen-controller", "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode", "ms-settings-bluetooth", "ms-settings-camera",
4592    "ms-settings-cellular", "ms-settings-cloudstorage", "ms-settings-connectabledevices", "ms-settings-displays-topology", "ms-settings-emailandaccounts", "ms-settings-language",
4593    "ms-settings-location", "ms-settings-lock", "ms-settings-nfctransactions", "ms-settings-notifications", "ms-settings-power", "ms-settings-privacy",
4594    "ms-settings-proximity", "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace", "ms-spd", "ms-sttoverlay",
4595    "ms-transit-to", "ms-virtualtouchpad", "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd",
4596    "ms-word", "msnim", "msrp", "msrps", "mtqp", "mumble",
4597    "mupdate", "mvn", "news", "nfs", "ni", "nih",
4598    "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd",
4599    "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform",
4600    "pmid", "pop", "pres", "prospero", "proxy", "psyc",
4601    "pwid", "qb", "query", "redis", "rediss", "reload",
4602    "res", "resource", "rmi", "rsync", "rtmfp", "rtmp",
4603    "rtsp", "rtsps", "rtspu", "secondlife", "service", "session",
4604    "sftp", "sgn", "shttp", "sieve", "sip", "sips",
4605    "skype", "smb", "sms", "smtp", "snews", "snmp",
4606    "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam",
4607    "stun", "stuns", "submit", "svn", "tag", "teamspeak",
4608    "tel", "teliaeid", "telnet", "tftp", "things", "thismessage",
4609    "tip", "tn3270", "tool", "turn", "turns", "tv",
4610    "udp", "unreal", "urn", "ut2004", "v-event", "vemmi",
4611    "ventrilo", "videotex", "view-source", "vnc", "wais", "webcal",
4612    "wpid", "ws", "wss", "wtai", "wyciwyg", "xcon",
4613    "xcon-userid", "xfire", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri",
4614    "ymsgr", "z39.50", "z39.50r", "z39.50s",
4615];
4616
4617fn footnote_reference_inline(node: &SyntaxNode) -> Inline {
4618    let Some(label) = footnote_label(node) else {
4619        return Inline::Unsupported("FOOTNOTE_REFERENCE".to_string());
4620    };
4621    let blocks = REFS_CTX.with(|c| {
4622        c.borrow()
4623            .footnotes
4624            .get(&label)
4625            .map(|bs| bs.iter().map(clone_block).collect::<Vec<_>>())
4626    });
4627    match blocks {
4628        Some(bs) => Inline::Note(bs),
4629        // Unresolved footnote reference: pandoc emits the original bytes as
4630        // text rather than a `Note []`. Keep the raw token text for now.
4631        None => Inline::Str(node.text().to_string()),
4632    }
4633}
4634
4635fn inline_footnote_inline(node: &SyntaxNode) -> Inline {
4636    let inlines = coalesce_inlines(inlines_from(node));
4637    if inlines.is_empty() {
4638        Inline::Note(Vec::new())
4639    } else {
4640        Inline::Note(vec![Block::Para(inlines)])
4641    }
4642}
4643
4644fn parse_link_dest(node: &SyntaxNode) -> (String, String) {
4645    // LINK_DEST holds the raw bytes between `(` and `)`. Split into URL and
4646    // optional quoted title, then percent-escape unsafe characters in the URL
4647    // to match pandoc's `escapeURI`.
4648    let raw = node.text().to_string();
4649    let trimmed = raw.trim();
4650    // `<URL>` form: pandoc strips the angle brackets, even if the URL
4651    // contains otherwise-ambiguous characters like spaces or parens.
4652    if let Some(rest) = trimmed.strip_prefix('<')
4653        && let Some(end) = rest.find('>')
4654    {
4655        let url = &rest[..end];
4656        let after = rest[end + 1..].trim();
4657        let title = parse_dest_title(after);
4658        return (escape_link_dest(url), title);
4659    }
4660    // URL/title boundary: a title starts with `"`, `'`, or `(` after
4661    // whitespace. Without one, the entire string is the URL — internal
4662    // spaces still get percent-escaped.
4663    let bytes = trimmed.as_bytes();
4664    let mut url_end = trimmed.len();
4665    let mut i = 0;
4666    while i < bytes.len() {
4667        if matches!(bytes[i], b' ' | b'\t' | b'\n') {
4668            let mut j = i;
4669            while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\n') {
4670                j += 1;
4671            }
4672            if j < bytes.len() && matches!(bytes[j], b'"' | b'\'' | b'(') {
4673                url_end = i;
4674                break;
4675            }
4676            i = j;
4677        } else {
4678            i += 1;
4679        }
4680    }
4681    let url_raw = &trimmed[..url_end];
4682    let title = parse_dest_title(trimmed[url_end..].trim());
4683    (escape_link_dest(url_raw), title)
4684}
4685
4686/// Mirrors pandoc's `escapeURI`: percent-escape ASCII whitespace and the
4687/// punctuation `<>|"{}[]^\``. Other ASCII and all non-ASCII chars are
4688/// preserved as-is.
4689fn escape_link_dest(s: &str) -> String {
4690    let mut out = String::with_capacity(s.len());
4691    for ch in s.chars() {
4692        let needs_escape = ch.is_whitespace()
4693            || matches!(
4694                ch,
4695                '<' | '>' | '|' | '"' | '{' | '}' | '[' | ']' | '^' | '`'
4696            );
4697        if needs_escape {
4698            let mut buf = [0u8; 4];
4699            for &b in ch.encode_utf8(&mut buf).as_bytes() {
4700                out.push_str(&format!("%{b:02X}"));
4701            }
4702        } else {
4703            out.push(ch);
4704        }
4705    }
4706    out
4707}
4708
4709fn parse_dest_title(s: &str) -> String {
4710    let bytes = s.as_bytes();
4711    if bytes.is_empty() {
4712        return String::new();
4713    }
4714    let (open, close) = match bytes[0] {
4715        b'"' => (b'"', b'"'),
4716        b'\'' => (b'\'', b'\''),
4717        b'(' => (b'(', b')'),
4718        _ => return String::new(),
4719    };
4720    if !s.starts_with(open as char) {
4721        return String::new();
4722    }
4723    if let Some(end) = s[1..].rfind(close as char) {
4724        return s[1..1 + end].to_string();
4725    }
4726    String::new()
4727}
4728
4729// ----- coalescing & helpers ----------------------------------------------
4730
4731fn coalesce_inlines(input: Vec<Inline>) -> Vec<Inline> {
4732    coalesce_inlines_inner(input, true)
4733}
4734
4735/// Inside markup atoms (Emph/Strong/Strikeout/Sup/Sub), pandoc preserves
4736/// leading/trailing whitespace inside the wrapper — e.g. `*foo bar *` projects
4737/// as `Emph [Str "foo", Space, Str "bar", Space]`. Block-level paragraphs and
4738/// headers strip edge whitespace, but inline markup wrappers do not.
4739fn coalesce_inlines_keep_edges(input: Vec<Inline>) -> Vec<Inline> {
4740    coalesce_inlines_inner(input, false)
4741}
4742
4743fn coalesce_inlines_inner(input: Vec<Inline>, trim_edges: bool) -> Vec<Inline> {
4744    let mut out: Vec<Inline> = Vec::with_capacity(input.len());
4745    for inline in input {
4746        if let Inline::Str(s) = inline {
4747            if let Some(Inline::Str(prev)) = out.last_mut() {
4748                prev.push_str(&s);
4749            } else {
4750                out.push(Inline::Str(s));
4751            }
4752        } else if let Inline::Space = inline {
4753            // Collapse runs of Space into a single Space; pandoc never emits
4754            // two consecutive Space tokens.
4755            if matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4756                continue;
4757            }
4758            out.push(Inline::Space);
4759        } else if let Inline::SoftBreak = inline {
4760            // SoftBreak after Space: drop the trailing Space to match pandoc
4761            // (line-end whitespace is not preserved as Space).
4762            if matches!(out.last(), Some(Inline::Space)) {
4763                out.pop();
4764            }
4765            out.push(Inline::SoftBreak);
4766        } else {
4767            out.push(inline);
4768        }
4769    }
4770    if trim_edges {
4771        // Trim leading/trailing Space/SoftBreak — pandoc does not emit edge
4772        // whitespace inside a paragraph or header.
4773        while matches!(out.first(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4774            out.remove(0);
4775        }
4776        while matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4777            out.pop();
4778        }
4779    }
4780    // Pandoc's `smart` extension is on by default for markdown. Apply the
4781    // simple in-Str substitutions here (apostrophe, dashes, ellipsis), then
4782    // restructure paired straight quotes into `Quoted` nodes.
4783    for inline in out.iter_mut() {
4784        if let Inline::Str(s) = inline {
4785            let mut t = smart_intraword_apostrophe(s);
4786            t = smart_dashes_and_ellipsis(&t);
4787            *s = t;
4788        }
4789    }
4790    let out = smart_quote_pairs(out);
4791    apply_abbreviations(out)
4792}
4793
4794/// Pandoc's default abbreviation list (from `pandoc/data/abbreviations`).
4795/// When a Str token *exactly equal to* one of these (i.e. the abbrev is a
4796/// suffix of the projected Str preceded by a non-letter / non-dot char or the
4797/// start of the Str) is followed by a `Space`, pandoc replaces the space with
4798/// a non-breaking space appended to the Str. Sorted to allow `binary_search`.
4799const PANDOC_ABBREVIATIONS: &[&str] = &[
4800    "Apr.", "Aug.", "Bros.", "Capt.", "Co.", "Corp.", "Dec.", "Dr.", "Feb.", "Fr.", "Gen.", "Gov.",
4801    "Hon.", "Inc.", "Jan.", "Jr.", "Jul.", "Jun.", "Ltd.", "M.A.", "M.D.", "Mar.", "Mr.", "Mrs.",
4802    "Ms.", "No.", "Nov.", "Oct.", "Ph.D.", "Pres.", "Prof.", "Rep.", "Rev.", "Sen.", "Sep.",
4803    "Sept.", "Sgt.", "Sr.", "St.", "aet.", "aetat.", "al.", "bk.", "c.", "cf.", "ch.", "chap.",
4804    "chs.", "col.", "cp.", "d.", "e.g.", "ed.", "eds.", "esp.", "f.", "fasc.", "ff.", "fig.",
4805    "fl.", "fol.", "fols.", "i.e.", "ill.", "incl.", "n.", "n.b.", "nn.", "p.", "pp.", "pt.",
4806    "q.v.", "s.v.", "s.vv.", "saec.", "sec.", "univ.", "viz.", "vol.", "vs.",
4807];
4808
4809fn matches_abbreviation_suffix(s: &str) -> bool {
4810    for &abbr in PANDOC_ABBREVIATIONS {
4811        if let Some(prefix) = s.strip_suffix(abbr) {
4812            if prefix.is_empty() {
4813                return true;
4814            }
4815            let last = prefix.chars().next_back().unwrap();
4816            if !last.is_alphanumeric() && last != '.' {
4817                return true;
4818            }
4819        }
4820    }
4821    false
4822}
4823
4824/// Apply pandoc's `+abbreviations` extension as a post-pass over a flat inline
4825/// list. For each `Str` ending in a known abbreviation followed by `Space`,
4826/// drop the `Space`, append `\u{a0}` (NBSP) to the `Str`, and merge the
4827/// following `Str` (if any) into it. Recurses into `Quoted` content because
4828/// `Quoted` is built inside `smart_quote_pairs` after the parent
4829/// `coalesce_inlines_inner` already ran on its source list, so its content
4830/// won't have been abbreviation-processed yet. Other inline wrappers (`Emph`,
4831/// `Strong`, `Link`, `Image`, `Note`, …) are constructed via their own
4832/// `coalesce_inlines_*` call, so their contents are already processed.
4833fn apply_abbreviations(inlines: Vec<Inline>) -> Vec<Inline> {
4834    let inlines: Vec<Inline> = inlines
4835        .into_iter()
4836        .map(|inline| match inline {
4837            Inline::Quoted(kind, content) => Inline::Quoted(kind, apply_abbreviations(content)),
4838            other => other,
4839        })
4840        .collect();
4841    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4842    let mut iter = inlines.into_iter().peekable();
4843    while let Some(inline) = iter.next() {
4844        if let Inline::Str(ref s) = inline
4845            && matches_abbreviation_suffix(s)
4846            && matches!(iter.peek(), Some(Inline::Space))
4847        {
4848            // Drop the Space.
4849            iter.next();
4850            let Inline::Str(mut new_s) = inline else {
4851                unreachable!()
4852            };
4853            new_s.push('\u{a0}');
4854            // Merge with the following Str if present.
4855            if let Some(Inline::Str(_)) = iter.peek()
4856                && let Some(Inline::Str(next_s)) = iter.next()
4857            {
4858                new_s.push_str(&next_s);
4859            }
4860            out.push(Inline::Str(new_s));
4861        } else {
4862            out.push(inline);
4863        }
4864    }
4865    out
4866}
4867
4868fn smart_quote_pairs(inlines: Vec<Inline>) -> Vec<Inline> {
4869    // Walk left-to-right, when a Str starts with a straight quote and the
4870    // previous element is a "boundary" (None/Space/SoftBreak/LineBreak), look
4871    // ahead for a matching close quote (Str ending with same quote char,
4872    // followed by a boundary). Wrap the inlines in between in a `Quoted` node.
4873    // Only handle quotes at Str boundaries; embedded or interleaved quotes are
4874    // not restructured (kept as-is) — pandoc has more nuanced rules but this
4875    // covers the common natural-text patterns in the corpus.
4876    fn is_boundary(prev: Option<&Inline>) -> bool {
4877        match prev {
4878            None => true,
4879            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4880            Some(Inline::Str(s)) => s.chars().last().is_some_and(|c| !c.is_alphanumeric()),
4881            _ => false,
4882        }
4883    }
4884    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4885    let n = inlines.len();
4886    let mut consumed = vec![false; n];
4887    for i in 0..n {
4888        if consumed[i] {
4889            continue;
4890        }
4891        // Try to detect an open quote at position i.
4892        let Inline::Str(s) = &inlines[i] else {
4893            out.push(clone_inline(&inlines[i]));
4894            consumed[i] = true;
4895            continue;
4896        };
4897        let first = s.chars().next();
4898        let quote = match first {
4899            Some('"') => Some('"'),
4900            Some('\'') => Some('\''),
4901            _ => None,
4902        };
4903        // Open quote condition: previous inline is boundary, AND either
4904        // (a) the Str has more chars after the quote and the next char is
4905        //     non-space (open quote attaches to a word in the same Str), or
4906        // (b) the Str is *only* the quote and the next inline is a markup
4907        //     atom (Emph/Strong/...), so the quote attaches across atoms.
4908        let prev_is_boundary = is_boundary(out.last());
4909        let str_has_more = s.chars().count() > 1;
4910        let next_char_is_word = s.chars().nth(1).is_some_and(|c| !c.is_whitespace());
4911        let next_is_markup_atom = matches!(
4912            inlines.get(i + 1),
4913            Some(
4914                Inline::Emph(_)
4915                    | Inline::Strong(_)
4916                    | Inline::Strikeout(_)
4917                    | Inline::Superscript(_)
4918                    | Inline::Subscript(_)
4919                    | Inline::Code(_, _)
4920            )
4921        );
4922        let attaches =
4923            (str_has_more && next_char_is_word) || (!str_has_more && next_is_markup_atom);
4924        if let Some(q) = quote
4925            && prev_is_boundary
4926            && attaches
4927        {
4928            // Find the matching close.
4929            if let Some(close_idx) = find_matching_close(&inlines, i, q, &consumed) {
4930                // Build content: inlines from i to close_idx (inclusive),
4931                // strip the leading quote from inlines[i] and trailing quote
4932                // from inlines[close_idx].
4933                let kind = if q == '"' {
4934                    "DoubleQuote"
4935                } else {
4936                    "SingleQuote"
4937                };
4938                let mut content: Vec<Inline> = Vec::new();
4939                for j in i..=close_idx {
4940                    if consumed[j] {
4941                        continue;
4942                    }
4943                    let inline = &inlines[j];
4944                    if j == i && j == close_idx {
4945                        // Open and close in the same Str — strip both ends.
4946                        if let Inline::Str(s) = inline {
4947                            let mut chars: Vec<char> = s.chars().collect();
4948                            if chars.len() >= 2 {
4949                                chars.remove(0);
4950                                chars.pop();
4951                            }
4952                            let stripped: String = chars.into_iter().collect();
4953                            if !stripped.is_empty() {
4954                                content.push(Inline::Str(stripped));
4955                            }
4956                        }
4957                    } else if j == i {
4958                        if let Inline::Str(s) = inline {
4959                            let stripped: String = s.chars().skip(1).collect();
4960                            if !stripped.is_empty() {
4961                                content.push(Inline::Str(stripped));
4962                            }
4963                        }
4964                    } else if j == close_idx {
4965                        if let Inline::Str(s) = inline {
4966                            let mut stripped: String = s.chars().collect();
4967                            stripped.pop();
4968                            if !stripped.is_empty() {
4969                                content.push(Inline::Str(stripped));
4970                            }
4971                        }
4972                    } else {
4973                        content.push(clone_inline(inline));
4974                    }
4975                    consumed[j] = true;
4976                }
4977                out.push(Inline::Quoted(kind, content));
4978                continue;
4979            }
4980        }
4981        out.push(clone_inline(&inlines[i]));
4982        consumed[i] = true;
4983    }
4984    out
4985}
4986
4987fn find_matching_close(
4988    inlines: &[Inline],
4989    open_idx: usize,
4990    quote: char,
4991    consumed: &[bool],
4992) -> Option<usize> {
4993    // First check: same Str ends with the matching quote (close in same Str).
4994    if let Inline::Str(s) = &inlines[open_idx]
4995        && s.chars().count() >= 3
4996        && s.ends_with(quote)
4997    {
4998        // Need to confirm the next inline (after this Str) is a boundary.
4999        let next = inlines.get(open_idx + 1);
5000        let after_is_boundary = match next {
5001            None => true,
5002            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
5003            Some(Inline::Str(s)) => s.chars().next().is_some_and(|c| !c.is_alphanumeric()),
5004            _ => false,
5005        };
5006        if after_is_boundary {
5007            return Some(open_idx);
5008        }
5009    }
5010    // Otherwise, scan forward for a Str ending with the quote and followed by
5011    // a boundary.
5012    let n = inlines.len();
5013    let mut j = open_idx + 1;
5014    while j < n {
5015        if consumed[j] {
5016            return None;
5017        }
5018        match &inlines[j] {
5019            Inline::Str(s) => {
5020                if s.ends_with(quote) {
5021                    let next = inlines.get(j + 1);
5022                    let after_is_boundary = match next {
5023                        None => true,
5024                        Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
5025                        Some(Inline::Str(s)) => {
5026                            s.chars().next().is_some_and(|c| !c.is_alphanumeric())
5027                        }
5028                        _ => false,
5029                    };
5030                    if after_is_boundary {
5031                        return Some(j);
5032                    }
5033                }
5034            }
5035            Inline::Space | Inline::SoftBreak | Inline::LineBreak => {}
5036            // Don't span over markup atoms — keep search cheap and predictable.
5037            _ => {}
5038        }
5039        j += 1;
5040        // Cap search range — natural quoted spans are short.
5041        if j - open_idx > 32 {
5042            return None;
5043        }
5044    }
5045    None
5046}
5047
5048fn clone_inline(inline: &Inline) -> Inline {
5049    match inline {
5050        Inline::Str(s) => Inline::Str(s.clone()),
5051        Inline::Space => Inline::Space,
5052        Inline::SoftBreak => Inline::SoftBreak,
5053        Inline::LineBreak => Inline::LineBreak,
5054        Inline::Emph(c) => Inline::Emph(c.iter().map(clone_inline).collect()),
5055        Inline::Strong(c) => Inline::Strong(c.iter().map(clone_inline).collect()),
5056        Inline::Strikeout(c) => Inline::Strikeout(c.iter().map(clone_inline).collect()),
5057        Inline::Superscript(c) => Inline::Superscript(c.iter().map(clone_inline).collect()),
5058        Inline::Subscript(c) => Inline::Subscript(c.iter().map(clone_inline).collect()),
5059        Inline::Code(a, s) => Inline::Code(a.clone(), s.clone()),
5060        Inline::Link(a, t, u, ti) => Inline::Link(
5061            a.clone(),
5062            t.iter().map(clone_inline).collect(),
5063            u.clone(),
5064            ti.clone(),
5065        ),
5066        Inline::Image(a, t, u, ti) => Inline::Image(
5067            a.clone(),
5068            t.iter().map(clone_inline).collect(),
5069            u.clone(),
5070            ti.clone(),
5071        ),
5072        Inline::Math(k, c) => Inline::Math(k, c.clone()),
5073        Inline::Span(a, c) => Inline::Span(a.clone(), c.iter().map(clone_inline).collect()),
5074        Inline::RawInline(f, c) => Inline::RawInline(f.clone(), c.clone()),
5075        Inline::Quoted(k, c) => Inline::Quoted(k, c.iter().map(clone_inline).collect()),
5076        Inline::Note(blocks) => Inline::Note(blocks.iter().map(clone_block).collect()),
5077        Inline::Cite(citations, text) => Inline::Cite(
5078            citations
5079                .iter()
5080                .map(|c| Citation {
5081                    id: c.id.clone(),
5082                    prefix: c.prefix.iter().map(clone_inline).collect(),
5083                    suffix: c.suffix.iter().map(clone_inline).collect(),
5084                    mode: c.mode,
5085                    note_num: c.note_num,
5086                    hash: c.hash,
5087                })
5088                .collect(),
5089            text.iter().map(clone_inline).collect(),
5090        ),
5091        Inline::Unsupported(s) => Inline::Unsupported(s.clone()),
5092    }
5093}
5094
5095fn clone_block(b: &Block) -> Block {
5096    match b {
5097        Block::Para(c) => Block::Para(c.iter().map(clone_inline).collect()),
5098        Block::Plain(c) => Block::Plain(c.iter().map(clone_inline).collect()),
5099        Block::Header(lvl, a, c) => {
5100            Block::Header(*lvl, a.clone(), c.iter().map(clone_inline).collect())
5101        }
5102        Block::BlockQuote(blocks) => Block::BlockQuote(blocks.iter().map(clone_block).collect()),
5103        Block::CodeBlock(a, s) => Block::CodeBlock(a.clone(), s.clone()),
5104        Block::HorizontalRule => Block::HorizontalRule,
5105        Block::BulletList(items) => Block::BulletList(
5106            items
5107                .iter()
5108                .map(|item| item.iter().map(clone_block).collect())
5109                .collect(),
5110        ),
5111        Block::OrderedList(start, style, delim, items) => Block::OrderedList(
5112            *start,
5113            style,
5114            delim,
5115            items
5116                .iter()
5117                .map(|item| item.iter().map(clone_block).collect())
5118                .collect(),
5119        ),
5120        Block::RawBlock(f, c) => Block::RawBlock(f.clone(), c.clone()),
5121        Block::Table(_) => Block::Unsupported("Table".to_string()),
5122        Block::Div(a, blocks) => Block::Div(a.clone(), blocks.iter().map(clone_block).collect()),
5123        Block::LineBlock(lines) => Block::LineBlock(
5124            lines
5125                .iter()
5126                .map(|line| line.iter().map(clone_inline).collect())
5127                .collect(),
5128        ),
5129        Block::DefinitionList(items) => Block::DefinitionList(
5130            items
5131                .iter()
5132                .map(|(term, defs)| {
5133                    (
5134                        term.iter().map(clone_inline).collect(),
5135                        defs.iter()
5136                            .map(|d| d.iter().map(clone_block).collect())
5137                            .collect(),
5138                    )
5139                })
5140                .collect(),
5141        ),
5142        Block::Figure(a, caption, body) => Block::Figure(
5143            a.clone(),
5144            caption.iter().map(clone_block).collect(),
5145            body.iter().map(clone_block).collect(),
5146        ),
5147        Block::Unsupported(s) => Block::Unsupported(s.clone()),
5148    }
5149}
5150
5151fn smart_dashes_and_ellipsis(s: &str) -> String {
5152    if !s.contains(['-', '.']) {
5153        return s.to_string();
5154    }
5155    let bytes = s.as_bytes();
5156    let mut out = String::with_capacity(s.len());
5157    let mut i = 0usize;
5158    while i < bytes.len() {
5159        if bytes[i] == b'-' {
5160            if i + 2 < bytes.len() && bytes[i + 1] == b'-' && bytes[i + 2] == b'-' {
5161                out.push('\u{2014}');
5162                i += 3;
5163                continue;
5164            }
5165            if i + 1 < bytes.len() && bytes[i + 1] == b'-' {
5166                out.push('\u{2013}');
5167                i += 2;
5168                continue;
5169            }
5170        }
5171        if bytes[i] == b'.' && i + 2 < bytes.len() && bytes[i + 1] == b'.' && bytes[i + 2] == b'.' {
5172            out.push('\u{2026}');
5173            i += 3;
5174            continue;
5175        }
5176        // Read one UTF-8 char.
5177        let len = utf8_char_len(bytes[i]);
5178        out.push_str(&s[i..i + len]);
5179        i += len;
5180    }
5181    out
5182}
5183
5184fn utf8_char_len(b: u8) -> usize {
5185    // Invalid start bytes (0x80..0xc0) advance one byte to recover.
5186    if b < 0xc0 {
5187        1
5188    } else if b < 0xe0 {
5189        2
5190    } else if b < 0xf0 {
5191        3
5192    } else {
5193        4
5194    }
5195}
5196
5197fn smart_intraword_apostrophe(s: &str) -> String {
5198    if !s.contains('\'') {
5199        return s.to_string();
5200    }
5201    let chars: Vec<char> = s.chars().collect();
5202    let mut out = String::with_capacity(s.len());
5203    for (i, &c) in chars.iter().enumerate() {
5204        if c == '\'' {
5205            let prev = i.checked_sub(1).map(|j| chars[j]);
5206            let next = chars.get(i + 1).copied();
5207            let prev_word = prev.is_some_and(is_word_char);
5208            let next_word = next.is_some_and(is_word_char);
5209            if prev_word && next_word {
5210                out.push('\u{2019}');
5211                continue;
5212            }
5213        }
5214        out.push(c);
5215    }
5216    out
5217}
5218
5219fn is_word_char(c: char) -> bool {
5220    c.is_alphanumeric()
5221}
5222
5223fn inlines_to_plaintext(inlines: &[Inline]) -> String {
5224    let mut s = String::new();
5225    for i in inlines {
5226        match i {
5227            Inline::Str(t) => s.push_str(t),
5228            Inline::Space | Inline::SoftBreak => s.push(' '),
5229            Inline::LineBreak => s.push(' '),
5230            Inline::Emph(children)
5231            | Inline::Strong(children)
5232            | Inline::Strikeout(children)
5233            | Inline::Superscript(children)
5234            | Inline::Subscript(children) => s.push_str(&inlines_to_plaintext(children)),
5235            Inline::Code(_, c) => s.push_str(c),
5236            Inline::Link(_, alt, _, _) | Inline::Image(_, alt, _, _) => {
5237                s.push_str(&inlines_to_plaintext(alt))
5238            }
5239            Inline::Math(_, c) => s.push_str(c),
5240            Inline::Span(_, children) => s.push_str(&inlines_to_plaintext(children)),
5241            Inline::RawInline(_, _) => {}
5242            Inline::Quoted(_, children) => s.push_str(&inlines_to_plaintext(children)),
5243            Inline::Note(_) => {}
5244            Inline::Cite(_, text) => s.push_str(&inlines_to_plaintext(text)),
5245            Inline::Unsupported(_) => {}
5246        }
5247    }
5248    s
5249}
5250
5251fn pandoc_slugify(text: &str) -> String {
5252    // Mirror crates/panache-formatter::utils::pandoc_slugify so the parser-side
5253    // projector doesn't need to depend on the formatter crate.
5254    let mut out = String::new();
5255    let mut prev_dash = false;
5256    for ch in text.chars() {
5257        if ch.is_whitespace() {
5258            if !out.is_empty() && !prev_dash {
5259                out.push('-');
5260                prev_dash = true;
5261            }
5262            continue;
5263        }
5264        for lc in ch.to_lowercase() {
5265            if lc.is_alphanumeric() || lc == '_' || lc == '-' || lc == '.' {
5266                out.push(lc);
5267                prev_dash = lc == '-';
5268            }
5269        }
5270    }
5271    while out.ends_with('-') {
5272        out.pop();
5273    }
5274    out
5275}
5276
5277impl Attr {
5278    fn with_id(id: String) -> Self {
5279        Self {
5280            id,
5281            classes: Vec::new(),
5282            kvs: Vec::new(),
5283        }
5284    }
5285}
5286
5287// ----- text emission ------------------------------------------------------
5288
5289fn write_block(b: &Block, out: &mut String) {
5290    match b {
5291        Block::Para(inlines) => {
5292            out.push_str("Para [");
5293            write_inline_list(inlines, out);
5294            out.push_str(" ]");
5295        }
5296        Block::Plain(inlines) => {
5297            out.push_str("Plain [");
5298            write_inline_list(inlines, out);
5299            out.push_str(" ]");
5300        }
5301        Block::Header(level, attr, inlines) => {
5302            out.push_str(&format!("Header {level} ("));
5303            write_attr(attr, out);
5304            out.push_str(") [");
5305            write_inline_list(inlines, out);
5306            out.push_str(" ]");
5307        }
5308        Block::BlockQuote(blocks) => {
5309            out.push_str("BlockQuote [");
5310            write_block_list(blocks, out);
5311            out.push_str(" ]");
5312        }
5313        Block::CodeBlock(attr, content) => {
5314            out.push_str("CodeBlock (");
5315            write_attr(attr, out);
5316            out.push_str(") ");
5317            write_haskell_string(content, out);
5318        }
5319        Block::HorizontalRule => out.push_str("HorizontalRule"),
5320        Block::BulletList(items) => {
5321            out.push_str("BulletList [");
5322            for (i, item) in items.iter().enumerate() {
5323                if i > 0 {
5324                    out.push(',');
5325                }
5326                out.push_str(" [");
5327                write_block_list(item, out);
5328                out.push_str(" ]");
5329            }
5330            out.push_str(" ]");
5331        }
5332        Block::OrderedList(start, style, delim, items) => {
5333            out.push_str(&format!("OrderedList ( {start} , {style} , {delim} ) ["));
5334            for (i, item) in items.iter().enumerate() {
5335                if i > 0 {
5336                    out.push(',');
5337                }
5338                out.push_str(" [");
5339                write_block_list(item, out);
5340                out.push_str(" ]");
5341            }
5342            out.push_str(" ]");
5343        }
5344        Block::RawBlock(format, content) => {
5345            out.push_str("RawBlock ( Format ");
5346            write_haskell_string(format, out);
5347            out.push_str(" ) ");
5348            write_haskell_string(content, out);
5349        }
5350        Block::Table(data) => {
5351            write_table(data, out);
5352        }
5353        Block::Div(attr, blocks) => {
5354            out.push_str("Div (");
5355            write_attr(attr, out);
5356            out.push_str(") [");
5357            write_block_list(blocks, out);
5358            out.push_str(" ]");
5359        }
5360        Block::LineBlock(lines) => {
5361            out.push_str("LineBlock [");
5362            for (i, line) in lines.iter().enumerate() {
5363                if i > 0 {
5364                    out.push(',');
5365                }
5366                out.push_str(" [");
5367                write_inline_list(line, out);
5368                out.push_str(" ]");
5369            }
5370            out.push_str(" ]");
5371        }
5372        Block::DefinitionList(items) => {
5373            out.push_str("DefinitionList [");
5374            for (i, (term, defs)) in items.iter().enumerate() {
5375                if i > 0 {
5376                    out.push(',');
5377                }
5378                out.push_str(" ( [");
5379                write_inline_list(term, out);
5380                out.push_str(" ] , [");
5381                for (j, def) in defs.iter().enumerate() {
5382                    if j > 0 {
5383                        out.push(',');
5384                    }
5385                    out.push_str(" [");
5386                    write_block_list(def, out);
5387                    out.push_str(" ]");
5388                }
5389                out.push_str(" ] )");
5390            }
5391            out.push_str(" ]");
5392        }
5393        Block::Figure(attr, caption, body) => {
5394            out.push_str("Figure (");
5395            write_attr(attr, out);
5396            out.push_str(") ( Caption Nothing [");
5397            write_block_list(caption, out);
5398            out.push_str(" ] ) [");
5399            write_block_list(body, out);
5400            out.push_str(" ]");
5401        }
5402        Block::Unsupported(name) => {
5403            out.push_str(&format!("Unsupported {name:?}"));
5404        }
5405    }
5406}
5407
5408fn write_table(data: &TableData, out: &mut String) {
5409    out.push_str("Table (");
5410    write_attr(&data.attr, out);
5411    out.push_str(") ( Caption Nothing [");
5412    if !data.caption.is_empty() {
5413        out.push_str(" Plain [");
5414        write_inline_list(&data.caption, out);
5415        out.push_str(" ]");
5416    }
5417    out.push_str(" ] ) [");
5418    for (i, align) in data.aligns.iter().enumerate() {
5419        if i > 0 {
5420            out.push(',');
5421        }
5422        let width = data.widths.get(i).copied().unwrap_or(None);
5423        match width {
5424            None => out.push_str(&format!(" ( {align} , ColWidthDefault )")),
5425            Some(w) => out.push_str(&format!(" ( {align} , ColWidth {} )", show_double(w))),
5426        }
5427    }
5428    out.push_str(" ] ( TableHead ( \"\" , [ ] , [ ] ) [");
5429    for (i, row) in data.head_rows.iter().enumerate() {
5430        if i > 0 {
5431            out.push(',');
5432        }
5433        out.push(' ');
5434        write_table_row(row, out);
5435    }
5436    out.push_str(" ] ) [ TableBody ( \"\" , [ ] , [ ] ) ( RowHeadColumns 0 ) [ ] [");
5437    for (i, row) in data.body_rows.iter().enumerate() {
5438        if i > 0 {
5439            out.push(',');
5440        }
5441        out.push(' ');
5442        write_table_row(row, out);
5443    }
5444    out.push_str(" ] ] ( TableFoot ( \"\" , [ ] , [ ] ) [");
5445    for (i, row) in data.foot_rows.iter().enumerate() {
5446        if i > 0 {
5447            out.push(',');
5448        }
5449        out.push(' ');
5450        write_table_row(row, out);
5451    }
5452    out.push_str(" ] )");
5453}
5454
5455fn write_table_row(cells: &[GridCell], out: &mut String) {
5456    out.push_str("Row ( \"\" , [ ] , [ ] ) [");
5457    for (i, cell) in cells.iter().enumerate() {
5458        if i > 0 {
5459            out.push(',');
5460        }
5461        out.push_str(&format!(
5462            " Cell ( \"\" , [ ] , [ ] ) AlignDefault ( RowSpan {} ) ( ColSpan {} ) [",
5463            cell.row_span, cell.col_span
5464        ));
5465        if !cell.blocks.is_empty() {
5466            write_block_list(&cell.blocks, out);
5467        }
5468        out.push_str(" ]");
5469    }
5470    out.push_str(" ]");
5471}
5472
5473fn write_block_list(blocks: &[Block], out: &mut String) {
5474    for (i, b) in blocks.iter().enumerate() {
5475        if i > 0 {
5476            out.push(',');
5477        }
5478        out.push(' ');
5479        write_block(b, out);
5480    }
5481}
5482
5483fn write_inline_list(inlines: &[Inline], out: &mut String) {
5484    for (i, inline) in inlines.iter().enumerate() {
5485        if i > 0 {
5486            out.push(',');
5487        }
5488        out.push(' ');
5489        write_inline(inline, out);
5490    }
5491}
5492
5493fn write_inline(inline: &Inline, out: &mut String) {
5494    match inline {
5495        Inline::Str(s) => {
5496            out.push_str("Str ");
5497            write_haskell_string(s, out);
5498        }
5499        Inline::Space => out.push_str("Space"),
5500        Inline::SoftBreak => out.push_str("SoftBreak"),
5501        Inline::LineBreak => out.push_str("LineBreak"),
5502        Inline::Emph(children) => {
5503            out.push_str("Emph [");
5504            write_inline_list(children, out);
5505            out.push_str(" ]");
5506        }
5507        Inline::Strong(children) => {
5508            out.push_str("Strong [");
5509            write_inline_list(children, out);
5510            out.push_str(" ]");
5511        }
5512        Inline::Strikeout(children) => {
5513            out.push_str("Strikeout [");
5514            write_inline_list(children, out);
5515            out.push_str(" ]");
5516        }
5517        Inline::Superscript(children) => {
5518            out.push_str("Superscript [");
5519            write_inline_list(children, out);
5520            out.push_str(" ]");
5521        }
5522        Inline::Subscript(children) => {
5523            out.push_str("Subscript [");
5524            write_inline_list(children, out);
5525            out.push_str(" ]");
5526        }
5527        Inline::Code(attr, content) => {
5528            out.push_str("Code (");
5529            write_attr(attr, out);
5530            out.push_str(") ");
5531            write_haskell_string(content, out);
5532        }
5533        Inline::Link(attr, text, url, title) => {
5534            out.push_str("Link (");
5535            write_attr(attr, out);
5536            out.push_str(") [");
5537            write_inline_list(text, out);
5538            out.push_str(" ] ( ");
5539            write_haskell_string(url, out);
5540            out.push_str(" , ");
5541            write_haskell_string(title, out);
5542            out.push_str(" )");
5543        }
5544        Inline::Image(attr, alt, url, title) => {
5545            out.push_str("Image (");
5546            write_attr(attr, out);
5547            out.push_str(") [");
5548            write_inline_list(alt, out);
5549            out.push_str(" ] ( ");
5550            write_haskell_string(url, out);
5551            out.push_str(" , ");
5552            write_haskell_string(title, out);
5553            out.push_str(" )");
5554        }
5555        Inline::Math(kind, content) => {
5556            out.push_str("Math ");
5557            out.push_str(kind);
5558            out.push(' ');
5559            write_haskell_string(content, out);
5560        }
5561        Inline::Span(attr, children) => {
5562            out.push_str("Span (");
5563            write_attr(attr, out);
5564            out.push_str(") [");
5565            write_inline_list(children, out);
5566            out.push_str(" ]");
5567        }
5568        Inline::RawInline(format, content) => {
5569            out.push_str("RawInline ( Format ");
5570            write_haskell_string(format, out);
5571            out.push_str(" ) ");
5572            write_haskell_string(content, out);
5573        }
5574        Inline::Quoted(kind, children) => {
5575            out.push_str("Quoted ");
5576            out.push_str(kind);
5577            out.push_str(" [");
5578            write_inline_list(children, out);
5579            out.push_str(" ]");
5580        }
5581        Inline::Note(blocks) => {
5582            out.push_str("Note [");
5583            write_block_list(blocks, out);
5584            out.push_str(" ]");
5585        }
5586        Inline::Cite(citations, text) => {
5587            out.push_str("Cite [");
5588            for (i, c) in citations.iter().enumerate() {
5589                if i > 0 {
5590                    out.push(',');
5591                }
5592                out.push_str(" Citation { citationId = ");
5593                write_haskell_string(&c.id, out);
5594                out.push_str(" , citationPrefix = [");
5595                write_inline_list(&c.prefix, out);
5596                out.push_str(" ] , citationSuffix = [");
5597                write_inline_list(&c.suffix, out);
5598                out.push_str(" ] , citationMode = ");
5599                out.push_str(match c.mode {
5600                    CitationMode::AuthorInText => "AuthorInText",
5601                    CitationMode::NormalCitation => "NormalCitation",
5602                    CitationMode::SuppressAuthor => "SuppressAuthor",
5603                });
5604                out.push_str(&format!(
5605                    " , citationNoteNum = {} , citationHash = {} }}",
5606                    c.note_num, c.hash
5607                ));
5608            }
5609            out.push_str(" ] [");
5610            write_inline_list(text, out);
5611            out.push_str(" ]");
5612        }
5613        Inline::Unsupported(name) => {
5614            out.push_str(&format!("Unsupported {name:?}"));
5615        }
5616    }
5617}
5618
5619fn write_attr(attr: &Attr, out: &mut String) {
5620    out.push(' ');
5621    write_haskell_string(&attr.id, out);
5622    out.push_str(" , [");
5623    for (i, c) in attr.classes.iter().enumerate() {
5624        if i > 0 {
5625            out.push(',');
5626        }
5627        out.push(' ');
5628        write_haskell_string(c, out);
5629    }
5630    if !attr.classes.is_empty() {
5631        out.push(' ');
5632    }
5633    out.push_str("] , [");
5634    for (i, (k, v)) in attr.kvs.iter().enumerate() {
5635        if i > 0 {
5636            out.push(',');
5637        }
5638        out.push_str(" ( ");
5639        write_haskell_string(k, out);
5640        out.push_str(" , ");
5641        write_haskell_string(v, out);
5642        out.push_str(" )");
5643    }
5644    if !attr.kvs.is_empty() {
5645        out.push(' ');
5646    }
5647    out.push_str("] ");
5648}
5649
5650fn write_haskell_string(s: &str, out: &mut String) {
5651    out.push('"');
5652    let mut prev_was_numeric_escape = false;
5653    for ch in s.chars() {
5654        let code = ch as u32;
5655        let is_ascii_printable = (0x20..0x7f).contains(&code);
5656        match ch {
5657            '"' => {
5658                out.push_str("\\\"");
5659                prev_was_numeric_escape = false;
5660            }
5661            '\\' => {
5662                out.push_str("\\\\");
5663                prev_was_numeric_escape = false;
5664            }
5665            '\n' => {
5666                out.push_str("\\n");
5667                prev_was_numeric_escape = false;
5668            }
5669            '\t' => {
5670                out.push_str("\\t");
5671                prev_was_numeric_escape = false;
5672            }
5673            '\r' => {
5674                out.push_str("\\r");
5675                prev_was_numeric_escape = false;
5676            }
5677            _ if is_ascii_printable => {
5678                // Disambiguate digit immediately after a numeric escape: `\160\&33`
5679                // versus `\16033`.
5680                if prev_was_numeric_escape && ch.is_ascii_digit() {
5681                    out.push_str("\\&");
5682                }
5683                out.push(ch);
5684                prev_was_numeric_escape = false;
5685            }
5686            _ => {
5687                // Non-printable or non-ASCII → decimal escape.
5688                out.push('\\');
5689                out.push_str(&code.to_string());
5690                prev_was_numeric_escape = true;
5691            }
5692        }
5693    }
5694    out.push('"');
5695}
5696
5697// ----- pandoc JSON projection ---------------------------------------------
5698//
5699// Walks the same `Block`/`Inline` tree as `write_block`/`write_inline` but
5700// emits pandoc's JSON shape — `{"t": "Constructor", "c": <content>}`, with
5701// nullary constructors omitting `"c"`. See pandoc's
5702// `Text.Pandoc.Definition` ToJSON instances for the source of truth.
5703
5704fn attr_to_json(attr: &Attr) -> Value {
5705    let kvs: Vec<Value> = attr.kvs.iter().map(|(k, v)| json!([k, v])).collect();
5706    json!([attr.id, attr.classes, kvs])
5707}
5708
5709fn target_to_json(url: &str, title: &str) -> Value {
5710    json!([url, title])
5711}
5712
5713fn inlines_to_json(inlines: &[Inline]) -> Vec<Value> {
5714    inlines.iter().map(inline_to_json).collect()
5715}
5716
5717fn blocks_to_json(blocks: &[Block]) -> Vec<Value> {
5718    blocks.iter().map(block_to_json).collect()
5719}
5720
5721fn citation_to_json(c: &Citation) -> Value {
5722    let mode = match c.mode {
5723        CitationMode::AuthorInText => "AuthorInText",
5724        CitationMode::NormalCitation => "NormalCitation",
5725        CitationMode::SuppressAuthor => "SuppressAuthor",
5726    };
5727    json!({
5728        "citationId": c.id,
5729        "citationPrefix": inlines_to_json(&c.prefix),
5730        "citationSuffix": inlines_to_json(&c.suffix),
5731        "citationMode": { "t": mode },
5732        "citationNoteNum": c.note_num,
5733        "citationHash": c.hash,
5734    })
5735}
5736
5737fn inline_to_json(inline: &Inline) -> Value {
5738    match inline {
5739        Inline::Str(s) => json!({ "t": "Str", "c": s }),
5740        Inline::Space => json!({ "t": "Space" }),
5741        Inline::SoftBreak => json!({ "t": "SoftBreak" }),
5742        Inline::LineBreak => json!({ "t": "LineBreak" }),
5743        Inline::Emph(children) => json!({ "t": "Emph", "c": inlines_to_json(children) }),
5744        Inline::Strong(children) => json!({ "t": "Strong", "c": inlines_to_json(children) }),
5745        Inline::Strikeout(children) => {
5746            json!({ "t": "Strikeout", "c": inlines_to_json(children) })
5747        }
5748        Inline::Superscript(children) => {
5749            json!({ "t": "Superscript", "c": inlines_to_json(children) })
5750        }
5751        Inline::Subscript(children) => {
5752            json!({ "t": "Subscript", "c": inlines_to_json(children) })
5753        }
5754        Inline::Code(attr, content) => {
5755            json!({ "t": "Code", "c": [attr_to_json(attr), content] })
5756        }
5757        Inline::Link(attr, text, url, title) => json!({
5758            "t": "Link",
5759            "c": [attr_to_json(attr), inlines_to_json(text), target_to_json(url, title)],
5760        }),
5761        Inline::Image(attr, alt, url, title) => json!({
5762            "t": "Image",
5763            "c": [attr_to_json(attr), inlines_to_json(alt), target_to_json(url, title)],
5764        }),
5765        Inline::Math(kind, content) => json!({
5766            "t": "Math",
5767            "c": [{ "t": kind }, content],
5768        }),
5769        Inline::Span(attr, children) => json!({
5770            "t": "Span",
5771            "c": [attr_to_json(attr), inlines_to_json(children)],
5772        }),
5773        Inline::RawInline(format, content) => json!({
5774            "t": "RawInline",
5775            "c": [format, content],
5776        }),
5777        Inline::Quoted(kind, children) => json!({
5778            "t": "Quoted",
5779            "c": [{ "t": kind }, inlines_to_json(children)],
5780        }),
5781        Inline::Note(blocks) => json!({ "t": "Note", "c": blocks_to_json(blocks) }),
5782        Inline::Cite(citations, text) => json!({
5783            "t": "Cite",
5784            "c": [
5785                citations.iter().map(citation_to_json).collect::<Vec<_>>(),
5786                inlines_to_json(text),
5787            ],
5788        }),
5789        Inline::Unsupported(name) => json!({ "t": "Unsupported", "c": name }),
5790    }
5791}
5792
5793fn block_to_json(b: &Block) -> Value {
5794    match b {
5795        Block::Para(inlines) => json!({ "t": "Para", "c": inlines_to_json(inlines) }),
5796        Block::Plain(inlines) => json!({ "t": "Plain", "c": inlines_to_json(inlines) }),
5797        Block::Header(level, attr, inlines) => json!({
5798            "t": "Header",
5799            "c": [level, attr_to_json(attr), inlines_to_json(inlines)],
5800        }),
5801        Block::BlockQuote(blocks) => {
5802            json!({ "t": "BlockQuote", "c": blocks_to_json(blocks) })
5803        }
5804        Block::CodeBlock(attr, content) => json!({
5805            "t": "CodeBlock",
5806            "c": [attr_to_json(attr), content],
5807        }),
5808        Block::HorizontalRule => json!({ "t": "HorizontalRule" }),
5809        Block::BulletList(items) => {
5810            let items_json: Vec<Vec<Value>> = items.iter().map(|it| blocks_to_json(it)).collect();
5811            json!({ "t": "BulletList", "c": items_json })
5812        }
5813        Block::OrderedList(start, style, delim, items) => {
5814            let items_json: Vec<Vec<Value>> = items.iter().map(|it| blocks_to_json(it)).collect();
5815            json!({
5816                "t": "OrderedList",
5817                "c": [
5818                    [json!(start), json!({ "t": style }), json!({ "t": delim })],
5819                    items_json,
5820                ],
5821            })
5822        }
5823        Block::RawBlock(format, content) => json!({
5824            "t": "RawBlock",
5825            "c": [format, content],
5826        }),
5827        Block::Table(data) => table_to_json(data),
5828        Block::Div(attr, blocks) => json!({
5829            "t": "Div",
5830            "c": [attr_to_json(attr), blocks_to_json(blocks)],
5831        }),
5832        Block::LineBlock(lines) => {
5833            let lines_json: Vec<Vec<Value>> =
5834                lines.iter().map(|line| inlines_to_json(line)).collect();
5835            json!({ "t": "LineBlock", "c": lines_json })
5836        }
5837        Block::DefinitionList(items) => {
5838            let items_json: Vec<Value> = items
5839                .iter()
5840                .map(|(term, defs)| {
5841                    let defs_json: Vec<Vec<Value>> =
5842                        defs.iter().map(|d| blocks_to_json(d)).collect();
5843                    json!([inlines_to_json(term), defs_json])
5844                })
5845                .collect();
5846            json!({ "t": "DefinitionList", "c": items_json })
5847        }
5848        Block::Figure(attr, caption, body) => {
5849            // Pandoc's Caption shape: `[shortCaption_or_null, [blocks]]`.
5850            // panache stores the caption as a Vec<Block> directly; wrap it.
5851            let caption_json = json!([Value::Null, blocks_to_json(caption)]);
5852            json!({
5853                "t": "Figure",
5854                "c": [attr_to_json(attr), caption_json, blocks_to_json(body)],
5855            })
5856        }
5857        Block::Unsupported(name) => json!({ "t": "Unsupported", "c": name }),
5858    }
5859}
5860
5861fn table_to_json(data: &TableData) -> Value {
5862    // Caption: `[null, [Plain inlines]]` when non-empty, `[null, []]` when empty.
5863    let caption_blocks: Vec<Value> = if data.caption.is_empty() {
5864        Vec::new()
5865    } else {
5866        vec![json!({ "t": "Plain", "c": inlines_to_json(&data.caption) })]
5867    };
5868    let caption_json = json!([Value::Null, caption_blocks]);
5869
5870    // Column specs: pair each align constructor with its column-width
5871    // constructor — `ColWidthDefault` (nullary) or `ColWidth f` (with value).
5872    let colspecs: Vec<Value> = data
5873        .aligns
5874        .iter()
5875        .enumerate()
5876        .map(|(i, align)| {
5877            let width = data.widths.get(i).copied().unwrap_or(None);
5878            let width_json = match width {
5879                None => json!({ "t": "ColWidthDefault" }),
5880                Some(w) => json!({ "t": "ColWidth", "c": w }),
5881            };
5882            json!([{ "t": align }, width_json])
5883        })
5884        .collect();
5885
5886    let empty_attr = json!(["", Vec::<Value>::new(), Vec::<Value>::new()]);
5887
5888    let head_rows: Vec<Value> = data
5889        .head_rows
5890        .iter()
5891        .map(|r| table_row_to_json(r))
5892        .collect();
5893    let body_rows: Vec<Value> = data
5894        .body_rows
5895        .iter()
5896        .map(|r| table_row_to_json(r))
5897        .collect();
5898    let foot_rows: Vec<Value> = data
5899        .foot_rows
5900        .iter()
5901        .map(|r| table_row_to_json(r))
5902        .collect();
5903
5904    let table_head = json!([empty_attr, head_rows]);
5905    let table_bodies = json!([[empty_attr, 0, Vec::<Value>::new(), body_rows,]]);
5906    let table_foot = json!([empty_attr, foot_rows]);
5907
5908    json!({
5909        "t": "Table",
5910        "c": [
5911            attr_to_json(&data.attr),
5912            caption_json,
5913            colspecs,
5914            table_head,
5915            table_bodies,
5916            table_foot,
5917        ],
5918    })
5919}
5920
5921fn table_row_to_json(cells: &[GridCell]) -> Value {
5922    let empty_attr = json!(["", Vec::<Value>::new(), Vec::<Value>::new()]);
5923    let cells_json: Vec<Value> = cells
5924        .iter()
5925        .map(|cell| {
5926            json!([
5927                empty_attr,
5928                { "t": "AlignDefault" },
5929                cell.row_span,
5930                cell.col_span,
5931                blocks_to_json(&cell.blocks),
5932            ])
5933        })
5934        .collect();
5935    json!([empty_attr, cells_json])
5936}
5937
5938#[cfg(test)]
5939mod tests {
5940    use super::*;
5941    use crate::parser::parse;
5942    use serde_json::Value;
5943
5944    fn parse_to_json(input: &str) -> Value {
5945        let tree = parse(input, None);
5946        let s = to_pandoc_json(&tree);
5947        serde_json::from_str(&s).expect("to_pandoc_json must emit valid JSON")
5948    }
5949
5950    #[test]
5951    fn empty_doc_emits_envelope_with_no_blocks() {
5952        let v = parse_to_json("");
5953        assert_eq!(v["pandoc-api-version"], serde_json::json!([1, 23, 1, 1]));
5954        assert_eq!(v["meta"], serde_json::json!({}));
5955        assert_eq!(v["blocks"], serde_json::json!([]));
5956    }
5957
5958    #[test]
5959    fn paragraph_with_str_emits_para_str_shape() {
5960        let v = parse_to_json("hello");
5961        let blocks = v["blocks"].as_array().expect("blocks is array");
5962        assert_eq!(blocks.len(), 1);
5963        let para = &blocks[0];
5964        assert_eq!(para["t"], "Para");
5965        let inlines = para["c"].as_array().expect("Para.c is array");
5966        assert_eq!(inlines.len(), 1);
5967        assert_eq!(inlines[0]["t"], "Str");
5968        assert_eq!(inlines[0]["c"], "hello");
5969    }
5970
5971    #[test]
5972    fn nullary_constructors_omit_c_key() {
5973        // A space between two words produces a nullary `Space` inline.
5974        let v = parse_to_json("a b");
5975        let inlines = v["blocks"][0]["c"].as_array().expect("Para.c is array");
5976        // [Str "a", Space, Str "b"]
5977        let space = inlines
5978            .iter()
5979            .find(|i| i["t"] == "Space")
5980            .expect("Space inline present");
5981        let space_obj = space.as_object().expect("Space is JSON object");
5982        assert!(
5983            !space_obj.contains_key("c"),
5984            "nullary constructors must omit the \"c\" key, got {space:?}",
5985        );
5986    }
5987
5988    #[test]
5989    fn header_attr_shape_matches_pandoc_tuple() {
5990        // `# Hi {#foo .bar key=val}` → Header 1 ("foo", ["bar"], [("key","val")]) [Str "Hi"]
5991        let v = parse_to_json("# Hi {#foo .bar key=val}");
5992        let header = &v["blocks"][0];
5993        assert_eq!(header["t"], "Header");
5994        let c = header["c"].as_array().expect("Header.c is array");
5995        assert_eq!(c.len(), 3);
5996        assert_eq!(c[0], 1, "level");
5997        // attr tuple: [id, [classes], [[k, v], ...]]
5998        let attr = c[1].as_array().expect("attr tuple");
5999        assert_eq!(attr[0], "foo");
6000        assert_eq!(attr[1], serde_json::json!(["bar"]));
6001        assert_eq!(attr[2], serde_json::json!([["key", "val"]]));
6002    }
6003}