panache_parser/
pandoc_ast.rs

1//! CST → Pandoc-native AST text projector.
2//!
3//! Walks a panache [`SyntaxNode`] and emits a string in the textual shape of
4//! pandoc's `Pandoc [Block]` AST — the same format produced by
5//! `pandoc -f markdown -t native`. Exposed via [`to_pandoc_ast`] and the
6//! `panache parse --to pandoc-ast` CLI mode; also drives the pandoc
7//! conformance harness in `tests/pandoc.rs`.
8//!
9//! Coverage is intentionally narrow. Unsupported nodes emit
10//! `Unsupported "<KIND>"` so a failing case stays visibly failing rather
11//! than silently dropping content; expand coverage as the corpus grows.
12//!
13//! Output shape matches pandoc 3.9.0.2 with default-standalone-off behavior:
14//! the document is rendered as a bare block list `[ <block>, ... ]`. The
15//! comparison normalizer collapses whitespace runs, so ppShow's pretty-print
16//! line breaks/indentation are not load-bearing.
17
18use std::cell::RefCell;
19use std::collections::{HashMap, HashSet};
20
21use crate::SyntaxNode;
22use crate::syntax::SyntaxKind;
23use rowan::NodeOrToken;
24use serde_json::{Value, json};
25
26/// Pinned `pandoc-api-version` reported in `to_pandoc_json` output. Mirrors
27/// the version reported by pandoc 3.9.0.2 (the version pinned by the
28/// conformance corpus — see
29/// `tests/fixtures/pandoc-conformance/.panache-source`). Bump alongside
30/// any pandoc-version bump in that corpus.
31const PANDOC_API_VERSION: [u32; 4] = [1, 23, 1, 1];
32
33#[derive(Default)]
34struct RefsCtx {
35    refs: HashMap<String, (String, String)>,
36    heading_ids: HashSet<String>,
37    /// Heading text-range start → final disambiguated id. Lets
38    /// `heading_block` look up the document-level id (with `section`
39    /// fallback for empty slugs and `-1`/`-2` suffixes for duplicates)
40    /// that was computed during the pre-pass.
41    heading_id_by_offset: HashMap<u32, String>,
42    /// Footnote label → parsed body blocks. Lookup keyed by the raw label
43    /// id text (no normalization needed — pandoc footnote labels are
44    /// case-sensitive and not whitespace-collapsed).
45    footnotes: HashMap<String, Vec<Block>>,
46    /// Example-list label (`@label`) → resolved item number. Pandoc
47    /// numbers all `OrderedList(_, Example, _)` items across the entire
48    /// document with one shared counter; labeled items also become
49    /// referenceable so inline `@label` resolves to the item's number.
50    example_label_to_num: HashMap<String, usize>,
51    /// Example-list start number per `LIST` text-range start. Looked up
52    /// in `ordered_list_attrs` so each Example list reports the first
53    /// item's number — picking up where the previous Example list left
54    /// off rather than restarting at 1.
55    example_list_start_by_offset: HashMap<u32, usize>,
56    /// Note number per `CITATION` text-range start. Pandoc assigns each
57    /// inline-cite group (and each footnote, regardless of inner cites)
58    /// a position-counter value; cites inside a footnote share its number.
59    cite_note_num_by_offset: HashMap<u32, i64>,
60}
61
62thread_local! {
63    static REFS_CTX: RefCell<RefsCtx> = RefCell::new(RefsCtx::default());
64}
65
66/// Render the given panache CST as pandoc-native AST text.
67///
68/// Output mirrors `pandoc -f markdown -t native` for supported constructs.
69/// Unsupported nodes emit a visible `Unsupported "<KIND>"` sentinel rather
70/// than silently dropping content. Pair with [`normalize_native`] when
71/// comparing against captured pandoc output to ignore pretty-print
72/// whitespace differences.
73pub fn to_pandoc_ast(tree: &SyntaxNode) -> String {
74    let ctx = build_refs_ctx(tree);
75    REFS_CTX.with(|c| *c.borrow_mut() = ctx);
76    let blocks = blocks_from_doc(tree);
77    let mut out = String::new();
78    out.push('[');
79    for (i, b) in blocks.iter().enumerate() {
80        if i > 0 {
81            out.push(',');
82        }
83        out.push(' ');
84        write_block(b, &mut out);
85    }
86    out.push_str(" ]");
87    REFS_CTX.with(|c| *c.borrow_mut() = RefsCtx::default());
88    out
89}
90
91/// Render the given panache CST as pandoc JSON-AST text.
92///
93/// Output mirrors `pandoc -f markdown -t json`: a single JSON object
94/// `{"pandoc-api-version": [...], "meta": {...}, "blocks": [...]}` where
95/// each AST node is `{"t": "Constructor", "c": <content>}` (nullary
96/// constructors omit `"c"`). The block tree is the same one used by
97/// [`to_pandoc_ast`] — the difference is the surface encoding only.
98///
99/// Output is compact (no whitespace), matching pandoc's default. The
100/// `pandoc-api-version` field is pinned to [`PANDOC_API_VERSION`].
101///
102/// Note: object keys are emitted in alphabetical order (e.g. `"c"` before
103/// `"t"`) rather than pandoc's insertion order. JSON objects are unordered
104/// by spec, so downstream tools (`jq`, `ascii2uni`, deserializers) treat
105/// the outputs as equivalent — but they are not byte-identical.
106///
107/// As with [`to_pandoc_ast`], unsupported nodes emit a panache-internal
108/// `{"t": "Unsupported", "c": "<KIND>"}` sentinel rather than being
109/// silently dropped. This sentinel is not emitted by real pandoc.
110pub fn to_pandoc_json(tree: &SyntaxNode) -> String {
111    let ctx = build_refs_ctx(tree);
112    REFS_CTX.with(|c| *c.borrow_mut() = ctx);
113    let blocks = blocks_from_doc(tree);
114    let blocks_json: Vec<Value> = blocks.iter().map(block_to_json).collect();
115    REFS_CTX.with(|c| *c.borrow_mut() = RefsCtx::default());
116    let doc = json!({
117        "pandoc-api-version": PANDOC_API_VERSION,
118        "meta": {},
119        "blocks": blocks_json,
120    });
121    serde_json::to_string(&doc).expect("pandoc-json serialization is infallible")
122}
123
124fn build_refs_ctx(tree: &SyntaxNode) -> RefsCtx {
125    build_refs_ctx_inherited(tree, None)
126}
127
128fn build_refs_ctx_inherited(tree: &SyntaxNode, parent: Option<&RefsCtx>) -> RefsCtx {
129    let mut ctx = RefsCtx::default();
130    collect_cite_note_nums(tree, &mut ctx);
131    let mut example_counter: usize = 0;
132    collect_example_numbering(tree, &mut ctx, &mut example_counter);
133    REFS_CTX.with(|c| {
134        let mut borrowed = c.borrow_mut();
135        borrowed.cite_note_num_by_offset = ctx.cite_note_num_by_offset.clone();
136        borrowed.example_label_to_num = ctx.example_label_to_num.clone();
137        borrowed.example_list_start_by_offset = ctx.example_list_start_by_offset.clone();
138    });
139    // Seed seen_ids from parent's heading_ids so inner heading auto-ids
140    // disambiguate against outer's history. Reverse-engineer counts from
141    // final ids: id `base` implies count >= 1; `base-N` implies count >=
142    // N+1. Take max per base.
143    let mut seen_ids: HashMap<String, u32> = HashMap::new();
144    if let Some(p) = parent {
145        for id in &p.heading_ids {
146            if let Some(idx) = id.rfind('-')
147                && let Ok(n) = id[idx + 1..].parse::<u32>()
148            {
149                let base = id[..idx].to_string();
150                let entry = seen_ids.entry(base).or_insert(0);
151                *entry = (*entry).max(n + 1);
152            }
153            let entry = seen_ids.entry(id.clone()).or_insert(0);
154            *entry = (*entry).max(1);
155        }
156    }
157    collect_refs_and_headings(tree, &mut ctx, &mut seen_ids);
158    // Fold parent refs/footnotes/heading_ids into the inner ctx so lookups
159    // during projection see both halves. Inner-defined keys win on conflict
160    // (scoping semantics; pandoc's true rule is "first def in document
161    // order wins" but tracking that across the recursive boundary would
162    // require offset-aware merging that no current corpus case exercises).
163    if let Some(p) = parent {
164        for (k, v) in &p.refs {
165            ctx.refs.entry(k.clone()).or_insert_with(|| v.clone());
166        }
167        for (k, v) in &p.footnotes {
168            ctx.footnotes.entry(k.clone()).or_insert_with(|| v.clone());
169        }
170        for id in &p.heading_ids {
171            ctx.heading_ids.insert(id.clone());
172        }
173    }
174    ctx
175}
176
177/// Walk every inline tree under `tree` and assign a `citationNoteNum` to
178/// each `CITATION` node. Pandoc's rule: outside footnotes, each Cite group
179/// (one CITATION node, regardless of internal `;`-separated keys) gets a
180/// fresh counter value; footnotes increment the counter once on entry,
181/// then ALL cites inside the footnote share that value.
182fn collect_cite_note_nums(tree: &SyntaxNode, ctx: &mut RefsCtx) {
183    let mut footnote_def_nodes: HashMap<String, SyntaxNode> = HashMap::new();
184    for child in tree.descendants() {
185        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION
186            && let Some(label) = footnote_label(&child)
187        {
188            footnote_def_nodes.entry(label).or_insert(child);
189        }
190    }
191    let mut counter: i64 = 0;
192    for child in tree.children() {
193        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION {
194            continue;
195        }
196        visit_for_cite_nums(&child, &footnote_def_nodes, &mut counter, None, ctx);
197    }
198}
199
200fn visit_for_cite_nums(
201    node: &SyntaxNode,
202    fn_defs: &HashMap<String, SyntaxNode>,
203    counter: &mut i64,
204    in_fn: Option<i64>,
205    ctx: &mut RefsCtx,
206) {
207    for el in node.children_with_tokens() {
208        if let NodeOrToken::Node(n) = el {
209            match n.kind() {
210                SyntaxKind::CITATION => {
211                    let offset: u32 = n.text_range().start().into();
212                    let num = if let Some(fn_num) = in_fn {
213                        fn_num
214                    } else {
215                        *counter += 1;
216                        *counter
217                    };
218                    ctx.cite_note_num_by_offset.insert(offset, num);
219                }
220                SyntaxKind::FOOTNOTE_REFERENCE => {
221                    if in_fn.is_none() {
222                        *counter += 1;
223                        let fn_num = *counter;
224                        if let Some(label) = footnote_label(&n)
225                            && let Some(def) = fn_defs.get(&label)
226                        {
227                            visit_for_cite_nums(def, fn_defs, counter, Some(fn_num), ctx);
228                        }
229                    }
230                }
231                _ => visit_for_cite_nums(&n, fn_defs, counter, in_fn, ctx),
232            }
233        }
234    }
235}
236
237/// Walk every `LIST` in document order and assign Example-list numbers.
238/// Pandoc tracks one counter across all `OrderedList(_, Example, _)` lists
239/// in a document, so each subsequent Example list picks up where the prior
240/// one left off. Labeled items (`(@label)`) get a label → number mapping
241/// for inline `@label` reference resolution.
242fn collect_example_numbering(node: &SyntaxNode, ctx: &mut RefsCtx, counter: &mut usize) {
243    for child in node.children() {
244        if child.kind() == SyntaxKind::LIST && list_is_example(&child) {
245            let list_offset: u32 = child.text_range().start().into();
246            ctx.example_list_start_by_offset
247                .insert(list_offset, *counter + 1);
248            for item in child
249                .children()
250                .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
251            {
252                *counter += 1;
253                if let Some(label) = example_item_label(&item) {
254                    ctx.example_label_to_num.entry(label).or_insert(*counter);
255                }
256            }
257            // Recurse into the list's contents to pick up nested Example
258            // lists (rare but possible).
259            collect_example_numbering(&child, ctx, counter);
260        } else {
261            collect_example_numbering(&child, ctx, counter);
262        }
263    }
264}
265
266/// `(@)` / `(@label)` markers identify Example list items. Returns true
267/// iff the LIST's first item carries such a marker (pandoc decides the
268/// list style from the first marker only).
269fn list_is_example(list: &SyntaxNode) -> bool {
270    let Some(item) = list.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
271        return false;
272    };
273    let marker = list_item_marker_text(&item);
274    let trimmed = marker.trim();
275    let body = if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
276        inner
277    } else if let Some(inner) = trimmed.strip_suffix(')') {
278        inner
279    } else if let Some(inner) = trimmed.strip_suffix('.') {
280        inner
281    } else {
282        trimmed
283    };
284    body.starts_with('@')
285        && body[1..]
286            .chars()
287            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
288}
289
290fn list_item_marker_text(item: &SyntaxNode) -> String {
291    item.children_with_tokens()
292        .filter_map(|el| el.into_token())
293        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
294        .map(|t| t.text().to_string())
295        .unwrap_or_default()
296}
297
298/// Returns the `@label` text for an Example list item, or `None` for the
299/// unlabeled `(@)` form.
300fn example_item_label(item: &SyntaxNode) -> Option<String> {
301    let marker = list_item_marker_text(item);
302    let trimmed = marker.trim();
303    let body = trimmed
304        .strip_prefix('(')
305        .and_then(|s| s.strip_suffix(')'))
306        .or_else(|| trimmed.strip_suffix(')'))
307        .or_else(|| trimmed.strip_suffix('.'))
308        .unwrap_or(trimmed);
309    let label = body.strip_prefix('@')?;
310    if label.is_empty() {
311        None
312    } else {
313        Some(label.to_string())
314    }
315}
316
317fn collect_refs_and_headings(
318    node: &SyntaxNode,
319    ctx: &mut RefsCtx,
320    seen_ids: &mut HashMap<String, u32>,
321) {
322    for child in node.children() {
323        match child.kind() {
324            SyntaxKind::REFERENCE_DEFINITION => {
325                if let Some((label, url, title)) = parse_reference_def(&child) {
326                    ctx.refs
327                        .entry(normalize_ref_label(&label))
328                        .or_insert((url, title));
329                }
330            }
331            SyntaxKind::FOOTNOTE_DEFINITION => {
332                if let Some((label, blocks)) = parse_footnote_def(&child) {
333                    ctx.footnotes.entry(label).or_insert(blocks);
334                }
335            }
336            SyntaxKind::HEADING => {
337                let (id, was_explicit) = heading_id_with_explicitness(&child);
338                let final_id = if was_explicit {
339                    // Explicit `{#x}` ids are kept verbatim; pandoc only
340                    // warns on conflicts but does not auto-disambiguate.
341                    seen_ids.entry(id.clone()).or_insert(0);
342                    id
343                } else {
344                    let mut base = id;
345                    if base.is_empty() {
346                        base = "section".to_string();
347                    }
348                    let count = seen_ids.entry(base.clone()).or_insert(0);
349                    let id = if *count == 0 {
350                        base
351                    } else {
352                        format!("{base}-{count}")
353                    };
354                    *count += 1;
355                    id
356                };
357                if !final_id.is_empty() {
358                    let offset: u32 = child.text_range().start().into();
359                    ctx.heading_ids.insert(final_id.clone());
360                    ctx.heading_id_by_offset.insert(offset, final_id);
361                }
362                collect_refs_and_headings(&child, ctx, seen_ids);
363            }
364            _ => collect_refs_and_headings(&child, ctx, seen_ids),
365        }
366    }
367}
368
369/// Returns `(id, was_explicit)` for a HEADING node. Explicit ids come from
370/// `{#id}` attributes; the auto-id is the slugified plaintext (which may be
371/// empty for headings whose text contains no slug-eligible characters).
372fn heading_id_with_explicitness(node: &SyntaxNode) -> (String, bool) {
373    let inlines = node
374        .children()
375        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
376        .map(|c| coalesce_inlines(inlines_from(&c)))
377        .unwrap_or_default();
378    let attr = node.children_with_tokens().find_map(|el| match el {
379        NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
380        NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => Some(t.text().to_string()),
381        _ => None,
382    });
383    if let Some(raw) = attr {
384        let trimmed = raw.trim();
385        if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
386            let parsed = parse_attr_block(inner);
387            if !parsed.id.is_empty() {
388                return (parsed.id, true);
389            }
390        }
391    }
392    (pandoc_slugify(&inlines_to_plaintext(&inlines)), false)
393}
394
395fn parse_footnote_def(node: &SyntaxNode) -> Option<(String, Vec<Block>)> {
396    let label = footnote_label(node)?;
397    let mut blocks = Vec::new();
398    for child in node.children() {
399        // The CST keeps each footnote-body line at its full raw indentation
400        // (the 4-space body indent plus any nested-block indent). Most blocks
401        // recover transparently because `coalesce_inlines` trims leading
402        // spaces on paragraph content, but indented code blocks preserve all
403        // leading whitespace — strip the 4 footnote-body spaces in addition
404        // to the code block's own 4.
405        if child.kind() == SyntaxKind::CODE_BLOCK
406            && !child
407                .children()
408                .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
409        {
410            blocks.push(indented_code_block_with_extra_strip(&child, 4));
411        } else {
412            collect_block(&child, &mut blocks);
413        }
414    }
415    Some((label, blocks))
416}
417
418fn indented_code_block_with_extra_strip(node: &SyntaxNode, extra: usize) -> Block {
419    let raw_format = code_block_raw_format(node);
420    let attr = code_block_attr(node);
421    let is_fenced = node
422        .children()
423        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
424    let mut content = String::new();
425    for child in node.children() {
426        if child.kind() == SyntaxKind::CODE_CONTENT {
427            content.push_str(&child.text().to_string());
428        }
429    }
430    while content.ends_with('\n') {
431        content.pop();
432    }
433    // Pandoc expands tabs (4-col stops) on code-block bodies before any
434    // indent stripping, so a `:\t` marker followed by `\t\t\tcode` correctly
435    // becomes `"        code"` after the 4-col definition-content offset is
436    // stripped. Apply expansion first, then strip.
437    content = content
438        .split('\n')
439        .map(expand_tabs_to_4)
440        .collect::<Vec<_>>()
441        .join("\n");
442    content = strip_leading_spaces_per_line(&content, extra);
443    if !is_fenced {
444        content = strip_indented_code_indent(&content);
445    }
446    if let Some(fmt) = raw_format {
447        return Block::RawBlock(fmt, content);
448    }
449    Block::CodeBlock(attr, content)
450}
451
452fn strip_leading_spaces_per_line(s: &str, n: usize) -> String {
453    let mut out = String::with_capacity(s.len());
454    for (i, line) in s.split('\n').enumerate() {
455        if i > 0 {
456            out.push('\n');
457        }
458        let to_strip = line.chars().take(n).take_while(|&c| c == ' ').count();
459        out.push_str(&line[to_strip..]);
460    }
461    out
462}
463
464fn footnote_label(node: &SyntaxNode) -> Option<String> {
465    for el in node.children_with_tokens() {
466        if let NodeOrToken::Token(t) = el
467            && t.kind() == SyntaxKind::FOOTNOTE_LABEL_ID
468        {
469            return Some(t.text().to_string());
470        }
471    }
472    None
473}
474
475fn parse_reference_def(node: &SyntaxNode) -> Option<(String, String, String)> {
476    let link = node.children().find(|c| c.kind() == SyntaxKind::LINK)?;
477    let label_node = link
478        .children()
479        .find(|c| c.kind() == SyntaxKind::LINK_TEXT)?;
480    let label = label_node.text().to_string();
481
482    let mut tail = String::new();
483    let mut after_link = false;
484    for el in node.children_with_tokens() {
485        if after_link {
486            match el {
487                NodeOrToken::Token(t) => tail.push_str(t.text()),
488                NodeOrToken::Node(n) => tail.push_str(&n.text().to_string()),
489            }
490        } else if let NodeOrToken::Node(n) = &el
491            && n.kind() == SyntaxKind::LINK
492        {
493            after_link = true;
494        }
495    }
496
497    let trimmed = tail.trim_start();
498    let rest = trimmed.strip_prefix(':')?;
499    let after_colon = rest.trim_start();
500    let (url, after_url) = parse_ref_url(after_colon);
501    let title = parse_dest_title(after_url.trim());
502    Some((unescape_label(&label), url, title))
503}
504
505fn parse_ref_url(s: &str) -> (String, &str) {
506    let s = s.trim_start();
507    if let Some(rest) = s.strip_prefix('<')
508        && let Some(end) = rest.find('>')
509    {
510        return (rest[..end].to_string(), &rest[end + 1..]);
511    }
512    let end = s.find(|c: char| c.is_whitespace()).unwrap_or(s.len());
513    (s[..end].to_string(), &s[end..])
514}
515
516fn unescape_label(label: &str) -> String {
517    let mut out = String::with_capacity(label.len());
518    let mut chars = label.chars().peekable();
519    while let Some(ch) = chars.next() {
520        if ch == '\\'
521            && let Some(&next) = chars.peek()
522            && is_ascii_punct(next)
523        {
524            out.push(next);
525            chars.next();
526        } else {
527            out.push(ch);
528        }
529    }
530    out
531}
532
533fn is_ascii_punct(c: char) -> bool {
534    c.is_ascii() && (c.is_ascii_punctuation())
535}
536
537/// Pandoc/CommonMark reference-label normalization: case-fold and collapse
538/// runs of whitespace to a single space, with leading/trailing trimmed.
539fn normalize_ref_label(label: &str) -> String {
540    let unescaped = unescape_label(label);
541    let mut out = String::new();
542    let mut last_space = false;
543    for ch in unescaped.chars() {
544        if ch.is_whitespace() {
545            if !out.is_empty() && !last_space {
546                out.push(' ');
547                last_space = true;
548            }
549        } else {
550            for lc in ch.to_lowercase() {
551                out.push(lc);
552            }
553            last_space = false;
554        }
555    }
556    if last_space {
557        out.pop();
558    }
559    out
560}
561
562fn lookup_ref(label: &str) -> Option<(String, String)> {
563    let key = normalize_ref_label(label);
564    REFS_CTX.with(|c| c.borrow().refs.get(&key).cloned())
565}
566
567fn lookup_heading_id(label: &str) -> Option<String> {
568    let id = pandoc_slugify(&unescape_label(label));
569    if id.is_empty() {
570        return None;
571    }
572    REFS_CTX.with(|c| {
573        if c.borrow().heading_ids.contains(&id) {
574            Some(id)
575        } else {
576            None
577        }
578    })
579}
580
581/// Canonical form of a Pandoc-native AST string. Tokenizes the input and
582/// re-serializes it with single-space separation so that pretty-print line
583/// breaks and indentation no longer affect equality.
584pub fn normalize_native(s: &str) -> String {
585    let mut tokens = Vec::new();
586    let bytes = s.as_bytes();
587    let mut i = 0usize;
588    while i < bytes.len() {
589        let c = bytes[i];
590        match c {
591            b' ' | b'\t' | b'\n' | b'\r' => {
592                i += 1;
593            }
594            b'[' | b']' | b'(' | b')' | b',' => {
595                tokens.push((c as char).to_string());
596                i += 1;
597            }
598            b'"' => {
599                // String literal: copy bytes until matching unescaped quote.
600                let start = i;
601                i += 1;
602                while i < bytes.len() {
603                    match bytes[i] {
604                        b'\\' if i + 1 < bytes.len() => {
605                            i += 2;
606                        }
607                        b'"' => {
608                            i += 1;
609                            break;
610                        }
611                        _ => {
612                            i += 1;
613                        }
614                    }
615                }
616                tokens.push(s[start..i].to_string());
617            }
618            _ => {
619                let start = i;
620                while i < bytes.len() {
621                    let b = bytes[i];
622                    if matches!(
623                        b,
624                        b' ' | b'\t' | b'\n' | b'\r' | b'[' | b']' | b'(' | b')' | b',' | b'"'
625                    ) {
626                        break;
627                    }
628                    i += 1;
629                }
630                if i > start {
631                    tokens.push(s[start..i].to_string());
632                }
633            }
634        }
635    }
636    tokens.join(" ")
637}
638
639// Variant names mirror Pandoc's `Text.Pandoc.Definition` constructors so the
640// emission code reads 1:1 against pandoc-native — `BlockQuote`, `CodeBlock`,
641// `BulletList`, `OrderedList` are not redundant here, they are the spec names.
642#[derive(Debug, Clone)]
643#[allow(clippy::enum_variant_names)]
644enum Block {
645    Para(Vec<Inline>),
646    Plain(Vec<Inline>),
647    Header(usize, Attr, Vec<Inline>),
648    BlockQuote(Vec<Block>),
649    CodeBlock(Attr, String),
650    HorizontalRule,
651    BulletList(Vec<Vec<Block>>),
652    OrderedList(usize, &'static str, &'static str, Vec<Vec<Block>>),
653    RawBlock(String, String),
654    Table(TableData),
655    Div(Attr, Vec<Block>),
656    LineBlock(Vec<Vec<Inline>>),
657    DefinitionList(Vec<(Vec<Inline>, Vec<Vec<Block>>)>),
658    /// `Figure attr (Caption Nothing [caption-blocks]) [body-blocks]` —
659    /// pandoc's implicit_figures wraps an image-only paragraph whose
660    /// alt text becomes the caption and whose body re-includes the
661    /// image as a Plain block.
662    Figure(Attr, Vec<Block>, Vec<Block>),
663    Unsupported(String),
664}
665
666#[derive(Debug, Clone)]
667struct TableData {
668    /// Pandoc's `+caption_attributes` extension lifts a trailing
669    /// `{#id .class kv=...}` from the caption text into the Table's outer
670    /// attribute. Default-empty for tables without caption attributes.
671    attr: Attr,
672    caption: Vec<Inline>,
673    aligns: Vec<&'static str>,
674    /// Per-column width. `None` → `ColWidthDefault`, `Some(f)` → `ColWidth f`.
675    widths: Vec<Option<f64>>,
676    head_rows: Vec<Vec<GridCell>>,
677    body_rows: Vec<Vec<GridCell>>,
678    /// Footer rows. Currently only populated for grid tables with a
679    /// trailing `+===+===+` separator before the final body row(s).
680    foot_rows: Vec<Vec<GridCell>>,
681}
682
683/// One cell in a `TableData` row. `row_span`/`col_span` default to 1 for
684/// pipe/simple/multiline tables (which don't model spans). Grid tables
685/// compute proper span counts via the layout algorithm in `grid_table`.
686#[derive(Debug, Clone)]
687struct GridCell {
688    row_span: u32,
689    col_span: u32,
690    blocks: Vec<Block>,
691}
692
693impl GridCell {
694    fn no_span(blocks: Vec<Block>) -> Self {
695        Self {
696            row_span: 1,
697            col_span: 1,
698            blocks,
699        }
700    }
701}
702
703#[derive(Debug, Clone)]
704#[allow(clippy::enum_variant_names)]
705enum Inline {
706    Str(String),
707    Space,
708    SoftBreak,
709    LineBreak,
710    Emph(Vec<Inline>),
711    Strong(Vec<Inline>),
712    Strikeout(Vec<Inline>),
713    Superscript(Vec<Inline>),
714    Subscript(Vec<Inline>),
715    Code(Attr, String),
716    Link(Attr, Vec<Inline>, String, String),
717    Image(Attr, Vec<Inline>, String, String),
718    Math(&'static str, String),
719    Span(Attr, Vec<Inline>),
720    RawInline(String, String),
721    Quoted(&'static str, Vec<Inline>),
722    Note(Vec<Block>),
723    Cite(Vec<Citation>, Vec<Inline>),
724    Unsupported(String),
725}
726
727#[derive(Debug, Clone)]
728struct Citation {
729    id: String,
730    prefix: Vec<Inline>,
731    suffix: Vec<Inline>,
732    mode: CitationMode,
733    note_num: i64,
734    hash: i64,
735}
736
737#[derive(Debug, Clone, Copy)]
738enum CitationMode {
739    AuthorInText,
740    NormalCitation,
741    SuppressAuthor,
742}
743
744#[derive(Debug, Default, Clone)]
745struct Attr {
746    id: String,
747    classes: Vec<String>,
748    kvs: Vec<(String, String)>,
749}
750
751// ----- block-level walking ------------------------------------------------
752
753fn blocks_from_doc(doc: &SyntaxNode) -> Vec<Block> {
754    let mut out = Vec::new();
755    for child in doc.children() {
756        collect_block(&child, &mut out);
757    }
758    out
759}
760
761fn block_from(node: &SyntaxNode) -> Option<Block> {
762    match node.kind() {
763        SyntaxKind::PARAGRAPH => Some(Block::Para(coalesce_inlines(inlines_from(node)))),
764        SyntaxKind::PLAIN => Some(Block::Plain(coalesce_inlines(inlines_from(node)))),
765        SyntaxKind::HEADING => Some(heading_block(node)),
766        SyntaxKind::BLOCK_QUOTE => Some(Block::BlockQuote(blockquote_blocks(node))),
767        SyntaxKind::CODE_BLOCK => Some(code_block(node)),
768        SyntaxKind::HORIZONTAL_RULE => Some(Block::HorizontalRule),
769        SyntaxKind::LIST => Some(list_block(node)),
770        SyntaxKind::BLANK_LINE => None,
771        // Reference definitions don't appear in pandoc-native output (they
772        // resolve into the link they define).
773        SyntaxKind::REFERENCE_DEFINITION => None,
774        // Footnote definitions are pulled into Note inlines at the
775        // FOOTNOTE_REFERENCE site; the definition block itself is dropped.
776        SyntaxKind::FOOTNOTE_DEFINITION => None,
777        // YAML metadata becomes the document Meta wrapper, not a body block.
778        // The projector emits a bare block list, so just drop these.
779        SyntaxKind::YAML_METADATA => None,
780        // Pandoc title block (`% title\n% authors\n% date`) populates Meta
781        // and produces no body block.
782        SyntaxKind::PANDOC_TITLE_BLOCK => None,
783        SyntaxKind::HTML_BLOCK => Some(html_block(node)),
784        SyntaxKind::HTML_BLOCK_DIV => Some(html_div_block(node)),
785        SyntaxKind::PIPE_TABLE => pipe_table(node).map(Block::Table),
786        SyntaxKind::SIMPLE_TABLE => simple_table(node).map(Block::Table),
787        SyntaxKind::GRID_TABLE => grid_table(node).map(Block::Table),
788        SyntaxKind::MULTILINE_TABLE => multiline_table(node).map(Block::Table),
789        SyntaxKind::TEX_BLOCK => Some(tex_block(node)),
790        SyntaxKind::FENCED_DIV => Some(fenced_div(node)),
791        SyntaxKind::LINE_BLOCK => Some(line_block(node)),
792        SyntaxKind::DEFINITION_LIST => Some(definition_list(node)),
793        SyntaxKind::FIGURE => Some(figure_block(node)),
794        other => Some(Block::Unsupported(format!("{other:?}"))),
795    }
796}
797
798/// Pandoc's `implicit_figures` extension wraps a paragraph that is *only*
799/// an Image into a `Figure` block: `Figure (id, [], []) (Caption Nothing
800/// [Plain alt]) [Plain [Image]]`. The image's alt-text inlines become the
801/// caption; the body holds the image itself wrapped in a Plain. Any
802/// attribute attached to the Image migrates to the Figure attr (id only)
803/// — the Image keeps its classes/kvs.
804fn figure_block(node: &SyntaxNode) -> Block {
805    let mut alt: Vec<Inline> = Vec::new();
806    let mut image_inline: Option<Inline> = None;
807    if let Some(image) = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_LINK) {
808        let alt_node = image.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
809        if let Some(an) = alt_node {
810            alt = coalesce_inlines(inlines_from(&an));
811        }
812        let mut tmp = Vec::new();
813        render_image_inline(&image, &mut tmp);
814        if let Some(first) = tmp.into_iter().next() {
815            image_inline = Some(first);
816        }
817    }
818    // Pandoc's `implicit_figures` migrates only the image's id to the Figure
819    // attr; the image keeps its classes and key-value pairs but loses the id.
820    let (figure_attr, image_inline) = match image_inline {
821        Some(Inline::Image(mut attr, alt_inlines, url, title)) if !attr.id.is_empty() => {
822            let fig_attr = Attr::with_id(std::mem::take(&mut attr.id));
823            (fig_attr, Some(Inline::Image(attr, alt_inlines, url, title)))
824        }
825        other => (Attr::default(), other),
826    };
827    let caption = if alt.is_empty() {
828        Vec::new()
829    } else {
830        vec![Block::Plain(alt)]
831    };
832    let body = match image_inline {
833        Some(img) => vec![Block::Plain(vec![img])],
834        None => Vec::new(),
835    };
836    Block::Figure(figure_attr, caption, body)
837}
838
839fn heading_block(node: &SyntaxNode) -> Block {
840    let level = heading_level(node);
841    let inlines = node
842        .children()
843        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
844        .map(|c| coalesce_inlines(inlines_from(&c)))
845        .unwrap_or_default();
846    // Auto-id and disambiguation are computed in the `RefsCtx` pre-pass so
847    // duplicate slugs and `section`-fallbacks are document-wide consistent.
848    // Explicit attributes still need their classes/kvs parsed here.
849    let offset: u32 = node.text_range().start().into();
850    let final_id = REFS_CTX
851        .with(|c| c.borrow().heading_id_by_offset.get(&offset).cloned())
852        .unwrap_or_default();
853    let attr = node
854        .children_with_tokens()
855        .find_map(|el| match el {
856            NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
857            NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => {
858                Some(t.text().to_string())
859            }
860            _ => None,
861        })
862        .map(|raw| {
863            let trimmed = raw.trim();
864            if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
865                let mut attr = parse_attr_block(inner);
866                if attr.id.is_empty() {
867                    attr.id = final_id.clone();
868                }
869                attr
870            } else {
871                Attr::with_id(final_id.clone())
872            }
873        })
874        .unwrap_or_else(|| Attr::with_id(final_id));
875    Block::Header(level, attr, inlines)
876}
877
878fn heading_level(node: &SyntaxNode) -> usize {
879    for child in node.children() {
880        if child.kind() == SyntaxKind::ATX_HEADING_MARKER {
881            for tok in child.children_with_tokens() {
882                if let Some(t) = tok.as_token()
883                    && t.kind() == SyntaxKind::ATX_HEADING_MARKER
884                {
885                    return t.text().chars().filter(|&c| c == '#').count();
886                }
887            }
888        }
889    }
890    for el in node.descendants_with_tokens() {
891        if let NodeOrToken::Token(t) = el
892            && t.kind() == SyntaxKind::SETEXT_HEADING_UNDERLINE
893        {
894            return if t.text().trim_start().starts_with('=') {
895                1
896            } else {
897                2
898            };
899        }
900    }
901    1
902}
903
904fn blockquote_blocks(node: &SyntaxNode) -> Vec<Block> {
905    let mut out = Vec::new();
906    for child in node.children() {
907        collect_block(&child, &mut out);
908    }
909    out
910}
911
912fn code_block(node: &SyntaxNode) -> Block {
913    let raw_format = code_block_raw_format(node);
914    let attr = code_block_attr(node);
915    let is_fenced = node
916        .children()
917        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
918    let mut content = String::new();
919    for child in node.children() {
920        if child.kind() == SyntaxKind::CODE_CONTENT {
921            content.push_str(&child.text().to_string());
922        }
923    }
924    // Pandoc strips the trailing newline that closes the block.
925    while content.ends_with('\n') {
926        content.pop();
927    }
928    if is_fenced {
929        // Pandoc tab-expands code-block bodies before emission. For indented
930        // code, the expansion happens inside `strip_indented_code_indent`
931        // before the 4-col strip; for fenced code there is no strip, so do
932        // it directly here.
933        content = content
934            .split('\n')
935            .map(expand_tabs_to_4)
936            .collect::<Vec<_>>()
937            .join("\n");
938    } else {
939        content = strip_indented_code_indent(&content);
940    }
941    if let Some(fmt) = raw_format {
942        return Block::RawBlock(fmt, content);
943    }
944    Block::CodeBlock(attr, content)
945}
946
947/// Pandoc's raw-attribute syntax (`Ext_raw_attribute`) treats a fenced code
948/// block whose info string is exactly `{=format}` as a `RawBlock` of that
949/// format rather than a `CodeBlock`. The brace contents must start with `=`
950/// followed by a non-empty token, with no other classes/ids/key-value pairs.
951fn code_block_raw_format(node: &SyntaxNode) -> Option<String> {
952    let open = node
953        .children()
954        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)?;
955    let info = open
956        .children()
957        .find(|c| c.kind() == SyntaxKind::CODE_INFO)?;
958    let raw = info.text().to_string();
959    let trimmed = raw.trim();
960    let inner = trimmed
961        .strip_prefix('{')
962        .and_then(|s| s.strip_suffix('}'))?;
963    let inner = inner.trim();
964    let format = inner.strip_prefix('=')?.trim();
965    if format.is_empty() || format.contains(char::is_whitespace) {
966        return None;
967    }
968    Some(format.to_string())
969}
970
971fn code_block_attr(node: &SyntaxNode) -> Attr {
972    let Some(open) = node
973        .children()
974        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
975    else {
976        return Attr::default();
977    };
978    let Some(info) = open.children().find(|c| c.kind() == SyntaxKind::CODE_INFO) else {
979        return Attr::default();
980    };
981    let raw = info.text().to_string();
982    let trimmed = raw.trim();
983    if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
984        return parse_attr_block(inner);
985    }
986    // Shortcut form: `lang {.cls #id key=value}` — language followed by an
987    // attribute block. Pandoc concatenates the language as the first class.
988    if let Some(brace) = trimmed.find('{')
989        && trimmed.ends_with('}')
990    {
991        let lang = trimmed[..brace].trim();
992        let attr_inner = &trimmed[brace + 1..trimmed.len() - 1];
993        let mut attr = parse_attr_block(attr_inner);
994        if !lang.is_empty() {
995            attr.classes.insert(0, normalize_lang_id(lang));
996        }
997        return attr;
998    }
999    if !trimmed.is_empty() {
1000        return Attr {
1001            id: String::new(),
1002            classes: vec![normalize_lang_id(trimmed)],
1003            kvs: Vec::new(),
1004        };
1005    }
1006    Attr::default()
1007}
1008
1009/// Mirrors pandoc's `toLanguageId` (Markdown reader): lowercases the language
1010/// identifier and applies the GitHub-syntax-highlighting normalizations
1011/// (`c++` → `cpp`, `objective-c` → `objectivec`).
1012fn normalize_lang_id(lang: &str) -> String {
1013    let lower = lang.to_ascii_lowercase();
1014    match lower.as_str() {
1015        "c++" => "cpp".to_string(),
1016        "objective-c" => "objectivec".to_string(),
1017        _ => lower,
1018    }
1019}
1020
1021/// Pandoc strips up to four leading spaces (or one tab) from each line of an
1022/// indented code block. The CST keeps the indent as part of CODE_CONTENT, so
1023/// we remove it here.
1024fn strip_indented_code_indent(s: &str) -> String {
1025    let mut out = String::with_capacity(s.len());
1026    for (i, line) in s.split('\n').enumerate() {
1027        if i > 0 {
1028            out.push('\n');
1029        }
1030        // Pandoc expands tabs to 4-column tab stops *before* stripping the
1031        // 4-column indent. Mixed `  \tfoo` therefore becomes `    foo` →
1032        // `foo` after strip, which is what `pandoc -t native` emits.
1033        let expanded = expand_tabs_to_4(line);
1034        let stripped = if let Some(rest) = expanded.strip_prefix("    ") {
1035            rest.to_string()
1036        } else if let Some(rest) = expanded.strip_prefix('\t') {
1037            rest.to_string()
1038        } else {
1039            // Strip up to 3 leading spaces if present (pandoc tolerates short
1040            // indentation only on blank lines, which we don't try to detect
1041            // here — safer to leave non-conforming lines alone).
1042            expanded
1043        };
1044        out.push_str(&stripped);
1045    }
1046    out
1047}
1048
1049/// Expand `\t` to spaces using 4-column tab stops, starting from column 0
1050/// of `line`. Pandoc applies this to indented code blocks before stripping
1051/// the leading 4-column indent so the body byte-equals what pandoc emits.
1052fn expand_tabs_to_4(line: &str) -> String {
1053    let mut out = String::with_capacity(line.len());
1054    let mut col = 0usize;
1055    for c in line.chars() {
1056        if c == '\t' {
1057            let next = (col / 4 + 1) * 4;
1058            for _ in col..next {
1059                out.push(' ');
1060            }
1061            col = next;
1062        } else {
1063            out.push(c);
1064            col += 1;
1065        }
1066    }
1067    out
1068}
1069
1070/// Single-block projection of an opaque `HTML_BLOCK`. Used when a non-
1071/// structural caller (e.g. grid-table cell reparse via `block_from`)
1072/// needs one `Block` rather than a stream. Emits a single `RawBlock`
1073/// — no structural lift (the lifted shape projects as multiple blocks
1074/// and is handled by `emit_html_block` via `collect_block`).
1075fn html_block(node: &SyntaxNode) -> Block {
1076    let mut content = node.text().to_string();
1077    while content.ends_with('\n') {
1078        content.pop();
1079    }
1080    Block::RawBlock("html".to_string(), content)
1081}
1082
1083/// Project an `HTML_BLOCK_DIV` node (a Pandoc-dialect-lifted
1084/// `<div ...>...</div>` block) into a `Block::Div`.
1085///
1086/// Walks the structural CST: attributes come from the open
1087/// `HTML_BLOCK_TAG`'s `HTML_ATTRS` descendant (Phase 1's structural
1088/// lift) and inner blocks from any non-tag CST children (Phase 6's
1089/// structural lift — `PARAGRAPH`, `HEADING`, nested `HTML_BLOCK_DIV`,
1090/// etc., produced when the parser recursively parses the inner
1091/// content of a `<div>...</div>` body).
1092///
1093/// All currently exercised `<div>` shapes lift structurally (clean
1094/// multi-line, open-trailing, butted-close, indented-close, same-
1095/// line, empty / blank-only, and bq-wrapped shapes). The defensive
1096/// fallback below emits an empty `Div` if a future parser change
1097/// somehow yields an unlifted `HTML_BLOCK_DIV` — that would be a
1098/// parser bug, not something the projector should silently reparse.
1099fn html_div_block(node: &SyntaxNode) -> Block {
1100    let attr = cst_div_open_tag_attr(node);
1101    if div_has_structural_inner(node) {
1102        let mut blocks = Vec::new();
1103        for child in node.children() {
1104            match child.kind() {
1105                SyntaxKind::HTML_BLOCK_TAG | SyntaxKind::BLANK_LINE => {}
1106                _ => collect_block(&child, &mut blocks),
1107            }
1108        }
1109        return Block::Div(attr, blocks);
1110    }
1111    debug_assert!(
1112        false,
1113        "HTML_BLOCK_DIV without structural inner shape — parser regression"
1114    );
1115    Block::Div(attr, Vec::new())
1116}
1117
1118/// Concatenate the node's token text, dropping prefix tokens injected
1119/// by the parser for container nesting:
1120/// - Every `BLOCK_QUOTE_MARKER` and the immediately-following
1121///   `WHITESPACE` token (bq-wrapped HTML lift —
1122///   `> <div>\n> foo\n> </div>` becomes `<div>\nfoo\n</div>`).
1123/// - A leading `WHITESPACE` token at the start of each source line
1124///   when it is NOT preceded by a `BLOCK_QUOTE_MARKER` on the same
1125///   line (list-item content_col stripped by
1126///   `parser/utils/list_item_buffer.rs::strip_list_item_indent` and
1127///   re-injected as a structural `WHITESPACE` token during graft —
1128///   `- <pre>\n  foo\n  </pre>` becomes `<pre>\nfoo\n</pre>` in the
1129///   RawBlock text). The parser never emits a leading line-start
1130///   `WHITESPACE` inside `HTML_BLOCK_CONTENT` or `HTML_BLOCK_TAG`
1131///   outside this lift path — top-level indented HTML keeps the
1132///   leading indent inside a single `TEXT` token — so the rule is
1133///   unambiguous.
1134fn collect_html_block_text_skip_bq_markers(node: &SyntaxNode) -> String {
1135    let mut out = String::new();
1136    let mut skip_next_ws = false;
1137    let mut at_line_start = true;
1138    walk_skip_bq_markers(node, &mut out, &mut skip_next_ws, &mut at_line_start);
1139    out
1140}
1141
1142fn walk_skip_bq_markers(
1143    node: &SyntaxNode,
1144    out: &mut String,
1145    skip_next_ws: &mut bool,
1146    at_line_start: &mut bool,
1147) {
1148    for child in node.children_with_tokens() {
1149        match child {
1150            NodeOrToken::Node(n) => walk_skip_bq_markers(&n, out, skip_next_ws, at_line_start),
1151            NodeOrToken::Token(t) => {
1152                if t.kind() == SyntaxKind::BLOCK_QUOTE_MARKER {
1153                    *skip_next_ws = true;
1154                    *at_line_start = false;
1155                    continue;
1156                }
1157                if *skip_next_ws && t.kind() == SyntaxKind::WHITESPACE {
1158                    *skip_next_ws = false;
1159                    *at_line_start = false;
1160                    continue;
1161                }
1162                if *at_line_start && t.kind() == SyntaxKind::WHITESPACE {
1163                    *at_line_start = false;
1164                    continue;
1165                }
1166                *skip_next_ws = false;
1167                let kind = t.kind();
1168                out.push_str(t.text());
1169                *at_line_start = kind == SyntaxKind::NEWLINE || kind == SyntaxKind::BLANK_LINE;
1170            }
1171        }
1172    }
1173}
1174
1175/// True when the parser has lifted the `<div>` body into structural
1176/// CST children AND both the open and close `HTML_BLOCK_TAG`s are
1177/// "clean" (carry no inner content): the open tag ends with the `>`
1178/// token followed only by a NEWLINE, and the close tag's first text
1179/// starts with `</`. "Messy" shapes — same-line `<div>foo</div>`,
1180/// trailing content on the open tag (`<div>foo\nbar\n</div>`),
1181/// butted close (`<div>\nfoo\nbar</div>`) — fall through to the byte
1182/// reparse path, which is the source of truth for those cases until
1183/// follow-up parser work lifts them too.
1184///
1185/// Presence of an `HTML_BLOCK_CONTENT` child signals an unlifted body
1186/// (parser kept body lines as opaque TEXT) — bq-wrapped divs are the
1187/// current example. Those still need the byte-reparse path. Empty
1188/// and blank-only bodies have no `HTML_BLOCK_CONTENT` child and can
1189/// be lifted structurally to `Div ("",[],[]) []`.
1190fn div_has_structural_inner(node: &SyntaxNode) -> bool {
1191    let mut tags = node
1192        .children()
1193        .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG);
1194    let Some(open_tag) = tags.next() else {
1195        return false;
1196    };
1197    // Close tag is optional: pandoc emits an implicit close at EOF
1198    // for an unclosed `<div>` (warning: "Div ... unclosed ... closing
1199    // implicitly"). The body lift still produces structural children
1200    // (or none, for empty `<div>`), which we project as `Block::Div`.
1201    let close_tag = tags.next();
1202    if tags.next().is_some() {
1203        return false;
1204    }
1205    if !html_block_open_tag_is_clean(&open_tag) {
1206        return false;
1207    }
1208    if let Some(close_tag) = close_tag.as_ref()
1209        && !html_block_close_tag_is_clean(close_tag)
1210    {
1211        return false;
1212    }
1213    !node
1214        .children()
1215        .any(|c| c.kind() == SyntaxKind::HTML_BLOCK_CONTENT)
1216}
1217
1218/// True when the open `HTML_BLOCK_TAG` carries no inner content after
1219/// its closing `>`: the tag's children, in order, end with a TEXT
1220/// token whose last byte is `>` (either the dedicated `>` token used
1221/// by the structural `<div>` emission, or the whole-line TEXT used by
1222/// non-div strict-block emission like `<form>` / `<section>`),
1223/// followed only by zero or more NEWLINE tokens. Trailing content
1224/// (e.g. `<div id="x">foo\n`) returns false.
1225fn html_block_open_tag_is_clean(open_tag: &SyntaxNode) -> bool {
1226    let mut seen_gt = false;
1227    for child in open_tag.children_with_tokens() {
1228        let NodeOrToken::Token(t) = child else {
1229            // Structural HTML_ATTRS nodes are part of the open tag;
1230            // ignore them — they belong before `>`.
1231            continue;
1232        };
1233        if !seen_gt {
1234            if t.kind() == SyntaxKind::TEXT && t.text().ends_with('>') {
1235                seen_gt = true;
1236            }
1237        } else if t.kind() != SyntaxKind::NEWLINE {
1238            return false;
1239        }
1240    }
1241    seen_gt
1242}
1243
1244/// True when the close `HTML_BLOCK_TAG`'s first TEXT token begins with
1245/// `</`. A butted-close shape (`bar</div>`) starts with content text
1246/// and returns false.
1247fn html_block_close_tag_is_clean(close_tag: &SyntaxNode) -> bool {
1248    for child in close_tag.children_with_tokens() {
1249        if let NodeOrToken::Token(t) = child
1250            && t.kind() == SyntaxKind::TEXT
1251        {
1252            return t.text().starts_with("</");
1253        }
1254    }
1255    false
1256}
1257
1258/// Read the `<div>` open tag's attributes from the structural CST.
1259/// `HTML_BLOCK_DIV` always has an open `HTML_BLOCK_TAG` as its first
1260/// `HTML_BLOCK_TAG` child. The open tag may contain multiple
1261/// `HTML_ATTRS` regions when the source spans multiple attribute lines
1262/// (e.g. `<div\n  id="x"\n  class="y">`); join their text with spaces
1263/// before parsing so attributes from every line contribute. Empty
1264/// attributes (`<div>`) produce `Attr::default()`.
1265fn cst_div_open_tag_attr(node: &SyntaxNode) -> Attr {
1266    let Some(open_tag) = node
1267        .children()
1268        .find(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG)
1269    else {
1270        return Attr::default();
1271    };
1272    let mut parts: Vec<String> = Vec::new();
1273    for child in open_tag.children() {
1274        if child.kind() == SyntaxKind::HTML_ATTRS {
1275            parts.push(child.text().to_string());
1276        }
1277    }
1278    if parts.is_empty() {
1279        return Attr::default();
1280    }
1281    parse_html_attrs(parts.join(" ").trim())
1282}
1283
1284/// Project an `HTML_BLOCK` node into one or more `Block`s.
1285///
1286/// Pandoc's `markdown_in_html_blocks` extension (default-on under `markdown`
1287/// flavor) splits an HTML block at every complete *block-level* HTML tag:
1288/// each open or close tag emits its own `RawBlock`, and intervening
1289/// non-tag bytes are parsed as fresh markdown and emitted as `Plain` (or
1290/// `Para` for chunks separated by blank lines). Inline-only tags
1291/// (`<em>`, `<a>`, `<input>`, `<br>`, …) are not splitters — they pass
1292/// through as `RawInline` inside the surrounding `Plain` content.
1293///
1294/// Verbatim constructs are preserved as a single `RawBlock`: comments,
1295/// `<script>` / `<style>` / `<pre>` / `<textarea>`, processing
1296/// instructions, declarations, and CDATA. Balanced `<div>...</div>` is
1297/// handled at parse time (HTML_BLOCK_DIV lift) and routed through
1298/// `html_div_block`, not the splitter.
1299fn emit_html_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1300    // Fix #4 / Phase 6 structural lift: when the parser has lifted the
1301    // body into structural CST children (open `HTML_BLOCK_TAG` + body
1302    // blocks + close `HTML_BLOCK_TAG`, no `HTML_BLOCK_CONTENT`), walk
1303    // the children directly. This avoids the byte-reparse path that
1304    // would re-disambiguate heading auto-ids against a fresh inner
1305    // `RefsCtx` (producing `heading-1` instead of `heading` when the
1306    // outer ctx already saw the heading).
1307    if html_block_has_structural_lift(node) {
1308        emit_html_block_structural(node, out);
1309        return;
1310    }
1311    // Strip BLOCK_QUOTE_MARKER + WHITESPACE prefix tokens so the
1312    // byte-level walkers below see clean HTML — the parser keeps bq
1313    // markers as structural tokens inside HTML_BLOCK for verbatim-tag
1314    // content (e.g. `> <pre>\n> code\n> </pre>`). Outside a blockquote
1315    // this returns the same bytes as `node.text()`.
1316    let mut content = collect_html_block_text_skip_bq_markers(node);
1317    // Pandoc trims trailing ASCII whitespace (newlines, spaces, tabs)
1318    // from RawBlock text — `<!-- hi -->   \n` emits `RawBlock
1319    // "<!-- hi -->"`, not `"<!-- hi -->   "`. Interior whitespace is
1320    // preserved (e.g. `<pre>foo\n   </pre>` keeps the indented close).
1321    while content
1322        .as_bytes()
1323        .last()
1324        .is_some_and(|b| matches!(b, b'\n' | b'\r' | b' ' | b'\t'))
1325    {
1326        content.pop();
1327    }
1328    let leading_ws = content
1329        .as_bytes()
1330        .iter()
1331        .position(|&b| b != b' ' && b != b'\t')
1332        .unwrap_or(content.len());
1333    let trimmed = &content[leading_ws..];
1334    // Pandoc strips leading 1-3 spaces of indent from the first line
1335    // of an HTML block's RawBlock text — `  <pre>foo</pre>\n` emits
1336    // `RawBlock "<pre>foo</pre>"`. Subsequent lines keep their
1337    // indent. The HTML-block scanner only recognizes 0-3 leading
1338    // spaces of indent, so leading_ws is bounded; tabs aren't part
1339    // of an HTML-block opener and shouldn't be stripped.
1340    let strip_first_line_indent = leading_ws > 0
1341        && leading_ws <= 3
1342        && content.as_bytes()[..leading_ws].iter().all(|&b| b == b' ');
1343    if trimmed.starts_with("<!--")
1344        || trimmed.starts_with("<?")
1345        || trimmed.starts_with("<![CDATA[")
1346        || trimmed.starts_with("<!")
1347        || is_raw_text_element_open(trimmed)
1348    {
1349        let raw = if strip_first_line_indent {
1350            content[leading_ws..].to_string()
1351        } else {
1352            content
1353        };
1354        out.push(Block::RawBlock("html".to_string(), raw));
1355        return;
1356    }
1357    let walker_input = if strip_first_line_indent {
1358        &content[leading_ws..]
1359    } else {
1360        content.as_str()
1361    };
1362    split_html_block_by_tags(walker_input, out);
1363}
1364
1365/// True when an `HTML_BLOCK` carries the Fix #4 structural lift shape:
1366/// exactly two `HTML_BLOCK_TAG` children (open + close), both "clean"
1367/// (open ends at `>`, close starts with `</`), and no
1368/// `HTML_BLOCK_CONTENT` (which would mark an unlifted opaque body).
1369/// Empty bodies (only the two tags, with optional `BLANK_LINE` in
1370/// between) still count as lifted — they project as RawBlock +
1371/// RawBlock with nothing in between, matching pandoc.
1372fn html_block_has_structural_lift(node: &SyntaxNode) -> bool {
1373    let mut tags = node
1374        .children()
1375        .filter(|c| c.kind() == SyntaxKind::HTML_BLOCK_TAG);
1376    let Some(open_tag) = tags.next() else {
1377        return false;
1378    };
1379    let Some(close_tag) = tags.next() else {
1380        return false;
1381    };
1382    if tags.next().is_some() {
1383        return false;
1384    }
1385    if !html_block_open_tag_is_clean(&open_tag) {
1386        return false;
1387    }
1388    if !html_block_close_tag_is_clean(&close_tag) {
1389        return false;
1390    }
1391    !node
1392        .children()
1393        .any(|c| c.kind() == SyntaxKind::HTML_BLOCK_CONTENT)
1394}
1395
1396/// Emit an `HTML_BLOCK` whose body has been structurally lifted: walk
1397/// its CST children, projecting the open/close `HTML_BLOCK_TAG`s as
1398/// `RawBlock` (one each, trailing newlines trimmed to match pandoc-
1399/// native's tag-only emission) and inner block children through
1400/// `collect_block`. `BLANK_LINE` children are skipped (they don't
1401/// project to anything in pandoc-native).
1402fn emit_html_block_structural(node: &SyntaxNode, out: &mut Vec<Block>) {
1403    for child in node.children() {
1404        match child.kind() {
1405            SyntaxKind::HTML_BLOCK_TAG => {
1406                let text = open_tag_raw_block_text(&child);
1407                out.push(Block::RawBlock("html".to_string(), text));
1408            }
1409            SyntaxKind::BLANK_LINE => {}
1410            _ => collect_block(&child, out),
1411        }
1412    }
1413}
1414
1415/// Produce the `RawBlock` text for an `HTML_BLOCK_TAG` (open or close)
1416/// under the structural lift. Trailing newlines are always trimmed
1417/// (pandoc emits the tag bytes only). When the tag contains an
1418/// `HTML_ATTRS` structural region, the text is canonicalized to
1419/// pandoc's single-line form `<tagname attr1 attr2 ...>`:
1420/// multi-line opens collapse to one line, inter-attribute whitespace
1421/// normalizes to a single space, and any trailing whitespace before
1422/// `>` is dropped. Open tags without structural HTML_ATTRS (e.g.
1423/// `<form>`) and close tags (`</form>`) keep their literal text.
1424fn open_tag_raw_block_text(tag: &SyntaxNode) -> String {
1425    let has_attrs = tag.children().any(|c| c.kind() == SyntaxKind::HTML_ATTRS);
1426    if has_attrs {
1427        let mut name_prefix: Option<String> = None;
1428        let mut attrs: Vec<String> = Vec::new();
1429        for child in tag.children_with_tokens() {
1430            match child {
1431                NodeOrToken::Token(t) if t.kind() == SyntaxKind::TEXT => {
1432                    let text = t.text();
1433                    if name_prefix.is_none() && text.starts_with('<') {
1434                        if let Some(gt_idx) = text.find('>') {
1435                            // Whole-line shape (`<form>` etc., shouldn't
1436                            // reach here because has_attrs would be
1437                            // false). Defensive: emit literal prefix.
1438                            return text[..=gt_idx].to_string();
1439                        }
1440                        name_prefix = Some(text.to_string());
1441                    }
1442                }
1443                NodeOrToken::Node(n) if n.kind() == SyntaxKind::HTML_ATTRS => {
1444                    let attr_text = n.text().to_string();
1445                    let trimmed = attr_text.trim();
1446                    if !trimmed.is_empty() {
1447                        attrs.push(trimmed.to_string());
1448                    }
1449                }
1450                _ => {}
1451            }
1452        }
1453        let mut result = name_prefix.unwrap_or_default();
1454        for attr in &attrs {
1455            result.push(' ');
1456            result.push_str(attr);
1457        }
1458        result.push('>');
1459        return result;
1460    }
1461    // Blockquote-wrapped close tags (`> </form>`, `> </video>`) carry
1462    // their leading `BLOCK_QUOTE_MARKER + WHITESPACE` tokens inside the
1463    // close `HTML_BLOCK_TAG` for losslessness. Pandoc-native's RawBlock
1464    // text is the tag bytes only — strip those prefix tokens. Leading
1465    // 1-3 space indent (captured as a WHITESPACE token before the tag
1466    // name TEXT) is likewise stripped: pandoc's HTML block scanner
1467    // accepts ≤ 3 leading spaces on the open/close line but doesn't
1468    // round-trip them into the RawBlock text.
1469    let mut text = String::new();
1470    let mut skip_next_ws = false;
1471    for child in tag.children_with_tokens() {
1472        if let NodeOrToken::Token(t) = child {
1473            if t.kind() == SyntaxKind::BLOCK_QUOTE_MARKER {
1474                skip_next_ws = true;
1475                continue;
1476            }
1477            if skip_next_ws && t.kind() == SyntaxKind::WHITESPACE {
1478                skip_next_ws = false;
1479                continue;
1480            }
1481            if text.is_empty() && t.kind() == SyntaxKind::WHITESPACE {
1482                continue;
1483            }
1484            skip_next_ws = false;
1485            text.push_str(t.text());
1486        }
1487    }
1488    while text.ends_with('\n') {
1489        text.pop();
1490    }
1491    text
1492}
1493
1494/// Walk `content`'s bytes and split at every complete block-level HTML tag.
1495/// Each tag emits its own `RawBlock`; intervening text is flushed via
1496/// [`flush_html_block_text`]. Balanced `<div>...</div>` pairs (depth-aware)
1497/// project to `Block::Div`.
1498///
1499/// Tag classification follows pandoc:
1500/// - **Strict block tags** (pandoc's `blockHtmlTags`) always split.
1501/// - **Inline-block tags** (pandoc's `eitherBlockOrInline` set —
1502///   `<iframe>`, `<button>`, `<video>`, …) split with a matched-pair
1503///   3-way lift only at fresh-block positions; inside an existing
1504///   inline run they pass through as raw inline HTML. The
1505///   `inline_pending` flag tracks whether any non-whitespace content
1506///   (text bytes or non-splitting tags) has appeared since the last
1507///   splitter — when true, we don't split on inline-block tags.
1508fn split_html_block_by_tags(content: &str, out: &mut Vec<Block>) {
1509    use crate::parser::blocks::html_blocks::{
1510        is_pandoc_block_tag_name, is_pandoc_inline_block_tag_name, is_pandoc_void_block_tag_name,
1511    };
1512    use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
1513
1514    let bytes = content.as_bytes();
1515    let mut i = 0usize;
1516    let mut text_start = 0usize;
1517    let mut inline_pending = false;
1518    let mut consecutive_newlines = 0usize;
1519    while i < bytes.len() {
1520        let b = bytes[i];
1521        if b == b'\n' {
1522            consecutive_newlines += 1;
1523            // A blank line resets the inline-pending state — pandoc
1524            // restarts block parsing after a blank line, so subsequent
1525            // inline-block tags become eligible to split again.
1526            if consecutive_newlines >= 2 {
1527                inline_pending = false;
1528            }
1529            i += 1;
1530            continue;
1531        }
1532        consecutive_newlines = 0;
1533        if b != b'<' {
1534            if !b.is_ascii_whitespace() {
1535                inline_pending = true;
1536            }
1537            i += 1;
1538            continue;
1539        }
1540        let rest = &content[i..];
1541        let open_end = parse_open_tag(rest);
1542        let close_end = parse_close_tag(rest);
1543        let Some((tag_end, is_close)) = open_end
1544            .map(|n| (n, false))
1545            .or_else(|| close_end.map(|n| (n, true)))
1546        else {
1547            inline_pending = true;
1548            i += 1;
1549            continue;
1550        };
1551        let tag_text = &rest[..tag_end];
1552        let Some(name) = extract_html_tag_name(tag_text) else {
1553            inline_pending = true;
1554            i += 1;
1555            continue;
1556        };
1557        if is_pandoc_block_tag_name(name) {
1558            // Strict block tags (incl. `<div>`) inside an opaque
1559            // HTML_BLOCK split into RawBlocks per tag. Matched
1560            // `<div>...</div>` is handled at parse time (HTML_BLOCK_DIV
1561            // lift); we only reach the splitter for unbalanced or
1562            // multi-tag content (e.g. `</section>` standalone), so
1563            // emit each tag as its own RawBlock.
1564            if i > text_start {
1565                flush_html_block_text(&content[text_start..i], out);
1566            }
1567            out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1568            i += tag_end;
1569            text_start = i;
1570            inline_pending = false;
1571            continue;
1572        }
1573        if is_pandoc_inline_block_tag_name(name) {
1574            // At a fresh-block position (!inline_pending):
1575            //
1576            // - Open tag with matched close, interior not opening with a
1577            //   void block tag: lift `<tag>...</tag>` into RawBlock +
1578            //   interior + RawBlock.
1579            // - Open tag with no matched close, or open tag whose interior
1580            //   begins (after any indent) with a void block tag at column
1581            //   0: emit the open tag as a single RawBlock and continue
1582            //   scanning. Pandoc-native treats `<video>\n<source>...` as
1583            //   per-tag emission rather than a balanced span; once
1584            //   `<source>` interrupts the run, the closing `</video>` ends
1585            //   up as `RawInline` inside the trailing paragraph.
1586            // - Close tag at fresh-block: emit as a single RawBlock.
1587            //   Pandoc-native pins `</video>` standalone as a RawBlock.
1588            //
1589            // Inside an existing inline run (`inline_pending == true`),
1590            // pass through as inline raw HTML (pandoc's `cannot_interrupt`
1591            // semantics for `eitherBlockOrInline` tags).
1592            if !inline_pending {
1593                if !is_close
1594                    && let Some((close_start, close_end)) =
1595                        find_matching_html_close_with_start(content, i, name)
1596                    && !interior_starts_with_void_block_tag(content, i + tag_end)
1597                {
1598                    if i > text_start {
1599                        flush_html_block_text(&content[text_start..i], out);
1600                    }
1601                    out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1602                    let interior = &content[i + tag_end..close_start];
1603                    flush_html_block_text(interior, out);
1604                    let close_text = &content[close_start..close_end];
1605                    out.push(Block::RawBlock("html".to_string(), close_text.to_string()));
1606                    i = close_end;
1607                    text_start = i;
1608                    inline_pending = false;
1609                    continue;
1610                }
1611                if i > text_start {
1612                    flush_html_block_text(&content[text_start..i], out);
1613                }
1614                out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1615                i += tag_end;
1616                text_start = i;
1617                inline_pending = false;
1618                continue;
1619            }
1620            inline_pending = true;
1621            i += tag_end;
1622            continue;
1623        }
1624        if is_pandoc_void_block_tag_name(name) {
1625            // Void `eitherBlockOrInline` tags (`<embed>`, `<area>`,
1626            // `<source>`, `<track>`) emit as a single `RawBlock` per
1627            // instance at fresh-block positions; inside inline content
1628            // (`inline_pending == true`) they pass through as raw
1629            // inline HTML. The closing form (`</embed>` etc.) is not
1630            // valid HTML for void elements, but if it appears in the
1631            // wild pandoc still emits it as a `RawBlock` at fresh-block
1632            // positions — mirror that.
1633            if !inline_pending {
1634                if i > text_start {
1635                    flush_html_block_text(&content[text_start..i], out);
1636                }
1637                out.push(Block::RawBlock("html".to_string(), tag_text.to_string()));
1638                i += tag_end;
1639                text_start = i;
1640                inline_pending = false;
1641                continue;
1642            }
1643            inline_pending = true;
1644            i += tag_end;
1645            continue;
1646        }
1647        // Non-splitting tag (truly inline-only HTML). Mark that an
1648        // inline run has started so subsequent `eitherBlockOrInline`
1649        // tags don't split mid-paragraph.
1650        inline_pending = true;
1651        i += tag_end;
1652    }
1653    if text_start < bytes.len() {
1654        // Tail text — no further tag follows in this HTML block, so the
1655        // final `Para` should NOT be demoted to `Plain`. Pandoc only
1656        // promotes a paragraph to `Plain` when it is butted up against
1657        // the next HTML tag in the same block.
1658        flush_html_block_tail_text(&content[text_start..], out);
1659    }
1660}
1661
1662/// Reparse inter-tag text as fresh Pandoc markdown. The final `Para`
1663/// becomes a `Plain` when the text has no trailing blank line (i.e. a
1664/// closing tag follows immediately): pandoc promotes the last paragraph
1665/// to `Plain` whenever it is butted up against the next HTML tag.
1666///
1667/// Use [`flush_html_block_tail_text`] for text at the END of the HTML
1668/// block (no tag follows) — the demotion would be wrong there.
1669fn flush_html_block_text(text: &str, out: &mut Vec<Block>) {
1670    if text.trim().is_empty() {
1671        return;
1672    }
1673    let trailing_blank = trailing_newlines(text) >= 2;
1674    let mut blocks = parse_pandoc_blocks(text);
1675    if blocks.is_empty() {
1676        return;
1677    }
1678    if !trailing_blank
1679        && let Some(Block::Para(_)) = blocks.last()
1680        && let Some(Block::Para(inlines)) = blocks.pop()
1681    {
1682        blocks.push(Block::Plain(inlines));
1683    }
1684    out.extend(blocks);
1685}
1686
1687/// Reparse trailing text at the end of an HTML block (no tag follows).
1688/// Unlike [`flush_html_block_text`], the final `Para` is preserved —
1689/// pandoc only demotes to `Plain` when butted up against the next tag.
1690fn flush_html_block_tail_text(text: &str, out: &mut Vec<Block>) {
1691    if text.trim().is_empty() {
1692        return;
1693    }
1694    let blocks = parse_pandoc_blocks(text);
1695    out.extend(blocks);
1696}
1697
1698fn trailing_newlines(s: &str) -> usize {
1699    s.bytes().rev().take_while(|&b| b == b'\n').count()
1700}
1701
1702/// Whether the slice of `content` starting at `interior_start` (the byte
1703/// just after an inline-block open tag like `<video>`) begins on its first
1704/// non-blank line with a void block tag (`<source>`, `<embed>`, `<area>`,
1705/// `<track>`). When true, pandoc-native abandons the matched-pair lift —
1706/// the void tag emits as its own `RawBlock` and the closing `</video>`
1707/// ends up inline inside the trailing paragraph rather than as a
1708/// matched-pair close. Leading indentation is allowed before the void tag
1709/// (pandoc still abandons even when the void tag is indented).
1710fn interior_starts_with_void_block_tag(content: &str, interior_start: usize) -> bool {
1711    use crate::parser::blocks::html_blocks::is_pandoc_void_block_tag_name;
1712    use crate::parser::inlines::inline_html::parse_open_tag;
1713
1714    let bytes = content.as_bytes();
1715    let mut i = interior_start;
1716    while i < bytes.len() && matches!(bytes[i], b'\n' | b' ' | b'\t') {
1717        i += 1;
1718    }
1719    if i >= bytes.len() || bytes[i] != b'<' {
1720        return false;
1721    }
1722    let rest = &content[i..];
1723    let Some(end) = parse_open_tag(rest) else {
1724        return false;
1725    };
1726    extract_html_tag_name(&rest[..end]).is_some_and(is_pandoc_void_block_tag_name)
1727}
1728
1729/// Extract the tag name from a complete HTML tag text (`<name ...>` or
1730/// `</name>`). Used to gate splitting on block-level tag membership.
1731fn extract_html_tag_name(tag_text: &str) -> Option<&str> {
1732    let bytes = tag_text.as_bytes();
1733    if bytes.first() != Some(&b'<') {
1734        return None;
1735    }
1736    let start = if bytes.get(1) == Some(&b'/') { 2 } else { 1 };
1737    let mut end = start;
1738    while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'-') {
1739        end += 1;
1740    }
1741    if start == end {
1742        None
1743    } else {
1744        Some(&tag_text[start..end])
1745    }
1746}
1747
1748/// Depth-aware scan for the matching closing tag of `name` starting at
1749/// byte position `start` (the `<` of the opening tag) in `content`.
1750/// Returns `(close_start, close_end)` — the bounds of the matching
1751/// `</name>` tag — or `None` when no balanced close exists in `content`.
1752fn find_matching_html_close_with_start(
1753    content: &str,
1754    start: usize,
1755    name: &str,
1756) -> Option<(usize, usize)> {
1757    use crate::parser::inlines::inline_html::{parse_close_tag, parse_open_tag};
1758
1759    let bytes = content.as_bytes();
1760    let opener_end = parse_open_tag(&content[start..])?;
1761    let mut i = start + opener_end;
1762    let mut depth = 1usize;
1763    while i < bytes.len() {
1764        if bytes[i] != b'<' {
1765            i += 1;
1766            continue;
1767        }
1768        let rest = &content[i..];
1769        if let Some(end) = parse_open_tag(rest) {
1770            let tag = &rest[..end];
1771            if extract_html_tag_name(tag).is_some_and(|n| n.eq_ignore_ascii_case(name)) {
1772                depth += 1;
1773            }
1774            i += end;
1775            continue;
1776        }
1777        if let Some(end) = parse_close_tag(rest) {
1778            let tag = &rest[..end];
1779            if extract_html_tag_name(tag).is_some_and(|n| n.eq_ignore_ascii_case(name)) {
1780                depth -= 1;
1781                if depth == 0 {
1782                    return Some((i, i + end));
1783                }
1784            }
1785            i += end;
1786            continue;
1787        }
1788        i += 1;
1789    }
1790    None
1791}
1792
1793/// Return true if `s` (with leading `<`) opens a raw-text HTML element where
1794/// pandoc keeps the entire block verbatim — no markdown parsing inside.
1795/// Lowercases the tag name for matching; matches when the tag name is
1796/// followed by whitespace, `>`, `/`, or end-of-string.
1797fn is_raw_text_element_open(s: &str) -> bool {
1798    let bytes = s.as_bytes();
1799    if bytes.is_empty() || bytes[0] != b'<' {
1800        return false;
1801    }
1802    let rest = &s[1..];
1803    for tag in ["script", "style", "pre", "textarea"] {
1804        if rest.len() < tag.len() {
1805            continue;
1806        }
1807        if rest[..tag.len()].eq_ignore_ascii_case(tag) {
1808            let after = rest.as_bytes().get(tag.len()).copied();
1809            match after {
1810                None => return true,
1811                Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'>') | Some(b'/') => {
1812                    return true;
1813                }
1814                _ => {}
1815            }
1816        }
1817    }
1818    false
1819}
1820
1821/// Iterate `node`'s block-level emission, handling `HTML_BLOCK` splitting
1822/// (one HTML block can project as several pandoc-native blocks under
1823/// `markdown_in_html_blocks`) while keeping every other kind one-block.
1824fn collect_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1825    if node.kind() == SyntaxKind::HTML_BLOCK_DIV {
1826        // `HTML_BLOCK_DIV` is the parser's explicit `<div>` retag. The
1827        // structural projector walks lifted CST children directly —
1828        // all balanced `<div>` shapes lift at parse time.
1829        out.push(html_div_block(node));
1830        return;
1831    }
1832    if node.kind() == SyntaxKind::HTML_BLOCK {
1833        // Opaque HTML block — comments, PI, verbatim (`<pre>`, `<style>`,
1834        // `<script>`, `<textarea>`), void inline-block tags, and any
1835        // strict/inline-block tag the parser couldn't lift. The byte
1836        // walker splits these into per-tag RawBlocks plus interior text.
1837        emit_html_block(node, out);
1838        return;
1839    }
1840    if let Some(b) = block_from(node) {
1841        out.push(b);
1842    }
1843}
1844
1845/// Reparse `text` as Pandoc-flavored markdown and return its top-level
1846/// blocks. Unlike `parse_cell_text_blocks`, leaves `Para` as `Para` — the
1847/// caller decides whether the surrounding context demands `Plain`.
1848fn parse_pandoc_blocks(text: &str) -> Vec<Block> {
1849    if text.trim().is_empty() {
1850        return Vec::new();
1851    }
1852    let opts = crate::ParserOptions {
1853        flavor: crate::Flavor::Pandoc,
1854        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
1855        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
1856        ..crate::ParserOptions::default()
1857    };
1858    let doc = crate::parse(text, Some(opts));
1859    // Swap REFS_CTX with one built from the inner CST so heading auto-ids,
1860    // reference-link defs, and footnote defs inside the recursive parse
1861    // resolve against inner offsets/labels rather than the outer document's.
1862    // Outer refs/footnotes/heading-id history are inherited so a `<div>`
1863    // body can use a label/footnote defined outside, and inner heading
1864    // slugs disambiguate against outer headings. Pandoc parses
1865    // `<div>...</div>` natively in one pass, so this approximation
1866    // matches the common case (outer-def-before-inner-use, inner-loses
1867    // for shared keys); offset-aware document-order resolution would be
1868    // needed for full parity but is not exercised by current corpus.
1869    let outer = REFS_CTX.with(|c| std::mem::take(&mut *c.borrow_mut()));
1870    let inner_ctx = build_refs_ctx_inherited(&doc, Some(&outer));
1871    REFS_CTX.with(|c| *c.borrow_mut() = inner_ctx);
1872    let mut out = Vec::new();
1873    for child in doc.children() {
1874        collect_block(&child, &mut out);
1875    }
1876    REFS_CTX.with(|c| *c.borrow_mut() = outer);
1877    out
1878}
1879
1880fn tex_block(node: &SyntaxNode) -> Block {
1881    let mut content = node.text().to_string();
1882    while content.ends_with('\n') {
1883        content.pop();
1884    }
1885    Block::RawBlock("tex".to_string(), content)
1886}
1887
1888fn fenced_div(node: &SyntaxNode) -> Block {
1889    let attr = node
1890        .children()
1891        .find(|c| c.kind() == SyntaxKind::DIV_FENCE_OPEN)
1892        .map(|open| {
1893            let info = open
1894                .children()
1895                .find(|c| c.kind() == SyntaxKind::DIV_INFO)
1896                .map(|n| n.text().to_string())
1897                .unwrap_or_default();
1898            parse_div_info(info.trim())
1899        })
1900        .unwrap_or_default();
1901    let mut blocks = Vec::new();
1902    for child in node.children() {
1903        match child.kind() {
1904            SyntaxKind::DIV_FENCE_OPEN | SyntaxKind::DIV_FENCE_CLOSE => {}
1905            _ => collect_block(&child, &mut blocks),
1906        }
1907    }
1908    Block::Div(attr, blocks)
1909}
1910
1911/// Parse pandoc div info: either `{#id .class1 .class2 key=value}` or a single
1912/// bare class name like `Warning`.
1913fn parse_div_info(info: &str) -> Attr {
1914    if info.starts_with('{') && info.ends_with('}') {
1915        return parse_attr_block(&info[1..info.len() - 1]);
1916    }
1917    if !info.is_empty() {
1918        return Attr {
1919            id: String::new(),
1920            classes: vec![info.to_string()],
1921            kvs: Vec::new(),
1922        };
1923    }
1924    Attr::default()
1925}
1926
1927/// Read a child `ATTRIBUTE` (node or token) on `parent` and parse its
1928/// `{...}` body into an `Attr`. Returns `Attr::default()` if no attribute
1929/// is attached or the body isn't `{...}`-shaped.
1930fn extract_attr_from_node(parent: &SyntaxNode) -> Attr {
1931    let raw = parent.children_with_tokens().find_map(|el| match el {
1932        NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
1933        NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => Some(t.text().to_string()),
1934        _ => None,
1935    });
1936    let Some(raw) = raw else {
1937        return Attr::default();
1938    };
1939    let trimmed = raw.trim();
1940    if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
1941        parse_attr_block(inner)
1942    } else {
1943        Attr::default()
1944    }
1945}
1946
1947/// Parse the body of an attribute block like `#my-id .class1 .class2 key=value`.
1948/// Whitespace-separated. Tokens starting with `#` are id, `.` are classes,
1949/// `key=value` (optionally quoted value) are kvs.
1950fn parse_attr_block(s: &str) -> Attr {
1951    let mut id = String::new();
1952    let mut classes: Vec<String> = Vec::new();
1953    let mut kvs: Vec<(String, String)> = Vec::new();
1954    let bytes = s.as_bytes();
1955    let mut i = 0usize;
1956    while i < bytes.len() {
1957        match bytes[i] {
1958            b' ' | b'\t' | b'\n' | b'\r' => {
1959                i += 1;
1960            }
1961            b'#' => {
1962                let start = i + 1;
1963                let mut j = start;
1964                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
1965                    j += 1;
1966                }
1967                id = s[start..j].to_string();
1968                i = j;
1969            }
1970            b'.' => {
1971                let start = i + 1;
1972                let mut j = start;
1973                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
1974                    j += 1;
1975                }
1976                classes.push(s[start..j].to_string());
1977                i = j;
1978            }
1979            _ => {
1980                // Read key up to `=` or whitespace.
1981                let key_start = i;
1982                while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=') {
1983                    i += 1;
1984                }
1985                let key = s[key_start..i].to_string();
1986                if i < bytes.len() && bytes[i] == b'=' {
1987                    i += 1;
1988                    let value = if i < bytes.len() && bytes[i] == b'"' {
1989                        i += 1;
1990                        let v_start = i;
1991                        while i < bytes.len() && bytes[i] != b'"' {
1992                            i += 1;
1993                        }
1994                        let v = s[v_start..i].to_string();
1995                        if i < bytes.len() {
1996                            i += 1;
1997                        }
1998                        v
1999                    } else {
2000                        let v_start = i;
2001                        while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
2002                            i += 1;
2003                        }
2004                        s[v_start..i].to_string()
2005                    };
2006                    kvs.push((key, value));
2007                } else if !key.is_empty() {
2008                    // Bare token (legacy class form).
2009                    classes.push(key);
2010                }
2011            }
2012        }
2013    }
2014    Attr { id, classes, kvs }
2015}
2016
2017/// Parse HTML-style attributes `class="x" id="y" key="z"` into `Attr`,
2018/// mapping `class` (whitespace-split) → classes, `id` → id, others → kvs.
2019fn parse_html_attrs(s: &str) -> Attr {
2020    let mut id = String::new();
2021    let mut classes: Vec<String> = Vec::new();
2022    let mut kvs: Vec<(String, String)> = Vec::new();
2023    let bytes = s.as_bytes();
2024    let mut i = 0usize;
2025    while i < bytes.len() {
2026        match bytes[i] {
2027            b' ' | b'\t' | b'\n' | b'\r' => {
2028                i += 1;
2029            }
2030            _ => {
2031                let key_start = i;
2032                while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=') {
2033                    i += 1;
2034                }
2035                let key = s[key_start..i].to_string();
2036                let value = if i < bytes.len() && bytes[i] == b'=' {
2037                    i += 1;
2038                    if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
2039                        let quote = bytes[i];
2040                        i += 1;
2041                        let v_start = i;
2042                        while i < bytes.len() && bytes[i] != quote {
2043                            i += 1;
2044                        }
2045                        let v = s[v_start..i].to_string();
2046                        if i < bytes.len() {
2047                            i += 1;
2048                        }
2049                        v
2050                    } else {
2051                        let v_start = i;
2052                        while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
2053                            i += 1;
2054                        }
2055                        s[v_start..i].to_string()
2056                    }
2057                } else {
2058                    String::new()
2059                };
2060                if key.is_empty() {
2061                    continue;
2062                }
2063                match key.as_str() {
2064                    "class" => {
2065                        for c in value.split_ascii_whitespace() {
2066                            classes.push(c.to_string());
2067                        }
2068                    }
2069                    "id" => id = value,
2070                    _ => kvs.push((key, value)),
2071                }
2072            }
2073        }
2074    }
2075    Attr { id, classes, kvs }
2076}
2077
2078fn definition_list(node: &SyntaxNode) -> Block {
2079    let items: Vec<(Vec<Inline>, Vec<Vec<Block>>)> = node
2080        .children()
2081        .filter(|c| c.kind() == SyntaxKind::DEFINITION_ITEM)
2082        .map(|item| {
2083            let term = item
2084                .children()
2085                .find(|c| c.kind() == SyntaxKind::TERM)
2086                .map(|t| coalesce_inlines(inlines_from(&t)))
2087                .unwrap_or_default();
2088            let loose = is_loose_definition_item(&item);
2089            let defs: Vec<Vec<Block>> = item
2090                .children()
2091                .filter(|c| c.kind() == SyntaxKind::DEFINITION)
2092                .map(|d| definition_blocks(&d, loose))
2093                .collect();
2094            (term, defs)
2095        })
2096        .collect();
2097    Block::DefinitionList(items)
2098}
2099
2100/// A `DEFINITION_ITEM` is "loose" iff there is a `BLANK_LINE` between the
2101/// `TERM` (or its preceding term continuations) and the first `DEFINITION`.
2102/// Pandoc renders loose definitions with `Para` blocks; tight ones use
2103/// `Plain`. The looseness is per-item (per-term group), not per-definition,
2104/// and applies to *all* definitions in the item — see pandoc's behavior.
2105fn is_loose_definition_item(item: &SyntaxNode) -> bool {
2106    let mut saw_term = false;
2107    for child in item.children_with_tokens() {
2108        if let NodeOrToken::Node(n) = child {
2109            match n.kind() {
2110                SyntaxKind::TERM => {
2111                    saw_term = true;
2112                }
2113                SyntaxKind::BLANK_LINE if saw_term => {
2114                    return true;
2115                }
2116                SyntaxKind::DEFINITION => {
2117                    return false;
2118                }
2119                _ => {}
2120            }
2121        }
2122    }
2123    false
2124}
2125
2126fn definition_blocks(def_node: &SyntaxNode, loose: bool) -> Vec<Block> {
2127    // Definition body content lives at the marker's content offset (`: ` →
2128    // 2 columns by default). The CST keeps that indent on each line, so any
2129    // CODE_BLOCK descendant needs the offset stripped before pandoc-native
2130    // projection.
2131    let extra = definition_content_offset(def_node);
2132    let mut out = Vec::new();
2133    for child in def_node.children() {
2134        match child.kind() {
2135            SyntaxKind::PLAIN => {
2136                let inlines = coalesce_inlines(inlines_from(&child));
2137                if loose {
2138                    out.push(Block::Para(inlines));
2139                } else {
2140                    out.push(Block::Plain(inlines));
2141                }
2142            }
2143            SyntaxKind::PARAGRAPH => {
2144                out.push(Block::Para(coalesce_inlines(inlines_from(&child))));
2145            }
2146            SyntaxKind::CODE_BLOCK if extra > 0 => {
2147                out.push(indented_code_block_with_extra_strip(&child, extra));
2148            }
2149            _ => collect_block(&child, &mut out),
2150        }
2151    }
2152    out
2153}
2154
2155/// Visual column where definition body content starts. The strip later runs
2156/// against the *tab-expanded* body, so this offset must be measured in
2157/// columns (tabs round to the next 4-col stop), not raw chars: `:\t` reaches
2158/// col 4, which is the column the body's strip should remove.
2159fn definition_content_offset(def_node: &SyntaxNode) -> usize {
2160    let mut col = 0usize;
2161    let mut saw_marker = false;
2162    for el in def_node.children_with_tokens() {
2163        if let NodeOrToken::Token(t) = el {
2164            match t.kind() {
2165                SyntaxKind::DEFINITION_MARKER => {
2166                    col = advance_col(col, t.text());
2167                    saw_marker = true;
2168                }
2169                SyntaxKind::WHITESPACE if saw_marker => {
2170                    return advance_col(col, t.text());
2171                }
2172                _ if saw_marker => return col,
2173                _ => {}
2174            }
2175        } else if saw_marker {
2176            return col;
2177        }
2178    }
2179    col
2180}
2181
2182/// Advance a column counter by `s`, treating `\t` as moving to the next
2183/// 4-column tab stop and any other character as a single column.
2184fn advance_col(start: usize, s: &str) -> usize {
2185    let mut col = start;
2186    for c in s.chars() {
2187        if c == '\t' {
2188            col = (col / 4 + 1) * 4;
2189        } else {
2190            col += 1;
2191        }
2192    }
2193    col
2194}
2195
2196fn line_block(node: &SyntaxNode) -> Block {
2197    let lines: Vec<Vec<Inline>> = node
2198        .children()
2199        .filter(|c| c.kind() == SyntaxKind::LINE_BLOCK_LINE)
2200        .map(|line| {
2201            let mut out = Vec::new();
2202            for el in line.children_with_tokens() {
2203                match el {
2204                    NodeOrToken::Token(t) => match t.kind() {
2205                        SyntaxKind::LINE_BLOCK_MARKER | SyntaxKind::NEWLINE => {}
2206                        _ => push_token_inline(&t, &mut out),
2207                    },
2208                    NodeOrToken::Node(n) => out.push(inline_from_node(&n)),
2209                }
2210            }
2211            coalesce_inlines(out)
2212        })
2213        .collect();
2214    Block::LineBlock(lines)
2215}
2216
2217fn latex_command_inline(node: &SyntaxNode) -> Inline {
2218    let content = node.text().to_string();
2219    Inline::RawInline("tex".to_string(), content)
2220}
2221
2222fn bracketed_span_inline(node: &SyntaxNode) -> Inline {
2223    let is_html = node
2224        .children_with_tokens()
2225        .any(|el| matches!(&el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::SPAN_BRACKET_OPEN && t.text().starts_with('<')));
2226    let attr_text = node.children_with_tokens().find_map(|el| match el {
2227        NodeOrToken::Token(t) if t.kind() == SyntaxKind::SPAN_ATTRIBUTES => {
2228            Some(t.text().to_string())
2229        }
2230        NodeOrToken::Node(n) if n.kind() == SyntaxKind::SPAN_ATTRIBUTES => {
2231            Some(n.text().to_string())
2232        }
2233        _ => None,
2234    });
2235    let attr = attr_text
2236        .map(|raw| {
2237            let trimmed = raw.trim();
2238            if is_html {
2239                parse_html_attrs(trimmed)
2240            } else if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}'))
2241            {
2242                parse_attr_block(inner)
2243            } else {
2244                Attr::default()
2245            }
2246        })
2247        .unwrap_or_default();
2248    let content = node
2249        .children()
2250        .find(|c| c.kind() == SyntaxKind::SPAN_CONTENT)
2251        .map(|n| coalesce_inlines(inlines_from(&n)))
2252        .unwrap_or_default();
2253    Inline::Span(attr, content)
2254}
2255
2256fn inline_html_span_inline(node: &SyntaxNode) -> Inline {
2257    let attr_text = node
2258        .children()
2259        .find(|c| c.kind() == SyntaxKind::HTML_ATTRS)
2260        .map(|n| n.text().to_string());
2261    let attr = attr_text
2262        .map(|raw| parse_html_attrs(raw.trim()))
2263        .unwrap_or_default();
2264    let content = node
2265        .children()
2266        .find(|c| c.kind() == SyntaxKind::SPAN_CONTENT)
2267        .map(|n| coalesce_inlines(inlines_from(&n)))
2268        .unwrap_or_default();
2269    Inline::Span(attr, content)
2270}
2271
2272fn pipe_table(node: &SyntaxNode) -> Option<TableData> {
2273    let mut header_cells: Vec<Vec<Inline>> = Vec::new();
2274    let mut body_rows: Vec<Vec<Vec<Inline>>> = Vec::new();
2275    let mut aligns: Vec<&'static str> = Vec::new();
2276    let mut caption_inlines: Vec<Inline> = Vec::new();
2277    let mut caption_attr_from_node: Option<Attr> = None;
2278    for child in node.children() {
2279        match child.kind() {
2280            SyntaxKind::TABLE_HEADER => {
2281                header_cells = pipe_table_cells(&child);
2282            }
2283            SyntaxKind::TABLE_SEPARATOR => {
2284                let raw = child.text().to_string();
2285                aligns = pipe_separator_aligns(&raw);
2286            }
2287            SyntaxKind::TABLE_ROW => {
2288                body_rows.push(pipe_table_cells(&child));
2289            }
2290            SyntaxKind::TABLE_CAPTION => {
2291                let (inlines, attr) = pipe_table_caption(&child);
2292                caption_inlines = inlines;
2293                caption_attr_from_node = attr;
2294            }
2295            _ => {}
2296        }
2297    }
2298    let cols = header_cells
2299        .len()
2300        .max(body_rows.iter().map(Vec::len).max().unwrap_or(0))
2301        .max(aligns.len());
2302    if cols == 0 {
2303        return None;
2304    }
2305    while aligns.len() < cols {
2306        aligns.push("AlignDefault");
2307    }
2308    let head_rows = if header_cells.is_empty() {
2309        Vec::new()
2310    } else {
2311        vec![cells_to_plain_blocks(header_cells, cols)]
2312    };
2313    let body_rows: Vec<Vec<GridCell>> = body_rows
2314        .into_iter()
2315        .map(|cells| cells_to_plain_blocks(cells, cols))
2316        .collect();
2317    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2318    Some(TableData {
2319        attr,
2320        caption: caption_inlines,
2321        aligns,
2322        widths: vec![None; cols],
2323        head_rows,
2324        body_rows,
2325        foot_rows: Vec::new(),
2326    })
2327}
2328
2329fn pipe_table_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
2330    row.children()
2331        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2332        .map(|cell| coalesce_inlines(inlines_from(&cell)))
2333        .collect()
2334}
2335
2336/// Pandoc's `+caption_attributes` extension lifts a trailing `{...}` from a
2337/// table caption into the Table's outer attribute. Walk the caption inlines
2338/// from the right looking for a balanced trailing `{...}` span: a Str
2339/// ending with `}` plus zero or more (Space, Str) pairs back until a Str
2340/// starts with `{`. If found, parse the brace contents as an attribute
2341/// block and drop those inlines (plus any preceding Space) from the caption
2342/// text.
2343fn extract_caption_attrs(mut inlines: Vec<Inline>) -> (Attr, Vec<Inline>) {
2344    let last_str_end = inlines
2345        .iter()
2346        .rposition(|i| matches!(i, Inline::Str(s) if s.ends_with('}')));
2347    let Some(end_idx) = last_str_end else {
2348        return (Attr::default(), inlines);
2349    };
2350    // Walk back to find the Str starting with `{`. Allow only Str/Space
2351    // between (no structural inlines like Emph), since attribute blocks
2352    // are plain text.
2353    let mut start_idx = end_idx;
2354    let mut found_open = false;
2355    loop {
2356        match &inlines[start_idx] {
2357            Inline::Str(s) => {
2358                if s.starts_with('{') {
2359                    found_open = true;
2360                    break;
2361                }
2362            }
2363            Inline::Space => {}
2364            _ => return (Attr::default(), inlines),
2365        }
2366        if start_idx == 0 {
2367            break;
2368        }
2369        start_idx -= 1;
2370    }
2371    if !found_open {
2372        return (Attr::default(), inlines);
2373    }
2374    // Concatenate the Str/Space slice into a flat string, then strip the
2375    // outer braces.
2376    let mut raw = String::new();
2377    for el in &inlines[start_idx..=end_idx] {
2378        match el {
2379            Inline::Str(s) => raw.push_str(s),
2380            Inline::Space => raw.push(' '),
2381            _ => return (Attr::default(), inlines),
2382        }
2383    }
2384    if !(raw.starts_with('{') && raw.ends_with('}')) {
2385        return (Attr::default(), inlines);
2386    }
2387    let inner = &raw[1..raw.len() - 1];
2388    let attr = parse_attr_block(inner);
2389    inlines.truncate(start_idx);
2390    if matches!(inlines.last(), Some(Inline::Space)) {
2391        inlines.pop();
2392    }
2393    (attr, inlines)
2394}
2395
2396/// Resolve `(Attr, caption_inlines)` for a table whose caption has already
2397/// been projected. Prefers a structural ATTRIBUTE node when the parser
2398/// captured one (`+caption_attributes` lift); falls back to the legacy
2399/// trailing-Str scan for older paths.
2400fn resolve_caption_attr(
2401    caption_inlines: Vec<Inline>,
2402    caption_attr_from_node: Option<Attr>,
2403) -> (Attr, Vec<Inline>) {
2404    match caption_attr_from_node {
2405        Some(attr) => (attr, caption_inlines),
2406        None => extract_caption_attrs(caption_inlines),
2407    }
2408}
2409
2410/// Run `pipe_table_caption` over the table node's TABLE_CAPTION child if any,
2411/// returning collected inlines and a structurally-extracted attr (None when
2412/// the parser didn't lift one).
2413fn project_table_caption_from(node: &SyntaxNode) -> (Vec<Inline>, Option<Attr>) {
2414    node.children()
2415        .find(|c| c.kind() == SyntaxKind::TABLE_CAPTION)
2416        .map(|n| pipe_table_caption(&n))
2417        .unwrap_or_else(|| (Vec::new(), None))
2418}
2419
2420fn pipe_table_caption(node: &SyntaxNode) -> (Vec<Inline>, Option<Attr>) {
2421    // Walk all tokens after TABLE_CAPTION_PREFIX and collect inline content.
2422    // The parser lifts a trailing `{...}` attribute block (Pandoc's
2423    // `+caption_attributes`) into a structural ATTRIBUTE node — surface it as
2424    // the table's outer attr instead of projecting it as an inline.
2425    let mut out = Vec::new();
2426    let mut caption_attr: Option<Attr> = None;
2427    let mut after_prefix = false;
2428    for el in node.children_with_tokens() {
2429        match el {
2430            NodeOrToken::Node(n) => {
2431                if n.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
2432                    after_prefix = true;
2433                    continue;
2434                }
2435                if !after_prefix {
2436                    continue;
2437                }
2438                if n.kind() == SyntaxKind::ATTRIBUTE {
2439                    let raw = n.text().to_string();
2440                    let inner = raw.trim().trim_start_matches('{').trim_end_matches('}');
2441                    caption_attr = Some(parse_attr_block(inner));
2442                    // Drop any trailing whitespace inline pushed before the attribute.
2443                    if matches!(out.last(), Some(Inline::Space)) {
2444                        out.pop();
2445                    }
2446                    continue;
2447                }
2448                out.push(inline_from_node(&n));
2449            }
2450            NodeOrToken::Token(t) => {
2451                if t.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
2452                    after_prefix = true;
2453                    continue;
2454                }
2455                if !after_prefix {
2456                    continue;
2457                }
2458                if t.kind() == SyntaxKind::ATTRIBUTE {
2459                    let raw = t.text();
2460                    let inner = raw.trim().trim_start_matches('{').trim_end_matches('}');
2461                    caption_attr = Some(parse_attr_block(inner));
2462                    if matches!(out.last(), Some(Inline::Space)) {
2463                        out.pop();
2464                    }
2465                    continue;
2466                }
2467                push_token_inline(&t, &mut out);
2468            }
2469        }
2470    }
2471    (coalesce_inlines(out), caption_attr)
2472}
2473
2474fn pipe_separator_aligns(raw: &str) -> Vec<&'static str> {
2475    // Strip surrounding whitespace before pipe-stripping so an indented
2476    // pipe-table separator (e.g. fenced-div content at column ≥1) doesn't
2477    // leave a leading whitespace segment that then counts as a phantom
2478    // column.
2479    let trimmed = raw.trim();
2480    let inner = trimmed.trim_start_matches('|').trim_end_matches('|');
2481    inner
2482        .split('|')
2483        .map(|seg| {
2484            let s = seg.trim();
2485            let left = s.starts_with(':');
2486            let right = s.ends_with(':');
2487            match (left, right) {
2488                (true, true) => "AlignCenter",
2489                (true, false) => "AlignLeft",
2490                (false, true) => "AlignRight",
2491                _ => "AlignDefault",
2492            }
2493        })
2494        .collect()
2495}
2496
2497fn cells_to_plain_blocks(cells: Vec<Vec<Inline>>, cols: usize) -> Vec<GridCell> {
2498    let mut out: Vec<GridCell> = cells
2499        .into_iter()
2500        .map(|inlines| {
2501            let blocks = if inlines.is_empty() {
2502                Vec::new()
2503            } else {
2504                vec![Block::Plain(inlines)]
2505            };
2506            GridCell::no_span(blocks)
2507        })
2508        .collect();
2509    while out.len() < cols {
2510        out.push(GridCell::no_span(Vec::new()));
2511    }
2512    out
2513}
2514
2515/// Pandoc-style `show` for `Double`. Decimal in `[0.1, 1e7)`, scientific
2516/// otherwise. Always emits a fractional component (`1.0` not `1`). Used for
2517/// `ColWidth N` rendering, where N is in `(0.0, 1.0)` for our cases.
2518fn show_double(x: f64) -> String {
2519    if x == 0.0 {
2520        return "0.0".to_string();
2521    }
2522    let abs = x.abs();
2523    if (0.1..1e7).contains(&abs) {
2524        let s = format!("{x}");
2525        if s.contains('.') || s.contains('e') {
2526            s
2527        } else {
2528            format!("{s}.0")
2529        }
2530    } else {
2531        // Rust's `{:e}` already matches Haskell's mantissa/exponent shape:
2532        // `8.333333333333333e-2`. Whole-number mantissa needs `.0` appended.
2533        let s = format!("{x:e}");
2534        if let Some((m, e)) = s.split_once('e') {
2535            if m.contains('.') {
2536                s
2537            } else {
2538                format!("{m}.0e{e}")
2539            }
2540        } else {
2541            s
2542        }
2543    }
2544}
2545
2546// ----- simple table -------------------------------------------------------
2547
2548/// Project a `SIMPLE_TABLE` node. Pandoc's "simple" table form:
2549///
2550/// ```text
2551///    Col1     Col2
2552/// -------- --------    ← TABLE_SEPARATOR (dash runs define columns)
2553///   data1    data2
2554///
2555/// Table: optional caption
2556/// ```
2557///
2558/// Headerless variant skips the header row and uses dash runs both above
2559/// and below the data. Alignment is derived from each header cell's
2560/// position relative to its column's dash run boundaries. For headerless
2561/// tables, alignment derives from the *first data row*.
2562fn simple_table(node: &SyntaxNode) -> Option<TableData> {
2563    let separator = node
2564        .children()
2565        .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)?;
2566    let cols = simple_table_dash_runs(&separator);
2567    if cols.is_empty() {
2568        return None;
2569    }
2570    let header = node
2571        .children()
2572        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
2573    // Body rows: every TABLE_ROW. Drop a trailing all-dashes row — that is
2574    // the closing `---` separator of a headerless table that the parser
2575    // currently emits as a TABLE_ROW of dash cells.
2576    let mut body_rows_nodes: Vec<SyntaxNode> = node
2577        .children()
2578        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
2579        .collect();
2580    if header.is_none()
2581        && body_rows_nodes
2582            .last()
2583            .map(simple_table_row_is_all_dashes)
2584            .unwrap_or(false)
2585    {
2586        body_rows_nodes.pop();
2587    }
2588    // Alignment: from header if present, else from the first data row.
2589    let aligns = if let Some(h) = &header {
2590        simple_table_aligns(h, &cols)
2591    } else if let Some(r0) = body_rows_nodes.first() {
2592        simple_table_aligns(r0, &cols)
2593    } else {
2594        vec!["AlignDefault"; cols.len()]
2595    };
2596    let head_rows = match &header {
2597        Some(h) => {
2598            let cells: Vec<Vec<Inline>> = simple_table_row_cells(h);
2599            vec![cells_to_plain_blocks(cells, cols.len())]
2600        }
2601        None => Vec::new(),
2602    };
2603    let body_rows: Vec<Vec<GridCell>> = body_rows_nodes
2604        .iter()
2605        .map(|r| cells_to_plain_blocks(simple_table_row_cells(r), cols.len()))
2606        .collect();
2607    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
2608    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2609    Some(TableData {
2610        attr,
2611        caption: caption_inlines,
2612        aligns,
2613        widths: vec![None; cols.len()],
2614        head_rows,
2615        body_rows,
2616        foot_rows: Vec::new(),
2617    })
2618}
2619
2620/// Return the `(start_col, end_col)` (inclusive) of each dash run in a
2621/// `TABLE_SEPARATOR` node, where columns are 0-based offsets within the
2622/// separator's line.
2623fn simple_table_dash_runs(separator: &SyntaxNode) -> Vec<(usize, usize)> {
2624    let raw = separator.text().to_string();
2625    let line = raw.trim_end_matches(['\n', '\r']);
2626    let mut runs = Vec::new();
2627    let mut start: Option<usize> = None;
2628    for (i, ch) in line.char_indices() {
2629        if ch == '-' {
2630            if start.is_none() {
2631                start = Some(i);
2632            }
2633        } else if let Some(s) = start.take() {
2634            runs.push((s, i - 1));
2635        }
2636    }
2637    if let Some(s) = start.take() {
2638        runs.push((s, line.len() - 1));
2639    }
2640    runs
2641}
2642
2643fn simple_table_row_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
2644    // Zero-width TABLE_CELL nodes represent positionally-empty columns
2645    // (e.g. case 0094, where header words land in only some of the
2646    // dash-defined columns). Keep them as empty cells so the row's
2647    // column ordering matches the dash separator.
2648    row.children()
2649        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2650        .map(|cell| coalesce_inlines(inlines_from(&cell)))
2651        .collect()
2652}
2653
2654fn simple_table_row_is_all_dashes(row: &SyntaxNode) -> bool {
2655    let mut had_cell = false;
2656    for cell in row
2657        .children()
2658        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2659    {
2660        let text = cell.text().to_string();
2661        let trimmed = text.trim();
2662        if trimmed.is_empty() {
2663            continue;
2664        }
2665        had_cell = true;
2666        if !trimmed.chars().all(|c| c == '-') {
2667            return false;
2668        }
2669    }
2670    had_cell
2671}
2672
2673/// Derive alignments for a simple-table header (or first data row) by
2674/// comparing each cell's *visible* (whitespace-trimmed) column range to
2675/// the corresponding dash run. Multiline-table TABLE_CELL nodes include
2676/// the padding whitespace within the column slice, so we have to peel
2677/// off leading/trailing whitespace before applying the flushness rule.
2678/// (Single-line simple-table cells already exclude padding whitespace,
2679/// but the trim is a no-op there.)
2680fn simple_table_aligns(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<&'static str> {
2681    let row_start: u32 = row.text_range().start().into();
2682    let mut cell_ranges: Vec<(usize, usize)> = Vec::new();
2683    for cell in row
2684        .children()
2685        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
2686    {
2687        if cell.text_range().is_empty() {
2688            continue;
2689        }
2690        let text = cell.text().to_string();
2691        let lstrip = text.chars().take_while(|c| *c == ' ' || *c == '\t').count();
2692        let rstrip = text
2693            .chars()
2694            .rev()
2695            .take_while(|c| *c == ' ' || *c == '\t')
2696            .count();
2697        let trimmed_len = text.chars().count().saturating_sub(lstrip + rstrip);
2698        if trimmed_len == 0 {
2699            continue;
2700        }
2701        let start: u32 = cell.text_range().start().into();
2702        let s = (start - row_start) as usize;
2703        let visible_start = s + lstrip;
2704        let visible_end = visible_start + trimmed_len - 1;
2705        cell_ranges.push((visible_start, visible_end));
2706    }
2707    cols.iter()
2708        .map(|(col_start, col_end)| {
2709            let cell = cell_ranges
2710                .iter()
2711                .find(|(cs, ce)| ce >= col_start && cs <= col_end);
2712            match cell {
2713                Some((cs, ce)) => {
2714                    let left_flush = cs == col_start;
2715                    let right_flush = ce == col_end;
2716                    match (left_flush, right_flush) {
2717                        (true, true) => "AlignDefault",
2718                        (true, false) => "AlignLeft",
2719                        (false, true) => "AlignRight",
2720                        (false, false) => "AlignCenter",
2721                    }
2722                }
2723                None => "AlignDefault",
2724            }
2725        })
2726        .collect()
2727}
2728
2729// ----- grid table ---------------------------------------------------------
2730
2731/// Project a `GRID_TABLE` node into pandoc-native shape. Implements a
2732/// `gridtables`-style 2D layout pass:
2733///
2734/// 1. Collect every line of the table (excluding caption) into a padded
2735///    char grid, tracking which `TABLE_HEADER` / `TABLE_ROW` /
2736///    `TABLE_FOOTER` parent each line came from.
2737/// 2. The canonical column boundaries are the union of `+` positions
2738///    across every "sep-style" line (lines made of `+`/`-`/`=`/`:`/`|`/`
2739///    `). The canonical row boundaries are the indices of those
2740///    sep-style lines. So a partial separator like
2741///    `|        +----+----+` contributes both to canonical column
2742///    positions and to row block boundaries (it ends some cells and
2743///    starts others mid-row).
2744/// 3. Cells are detected by walking `(row_block, col)` in scan order and,
2745///    at each unoccupied position whose top-left `+` is real, finding the
2746///    smallest valid bounding rectangle: top/bottom edges in
2747///    `{-,=,:,+}`, left/right edges in `{|,+}`, no fully-spanning
2748///    interior separator that would split it. RowSpan/ColSpan are
2749///    derived from the canonical row/col indices of the cell's corners.
2750///
2751/// Column widths use the alignment separator (the one carrying `:`s) if
2752/// present, else the first separator — both via `grid_dash_widths`. The
2753/// alignment row also drives per-column alignment via
2754/// `grid_separator_aligns`.
2755#[allow(clippy::needless_range_loop)]
2756fn grid_table(node: &SyntaxNode) -> Option<TableData> {
2757    // Collect all lines except the caption, tagged with their parent kind.
2758    let mut tagged: Vec<(SyntaxKind, String)> = Vec::new();
2759    for child in node.children() {
2760        if child.kind() == SyntaxKind::TABLE_CAPTION {
2761            continue;
2762        }
2763        let text = child.text().to_string();
2764        for line in text.split_inclusive('\n') {
2765            let trimmed = line.trim_end_matches('\n');
2766            tagged.push((child.kind(), trimmed.to_string()));
2767        }
2768    }
2769    if tagged.is_empty() {
2770        return None;
2771    }
2772
2773    // Pad lines into a 2D char grid.
2774    let max_width = tagged
2775        .iter()
2776        .map(|(_, l)| l.chars().count())
2777        .max()
2778        .unwrap_or(0);
2779    let grid: Vec<Vec<char>> = tagged
2780        .iter()
2781        .map(|(_, l)| {
2782            let mut chars: Vec<char> = l.chars().collect();
2783            chars.resize(max_width, ' ');
2784            chars
2785        })
2786        .collect();
2787    let nlines = grid.len();
2788
2789    // A line is "sep-style" if it contains at least one `+` and no chars
2790    // outside `+`/`-`/`=`/`:`/`|`/` `. Partial separators (lines mixing
2791    // `|` and `+`) qualify; content lines do not.
2792    let is_sep_line: Vec<bool> = grid
2793        .iter()
2794        .map(|row| {
2795            row.contains(&'+')
2796                && row
2797                    .iter()
2798                    .all(|&c| matches!(c, '+' | '-' | '=' | ':' | '|' | ' '))
2799        })
2800        .collect();
2801
2802    // Canonical column boundaries: union of `+` columns across all sep-style lines.
2803    let mut col_set: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
2804    for (i, row) in grid.iter().enumerate() {
2805        if !is_sep_line[i] {
2806            continue;
2807        }
2808        for (j, &c) in row.iter().enumerate() {
2809            if c == '+' {
2810                col_set.insert(j);
2811            }
2812        }
2813    }
2814    let cols_pos: Vec<usize> = col_set.into_iter().collect();
2815    if cols_pos.len() < 2 {
2816        return None;
2817    }
2818    let ncols = cols_pos.len() - 1;
2819
2820    // Canonical row boundaries: line indices of sep-style lines.
2821    let row_seps: Vec<usize> = (0..nlines).filter(|&i| is_sep_line[i]).collect();
2822    if row_seps.len() < 2 {
2823        return None;
2824    }
2825    let nrows = row_seps.len() - 1;
2826
2827    // Block kind per row block: head if any non-sep line in the block came
2828    // from a TABLE_HEADER, foot if from TABLE_FOOTER, else body.
2829    let mut block_kind: Vec<&'static str> = vec!["body"; nrows];
2830    for r in 0..nrows {
2831        let start = row_seps[r];
2832        let end = row_seps[r + 1];
2833        for i in (start + 1)..end {
2834            match tagged[i].0 {
2835                SyntaxKind::TABLE_HEADER => block_kind[r] = "head",
2836                SyntaxKind::TABLE_FOOTER => block_kind[r] = "foot",
2837                _ => {}
2838            }
2839        }
2840    }
2841
2842    // Detect cells.
2843    let mut occupied = vec![vec![false; ncols]; nrows];
2844    // (start_row, start_col, row_span, col_span, content_text)
2845    let mut cells: Vec<(usize, usize, u32, u32, String)> = Vec::new();
2846    for sr in 0..nrows {
2847        for sc in 0..ncols {
2848            if occupied[sr][sc] {
2849                continue;
2850            }
2851            let i = row_seps[sr];
2852            let j = cols_pos[sc];
2853            if grid[i][j] != '+' {
2854                // No corner here — the canonical column is missing on this
2855                // sep line, meaning the cell that owns this position must
2856                // have been emitted earlier and `occupied` should already be
2857                // set. If not, the table is malformed; skip.
2858                continue;
2859            }
2860            let Some((er, ec, content)) = find_grid_cell(&grid, i, j, sr, sc, &cols_pos, &row_seps)
2861            else {
2862                continue;
2863            };
2864            let row_span = (er - sr) as u32;
2865            let col_span = (ec - sc) as u32;
2866            for r in sr..er {
2867                for c in sc..ec {
2868                    occupied[r][c] = true;
2869                }
2870            }
2871            cells.push((sr, sc, row_span, col_span, content));
2872        }
2873    }
2874
2875    // Group cells by row block and convert to GridCells. Within each block,
2876    // emit cells in canonical column order.
2877    let mut head_rows: Vec<Vec<GridCell>> = Vec::new();
2878    let mut body_rows: Vec<Vec<GridCell>> = Vec::new();
2879    let mut foot_rows: Vec<Vec<GridCell>> = Vec::new();
2880    for r in 0..nrows {
2881        let mut row_cells: Vec<&(usize, usize, u32, u32, String)> =
2882            cells.iter().filter(|(sr, _, _, _, _)| *sr == r).collect();
2883        row_cells.sort_by_key(|(_, sc, _, _, _)| *sc);
2884        let row: Vec<GridCell> = row_cells
2885            .into_iter()
2886            .map(|(_, _, rs, cs, text)| {
2887                let blocks = parse_grid_cell_text(text);
2888                GridCell {
2889                    row_span: *rs,
2890                    col_span: *cs,
2891                    blocks,
2892                }
2893            })
2894            .collect();
2895        match block_kind[r] {
2896            "head" => head_rows.push(row),
2897            "foot" => foot_rows.push(row),
2898            _ => body_rows.push(row),
2899        }
2900    }
2901
2902    // Column widths and alignments. Pick the alignment-bearing separator
2903    // for both (or fall back to the first separator).
2904    let alignment_sep = node
2905        .children()
2906        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2907        .find(|c| c.text().to_string().contains(':'))
2908        .or_else(|| {
2909            node.children()
2910                .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2911        })?;
2912    let widths = grid_dash_widths(&alignment_sep);
2913    let aligns_raw = alignment_sep.text().to_string();
2914    let aligns = if aligns_raw.contains(':') {
2915        grid_separator_aligns(&aligns_raw, ncols)
2916    } else {
2917        vec!["AlignDefault"; ncols]
2918    };
2919
2920    // Caption.
2921    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
2922    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
2923
2924    Some(TableData {
2925        attr,
2926        caption: caption_inlines,
2927        aligns,
2928        widths: widths.into_iter().map(Some).collect(),
2929        head_rows,
2930        body_rows,
2931        foot_rows,
2932    })
2933}
2934
2935/// Find the smallest valid grid-table cell with its top-left `+` at
2936/// `(i, j)` in the char grid, where `(sr, sc)` are the canonical row /
2937/// column indices of that corner.
2938///
2939/// Returns `(end_row_idx, end_col_idx, content_text)` where the cell
2940/// occupies canonical rows `sr..end_row_idx` and canonical columns
2941/// `sc..end_col_idx`. Content is the text inside the cell, with one
2942/// leading-space pad stripped per line and trailing whitespace trimmed,
2943/// joined with `\n`.
2944#[allow(clippy::needless_range_loop)]
2945fn find_grid_cell(
2946    grid: &[Vec<char>],
2947    i: usize,
2948    j: usize,
2949    sr: usize,
2950    sc: usize,
2951    cols_pos: &[usize],
2952    row_seps: &[usize],
2953) -> Option<(usize, usize, String)> {
2954    let nrows = row_seps.len() - 1;
2955    let ncols = cols_pos.len() - 1;
2956
2957    for ec in (sc + 1)..=ncols {
2958        let k = cols_pos[ec];
2959        // Top edge (i, j+1..k) must be all sep chars (intermediate `+`s OK).
2960        let top_ok = (j + 1..k).all(|c| matches!(grid[i][c], '-' | '=' | ':' | '+'));
2961        if !top_ok {
2962            // Hit a `|` or ` `; can't extend further right.
2963            break;
2964        }
2965        for er in (sr + 1)..=nrows {
2966            let l = row_seps[er];
2967            // Left edge col j from i+1..l: chars in {|, +}.
2968            let left_ok = (i + 1..l).all(|r| matches!(grid[r][j], '|' | '+'));
2969            if !left_ok {
2970                break;
2971            }
2972            // Right edge col k from i+1..l: chars in {|, +}.
2973            let right_ok = (i + 1..l).all(|r| matches!(grid[r][k], '|' | '+'));
2974            if !right_ok {
2975                continue;
2976            }
2977            // Bottom edge (l, j+1..k): chars in {-, =, :, +}.
2978            let bot_ok = (j + 1..k).all(|c| matches!(grid[l][c], '-' | '=' | ':' | '+'));
2979            if !bot_ok {
2980                continue;
2981            }
2982            if grid[l][j] != '+' || grid[l][k] != '+' {
2983                continue;
2984            }
2985            // No interior partial separator that fully spans this cell.
2986            // A line m strictly between i and l splits the cell if it has
2987            // `+` at both col j and col k AND all chars between are sep
2988            // chars (i.e., the partial sep extends across the whole cell
2989            // horizontally).
2990            let interior_split = (i + 1..l).any(|m| {
2991                grid[m][j] == '+'
2992                    && grid[m][k] == '+'
2993                    && (j + 1..k).all(|c| matches!(grid[m][c], '-' | '=' | ':' | '+'))
2994            });
2995            if interior_split {
2996                continue;
2997            }
2998
2999            // Extract content text. For each interior line, take chars
3000            // [j+1..k], strip one leading space (cell padding), trim
3001            // trailing whitespace.
3002            let mut content_lines: Vec<String> = Vec::new();
3003            for r in (i + 1)..l {
3004                let slice: String = grid[r][j + 1..k].iter().collect();
3005                let stripped = slice.strip_prefix(' ').unwrap_or(&slice).to_string();
3006                content_lines.push(stripped.trim_end().to_string());
3007            }
3008            // Drop leading/trailing empty lines.
3009            let first = content_lines.iter().position(|s| !s.is_empty());
3010            let last = content_lines.iter().rposition(|s| !s.is_empty());
3011            let content = match (first, last) {
3012                (Some(f), Some(l)) => content_lines[f..=l].join("\n"),
3013                _ => String::new(),
3014            };
3015            return Some((er, ec, content));
3016        }
3017    }
3018    None
3019}
3020
3021/// Parse a grid-table cell's extracted text as block-level markdown via
3022/// panache, then convert top-level `Para`s to `Plain` (pandoc's
3023/// grid-table cell rule).
3024fn parse_grid_cell_text(text: &str) -> Vec<Block> {
3025    if text.trim().is_empty() {
3026        return Vec::new();
3027    }
3028    let opts = crate::ParserOptions {
3029        flavor: crate::Flavor::Pandoc,
3030        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
3031        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
3032        ..crate::ParserOptions::default()
3033    };
3034    let doc = crate::parse(text, Some(opts));
3035    let mut out = Vec::new();
3036    for child in doc.children() {
3037        if let Some(block) = block_from(&child) {
3038            let block = match block {
3039                Block::Para(inlines) => Block::Plain(inlines),
3040                other => other,
3041            };
3042            out.push(block);
3043        }
3044    }
3045    out
3046}
3047
3048/// Compute per-column widths from a grid-table separator like
3049/// `+--------+----------+----------+`. The `+` characters delimit
3050/// columns; each run of dashes/equals/colons between two `+` is one
3051/// column. Pandoc's formula (`Text/Pandoc/Parsing/GridTable.hs::
3052/// fractionalColumnWidths`):
3053/// ```text
3054/// raw[i] = dashes[i] + 1       (include separator width)
3055/// norm   = max(sum(raw) + count - 2, 72)   (72 = readerColumns)
3056/// width[i] = raw[i] / norm
3057/// ```
3058fn grid_dash_widths(separator: &SyntaxNode) -> Vec<f64> {
3059    let raw_text = separator.text().to_string();
3060    let line = raw_text.trim_end_matches(['\n', '\r']);
3061    let mut raw: Vec<usize> = Vec::new();
3062    let mut count: usize = 0;
3063    let mut in_col = false;
3064    for ch in line.chars() {
3065        match ch {
3066            '+' => {
3067                if in_col {
3068                    raw.push(count + 1);
3069                    count = 0;
3070                }
3071                in_col = true;
3072            }
3073            _ => {
3074                if in_col {
3075                    count += 1;
3076                }
3077            }
3078        }
3079    }
3080    if raw.is_empty() {
3081        return Vec::new();
3082    }
3083    let total: usize = raw.iter().sum();
3084    let count = raw.len();
3085    let norm = (total + count).saturating_sub(2).max(72) as f64;
3086    raw.into_iter().map(|w| w as f64 / norm).collect()
3087}
3088
3089fn grid_separator_aligns(raw: &str, cols: usize) -> Vec<&'static str> {
3090    let line = raw.trim_end_matches(['\n', '\r']);
3091    let mut aligns: Vec<&'static str> = Vec::with_capacity(cols);
3092    let mut col_start: Option<usize> = None;
3093    for (i, ch) in line.char_indices() {
3094        if ch == '+' {
3095            if let Some(s) = col_start.take() {
3096                let seg = &line[s..i];
3097                aligns.push(grid_segment_align(seg));
3098            }
3099            col_start = Some(i + 1);
3100        }
3101    }
3102    while aligns.len() < cols {
3103        aligns.push("AlignDefault");
3104    }
3105    aligns.truncate(cols);
3106    aligns
3107}
3108
3109fn grid_segment_align(seg: &str) -> &'static str {
3110    let bytes = seg.as_bytes();
3111    let left = bytes.first() == Some(&b':');
3112    let right = bytes.last() == Some(&b':');
3113    match (left, right) {
3114        (true, true) => "AlignCenter",
3115        (true, false) => "AlignLeft",
3116        (false, true) => "AlignRight",
3117        _ => "AlignDefault",
3118    }
3119}
3120
3121// ----- multiline table ----------------------------------------------------
3122
3123/// Project a `MULTILINE_TABLE` node. Multi-line tables have an opening
3124/// `-----` border, an optional header (one or more lines), a
3125/// `----- ----- -----` column separator, body rows (each row possibly
3126/// spans multiple lines, separated from the next row by a blank line),
3127/// and a closing `-----` border. Cell content within a row is joined with
3128/// `SoftBreak` between source lines. Column widths are
3129/// `(dash_count + 1) / 72`.
3130fn multiline_table(node: &SyntaxNode) -> Option<TableData> {
3131    // The column-separator (the dashes between header and body) is the
3132    // *second* TABLE_SEPARATOR if there is a header, else the first.
3133    let separators: Vec<SyntaxNode> = node
3134        .children()
3135        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
3136        .collect();
3137    let header = node
3138        .children()
3139        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
3140    let column_sep = if header.is_some() {
3141        separators.get(1).cloned()
3142    } else {
3143        separators.first().cloned()
3144    }?;
3145    let cols = simple_table_dash_runs(&column_sep);
3146    if cols.is_empty() {
3147        return None;
3148    }
3149    // Per pandoc `widthsFromIndices`: each non-last column's width is
3150    // `dashes + spaces_after` (= start of next column - start of this); the
3151    // last column's width is `dashes + 1` (the indices' bump). Normalize
3152    // by `max(total, 72)`.
3153    let raw: Vec<usize> = cols
3154        .iter()
3155        .enumerate()
3156        .map(|(i, (s, e))| {
3157            if i + 1 < cols.len() {
3158                cols[i + 1].0 - s
3159            } else {
3160                e - s + 2
3161            }
3162        })
3163        .collect();
3164    let total: usize = raw.iter().sum();
3165    let norm = (total.max(72)) as f64;
3166    let widths: Vec<f64> = raw.into_iter().map(|w| w as f64 / norm).collect();
3167    // Alignment from header (if present) or first data row, using the
3168    // simple-table flushness rule against the column-separator dash runs.
3169    let aligns = if let Some(h) = &header {
3170        simple_table_aligns(h, &cols)
3171    } else if let Some(r0) = node.children().find(|c| c.kind() == SyntaxKind::TABLE_ROW) {
3172        simple_table_aligns(&r0, &cols)
3173    } else {
3174        vec!["AlignDefault"; cols.len()]
3175    };
3176    let head_rows = match &header {
3177        Some(h) => vec![
3178            multiline_row_cells_blocks(h, &cols)
3179                .into_iter()
3180                .map(GridCell::no_span)
3181                .collect(),
3182        ],
3183        None => Vec::new(),
3184    };
3185    let body_rows: Vec<Vec<GridCell>> = node
3186        .children()
3187        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
3188        .map(|r| {
3189            multiline_row_cells_blocks(&r, &cols)
3190                .into_iter()
3191                .map(GridCell::no_span)
3192                .collect()
3193        })
3194        .collect();
3195    let (caption_inlines, caption_attr_from_node) = project_table_caption_from(node);
3196    let (attr, caption_inlines) = resolve_caption_attr(caption_inlines, caption_attr_from_node);
3197    Some(TableData {
3198        attr,
3199        caption: caption_inlines,
3200        aligns,
3201        widths: widths.into_iter().map(Some).collect(),
3202        head_rows,
3203        body_rows,
3204        foot_rows: Vec::new(),
3205    })
3206}
3207
3208/// Slice each line of a multiline-table row by column ranges, then merge
3209/// each column's per-line text into a single Plain block with `SoftBreak`s
3210/// between source lines.
3211fn multiline_row_cells_blocks(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<Vec<Block>> {
3212    let row_start: u32 = row.text_range().start().into();
3213    let raw = row.text().to_string();
3214    // Re-construct the row's per-line text. Tokens give us byte offsets, but
3215    // plain `.text()` is enough — split on '\n', then for each line, slice by
3216    // column ranges.
3217    let lines: Vec<&str> = raw.split_inclusive('\n').collect();
3218    let mut col_lines: Vec<Vec<String>> = vec![Vec::new(); cols.len()];
3219    let mut line_start_offset: usize = 0;
3220    for line in lines {
3221        let line_no_nl = line.trim_end_matches('\n');
3222        if line_no_nl.trim().is_empty() {
3223            line_start_offset += line.len();
3224            continue;
3225        }
3226        for (i, &(cs, ce)) in cols.iter().enumerate() {
3227            // Slice [cs..=ce] in chars from the line. Lines may be shorter.
3228            let slice = char_slice(line_no_nl, cs, ce + 1);
3229            let trimmed = slice.trim();
3230            if !trimmed.is_empty() {
3231                col_lines[i].push(trimmed.to_string());
3232            }
3233        }
3234        line_start_offset += line.len();
3235    }
3236    let _ = (row_start, line_start_offset);
3237    cols.iter()
3238        .enumerate()
3239        .map(|(i, _)| {
3240            let segments = &col_lines[i];
3241            if segments.is_empty() {
3242                return Vec::new();
3243            }
3244            // Re-parse the cell's joined text through panache's inline parser
3245            // so that `**bold**`, `` `code` ``, `[link](url)` etc. inside
3246            // multiline-table cells project as Strong/Code/Link rather than
3247            // raw Str (matches pandoc's `multilineTableHeader` behavior of
3248            // joining lines per column and parsing as Markdown).
3249            let joined = segments.join("\n");
3250            let inlines = parse_cell_text_inlines(&joined);
3251            if inlines.is_empty() {
3252                return Vec::new();
3253            }
3254            vec![Block::Plain(coalesce_inlines(inlines))]
3255        })
3256        .collect()
3257}
3258
3259/// Parse a cell text fragment through panache's inline parser and return its
3260/// inline content. Used for multiline-table cells whose per-line slices are
3261/// not seen by the outer parser as inline-bearing TABLE_CELLs (the parser
3262/// holds raw TEXT for lines past the first). Empty or whitespace-only input
3263/// returns an empty vec.
3264fn parse_cell_text_inlines(text: &str) -> Vec<Inline> {
3265    if text.trim().is_empty() {
3266        return Vec::new();
3267    }
3268    let opts = crate::ParserOptions {
3269        flavor: crate::Flavor::Pandoc,
3270        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
3271        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
3272        ..crate::ParserOptions::default()
3273    };
3274    let doc = crate::parse(text, Some(opts));
3275    for node in doc.descendants() {
3276        if matches!(node.kind(), SyntaxKind::PARAGRAPH | SyntaxKind::PLAIN) {
3277            return inlines_from(&node);
3278        }
3279    }
3280    Vec::new()
3281}
3282
3283fn char_slice(s: &str, start_char: usize, end_char: usize) -> &str {
3284    let mut start_byte = s.len();
3285    let mut end_byte = s.len();
3286    for (i, (b, _)) in s.char_indices().enumerate() {
3287        if i == start_char {
3288            start_byte = b;
3289        }
3290        if i == end_char {
3291            end_byte = b;
3292            break;
3293        }
3294    }
3295    if start_byte > end_byte {
3296        return "";
3297    }
3298    &s[start_byte..end_byte]
3299}
3300
3301fn list_block(node: &SyntaxNode) -> Block {
3302    let loose = is_loose_list(node);
3303    let items: Vec<Vec<Block>> = node
3304        .children()
3305        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
3306        .map(|item| list_item_blocks(&item, loose))
3307        .collect();
3308    if list_is_ordered(node) {
3309        let (start, style, delim) = ordered_list_attrs(node);
3310        Block::OrderedList(start, style, delim, items)
3311    } else {
3312        Block::BulletList(items)
3313    }
3314}
3315
3316fn list_is_ordered(node: &SyntaxNode) -> bool {
3317    let Some(item) = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
3318        return false;
3319    };
3320    let marker = item
3321        .children_with_tokens()
3322        .filter_map(|el| el.into_token())
3323        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
3324        .map(|t| t.text().to_string())
3325        .unwrap_or_default();
3326    let trimmed = marker.trim();
3327    !trimmed.starts_with(['-', '+', '*'])
3328}
3329
3330fn ordered_list_attrs(node: &SyntaxNode) -> (usize, &'static str, &'static str) {
3331    let item = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM);
3332    let marker = item
3333        .as_ref()
3334        .and_then(|i| {
3335            i.children_with_tokens()
3336                .filter_map(|el| el.into_token())
3337                .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
3338                .map(|t| t.text().to_string())
3339        })
3340        .unwrap_or_default();
3341    let (mut start, style, delim) = classify_ordered_marker(marker.trim());
3342    if style == "Example" {
3343        let offset: u32 = node.text_range().start().into();
3344        if let Some(s) = REFS_CTX.with(|c| {
3345            c.borrow()
3346                .example_list_start_by_offset
3347                .get(&offset)
3348                .copied()
3349        }) {
3350            start = s;
3351        }
3352    }
3353    (start, style, delim)
3354}
3355
3356/// Map a list-marker token (e.g. `1.`, `iv)`, `(A)`, `#.`, `(@)`) to the
3357/// pandoc-native `(start, style, delim)` tuple. Mirrors pandoc's parser logic
3358/// in `Text/Pandoc/Parsing/Lists.hs`: try `decimal`, then `exampleNum` (`@`),
3359/// then `defaultNum` (`#`), then `romanOne` (single `i`/`I`), then alpha,
3360/// then multi-char roman, in that order; the first matching form wins. The
3361/// start value for Example lists is left at 1 — pandoc tracks numbering
3362/// across lists at the document level, which we don't model.
3363fn classify_ordered_marker(trimmed: &str) -> (usize, &'static str, &'static str) {
3364    // Strip surrounding parens / trailing period or paren to get (body, delim).
3365    let (body, delim) =
3366        if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
3367            (inner, "TwoParens")
3368        } else if let Some(inner) = trimmed.strip_suffix(')') {
3369            (inner, "OneParen")
3370        } else if let Some(inner) = trimmed.strip_suffix('.') {
3371            (inner, "Period")
3372        } else {
3373            (trimmed, "DefaultDelim")
3374        };
3375
3376    // All-digit body → Decimal.
3377    if !body.is_empty() && body.chars().all(|c| c.is_ascii_digit()) {
3378        let start: usize = body.parse().unwrap_or(1);
3379        return (start, "Decimal", delim);
3380    }
3381
3382    // `#` (DefaultStyle) — when style is DefaultStyle pandoc forces
3383    // DefaultDelim regardless of the actual punctuation.
3384    if body == "#" {
3385        return (1, "DefaultStyle", "DefaultDelim");
3386    }
3387
3388    // `@` or `@label` (Example list).
3389    if let Some(rest) = body.strip_prefix('@')
3390        && rest
3391            .chars()
3392            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
3393    {
3394        return (1, "Example", delim);
3395    }
3396
3397    // Single `i`/`I` is romanOne (tried before alpha, so `i.`/`I.` is Roman 1).
3398    if body == "i" {
3399        return (1, "LowerRoman", delim);
3400    }
3401    if body == "I" {
3402        return (1, "UpperRoman", delim);
3403    }
3404
3405    // Single lowercase / uppercase letter → alpha.
3406    if body.len() == 1
3407        && let Some(c) = body.chars().next()
3408    {
3409        if c.is_ascii_lowercase() {
3410            return ((c as u8 - b'a') as usize + 1, "LowerAlpha", delim);
3411        }
3412        if c.is_ascii_uppercase() {
3413            return ((c as u8 - b'A') as usize + 1, "UpperAlpha", delim);
3414        }
3415    }
3416
3417    // Multi-char roman lowercase/uppercase.
3418    if body
3419        .chars()
3420        .all(|c| matches!(c, 'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm'))
3421        && let Some(n) = roman_to_int(body, false)
3422    {
3423        return (n, "LowerRoman", delim);
3424    }
3425    if body
3426        .chars()
3427        .all(|c| matches!(c, 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M'))
3428        && let Some(n) = roman_to_int(body, true)
3429    {
3430        return (n, "UpperRoman", delim);
3431    }
3432
3433    // Fallback — the parser accepted some marker we don't classify; emit
3434    // Decimal/Period so the list renders rather than dropping coverage.
3435    (1, "Decimal", delim)
3436}
3437
3438/// Convert a roman numeral string to its integer value. Returns `None` if the
3439/// string isn't a syntactically-valid roman numeral. Mirrors pandoc's
3440/// `romanNumeral` (greedy left-to-right with subtractive pairs).
3441fn roman_to_int(s: &str, upper: bool) -> Option<usize> {
3442    let normalize = |c: char| if upper { c } else { c.to_ascii_uppercase() };
3443    let value = |c: char| match c {
3444        'I' => 1,
3445        'V' => 5,
3446        'X' => 10,
3447        'L' => 50,
3448        'C' => 100,
3449        'D' => 500,
3450        'M' => 1000,
3451        _ => 0,
3452    };
3453    let chars: Vec<char> = s.chars().map(normalize).collect();
3454    if chars.is_empty() {
3455        return None;
3456    }
3457    let mut total = 0usize;
3458    let mut i = 0;
3459    while i < chars.len() {
3460        let v = value(chars[i]);
3461        if v == 0 {
3462            return None;
3463        }
3464        let next = chars.get(i + 1).copied().map(value).unwrap_or(0);
3465        if v < next {
3466            total += next - v;
3467            i += 2;
3468        } else {
3469            total += v;
3470            i += 1;
3471        }
3472    }
3473    Some(total)
3474}
3475
3476fn list_item_blocks(item: &SyntaxNode, loose: bool) -> Vec<Block> {
3477    let mut out = Vec::new();
3478    let item_indent = list_item_content_offset(item);
3479    let task_checkbox = task_checkbox_for_item(item);
3480    let mut checkbox_emitted = false;
3481    for child in item.children() {
3482        match child.kind() {
3483            SyntaxKind::PLAIN => {
3484                let mut inlines = coalesce_inlines(inlines_from(&child));
3485                // Skip empty Plain blocks. The parser emits a PLAIN node for
3486                // any line under a list item, including the bare-marker line
3487                // (`-` followed by blank then indented content); pandoc only
3488                // counts blocks with actual inline content.
3489                if inlines.is_empty() {
3490                    continue;
3491                }
3492                if !checkbox_emitted && let Some(glyph) = task_checkbox {
3493                    inlines.insert(0, Inline::Space);
3494                    inlines.insert(0, Inline::Str(glyph.to_string()));
3495                    checkbox_emitted = true;
3496                }
3497                if loose {
3498                    out.push(Block::Para(inlines));
3499                } else {
3500                    out.push(Block::Plain(inlines));
3501                }
3502            }
3503            SyntaxKind::CODE_BLOCK => {
3504                // Both fenced and indented code blocks inside list items
3505                // carry the item-content indent on every body line in the
3506                // CST. Strip that offset so pandoc sees the same body it
3507                // would in a flat document. (For indented code, the helper
3508                // also strips the 4-space code-block indent on top of the
3509                // item offset; for fenced code, the offset strip alone is
3510                // sufficient.)
3511                out.push(indented_code_block_with_extra_strip(&child, item_indent));
3512            }
3513            _ => collect_block(&child, &mut out),
3514        }
3515    }
3516    out
3517}
3518
3519/// Pandoc renders `- [ ] foo` as `Plain [Str "\u{2610}", Space, Str "foo"]`
3520/// (and `[x]`/`[X]` as `\u{2612}`). The parser keeps `[ ]`/`[x]`/`[X]` as a
3521/// dedicated `TASK_CHECKBOX` token on the `LIST_ITEM`; this helper returns
3522/// the matching ballot-box glyph if one is present.
3523fn task_checkbox_for_item(item: &SyntaxNode) -> Option<&'static str> {
3524    item.children_with_tokens()
3525        .filter_map(|el| el.into_token())
3526        .find(|t| t.kind() == SyntaxKind::TASK_CHECKBOX)
3527        .map(|t| {
3528            let text = t.text();
3529            if text.contains('x') || text.contains('X') {
3530                "\u{2612}"
3531            } else {
3532                "\u{2610}"
3533            }
3534        })
3535}
3536
3537/// Number of leading-space columns each body-content line of `item` carries
3538/// in the CST. Mirrors pandoc's list-item content offset:
3539///   - bare-marker line (no WHITESPACE after LIST_MARKER): offset = marker
3540///     width (e.g. `1` for `-`, `2` for `1.`).
3541///   - marker followed by space(s): offset = marker width + WS width (the
3542///     visual column where content starts on the marker's line).
3543///
3544/// Nested list items also carry leading WHITESPACE *before* the LIST_MARKER
3545/// (the outer item's content offset). Include that so the cumulative depth
3546/// is captured — required for correctly stripping nested fenced/indented
3547/// code blocks.
3548///
3549/// When the LIST is itself a child of an outer container (e.g. a DEFINITION
3550/// body where the `- item` line is indented to the def-content column), the
3551/// per-item leading indent lives on the parent LIST as a WHITESPACE token
3552/// preceding each LIST_ITEM rather than inside the item. Pick that up too —
3553/// without it, code blocks nested inside such items would only have the
3554/// item-local indent stripped, leaving the outer-container offset behind.
3555fn list_item_content_offset(item: &SyntaxNode) -> usize {
3556    let parent_ws = parent_list_leading_ws(item);
3557    let mut marker_width = 0usize;
3558    let mut leading_ws = 0usize;
3559    let mut saw_marker = false;
3560    for el in item.children_with_tokens() {
3561        if let NodeOrToken::Token(t) = el {
3562            match t.kind() {
3563                SyntaxKind::WHITESPACE if !saw_marker => {
3564                    leading_ws += t.text().chars().count();
3565                }
3566                SyntaxKind::LIST_MARKER => {
3567                    marker_width += t.text().chars().count();
3568                    saw_marker = true;
3569                }
3570                SyntaxKind::WHITESPACE if saw_marker => {
3571                    return parent_ws + leading_ws + marker_width + t.text().chars().count();
3572                }
3573                _ if saw_marker => {
3574                    return parent_ws + leading_ws + marker_width;
3575                }
3576                _ => {}
3577            }
3578        } else if saw_marker {
3579            return parent_ws + leading_ws + marker_width;
3580        }
3581    }
3582    parent_ws + leading_ws + marker_width
3583}
3584
3585/// WHITESPACE token immediately preceding `item` on its parent LIST node, if
3586/// any. Used to recover the outer-container indent when the parser stores it
3587/// on the parent LIST (e.g. LIST inside DEFINITION) rather than as the item's
3588/// own leading WHITESPACE.
3589fn parent_list_leading_ws(item: &SyntaxNode) -> usize {
3590    let prev = item.prev_sibling_or_token();
3591    match prev {
3592        Some(NodeOrToken::Token(t)) if t.kind() == SyntaxKind::WHITESPACE => {
3593            t.text().chars().count()
3594        }
3595        _ => 0,
3596    }
3597}
3598
3599fn is_loose_list(node: &SyntaxNode) -> bool {
3600    let mut prev_was_item = false;
3601    for child in node.children_with_tokens() {
3602        if let NodeOrToken::Node(n) = child {
3603            if n.kind() == SyntaxKind::LIST_ITEM {
3604                prev_was_item = true;
3605            } else if n.kind() == SyntaxKind::BLANK_LINE
3606                && prev_was_item
3607                && n.next_sibling()
3608                    .map(|s| s.kind() == SyntaxKind::LIST_ITEM)
3609                    .unwrap_or(false)
3610            {
3611                return true;
3612            }
3613        }
3614    }
3615    for item in node
3616        .children()
3617        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
3618    {
3619        if item.children().any(|c| c.kind() == SyntaxKind::PARAGRAPH) {
3620            return true;
3621        }
3622        // Per CommonMark/pandoc: a list is loose if any item directly
3623        // contains a blank line between two block-level children. The
3624        // single-item form (`- a\n\n  b`) only manifests as a BLANK_LINE
3625        // sandwiched between non-blank block children inside the item.
3626        if has_internal_blank_between_blocks(&item) {
3627            return true;
3628        }
3629    }
3630    false
3631}
3632
3633fn has_internal_blank_between_blocks(item: &SyntaxNode) -> bool {
3634    let mut saw_block_before = false;
3635    let mut pending_blank = false;
3636    for child in item.children() {
3637        match child.kind() {
3638            SyntaxKind::BLANK_LINE => {
3639                if saw_block_before {
3640                    pending_blank = true;
3641                }
3642            }
3643            // Bare-marker line emits an empty PLAIN (NEWLINE only); pandoc
3644            // doesn't count that as a block — its first real block is what
3645            // comes after the blank line.
3646            SyntaxKind::PLAIN if child_is_empty_plain(&child) => {}
3647            _ => {
3648                if pending_blank {
3649                    return true;
3650                }
3651                saw_block_before = true;
3652            }
3653        }
3654    }
3655    false
3656}
3657
3658fn child_is_empty_plain(node: &SyntaxNode) -> bool {
3659    !node.children_with_tokens().any(|el| match el {
3660        NodeOrToken::Token(t) => !matches!(t.kind(), SyntaxKind::NEWLINE | SyntaxKind::WHITESPACE),
3661        NodeOrToken::Node(_) => true,
3662    })
3663}
3664
3665// ----- inline walking -----------------------------------------------------
3666
3667fn inlines_from(parent: &SyntaxNode) -> Vec<Inline> {
3668    let mut out = Vec::new();
3669    let mut iter = parent.children_with_tokens().peekable();
3670    while let Some(el) = iter.next() {
3671        match el {
3672            NodeOrToken::Token(t) => push_token_inline(&t, &mut out),
3673            NodeOrToken::Node(n) if n.kind() == SyntaxKind::LATEX_COMMAND => {
3674                emit_latex_command_with_absorb(&n, &mut iter, &mut out);
3675            }
3676            NodeOrToken::Node(n) if n.kind() == SyntaxKind::CITATION => {
3677                emit_citation_with_absorb(&n, &mut iter, &mut out);
3678            }
3679            NodeOrToken::Node(n) => push_inline_node(&n, &mut out),
3680        }
3681    }
3682    // Trailing NEWLINE inside paragraphs/headings is structural. Strip a
3683    // single trailing SoftBreak so the inline list ends on Str/Space, matching
3684    // pandoc's "trim trailing line endings" rule.
3685    while matches!(out.last(), Some(Inline::SoftBreak)) {
3686        out.pop();
3687    }
3688    out
3689}
3690
3691/// Pandoc absorbs `@key [locator]` into a single AuthorInText `Cite` with
3692/// the bracketed text becoming the citation's suffix. The parser emits two
3693/// separate nodes: `CITATION` (bare `@key`, no surrounding brackets) and an
3694/// adjacent `LINK` whose bracketed text has no destination. When the
3695/// CITATION is bare and we can verify both the next siblings (a single
3696/// `TEXT` whitespace token followed by a `LINK` node lacking
3697/// `LINK_DEST_START`), consume both and absorb the link's text as suffix.
3698fn emit_citation_with_absorb<I>(
3699    node: &SyntaxNode,
3700    iter: &mut std::iter::Peekable<I>,
3701    out: &mut Vec<Inline>,
3702) where
3703    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
3704{
3705    let bracketed = node
3706        .children_with_tokens()
3707        .filter_map(|el| el.into_token())
3708        .any(|t| t.kind() == SyntaxKind::LINK_START);
3709    if bracketed {
3710        render_citation_inline(node, out, None);
3711        return;
3712    }
3713    // Bare AuthorInText form. Use rowan's sibling navigation (not the iter
3714    // peek) to verify the absorption pattern without consuming anything we
3715    // can't put back. Then if confirmed, advance the iter to skip both.
3716    let next_sibling_pair = node.next_sibling_or_token().and_then(|el1| {
3717        let t = el1.as_token().cloned()?;
3718        if t.kind() != SyntaxKind::TEXT || !t.text().starts_with(' ') {
3719            return None;
3720        }
3721        let space_text = t.text().to_string();
3722        let link_el = t.next_sibling_or_token()?;
3723        let link = link_el.as_node().cloned()?;
3724        // Pandoc absorbs `[locator]` after `@key` whether the brackets
3725        // resolve as a link or not; under the new IR, an unresolved
3726        // bracket-shape pattern is `UNRESOLVED_REFERENCE` rather than
3727        // shape-only `LINK`. Both shapes are valid locator candidates.
3728        if link.kind() != SyntaxKind::LINK && link.kind() != SyntaxKind::UNRESOLVED_REFERENCE {
3729            return None;
3730        }
3731        let has_dest = link
3732            .children_with_tokens()
3733            .filter_map(|el| el.into_token())
3734            .any(|tok| tok.kind() == SyntaxKind::LINK_DEST_START);
3735        if has_dest {
3736            return None;
3737        }
3738        let link_text = link
3739            .children()
3740            .find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3741            .map(|tt| tt.text().to_string())
3742            .unwrap_or_default();
3743        Some((space_text, link_text))
3744    });
3745    if let Some((_space_text, locator_text)) = next_sibling_pair {
3746        // Advance the iter past the consumed TEXT and LINK.
3747        iter.next();
3748        iter.next();
3749        render_citation_inline(node, out, Some(&locator_text));
3750    } else {
3751        render_citation_inline(node, out, None);
3752    }
3753}
3754
3755/// Pandoc's tex inline reader absorbs trailing horizontal whitespace into the
3756/// raw command when (and only when) the command is `\letters` with no brace
3757/// arguments — `\foo bar` becomes `RawInline tex "\\foo "` + `Str "bar"`,
3758/// while `\frac{a}{b} bar` keeps the space outside (`RawInline tex
3759/// "\\frac{a}{b}"` + `Space` + `Str "bar"`). The discriminator is the last
3760/// byte of the command text: ASCII letter → absorb, otherwise → don't.
3761fn emit_latex_command_with_absorb<I>(
3762    node: &SyntaxNode,
3763    iter: &mut std::iter::Peekable<I>,
3764    out: &mut Vec<Inline>,
3765) where
3766    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
3767{
3768    let mut content = node.text().to_string();
3769    let ends_in_letter = content
3770        .chars()
3771        .next_back()
3772        .is_some_and(|c| c.is_ascii_alphabetic());
3773    if ends_in_letter
3774        && let Some(NodeOrToken::Token(t)) = iter.peek()
3775        && t.kind() == SyntaxKind::TEXT
3776    {
3777        let text = t.text().to_string();
3778        let bytes = text.as_bytes();
3779        let mut absorbed = 0;
3780        while absorbed < bytes.len() && (bytes[absorbed] == b' ' || bytes[absorbed] == b'\t') {
3781            absorbed += 1;
3782        }
3783        if absorbed > 0 {
3784            content.push_str(&text[..absorbed]);
3785            out.push(Inline::RawInline("tex".to_string(), content));
3786            iter.next();
3787            let remainder = &text[absorbed..];
3788            if !remainder.is_empty() {
3789                push_text(remainder, out);
3790            }
3791            return;
3792        }
3793    }
3794    out.push(Inline::RawInline("tex".to_string(), content));
3795}
3796
3797fn push_inline_node(node: &SyntaxNode, out: &mut Vec<Inline>) {
3798    match node.kind() {
3799        SyntaxKind::LINK => render_link_inline(node, out),
3800        SyntaxKind::IMAGE_LINK => render_image_inline(node, out),
3801        SyntaxKind::CITATION => render_citation_inline(node, out, None),
3802        // Pandoc-native treats unresolved bracket-shape patterns as
3803        // literal text — the bracket bytes themselves are `Str "["`
3804        // and `Str "]"`, but inner inline structure (emphasis, math,
3805        // raw spans, etc.) survives. The Panache `UNRESOLVED_REFERENCE`
3806        // wrapper is a tooling concession; emit the bracket bytes as
3807        // `Str` and recurse into structural children so inner content
3808        // is preserved.
3809        SyntaxKind::UNRESOLVED_REFERENCE => render_unresolved_reference_inline(node, out),
3810        _ => out.push(inline_from_node(node)),
3811    }
3812}
3813
3814/// Project an UNRESOLVED_REFERENCE node as pandoc-native inlines.
3815///
3816/// Mirrors the unresolved fall-through of `render_link_inline`: try
3817/// `lookup_heading_id` for implicit-heading shortcut/full-reference
3818/// resolution at projection time (pandoc resolves heading IDs *during
3819/// inline rendering*; the parser's refdef map only carries explicit
3820/// `[label]: url` definitions). On miss, emit the original bracket
3821/// pattern as `Str "["`, inner inline structure (preserved via
3822/// `coalesce_inlines_keep_edges` so leading/trailing whitespace
3823/// survives, matching pandoc's `[ foo ]` → `Str "[", Space, Str "foo",
3824/// Space, Str "]"` behavior), then `Str "]"` (or `Str "][ref]"` for
3825/// full-reference form).
3826fn render_unresolved_reference_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
3827    let is_image = node
3828        .children()
3829        .any(|c| c.kind() == SyntaxKind::IMAGE_LINK_START);
3830    let text_node = if is_image {
3831        node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT)
3832    } else {
3833        node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3834    };
3835    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
3836
3837    let text_label = text_node
3838        .as_ref()
3839        .map(|n| n.text().to_string())
3840        .unwrap_or_default();
3841    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
3842        Some(rn) => {
3843            let inner = rn.text().to_string();
3844            if inner.is_empty() {
3845                (text_label.clone(), true, String::new())
3846            } else {
3847                (inner.clone(), true, inner)
3848            }
3849        }
3850        None => (text_label.clone(), false, String::new()),
3851    };
3852
3853    // Implicit-heading-id resolution at projection time. Only for
3854    // link-shape (not image-shape) shortcut/full-ref/collapsed forms.
3855    if !is_image && let Some(id) = lookup_heading_id(&label) {
3856        let url = format!("#{id}");
3857        let resolved_text_inlines = text_node
3858            .as_ref()
3859            .map(|n| coalesce_inlines(inlines_from(n)))
3860            .unwrap_or_default();
3861        out.push(Inline::Link(
3862            extract_attr_from_node(node),
3863            resolved_text_inlines,
3864            url,
3865            String::new(),
3866        ));
3867        return;
3868    }
3869
3870    // Inherited reference resolution. The parser emits UNRESOLVED_REFERENCE
3871    // when the corresponding `[label]: url` def isn't in the same CST, but
3872    // when projecting recursively-reparsed content (e.g. a `<div>` body)
3873    // the outer document's refs are folded into REFS_CTX. Resolve here so
3874    // an outer-defined ref used inside `<div>...</div>` becomes a Link.
3875    if let Some((url, title)) = lookup_ref(&label) {
3876        let resolved_text_inlines = text_node
3877            .as_ref()
3878            .map(|n| coalesce_inlines(inlines_from(n)))
3879            .unwrap_or_default();
3880        let kind = if is_image {
3881            Inline::Image
3882        } else {
3883            Inline::Link
3884        };
3885        out.push(kind(
3886            extract_attr_from_node(node),
3887            resolved_text_inlines,
3888            url,
3889            title,
3890        ));
3891        return;
3892    }
3893
3894    // Unresolved: emit the original markdown bytes, preserving inner
3895    // inline structure.
3896    let unresolved_text_inlines = text_node
3897        .as_ref()
3898        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
3899        .unwrap_or_default();
3900    let opener = if is_image { "![" } else { "[" };
3901    out.push(Inline::Str(opener.to_string()));
3902    out.extend(unresolved_text_inlines);
3903    let suffix = if has_second_brackets {
3904        format!("][{second_inner}]")
3905    } else {
3906        "]".to_string()
3907    };
3908    out.push(Inline::Str(suffix));
3909}
3910
3911/// Pandoc treats `(@label)` and bare `@label` as Example-list references
3912/// when the label was defined as an Example item; the inline becomes
3913/// `Str "N"` (just the digits — surrounding parens come from adjacent
3914/// source bytes which our coalesce pass merges back in). Otherwise we
3915/// project the CITATION node as a proper `Cite [Citation, ...] [Inline,
3916/// ...]` per pandoc's citation reader. `extra_suffix_text` carries an
3917/// absorbed `[locator]` (pandoc absorbs `@key [locator]` into the Cite as
3918/// the citation's suffix); the literal text reflects the absorbed bytes.
3919fn render_citation_inline(
3920    node: &SyntaxNode,
3921    out: &mut Vec<Inline>,
3922    extra_suffix_text: Option<&str>,
3923) {
3924    // Example-list resolution short-circuit (legacy carve-out).
3925    let first_key = node
3926        .children_with_tokens()
3927        .filter_map(|el| el.into_token())
3928        .find(|t| t.kind() == SyntaxKind::CITATION_KEY)
3929        .map(|t| t.text().to_string())
3930        .unwrap_or_default();
3931    let example_resolution =
3932        REFS_CTX.with(|c| c.borrow().example_label_to_num.get(&first_key).copied());
3933    if let Some(n) = example_resolution {
3934        out.push(Inline::Str(n.to_string()));
3935        return;
3936    }
3937
3938    let bracketed = node
3939        .children_with_tokens()
3940        .filter_map(|el| el.into_token())
3941        .any(|t| t.kind() == SyntaxKind::LINK_START);
3942
3943    let mut builders: Vec<CitationBuilder> = Vec::new();
3944    let mut current: Option<CitationBuilder> = None;
3945    let mut pending_prefix = String::new();
3946    for el in node.children_with_tokens() {
3947        let token = match el {
3948            NodeOrToken::Token(t) => t,
3949            _ => continue,
3950        };
3951        match token.kind() {
3952            SyntaxKind::LINK_START | SyntaxKind::LINK_DEST => {}
3953            SyntaxKind::CITATION_BRACE_OPEN | SyntaxKind::CITATION_BRACE_CLOSE => {}
3954            SyntaxKind::CITATION_MARKER => {
3955                if let Some(c) = current.take() {
3956                    builders.push(c);
3957                }
3958                let mode = if token.text() == "-@" {
3959                    CitationMode::SuppressAuthor
3960                } else if bracketed {
3961                    CitationMode::NormalCitation
3962                } else {
3963                    CitationMode::AuthorInText
3964                };
3965                current = Some(CitationBuilder::new(
3966                    std::mem::take(&mut pending_prefix),
3967                    mode,
3968                ));
3969            }
3970            SyntaxKind::CITATION_KEY => {
3971                if let Some(c) = &mut current {
3972                    c.id.push_str(token.text());
3973                }
3974            }
3975            SyntaxKind::CITATION_CONTENT => {
3976                if let Some(c) = &mut current {
3977                    c.suffix_raw.push_str(token.text());
3978                } else {
3979                    pending_prefix.push_str(token.text());
3980                }
3981            }
3982            SyntaxKind::CITATION_SEPARATOR => {
3983                if let Some(c) = current.take() {
3984                    builders.push(c);
3985                }
3986            }
3987            _ => {}
3988        }
3989    }
3990    if let Some(c) = current.take() {
3991        builders.push(c);
3992    }
3993
3994    // Absorbed `[locator]` text becomes additional suffix on the LAST
3995    // citation in the group (pandoc only absorbs into AuthorInText cites
3996    // anyway, which always have one citation in the group).
3997    if let Some(extra) = extra_suffix_text
3998        && let Some(last) = builders.last_mut()
3999    {
4000        if !last.suffix_raw.is_empty() && !extra.starts_with(' ') {
4001            last.suffix_raw.push(' ');
4002        }
4003        last.suffix_raw.push_str(extra);
4004    }
4005
4006    let note_offset: u32 = node.text_range().start().into();
4007    let note_num = REFS_CTX
4008        .with(|c| {
4009            c.borrow()
4010                .cite_note_num_by_offset
4011                .get(&note_offset)
4012                .copied()
4013        })
4014        .unwrap_or(1);
4015
4016    let projected: Vec<Citation> = builders
4017        .into_iter()
4018        .map(|b| b.into_citation(note_num))
4019        .collect();
4020
4021    // Build literal text from CITATION node text + any absorbed suffix.
4022    let mut literal = node.text().to_string();
4023    if let Some(extra) = extra_suffix_text {
4024        literal.push(' ');
4025        literal.push('[');
4026        literal.push_str(extra);
4027        literal.push(']');
4028    }
4029    let text_inlines = literal_inlines(&literal);
4030
4031    out.push(Inline::Cite(projected, text_inlines));
4032}
4033
4034/// Internal builder for a single Citation while walking the CITATION node's
4035/// tokens. `prefix_raw` and `suffix_raw` capture the raw `CITATION_CONTENT`
4036/// text segments before / after the key; they are inline-parsed (with smart
4037/// transformations applied via `coalesce_inlines`) once the builder is
4038/// finalized.
4039struct CitationBuilder {
4040    id: String,
4041    prefix_raw: String,
4042    suffix_raw: String,
4043    mode: CitationMode,
4044}
4045
4046impl CitationBuilder {
4047    fn new(prefix_raw: String, mode: CitationMode) -> Self {
4048        Self {
4049            id: String::new(),
4050            prefix_raw,
4051            suffix_raw: String::new(),
4052            mode,
4053        }
4054    }
4055
4056    fn into_citation(self, note_num: i64) -> Citation {
4057        let prefix = parse_cite_affix_inlines(self.prefix_raw.trim_end(), true);
4058        let suffix = parse_cite_affix_inlines(&self.suffix_raw, false);
4059        Citation {
4060            id: self.id,
4061            prefix,
4062            suffix,
4063            mode: self.mode,
4064            note_num,
4065            hash: 0,
4066        }
4067    }
4068}
4069
4070/// Parse a citation prefix or suffix raw-text fragment as inlines, applying
4071/// pandoc's smart transformations (NBSP after abbreviations, en-dash for
4072/// `--`, smart apostrophes/quotes). For prefixes, we trim leading whitespace
4073/// (pandoc's prefix never starts with Space). For suffixes, leading whitespace
4074/// is preserved so `[@key, suffix]` produces `[Str ",", Space, Str "suffix"]`.
4075///
4076/// We wrap the raw text with a benign `Z ` prefix before reparsing, then
4077/// strip the resulting leading `Str "Z"` + `Space`. This is necessary because
4078/// panache's block parser would otherwise misclassify text starting with
4079/// (e.g.) `p. ` as an alphabetical list marker, dropping the `p.` from the
4080/// resulting inline stream entirely.
4081fn parse_cite_affix_inlines(raw: &str, is_prefix: bool) -> Vec<Inline> {
4082    if raw.is_empty() {
4083        return Vec::new();
4084    }
4085    let trimmed = if is_prefix { raw.trim_start() } else { raw };
4086    if trimmed.is_empty() {
4087        return Vec::new();
4088    }
4089    let leading_space = !is_prefix && trimmed.starts_with([' ', '\t']);
4090    let work = trimmed.trim_start_matches([' ', '\t']);
4091    if work.is_empty() {
4092        return if leading_space {
4093            vec![Inline::Space]
4094        } else {
4095            Vec::new()
4096        };
4097    }
4098    let wrapped = format!("Z {work}");
4099    let inlines = parse_cell_text_inlines(&wrapped);
4100    let mut coalesced = coalesce_inlines(inlines);
4101    // Strip the leading `Z` sentinel + Space.
4102    if matches!(coalesced.first(), Some(Inline::Str(s)) if s == "Z") {
4103        coalesced.remove(0);
4104        if matches!(coalesced.first(), Some(Inline::Space)) {
4105            coalesced.remove(0);
4106        }
4107    }
4108    if leading_space {
4109        coalesced.insert(0, Inline::Space);
4110    }
4111    coalesced
4112}
4113
4114/// Tokenize raw input into the literal `[Inline]` payload that pandoc emits
4115/// as the second argument of `Cite`. This is a lossless representation of
4116/// the original bytes (including brackets, semicolons, `*`, `**`, etc.) —
4117/// no markup parsing, no smart-typography. Newlines become `SoftBreak`,
4118/// runs of spaces/tabs become a single `Space`.
4119fn literal_inlines(text: &str) -> Vec<Inline> {
4120    let mut out: Vec<Inline> = Vec::new();
4121    let mut buf = String::new();
4122    for ch in text.chars() {
4123        match ch {
4124            ' ' | '\t' => {
4125                if !buf.is_empty() {
4126                    out.push(Inline::Str(std::mem::take(&mut buf)));
4127                }
4128                if !matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4129                    out.push(Inline::Space);
4130                }
4131            }
4132            '\n' => {
4133                if !buf.is_empty() {
4134                    out.push(Inline::Str(std::mem::take(&mut buf)));
4135                }
4136                if matches!(out.last(), Some(Inline::Space)) {
4137                    out.pop();
4138                }
4139                out.push(Inline::SoftBreak);
4140            }
4141            _ => buf.push(ch),
4142        }
4143    }
4144    if !buf.is_empty() {
4145        out.push(Inline::Str(buf));
4146    }
4147    out
4148}
4149
4150fn push_token_inline(
4151    t: &rowan::SyntaxToken<crate::syntax::PanacheLanguage>,
4152    out: &mut Vec<Inline>,
4153) {
4154    match t.kind() {
4155        SyntaxKind::TEXT => push_text(t.text(), out),
4156        SyntaxKind::WHITESPACE => out.push(Inline::Space),
4157        SyntaxKind::NEWLINE => out.push(Inline::SoftBreak),
4158        SyntaxKind::HARD_LINE_BREAK => out.push(Inline::LineBreak),
4159        SyntaxKind::ESCAPED_CHAR => {
4160            // \x — keep just the escaped character as a Str
4161            let s: String = t.text().chars().skip(1).collect();
4162            out.push(Inline::Str(s));
4163        }
4164        SyntaxKind::NONBREAKING_SPACE => out.push(Inline::Str("\u{a0}".to_string())),
4165        // Skip structural tokens (markers, brackets, fence bytes) that don't
4166        // contribute to the inline stream.
4167        _ => {}
4168    }
4169}
4170
4171fn push_text(text: &str, out: &mut Vec<Inline>) {
4172    let mut buf = String::new();
4173    for ch in text.chars() {
4174        if ch == ' ' || ch == '\t' {
4175            if !buf.is_empty() {
4176                out.push(Inline::Str(std::mem::take(&mut buf)));
4177            }
4178            out.push(Inline::Space);
4179        } else if ch == '\n' {
4180            if !buf.is_empty() {
4181                out.push(Inline::Str(std::mem::take(&mut buf)));
4182            }
4183            out.push(Inline::SoftBreak);
4184        } else {
4185            buf.push(ch);
4186        }
4187    }
4188    if !buf.is_empty() {
4189        out.push(Inline::Str(buf));
4190    }
4191}
4192
4193fn inline_from_node(node: &SyntaxNode) -> Inline {
4194    match node.kind() {
4195        SyntaxKind::EMPHASIS => {
4196            Inline::Emph(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4197        }
4198        SyntaxKind::STRONG => {
4199            Inline::Strong(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4200        }
4201        SyntaxKind::STRIKEOUT => {
4202            Inline::Strikeout(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4203        }
4204        SyntaxKind::SUPERSCRIPT => {
4205            Inline::Superscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4206        }
4207        SyntaxKind::SUBSCRIPT => {
4208            Inline::Subscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
4209        }
4210        SyntaxKind::INLINE_CODE => {
4211            let content: String = node
4212                .children_with_tokens()
4213                .filter_map(|el| el.into_token())
4214                .filter(|t| t.kind() == SyntaxKind::INLINE_CODE_CONTENT)
4215                .map(|t| t.text().to_string())
4216                .collect();
4217            Inline::Code(
4218                extract_attr_from_node(node),
4219                strip_inline_code_padding(&content),
4220            )
4221        }
4222        SyntaxKind::LINK | SyntaxKind::IMAGE_LINK | SyntaxKind::UNRESOLVED_REFERENCE => {
4223            // LINK / IMAGE_LINK / UNRESOLVED_REFERENCE render through
4224            // `push_inline_node` so reference resolution can emit
4225            // multiple inlines (resolved Link, or unresolved Str
4226            // fragments). This single-Inline path is unreachable;
4227            // emit Unsupported as a guard rather than silently
4228            // dropping.
4229            Inline::Unsupported(format!("{:?}", node.kind()))
4230        }
4231        SyntaxKind::AUTO_LINK => autolink_inline(node),
4232        SyntaxKind::INLINE_MATH => math_inline(node, "InlineMath"),
4233        SyntaxKind::DISPLAY_MATH => math_inline(node, "DisplayMath"),
4234        SyntaxKind::LATEX_COMMAND => latex_command_inline(node),
4235        SyntaxKind::BRACKETED_SPAN => bracketed_span_inline(node),
4236        SyntaxKind::INLINE_HTML_SPAN => inline_html_span_inline(node),
4237        SyntaxKind::INLINE_HTML => Inline::RawInline("html".to_string(), node.text().to_string()),
4238        SyntaxKind::FOOTNOTE_REFERENCE => footnote_reference_inline(node),
4239        SyntaxKind::INLINE_FOOTNOTE => inline_footnote_inline(node),
4240        other => Inline::Unsupported(format!("{other:?}")),
4241    }
4242}
4243
4244/// Inlines from a wrapper (Emph/Strong/...) where the structural markers are
4245/// child *nodes* (e.g. EMPHASIS_MARKER) rather than child tokens. We descend
4246/// through such marker children but skip their bytes.
4247fn inlines_from_marked(parent: &SyntaxNode) -> Vec<Inline> {
4248    let mut out = Vec::new();
4249    let mut iter = parent.children_with_tokens().peekable();
4250    while let Some(el) = iter.next() {
4251        match el {
4252            NodeOrToken::Token(t) => match t.kind() {
4253                SyntaxKind::EMPHASIS_MARKER
4254                | SyntaxKind::STRONG_MARKER
4255                | SyntaxKind::STRIKEOUT_MARKER
4256                | SyntaxKind::SUPERSCRIPT_MARKER
4257                | SyntaxKind::SUBSCRIPT_MARKER
4258                | SyntaxKind::MARK_MARKER => {}
4259                _ => push_token_inline(&t, &mut out),
4260            },
4261            NodeOrToken::Node(n) => match n.kind() {
4262                SyntaxKind::EMPHASIS_MARKER
4263                | SyntaxKind::STRONG_MARKER
4264                | SyntaxKind::STRIKEOUT_MARKER
4265                | SyntaxKind::SUPERSCRIPT_MARKER
4266                | SyntaxKind::SUBSCRIPT_MARKER
4267                | SyntaxKind::MARK_MARKER => {}
4268                _ if n.kind() == SyntaxKind::LATEX_COMMAND => {
4269                    emit_latex_command_with_absorb(&n, &mut iter, &mut out);
4270                }
4271                _ => push_inline_node(&n, &mut out),
4272            },
4273        }
4274    }
4275    out
4276}
4277
4278fn render_link_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
4279    let text_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT);
4280    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
4281    let has_dest_paren = node
4282        .children_with_tokens()
4283        .any(|el| matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::LINK_DEST_START));
4284
4285    if has_dest_paren {
4286        let text = text_node
4287            .as_ref()
4288            .map(|n| coalesce_inlines(inlines_from(n)))
4289            .unwrap_or_default();
4290        let (url, title) = dest_node
4291            .as_ref()
4292            .map(parse_link_dest)
4293            .unwrap_or((String::new(), String::new()));
4294        out.push(Inline::Link(extract_attr_from_node(node), text, url, title));
4295        return;
4296    }
4297
4298    // Reference-style link: shortcut [label], implicit [label][], or full
4299    // [text][ref]. Distinguish by presence/contents of LINK_REF.
4300    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
4301    let resolved_text_inlines = text_node
4302        .as_ref()
4303        .map(|n| coalesce_inlines(inlines_from(n)))
4304        .unwrap_or_default();
4305    let text_label = text_node
4306        .as_ref()
4307        .map(|n| n.text().to_string())
4308        .unwrap_or_default();
4309
4310    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
4311        Some(rn) => {
4312            let inner = rn.text().to_string();
4313            if inner.is_empty() {
4314                (text_label.clone(), true, String::new())
4315            } else {
4316                (inner.clone(), true, inner)
4317            }
4318        }
4319        None => (text_label.clone(), false, String::new()),
4320    };
4321
4322    if let Some((url, title)) = lookup_ref(&label) {
4323        out.push(Inline::Link(
4324            extract_attr_from_node(node),
4325            resolved_text_inlines,
4326            url,
4327            title,
4328        ));
4329        return;
4330    }
4331
4332    if let Some(id) = lookup_heading_id(&label) {
4333        let url = format!("#{id}");
4334        out.push(Inline::Link(
4335            extract_attr_from_node(node),
4336            resolved_text_inlines,
4337            url,
4338            String::new(),
4339        ));
4340        return;
4341    }
4342
4343    // Unresolved: emit the original markdown bytes as plain text. The reader
4344    // assembles `[<text>]`, optionally followed by `[<ref>]` for a full or
4345    // implicit reference. Using Str inlines here (rather than Link with empty
4346    // dest) matches pandoc's behavior of leaving unresolved references as raw
4347    // text in the output stream. Use keep_edges so leading/trailing whitespace
4348    // inside `[ ... ]` survives — pandoc preserves source whitespace for
4349    // unresolved references (`[ foo ]` → `Str "[", Space, Str "foo", Space,
4350    // Str "]"`), unlike resolved Links which strip edges.
4351    let unresolved_text_inlines = text_node
4352        .as_ref()
4353        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
4354        .unwrap_or_default();
4355    out.push(Inline::Str("[".to_string()));
4356    out.extend(unresolved_text_inlines);
4357    let suffix = if has_second_brackets {
4358        format!("][{second_inner}]")
4359    } else {
4360        "]".to_string()
4361    };
4362    out.push(Inline::Str(suffix));
4363}
4364
4365fn render_image_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
4366    let alt_node = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
4367    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
4368    let has_dest_paren = node.children_with_tokens().any(|el| {
4369        matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::IMAGE_DEST_START
4370            || t.kind() == SyntaxKind::LINK_DEST_START)
4371    });
4372
4373    if has_dest_paren {
4374        let alt = alt_node
4375            .as_ref()
4376            .map(|n| coalesce_inlines(inlines_from(n)))
4377            .unwrap_or_default();
4378        let (url, title) = dest_node
4379            .as_ref()
4380            .map(parse_link_dest)
4381            .unwrap_or((String::new(), String::new()));
4382        out.push(Inline::Image(extract_attr_from_node(node), alt, url, title));
4383        return;
4384    }
4385
4386    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
4387    let alt_inlines = alt_node
4388        .as_ref()
4389        .map(|n| coalesce_inlines(inlines_from(n)))
4390        .unwrap_or_default();
4391    let alt_label = alt_node
4392        .as_ref()
4393        .map(|n| n.text().to_string())
4394        .unwrap_or_default();
4395
4396    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
4397        Some(rn) => {
4398            let inner = rn.text().to_string();
4399            if inner.is_empty() {
4400                (alt_label.clone(), true, String::new())
4401            } else {
4402                (inner.clone(), true, inner)
4403            }
4404        }
4405        None => (alt_label.clone(), false, String::new()),
4406    };
4407
4408    if let Some((url, title)) = lookup_ref(&label) {
4409        out.push(Inline::Image(
4410            extract_attr_from_node(node),
4411            alt_inlines,
4412            url,
4413            title,
4414        ));
4415        return;
4416    }
4417
4418    if let Some(id) = lookup_heading_id(&label) {
4419        let url = format!("#{id}");
4420        out.push(Inline::Image(
4421            extract_attr_from_node(node),
4422            alt_inlines,
4423            url,
4424            String::new(),
4425        ));
4426        return;
4427    }
4428
4429    out.push(Inline::Str("![".to_string()));
4430    out.extend(alt_inlines);
4431    let suffix = if has_second_brackets {
4432        format!("][{second_inner}]")
4433    } else {
4434        "]".to_string()
4435    };
4436    out.push(Inline::Str(suffix));
4437}
4438
4439/// Pandoc's inline code reader (`Markdown.hs::code`) replaces internal
4440/// newlines with spaces (each `\n` → one space) and then `trim`s leading
4441/// and trailing whitespace from the result. Internal whitespace runs are
4442/// preserved.
4443fn strip_inline_code_padding(s: &str) -> String {
4444    let collapsed: String = s.chars().map(|c| if c == '\n' { ' ' } else { c }).collect();
4445    collapsed.trim().to_string()
4446}
4447
4448fn math_inline(node: &SyntaxNode, kind: &'static str) -> Inline {
4449    let mut content = String::new();
4450    for el in node.children_with_tokens() {
4451        if let NodeOrToken::Token(t) = el {
4452            match t.kind() {
4453                SyntaxKind::INLINE_MATH_MARKER | SyntaxKind::DISPLAY_MATH_MARKER => {}
4454                _ => content.push_str(t.text()),
4455            }
4456        }
4457    }
4458    Inline::Math(kind, content)
4459}
4460
4461fn autolink_inline(node: &SyntaxNode) -> Inline {
4462    let mut url = String::new();
4463    for el in node.children_with_tokens() {
4464        if let NodeOrToken::Token(t) = el
4465            && t.kind() == SyntaxKind::TEXT
4466        {
4467            url.push_str(t.text());
4468        }
4469    }
4470    // Pandoc treats `<foo@bar>` as an email autolink (class "email", `mailto:`
4471    // dest) when the body has no scheme but contains an `@`.
4472    let is_email = !url.contains("://") && !url.starts_with("mailto:") && url.contains('@');
4473    if is_email {
4474        let attr = Attr {
4475            id: String::new(),
4476            classes: vec!["email".to_string()],
4477            kvs: Vec::new(),
4478        };
4479        let dest = format!("mailto:{url}");
4480        return Inline::Link(attr, vec![Inline::Str(url)], dest, String::new());
4481    }
4482    // Pandoc only treats `<scheme:body>` as a URI autolink when `scheme` is
4483    // in its known-schemes allowlist (see pandoc/src/Text/Pandoc/URI.hs).
4484    // Otherwise the original `<...>` bytes are emitted as raw HTML.
4485    if !is_known_uri_scheme(&url) {
4486        return Inline::RawInline("html".to_string(), node.text().to_string());
4487    }
4488    let attr = Attr {
4489        id: String::new(),
4490        classes: vec!["uri".to_string()],
4491        kvs: Vec::new(),
4492    };
4493    Inline::Link(attr, vec![Inline::Str(url.clone())], url, String::new())
4494}
4495
4496/// Pandoc's URI scheme allowlist (IANA + a few unofficial ones). Mirrors
4497/// `pandoc/src/Text/Pandoc/URI.hs`. Lowercase comparison.
4498fn is_known_uri_scheme(url: &str) -> bool {
4499    let scheme_end = url.find(':');
4500    let Some(end) = scheme_end else {
4501        return false;
4502    };
4503    let scheme = url[..end].to_ascii_lowercase();
4504    PANDOC_KNOWN_SCHEMES.binary_search(&scheme.as_str()).is_ok()
4505}
4506
4507/// Pandoc-known URI schemes, sorted for `binary_search`. Mirrors
4508/// `pandoc/src/Text/Pandoc/URI.hs`'s `schemes` set.
4509#[rustfmt::skip]
4510const PANDOC_KNOWN_SCHEMES: &[&str] = &[
4511    "aaa", "aaas", "about", "acap", "acct", "acr",
4512    "adiumxtra", "afp", "afs", "aim", "appdata", "apt",
4513    "attachment", "aw", "barion", "beshare", "bitcoin", "blob",
4514    "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension",
4515    "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid",
4516    "cvs", "data", "dav", "dict", "dis", "dlna-playcontainer",
4517    "dlna-playsingle", "dns", "dntp", "doi", "dtn", "dvb",
4518    "ed2k", "example", "facetime", "fax", "feed", "feedready",
4519    "file", "filesystem", "finger", "fish", "ftp", "gemini",
4520    "geo", "gg", "git", "gizmoproject", "go", "gopher",
4521    "graph", "gtalk", "h323", "ham", "hcp", "http",
4522    "https", "hxxp", "hxxps", "hydrazone", "iax", "icap",
4523    "icon", "im", "imap", "info", "iotdisco", "ipn",
4524    "ipp", "ipps", "irc", "irc6", "ircs", "iris",
4525    "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs", "isbn", "isostore",
4526    "itms", "jabber", "jar", "javascript", "jms", "keyparc",
4527    "lastfm", "ldap", "ldaps", "lvlt", "magnet", "mailserver",
4528    "mailto", "maps", "market", "message", "mid", "mms",
4529    "modem", "mongodb", "moz", "ms-access", "ms-browser-extension", "ms-drive-to",
4530    "ms-enrollment", "ms-excel", "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath",
4531    "ms-media-stream-id", "ms-officeapp", "ms-powerpoint", "ms-project", "ms-publisher", "ms-search-repair",
4532    "ms-secondary-screen-controller", "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode", "ms-settings-bluetooth", "ms-settings-camera",
4533    "ms-settings-cellular", "ms-settings-cloudstorage", "ms-settings-connectabledevices", "ms-settings-displays-topology", "ms-settings-emailandaccounts", "ms-settings-language",
4534    "ms-settings-location", "ms-settings-lock", "ms-settings-nfctransactions", "ms-settings-notifications", "ms-settings-power", "ms-settings-privacy",
4535    "ms-settings-proximity", "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace", "ms-spd", "ms-sttoverlay",
4536    "ms-transit-to", "ms-virtualtouchpad", "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd",
4537    "ms-word", "msnim", "msrp", "msrps", "mtqp", "mumble",
4538    "mupdate", "mvn", "news", "nfs", "ni", "nih",
4539    "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd",
4540    "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform",
4541    "pmid", "pop", "pres", "prospero", "proxy", "psyc",
4542    "pwid", "qb", "query", "redis", "rediss", "reload",
4543    "res", "resource", "rmi", "rsync", "rtmfp", "rtmp",
4544    "rtsp", "rtsps", "rtspu", "secondlife", "service", "session",
4545    "sftp", "sgn", "shttp", "sieve", "sip", "sips",
4546    "skype", "smb", "sms", "smtp", "snews", "snmp",
4547    "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam",
4548    "stun", "stuns", "submit", "svn", "tag", "teamspeak",
4549    "tel", "teliaeid", "telnet", "tftp", "things", "thismessage",
4550    "tip", "tn3270", "tool", "turn", "turns", "tv",
4551    "udp", "unreal", "urn", "ut2004", "v-event", "vemmi",
4552    "ventrilo", "videotex", "view-source", "vnc", "wais", "webcal",
4553    "wpid", "ws", "wss", "wtai", "wyciwyg", "xcon",
4554    "xcon-userid", "xfire", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri",
4555    "ymsgr", "z39.50", "z39.50r", "z39.50s",
4556];
4557
4558fn footnote_reference_inline(node: &SyntaxNode) -> Inline {
4559    let Some(label) = footnote_label(node) else {
4560        return Inline::Unsupported("FOOTNOTE_REFERENCE".to_string());
4561    };
4562    let blocks = REFS_CTX.with(|c| {
4563        c.borrow()
4564            .footnotes
4565            .get(&label)
4566            .map(|bs| bs.iter().map(clone_block).collect::<Vec<_>>())
4567    });
4568    match blocks {
4569        Some(bs) => Inline::Note(bs),
4570        // Unresolved footnote reference: pandoc emits the original bytes as
4571        // text rather than a `Note []`. Keep the raw token text for now.
4572        None => Inline::Str(node.text().to_string()),
4573    }
4574}
4575
4576fn inline_footnote_inline(node: &SyntaxNode) -> Inline {
4577    let inlines = coalesce_inlines(inlines_from(node));
4578    if inlines.is_empty() {
4579        Inline::Note(Vec::new())
4580    } else {
4581        Inline::Note(vec![Block::Para(inlines)])
4582    }
4583}
4584
4585fn parse_link_dest(node: &SyntaxNode) -> (String, String) {
4586    // LINK_DEST holds the raw bytes between `(` and `)`. Split into URL and
4587    // optional quoted title, then percent-escape unsafe characters in the URL
4588    // to match pandoc's `escapeURI`.
4589    let raw = node.text().to_string();
4590    let trimmed = raw.trim();
4591    // `<URL>` form: pandoc strips the angle brackets, even if the URL
4592    // contains otherwise-ambiguous characters like spaces or parens.
4593    if let Some(rest) = trimmed.strip_prefix('<')
4594        && let Some(end) = rest.find('>')
4595    {
4596        let url = &rest[..end];
4597        let after = rest[end + 1..].trim();
4598        let title = parse_dest_title(after);
4599        return (escape_link_dest(url), title);
4600    }
4601    // URL/title boundary: a title starts with `"`, `'`, or `(` after
4602    // whitespace. Without one, the entire string is the URL — internal
4603    // spaces still get percent-escaped.
4604    let bytes = trimmed.as_bytes();
4605    let mut url_end = trimmed.len();
4606    let mut i = 0;
4607    while i < bytes.len() {
4608        if matches!(bytes[i], b' ' | b'\t' | b'\n') {
4609            let mut j = i;
4610            while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\n') {
4611                j += 1;
4612            }
4613            if j < bytes.len() && matches!(bytes[j], b'"' | b'\'' | b'(') {
4614                url_end = i;
4615                break;
4616            }
4617            i = j;
4618        } else {
4619            i += 1;
4620        }
4621    }
4622    let url_raw = &trimmed[..url_end];
4623    let title = parse_dest_title(trimmed[url_end..].trim());
4624    (escape_link_dest(url_raw), title)
4625}
4626
4627/// Mirrors pandoc's `escapeURI`: percent-escape ASCII whitespace and the
4628/// punctuation `<>|"{}[]^\``. Other ASCII and all non-ASCII chars are
4629/// preserved as-is.
4630fn escape_link_dest(s: &str) -> String {
4631    let mut out = String::with_capacity(s.len());
4632    for ch in s.chars() {
4633        let needs_escape = ch.is_whitespace()
4634            || matches!(
4635                ch,
4636                '<' | '>' | '|' | '"' | '{' | '}' | '[' | ']' | '^' | '`'
4637            );
4638        if needs_escape {
4639            let mut buf = [0u8; 4];
4640            for &b in ch.encode_utf8(&mut buf).as_bytes() {
4641                out.push_str(&format!("%{b:02X}"));
4642            }
4643        } else {
4644            out.push(ch);
4645        }
4646    }
4647    out
4648}
4649
4650fn parse_dest_title(s: &str) -> String {
4651    let bytes = s.as_bytes();
4652    if bytes.is_empty() {
4653        return String::new();
4654    }
4655    let (open, close) = match bytes[0] {
4656        b'"' => (b'"', b'"'),
4657        b'\'' => (b'\'', b'\''),
4658        b'(' => (b'(', b')'),
4659        _ => return String::new(),
4660    };
4661    if !s.starts_with(open as char) {
4662        return String::new();
4663    }
4664    if let Some(end) = s[1..].rfind(close as char) {
4665        return s[1..1 + end].to_string();
4666    }
4667    String::new()
4668}
4669
4670// ----- coalescing & helpers ----------------------------------------------
4671
4672fn coalesce_inlines(input: Vec<Inline>) -> Vec<Inline> {
4673    coalesce_inlines_inner(input, true)
4674}
4675
4676/// Inside markup atoms (Emph/Strong/Strikeout/Sup/Sub), pandoc preserves
4677/// leading/trailing whitespace inside the wrapper — e.g. `*foo bar *` projects
4678/// as `Emph [Str "foo", Space, Str "bar", Space]`. Block-level paragraphs and
4679/// headers strip edge whitespace, but inline markup wrappers do not.
4680fn coalesce_inlines_keep_edges(input: Vec<Inline>) -> Vec<Inline> {
4681    coalesce_inlines_inner(input, false)
4682}
4683
4684fn coalesce_inlines_inner(input: Vec<Inline>, trim_edges: bool) -> Vec<Inline> {
4685    let mut out: Vec<Inline> = Vec::with_capacity(input.len());
4686    for inline in input {
4687        if let Inline::Str(s) = inline {
4688            if let Some(Inline::Str(prev)) = out.last_mut() {
4689                prev.push_str(&s);
4690            } else {
4691                out.push(Inline::Str(s));
4692            }
4693        } else if let Inline::Space = inline {
4694            // Collapse runs of Space into a single Space; pandoc never emits
4695            // two consecutive Space tokens.
4696            if matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4697                continue;
4698            }
4699            out.push(Inline::Space);
4700        } else if let Inline::SoftBreak = inline {
4701            // SoftBreak after Space: drop the trailing Space to match pandoc
4702            // (line-end whitespace is not preserved as Space).
4703            if matches!(out.last(), Some(Inline::Space)) {
4704                out.pop();
4705            }
4706            out.push(Inline::SoftBreak);
4707        } else {
4708            out.push(inline);
4709        }
4710    }
4711    if trim_edges {
4712        // Trim leading/trailing Space/SoftBreak — pandoc does not emit edge
4713        // whitespace inside a paragraph or header.
4714        while matches!(out.first(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4715            out.remove(0);
4716        }
4717        while matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
4718            out.pop();
4719        }
4720    }
4721    // Pandoc's `smart` extension is on by default for markdown. Apply the
4722    // simple in-Str substitutions here (apostrophe, dashes, ellipsis), then
4723    // restructure paired straight quotes into `Quoted` nodes.
4724    for inline in out.iter_mut() {
4725        if let Inline::Str(s) = inline {
4726            let mut t = smart_intraword_apostrophe(s);
4727            t = smart_dashes_and_ellipsis(&t);
4728            *s = t;
4729        }
4730    }
4731    let out = smart_quote_pairs(out);
4732    apply_abbreviations(out)
4733}
4734
4735/// Pandoc's default abbreviation list (from `pandoc/data/abbreviations`).
4736/// When a Str token *exactly equal to* one of these (i.e. the abbrev is a
4737/// suffix of the projected Str preceded by a non-letter / non-dot char or the
4738/// start of the Str) is followed by a `Space`, pandoc replaces the space with
4739/// a non-breaking space appended to the Str. Sorted to allow `binary_search`.
4740const PANDOC_ABBREVIATIONS: &[&str] = &[
4741    "Apr.", "Aug.", "Bros.", "Capt.", "Co.", "Corp.", "Dec.", "Dr.", "Feb.", "Fr.", "Gen.", "Gov.",
4742    "Hon.", "Inc.", "Jan.", "Jr.", "Jul.", "Jun.", "Ltd.", "M.A.", "M.D.", "Mar.", "Mr.", "Mrs.",
4743    "Ms.", "No.", "Nov.", "Oct.", "Ph.D.", "Pres.", "Prof.", "Rep.", "Rev.", "Sen.", "Sep.",
4744    "Sept.", "Sgt.", "Sr.", "St.", "aet.", "aetat.", "al.", "bk.", "c.", "cf.", "ch.", "chap.",
4745    "chs.", "col.", "cp.", "d.", "e.g.", "ed.", "eds.", "esp.", "f.", "fasc.", "ff.", "fig.",
4746    "fl.", "fol.", "fols.", "i.e.", "ill.", "incl.", "n.", "n.b.", "nn.", "p.", "pp.", "pt.",
4747    "q.v.", "s.v.", "s.vv.", "saec.", "sec.", "univ.", "viz.", "vol.", "vs.",
4748];
4749
4750fn matches_abbreviation_suffix(s: &str) -> bool {
4751    for &abbr in PANDOC_ABBREVIATIONS {
4752        if let Some(prefix) = s.strip_suffix(abbr) {
4753            if prefix.is_empty() {
4754                return true;
4755            }
4756            let last = prefix.chars().next_back().unwrap();
4757            if !last.is_alphanumeric() && last != '.' {
4758                return true;
4759            }
4760        }
4761    }
4762    false
4763}
4764
4765/// Apply pandoc's `+abbreviations` extension as a post-pass over a flat inline
4766/// list. For each `Str` ending in a known abbreviation followed by `Space`,
4767/// drop the `Space`, append `\u{a0}` (NBSP) to the `Str`, and merge the
4768/// following `Str` (if any) into it. Recurses into `Quoted` content because
4769/// `Quoted` is built inside `smart_quote_pairs` after the parent
4770/// `coalesce_inlines_inner` already ran on its source list, so its content
4771/// won't have been abbreviation-processed yet. Other inline wrappers (`Emph`,
4772/// `Strong`, `Link`, `Image`, `Note`, …) are constructed via their own
4773/// `coalesce_inlines_*` call, so their contents are already processed.
4774fn apply_abbreviations(inlines: Vec<Inline>) -> Vec<Inline> {
4775    let inlines: Vec<Inline> = inlines
4776        .into_iter()
4777        .map(|inline| match inline {
4778            Inline::Quoted(kind, content) => Inline::Quoted(kind, apply_abbreviations(content)),
4779            other => other,
4780        })
4781        .collect();
4782    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4783    let mut iter = inlines.into_iter().peekable();
4784    while let Some(inline) = iter.next() {
4785        if let Inline::Str(ref s) = inline
4786            && matches_abbreviation_suffix(s)
4787            && matches!(iter.peek(), Some(Inline::Space))
4788        {
4789            // Drop the Space.
4790            iter.next();
4791            let Inline::Str(mut new_s) = inline else {
4792                unreachable!()
4793            };
4794            new_s.push('\u{a0}');
4795            // Merge with the following Str if present.
4796            if let Some(Inline::Str(_)) = iter.peek()
4797                && let Some(Inline::Str(next_s)) = iter.next()
4798            {
4799                new_s.push_str(&next_s);
4800            }
4801            out.push(Inline::Str(new_s));
4802        } else {
4803            out.push(inline);
4804        }
4805    }
4806    out
4807}
4808
4809fn smart_quote_pairs(inlines: Vec<Inline>) -> Vec<Inline> {
4810    // Walk left-to-right, when a Str starts with a straight quote and the
4811    // previous element is a "boundary" (None/Space/SoftBreak/LineBreak), look
4812    // ahead for a matching close quote (Str ending with same quote char,
4813    // followed by a boundary). Wrap the inlines in between in a `Quoted` node.
4814    // Only handle quotes at Str boundaries; embedded or interleaved quotes are
4815    // not restructured (kept as-is) — pandoc has more nuanced rules but this
4816    // covers the common natural-text patterns in the corpus.
4817    fn is_boundary(prev: Option<&Inline>) -> bool {
4818        match prev {
4819            None => true,
4820            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4821            Some(Inline::Str(s)) => s.chars().last().is_some_and(|c| !c.is_alphanumeric()),
4822            _ => false,
4823        }
4824    }
4825    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4826    let n = inlines.len();
4827    let mut consumed = vec![false; n];
4828    for i in 0..n {
4829        if consumed[i] {
4830            continue;
4831        }
4832        // Try to detect an open quote at position i.
4833        let Inline::Str(s) = &inlines[i] else {
4834            out.push(clone_inline(&inlines[i]));
4835            consumed[i] = true;
4836            continue;
4837        };
4838        let first = s.chars().next();
4839        let quote = match first {
4840            Some('"') => Some('"'),
4841            Some('\'') => Some('\''),
4842            _ => None,
4843        };
4844        // Open quote condition: previous inline is boundary, AND either
4845        // (a) the Str has more chars after the quote and the next char is
4846        //     non-space (open quote attaches to a word in the same Str), or
4847        // (b) the Str is *only* the quote and the next inline is a markup
4848        //     atom (Emph/Strong/...), so the quote attaches across atoms.
4849        let prev_is_boundary = is_boundary(out.last());
4850        let str_has_more = s.chars().count() > 1;
4851        let next_char_is_word = s.chars().nth(1).is_some_and(|c| !c.is_whitespace());
4852        let next_is_markup_atom = matches!(
4853            inlines.get(i + 1),
4854            Some(
4855                Inline::Emph(_)
4856                    | Inline::Strong(_)
4857                    | Inline::Strikeout(_)
4858                    | Inline::Superscript(_)
4859                    | Inline::Subscript(_)
4860                    | Inline::Code(_, _)
4861            )
4862        );
4863        let attaches =
4864            (str_has_more && next_char_is_word) || (!str_has_more && next_is_markup_atom);
4865        if let Some(q) = quote
4866            && prev_is_boundary
4867            && attaches
4868        {
4869            // Find the matching close.
4870            if let Some(close_idx) = find_matching_close(&inlines, i, q, &consumed) {
4871                // Build content: inlines from i to close_idx (inclusive),
4872                // strip the leading quote from inlines[i] and trailing quote
4873                // from inlines[close_idx].
4874                let kind = if q == '"' {
4875                    "DoubleQuote"
4876                } else {
4877                    "SingleQuote"
4878                };
4879                let mut content: Vec<Inline> = Vec::new();
4880                for j in i..=close_idx {
4881                    if consumed[j] {
4882                        continue;
4883                    }
4884                    let inline = &inlines[j];
4885                    if j == i && j == close_idx {
4886                        // Open and close in the same Str — strip both ends.
4887                        if let Inline::Str(s) = inline {
4888                            let mut chars: Vec<char> = s.chars().collect();
4889                            if chars.len() >= 2 {
4890                                chars.remove(0);
4891                                chars.pop();
4892                            }
4893                            let stripped: String = chars.into_iter().collect();
4894                            if !stripped.is_empty() {
4895                                content.push(Inline::Str(stripped));
4896                            }
4897                        }
4898                    } else if j == i {
4899                        if let Inline::Str(s) = inline {
4900                            let stripped: String = s.chars().skip(1).collect();
4901                            if !stripped.is_empty() {
4902                                content.push(Inline::Str(stripped));
4903                            }
4904                        }
4905                    } else if j == close_idx {
4906                        if let Inline::Str(s) = inline {
4907                            let mut stripped: String = s.chars().collect();
4908                            stripped.pop();
4909                            if !stripped.is_empty() {
4910                                content.push(Inline::Str(stripped));
4911                            }
4912                        }
4913                    } else {
4914                        content.push(clone_inline(inline));
4915                    }
4916                    consumed[j] = true;
4917                }
4918                out.push(Inline::Quoted(kind, content));
4919                continue;
4920            }
4921        }
4922        out.push(clone_inline(&inlines[i]));
4923        consumed[i] = true;
4924    }
4925    out
4926}
4927
4928fn find_matching_close(
4929    inlines: &[Inline],
4930    open_idx: usize,
4931    quote: char,
4932    consumed: &[bool],
4933) -> Option<usize> {
4934    // First check: same Str ends with the matching quote (close in same Str).
4935    if let Inline::Str(s) = &inlines[open_idx]
4936        && s.chars().count() >= 3
4937        && s.ends_with(quote)
4938    {
4939        // Need to confirm the next inline (after this Str) is a boundary.
4940        let next = inlines.get(open_idx + 1);
4941        let after_is_boundary = match next {
4942            None => true,
4943            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4944            Some(Inline::Str(s)) => s.chars().next().is_some_and(|c| !c.is_alphanumeric()),
4945            _ => false,
4946        };
4947        if after_is_boundary {
4948            return Some(open_idx);
4949        }
4950    }
4951    // Otherwise, scan forward for a Str ending with the quote and followed by
4952    // a boundary.
4953    let n = inlines.len();
4954    let mut j = open_idx + 1;
4955    while j < n {
4956        if consumed[j] {
4957            return None;
4958        }
4959        match &inlines[j] {
4960            Inline::Str(s) => {
4961                if s.ends_with(quote) {
4962                    let next = inlines.get(j + 1);
4963                    let after_is_boundary = match next {
4964                        None => true,
4965                        Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4966                        Some(Inline::Str(s)) => {
4967                            s.chars().next().is_some_and(|c| !c.is_alphanumeric())
4968                        }
4969                        _ => false,
4970                    };
4971                    if after_is_boundary {
4972                        return Some(j);
4973                    }
4974                }
4975            }
4976            Inline::Space | Inline::SoftBreak | Inline::LineBreak => {}
4977            // Don't span over markup atoms — keep search cheap and predictable.
4978            _ => {}
4979        }
4980        j += 1;
4981        // Cap search range — natural quoted spans are short.
4982        if j - open_idx > 32 {
4983            return None;
4984        }
4985    }
4986    None
4987}
4988
4989fn clone_inline(inline: &Inline) -> Inline {
4990    match inline {
4991        Inline::Str(s) => Inline::Str(s.clone()),
4992        Inline::Space => Inline::Space,
4993        Inline::SoftBreak => Inline::SoftBreak,
4994        Inline::LineBreak => Inline::LineBreak,
4995        Inline::Emph(c) => Inline::Emph(c.iter().map(clone_inline).collect()),
4996        Inline::Strong(c) => Inline::Strong(c.iter().map(clone_inline).collect()),
4997        Inline::Strikeout(c) => Inline::Strikeout(c.iter().map(clone_inline).collect()),
4998        Inline::Superscript(c) => Inline::Superscript(c.iter().map(clone_inline).collect()),
4999        Inline::Subscript(c) => Inline::Subscript(c.iter().map(clone_inline).collect()),
5000        Inline::Code(a, s) => Inline::Code(a.clone(), s.clone()),
5001        Inline::Link(a, t, u, ti) => Inline::Link(
5002            a.clone(),
5003            t.iter().map(clone_inline).collect(),
5004            u.clone(),
5005            ti.clone(),
5006        ),
5007        Inline::Image(a, t, u, ti) => Inline::Image(
5008            a.clone(),
5009            t.iter().map(clone_inline).collect(),
5010            u.clone(),
5011            ti.clone(),
5012        ),
5013        Inline::Math(k, c) => Inline::Math(k, c.clone()),
5014        Inline::Span(a, c) => Inline::Span(a.clone(), c.iter().map(clone_inline).collect()),
5015        Inline::RawInline(f, c) => Inline::RawInline(f.clone(), c.clone()),
5016        Inline::Quoted(k, c) => Inline::Quoted(k, c.iter().map(clone_inline).collect()),
5017        Inline::Note(blocks) => Inline::Note(blocks.iter().map(clone_block).collect()),
5018        Inline::Cite(citations, text) => Inline::Cite(
5019            citations
5020                .iter()
5021                .map(|c| Citation {
5022                    id: c.id.clone(),
5023                    prefix: c.prefix.iter().map(clone_inline).collect(),
5024                    suffix: c.suffix.iter().map(clone_inline).collect(),
5025                    mode: c.mode,
5026                    note_num: c.note_num,
5027                    hash: c.hash,
5028                })
5029                .collect(),
5030            text.iter().map(clone_inline).collect(),
5031        ),
5032        Inline::Unsupported(s) => Inline::Unsupported(s.clone()),
5033    }
5034}
5035
5036fn clone_block(b: &Block) -> Block {
5037    match b {
5038        Block::Para(c) => Block::Para(c.iter().map(clone_inline).collect()),
5039        Block::Plain(c) => Block::Plain(c.iter().map(clone_inline).collect()),
5040        Block::Header(lvl, a, c) => {
5041            Block::Header(*lvl, a.clone(), c.iter().map(clone_inline).collect())
5042        }
5043        Block::BlockQuote(blocks) => Block::BlockQuote(blocks.iter().map(clone_block).collect()),
5044        Block::CodeBlock(a, s) => Block::CodeBlock(a.clone(), s.clone()),
5045        Block::HorizontalRule => Block::HorizontalRule,
5046        Block::BulletList(items) => Block::BulletList(
5047            items
5048                .iter()
5049                .map(|item| item.iter().map(clone_block).collect())
5050                .collect(),
5051        ),
5052        Block::OrderedList(start, style, delim, items) => Block::OrderedList(
5053            *start,
5054            style,
5055            delim,
5056            items
5057                .iter()
5058                .map(|item| item.iter().map(clone_block).collect())
5059                .collect(),
5060        ),
5061        Block::RawBlock(f, c) => Block::RawBlock(f.clone(), c.clone()),
5062        Block::Table(_) => Block::Unsupported("Table".to_string()),
5063        Block::Div(a, blocks) => Block::Div(a.clone(), blocks.iter().map(clone_block).collect()),
5064        Block::LineBlock(lines) => Block::LineBlock(
5065            lines
5066                .iter()
5067                .map(|line| line.iter().map(clone_inline).collect())
5068                .collect(),
5069        ),
5070        Block::DefinitionList(items) => Block::DefinitionList(
5071            items
5072                .iter()
5073                .map(|(term, defs)| {
5074                    (
5075                        term.iter().map(clone_inline).collect(),
5076                        defs.iter()
5077                            .map(|d| d.iter().map(clone_block).collect())
5078                            .collect(),
5079                    )
5080                })
5081                .collect(),
5082        ),
5083        Block::Figure(a, caption, body) => Block::Figure(
5084            a.clone(),
5085            caption.iter().map(clone_block).collect(),
5086            body.iter().map(clone_block).collect(),
5087        ),
5088        Block::Unsupported(s) => Block::Unsupported(s.clone()),
5089    }
5090}
5091
5092fn smart_dashes_and_ellipsis(s: &str) -> String {
5093    if !s.contains(['-', '.']) {
5094        return s.to_string();
5095    }
5096    let bytes = s.as_bytes();
5097    let mut out = String::with_capacity(s.len());
5098    let mut i = 0usize;
5099    while i < bytes.len() {
5100        if bytes[i] == b'-' {
5101            if i + 2 < bytes.len() && bytes[i + 1] == b'-' && bytes[i + 2] == b'-' {
5102                out.push('\u{2014}');
5103                i += 3;
5104                continue;
5105            }
5106            if i + 1 < bytes.len() && bytes[i + 1] == b'-' {
5107                out.push('\u{2013}');
5108                i += 2;
5109                continue;
5110            }
5111        }
5112        if bytes[i] == b'.' && i + 2 < bytes.len() && bytes[i + 1] == b'.' && bytes[i + 2] == b'.' {
5113            out.push('\u{2026}');
5114            i += 3;
5115            continue;
5116        }
5117        // Read one UTF-8 char.
5118        let len = utf8_char_len(bytes[i]);
5119        out.push_str(&s[i..i + len]);
5120        i += len;
5121    }
5122    out
5123}
5124
5125fn utf8_char_len(b: u8) -> usize {
5126    // Invalid start bytes (0x80..0xc0) advance one byte to recover.
5127    if b < 0xc0 {
5128        1
5129    } else if b < 0xe0 {
5130        2
5131    } else if b < 0xf0 {
5132        3
5133    } else {
5134        4
5135    }
5136}
5137
5138fn smart_intraword_apostrophe(s: &str) -> String {
5139    if !s.contains('\'') {
5140        return s.to_string();
5141    }
5142    let chars: Vec<char> = s.chars().collect();
5143    let mut out = String::with_capacity(s.len());
5144    for (i, &c) in chars.iter().enumerate() {
5145        if c == '\'' {
5146            let prev = i.checked_sub(1).map(|j| chars[j]);
5147            let next = chars.get(i + 1).copied();
5148            let prev_word = prev.is_some_and(is_word_char);
5149            let next_word = next.is_some_and(is_word_char);
5150            if prev_word && next_word {
5151                out.push('\u{2019}');
5152                continue;
5153            }
5154        }
5155        out.push(c);
5156    }
5157    out
5158}
5159
5160fn is_word_char(c: char) -> bool {
5161    c.is_alphanumeric()
5162}
5163
5164fn inlines_to_plaintext(inlines: &[Inline]) -> String {
5165    let mut s = String::new();
5166    for i in inlines {
5167        match i {
5168            Inline::Str(t) => s.push_str(t),
5169            Inline::Space | Inline::SoftBreak => s.push(' '),
5170            Inline::LineBreak => s.push(' '),
5171            Inline::Emph(children)
5172            | Inline::Strong(children)
5173            | Inline::Strikeout(children)
5174            | Inline::Superscript(children)
5175            | Inline::Subscript(children) => s.push_str(&inlines_to_plaintext(children)),
5176            Inline::Code(_, c) => s.push_str(c),
5177            Inline::Link(_, alt, _, _) | Inline::Image(_, alt, _, _) => {
5178                s.push_str(&inlines_to_plaintext(alt))
5179            }
5180            Inline::Math(_, c) => s.push_str(c),
5181            Inline::Span(_, children) => s.push_str(&inlines_to_plaintext(children)),
5182            Inline::RawInline(_, _) => {}
5183            Inline::Quoted(_, children) => s.push_str(&inlines_to_plaintext(children)),
5184            Inline::Note(_) => {}
5185            Inline::Cite(_, text) => s.push_str(&inlines_to_plaintext(text)),
5186            Inline::Unsupported(_) => {}
5187        }
5188    }
5189    s
5190}
5191
5192fn pandoc_slugify(text: &str) -> String {
5193    // Mirror crates/panache-formatter::utils::pandoc_slugify so the parser-side
5194    // projector doesn't need to depend on the formatter crate.
5195    let mut out = String::new();
5196    let mut prev_dash = false;
5197    for ch in text.chars() {
5198        if ch.is_whitespace() {
5199            if !out.is_empty() && !prev_dash {
5200                out.push('-');
5201                prev_dash = true;
5202            }
5203            continue;
5204        }
5205        for lc in ch.to_lowercase() {
5206            if lc.is_alphanumeric() || lc == '_' || lc == '-' || lc == '.' {
5207                out.push(lc);
5208                prev_dash = lc == '-';
5209            }
5210        }
5211    }
5212    while out.ends_with('-') {
5213        out.pop();
5214    }
5215    out
5216}
5217
5218impl Attr {
5219    fn with_id(id: String) -> Self {
5220        Self {
5221            id,
5222            classes: Vec::new(),
5223            kvs: Vec::new(),
5224        }
5225    }
5226}
5227
5228// ----- text emission ------------------------------------------------------
5229
5230fn write_block(b: &Block, out: &mut String) {
5231    match b {
5232        Block::Para(inlines) => {
5233            out.push_str("Para [");
5234            write_inline_list(inlines, out);
5235            out.push_str(" ]");
5236        }
5237        Block::Plain(inlines) => {
5238            out.push_str("Plain [");
5239            write_inline_list(inlines, out);
5240            out.push_str(" ]");
5241        }
5242        Block::Header(level, attr, inlines) => {
5243            out.push_str(&format!("Header {level} ("));
5244            write_attr(attr, out);
5245            out.push_str(") [");
5246            write_inline_list(inlines, out);
5247            out.push_str(" ]");
5248        }
5249        Block::BlockQuote(blocks) => {
5250            out.push_str("BlockQuote [");
5251            write_block_list(blocks, out);
5252            out.push_str(" ]");
5253        }
5254        Block::CodeBlock(attr, content) => {
5255            out.push_str("CodeBlock (");
5256            write_attr(attr, out);
5257            out.push_str(") ");
5258            write_haskell_string(content, out);
5259        }
5260        Block::HorizontalRule => out.push_str("HorizontalRule"),
5261        Block::BulletList(items) => {
5262            out.push_str("BulletList [");
5263            for (i, item) in items.iter().enumerate() {
5264                if i > 0 {
5265                    out.push(',');
5266                }
5267                out.push_str(" [");
5268                write_block_list(item, out);
5269                out.push_str(" ]");
5270            }
5271            out.push_str(" ]");
5272        }
5273        Block::OrderedList(start, style, delim, items) => {
5274            out.push_str(&format!("OrderedList ( {start} , {style} , {delim} ) ["));
5275            for (i, item) in items.iter().enumerate() {
5276                if i > 0 {
5277                    out.push(',');
5278                }
5279                out.push_str(" [");
5280                write_block_list(item, out);
5281                out.push_str(" ]");
5282            }
5283            out.push_str(" ]");
5284        }
5285        Block::RawBlock(format, content) => {
5286            out.push_str("RawBlock ( Format ");
5287            write_haskell_string(format, out);
5288            out.push_str(" ) ");
5289            write_haskell_string(content, out);
5290        }
5291        Block::Table(data) => {
5292            write_table(data, out);
5293        }
5294        Block::Div(attr, blocks) => {
5295            out.push_str("Div (");
5296            write_attr(attr, out);
5297            out.push_str(") [");
5298            write_block_list(blocks, out);
5299            out.push_str(" ]");
5300        }
5301        Block::LineBlock(lines) => {
5302            out.push_str("LineBlock [");
5303            for (i, line) in lines.iter().enumerate() {
5304                if i > 0 {
5305                    out.push(',');
5306                }
5307                out.push_str(" [");
5308                write_inline_list(line, out);
5309                out.push_str(" ]");
5310            }
5311            out.push_str(" ]");
5312        }
5313        Block::DefinitionList(items) => {
5314            out.push_str("DefinitionList [");
5315            for (i, (term, defs)) in items.iter().enumerate() {
5316                if i > 0 {
5317                    out.push(',');
5318                }
5319                out.push_str(" ( [");
5320                write_inline_list(term, out);
5321                out.push_str(" ] , [");
5322                for (j, def) in defs.iter().enumerate() {
5323                    if j > 0 {
5324                        out.push(',');
5325                    }
5326                    out.push_str(" [");
5327                    write_block_list(def, out);
5328                    out.push_str(" ]");
5329                }
5330                out.push_str(" ] )");
5331            }
5332            out.push_str(" ]");
5333        }
5334        Block::Figure(attr, caption, body) => {
5335            out.push_str("Figure (");
5336            write_attr(attr, out);
5337            out.push_str(") ( Caption Nothing [");
5338            write_block_list(caption, out);
5339            out.push_str(" ] ) [");
5340            write_block_list(body, out);
5341            out.push_str(" ]");
5342        }
5343        Block::Unsupported(name) => {
5344            out.push_str(&format!("Unsupported {name:?}"));
5345        }
5346    }
5347}
5348
5349fn write_table(data: &TableData, out: &mut String) {
5350    out.push_str("Table (");
5351    write_attr(&data.attr, out);
5352    out.push_str(") ( Caption Nothing [");
5353    if !data.caption.is_empty() {
5354        out.push_str(" Plain [");
5355        write_inline_list(&data.caption, out);
5356        out.push_str(" ]");
5357    }
5358    out.push_str(" ] ) [");
5359    for (i, align) in data.aligns.iter().enumerate() {
5360        if i > 0 {
5361            out.push(',');
5362        }
5363        let width = data.widths.get(i).copied().unwrap_or(None);
5364        match width {
5365            None => out.push_str(&format!(" ( {align} , ColWidthDefault )")),
5366            Some(w) => out.push_str(&format!(" ( {align} , ColWidth {} )", show_double(w))),
5367        }
5368    }
5369    out.push_str(" ] ( TableHead ( \"\" , [ ] , [ ] ) [");
5370    for (i, row) in data.head_rows.iter().enumerate() {
5371        if i > 0 {
5372            out.push(',');
5373        }
5374        out.push(' ');
5375        write_table_row(row, out);
5376    }
5377    out.push_str(" ] ) [ TableBody ( \"\" , [ ] , [ ] ) ( RowHeadColumns 0 ) [ ] [");
5378    for (i, row) in data.body_rows.iter().enumerate() {
5379        if i > 0 {
5380            out.push(',');
5381        }
5382        out.push(' ');
5383        write_table_row(row, out);
5384    }
5385    out.push_str(" ] ] ( TableFoot ( \"\" , [ ] , [ ] ) [");
5386    for (i, row) in data.foot_rows.iter().enumerate() {
5387        if i > 0 {
5388            out.push(',');
5389        }
5390        out.push(' ');
5391        write_table_row(row, out);
5392    }
5393    out.push_str(" ] )");
5394}
5395
5396fn write_table_row(cells: &[GridCell], out: &mut String) {
5397    out.push_str("Row ( \"\" , [ ] , [ ] ) [");
5398    for (i, cell) in cells.iter().enumerate() {
5399        if i > 0 {
5400            out.push(',');
5401        }
5402        out.push_str(&format!(
5403            " Cell ( \"\" , [ ] , [ ] ) AlignDefault ( RowSpan {} ) ( ColSpan {} ) [",
5404            cell.row_span, cell.col_span
5405        ));
5406        if !cell.blocks.is_empty() {
5407            write_block_list(&cell.blocks, out);
5408        }
5409        out.push_str(" ]");
5410    }
5411    out.push_str(" ]");
5412}
5413
5414fn write_block_list(blocks: &[Block], out: &mut String) {
5415    for (i, b) in blocks.iter().enumerate() {
5416        if i > 0 {
5417            out.push(',');
5418        }
5419        out.push(' ');
5420        write_block(b, out);
5421    }
5422}
5423
5424fn write_inline_list(inlines: &[Inline], out: &mut String) {
5425    for (i, inline) in inlines.iter().enumerate() {
5426        if i > 0 {
5427            out.push(',');
5428        }
5429        out.push(' ');
5430        write_inline(inline, out);
5431    }
5432}
5433
5434fn write_inline(inline: &Inline, out: &mut String) {
5435    match inline {
5436        Inline::Str(s) => {
5437            out.push_str("Str ");
5438            write_haskell_string(s, out);
5439        }
5440        Inline::Space => out.push_str("Space"),
5441        Inline::SoftBreak => out.push_str("SoftBreak"),
5442        Inline::LineBreak => out.push_str("LineBreak"),
5443        Inline::Emph(children) => {
5444            out.push_str("Emph [");
5445            write_inline_list(children, out);
5446            out.push_str(" ]");
5447        }
5448        Inline::Strong(children) => {
5449            out.push_str("Strong [");
5450            write_inline_list(children, out);
5451            out.push_str(" ]");
5452        }
5453        Inline::Strikeout(children) => {
5454            out.push_str("Strikeout [");
5455            write_inline_list(children, out);
5456            out.push_str(" ]");
5457        }
5458        Inline::Superscript(children) => {
5459            out.push_str("Superscript [");
5460            write_inline_list(children, out);
5461            out.push_str(" ]");
5462        }
5463        Inline::Subscript(children) => {
5464            out.push_str("Subscript [");
5465            write_inline_list(children, out);
5466            out.push_str(" ]");
5467        }
5468        Inline::Code(attr, content) => {
5469            out.push_str("Code (");
5470            write_attr(attr, out);
5471            out.push_str(") ");
5472            write_haskell_string(content, out);
5473        }
5474        Inline::Link(attr, text, url, title) => {
5475            out.push_str("Link (");
5476            write_attr(attr, out);
5477            out.push_str(") [");
5478            write_inline_list(text, out);
5479            out.push_str(" ] ( ");
5480            write_haskell_string(url, out);
5481            out.push_str(" , ");
5482            write_haskell_string(title, out);
5483            out.push_str(" )");
5484        }
5485        Inline::Image(attr, alt, url, title) => {
5486            out.push_str("Image (");
5487            write_attr(attr, out);
5488            out.push_str(") [");
5489            write_inline_list(alt, out);
5490            out.push_str(" ] ( ");
5491            write_haskell_string(url, out);
5492            out.push_str(" , ");
5493            write_haskell_string(title, out);
5494            out.push_str(" )");
5495        }
5496        Inline::Math(kind, content) => {
5497            out.push_str("Math ");
5498            out.push_str(kind);
5499            out.push(' ');
5500            write_haskell_string(content, out);
5501        }
5502        Inline::Span(attr, children) => {
5503            out.push_str("Span (");
5504            write_attr(attr, out);
5505            out.push_str(") [");
5506            write_inline_list(children, out);
5507            out.push_str(" ]");
5508        }
5509        Inline::RawInline(format, content) => {
5510            out.push_str("RawInline ( Format ");
5511            write_haskell_string(format, out);
5512            out.push_str(" ) ");
5513            write_haskell_string(content, out);
5514        }
5515        Inline::Quoted(kind, children) => {
5516            out.push_str("Quoted ");
5517            out.push_str(kind);
5518            out.push_str(" [");
5519            write_inline_list(children, out);
5520            out.push_str(" ]");
5521        }
5522        Inline::Note(blocks) => {
5523            out.push_str("Note [");
5524            write_block_list(blocks, out);
5525            out.push_str(" ]");
5526        }
5527        Inline::Cite(citations, text) => {
5528            out.push_str("Cite [");
5529            for (i, c) in citations.iter().enumerate() {
5530                if i > 0 {
5531                    out.push(',');
5532                }
5533                out.push_str(" Citation { citationId = ");
5534                write_haskell_string(&c.id, out);
5535                out.push_str(" , citationPrefix = [");
5536                write_inline_list(&c.prefix, out);
5537                out.push_str(" ] , citationSuffix = [");
5538                write_inline_list(&c.suffix, out);
5539                out.push_str(" ] , citationMode = ");
5540                out.push_str(match c.mode {
5541                    CitationMode::AuthorInText => "AuthorInText",
5542                    CitationMode::NormalCitation => "NormalCitation",
5543                    CitationMode::SuppressAuthor => "SuppressAuthor",
5544                });
5545                out.push_str(&format!(
5546                    " , citationNoteNum = {} , citationHash = {} }}",
5547                    c.note_num, c.hash
5548                ));
5549            }
5550            out.push_str(" ] [");
5551            write_inline_list(text, out);
5552            out.push_str(" ]");
5553        }
5554        Inline::Unsupported(name) => {
5555            out.push_str(&format!("Unsupported {name:?}"));
5556        }
5557    }
5558}
5559
5560fn write_attr(attr: &Attr, out: &mut String) {
5561    out.push(' ');
5562    write_haskell_string(&attr.id, out);
5563    out.push_str(" , [");
5564    for (i, c) in attr.classes.iter().enumerate() {
5565        if i > 0 {
5566            out.push(',');
5567        }
5568        out.push(' ');
5569        write_haskell_string(c, out);
5570    }
5571    if !attr.classes.is_empty() {
5572        out.push(' ');
5573    }
5574    out.push_str("] , [");
5575    for (i, (k, v)) in attr.kvs.iter().enumerate() {
5576        if i > 0 {
5577            out.push(',');
5578        }
5579        out.push_str(" ( ");
5580        write_haskell_string(k, out);
5581        out.push_str(" , ");
5582        write_haskell_string(v, out);
5583        out.push_str(" )");
5584    }
5585    if !attr.kvs.is_empty() {
5586        out.push(' ');
5587    }
5588    out.push_str("] ");
5589}
5590
5591fn write_haskell_string(s: &str, out: &mut String) {
5592    out.push('"');
5593    let mut prev_was_numeric_escape = false;
5594    for ch in s.chars() {
5595        let code = ch as u32;
5596        let is_ascii_printable = (0x20..0x7f).contains(&code);
5597        match ch {
5598            '"' => {
5599                out.push_str("\\\"");
5600                prev_was_numeric_escape = false;
5601            }
5602            '\\' => {
5603                out.push_str("\\\\");
5604                prev_was_numeric_escape = false;
5605            }
5606            '\n' => {
5607                out.push_str("\\n");
5608                prev_was_numeric_escape = false;
5609            }
5610            '\t' => {
5611                out.push_str("\\t");
5612                prev_was_numeric_escape = false;
5613            }
5614            '\r' => {
5615                out.push_str("\\r");
5616                prev_was_numeric_escape = false;
5617            }
5618            _ if is_ascii_printable => {
5619                // Disambiguate digit immediately after a numeric escape: `\160\&33`
5620                // versus `\16033`.
5621                if prev_was_numeric_escape && ch.is_ascii_digit() {
5622                    out.push_str("\\&");
5623                }
5624                out.push(ch);
5625                prev_was_numeric_escape = false;
5626            }
5627            _ => {
5628                // Non-printable or non-ASCII → decimal escape.
5629                out.push('\\');
5630                out.push_str(&code.to_string());
5631                prev_was_numeric_escape = true;
5632            }
5633        }
5634    }
5635    out.push('"');
5636}
5637
5638// ----- pandoc JSON projection ---------------------------------------------
5639//
5640// Walks the same `Block`/`Inline` tree as `write_block`/`write_inline` but
5641// emits pandoc's JSON shape — `{"t": "Constructor", "c": <content>}`, with
5642// nullary constructors omitting `"c"`. See pandoc's
5643// `Text.Pandoc.Definition` ToJSON instances for the source of truth.
5644
5645fn attr_to_json(attr: &Attr) -> Value {
5646    let kvs: Vec<Value> = attr.kvs.iter().map(|(k, v)| json!([k, v])).collect();
5647    json!([attr.id, attr.classes, kvs])
5648}
5649
5650fn target_to_json(url: &str, title: &str) -> Value {
5651    json!([url, title])
5652}
5653
5654fn inlines_to_json(inlines: &[Inline]) -> Vec<Value> {
5655    inlines.iter().map(inline_to_json).collect()
5656}
5657
5658fn blocks_to_json(blocks: &[Block]) -> Vec<Value> {
5659    blocks.iter().map(block_to_json).collect()
5660}
5661
5662fn citation_to_json(c: &Citation) -> Value {
5663    let mode = match c.mode {
5664        CitationMode::AuthorInText => "AuthorInText",
5665        CitationMode::NormalCitation => "NormalCitation",
5666        CitationMode::SuppressAuthor => "SuppressAuthor",
5667    };
5668    json!({
5669        "citationId": c.id,
5670        "citationPrefix": inlines_to_json(&c.prefix),
5671        "citationSuffix": inlines_to_json(&c.suffix),
5672        "citationMode": { "t": mode },
5673        "citationNoteNum": c.note_num,
5674        "citationHash": c.hash,
5675    })
5676}
5677
5678fn inline_to_json(inline: &Inline) -> Value {
5679    match inline {
5680        Inline::Str(s) => json!({ "t": "Str", "c": s }),
5681        Inline::Space => json!({ "t": "Space" }),
5682        Inline::SoftBreak => json!({ "t": "SoftBreak" }),
5683        Inline::LineBreak => json!({ "t": "LineBreak" }),
5684        Inline::Emph(children) => json!({ "t": "Emph", "c": inlines_to_json(children) }),
5685        Inline::Strong(children) => json!({ "t": "Strong", "c": inlines_to_json(children) }),
5686        Inline::Strikeout(children) => {
5687            json!({ "t": "Strikeout", "c": inlines_to_json(children) })
5688        }
5689        Inline::Superscript(children) => {
5690            json!({ "t": "Superscript", "c": inlines_to_json(children) })
5691        }
5692        Inline::Subscript(children) => {
5693            json!({ "t": "Subscript", "c": inlines_to_json(children) })
5694        }
5695        Inline::Code(attr, content) => {
5696            json!({ "t": "Code", "c": [attr_to_json(attr), content] })
5697        }
5698        Inline::Link(attr, text, url, title) => json!({
5699            "t": "Link",
5700            "c": [attr_to_json(attr), inlines_to_json(text), target_to_json(url, title)],
5701        }),
5702        Inline::Image(attr, alt, url, title) => json!({
5703            "t": "Image",
5704            "c": [attr_to_json(attr), inlines_to_json(alt), target_to_json(url, title)],
5705        }),
5706        Inline::Math(kind, content) => json!({
5707            "t": "Math",
5708            "c": [{ "t": kind }, content],
5709        }),
5710        Inline::Span(attr, children) => json!({
5711            "t": "Span",
5712            "c": [attr_to_json(attr), inlines_to_json(children)],
5713        }),
5714        Inline::RawInline(format, content) => json!({
5715            "t": "RawInline",
5716            "c": [format, content],
5717        }),
5718        Inline::Quoted(kind, children) => json!({
5719            "t": "Quoted",
5720            "c": [{ "t": kind }, inlines_to_json(children)],
5721        }),
5722        Inline::Note(blocks) => json!({ "t": "Note", "c": blocks_to_json(blocks) }),
5723        Inline::Cite(citations, text) => json!({
5724            "t": "Cite",
5725            "c": [
5726                citations.iter().map(citation_to_json).collect::<Vec<_>>(),
5727                inlines_to_json(text),
5728            ],
5729        }),
5730        Inline::Unsupported(name) => json!({ "t": "Unsupported", "c": name }),
5731    }
5732}
5733
5734fn block_to_json(b: &Block) -> Value {
5735    match b {
5736        Block::Para(inlines) => json!({ "t": "Para", "c": inlines_to_json(inlines) }),
5737        Block::Plain(inlines) => json!({ "t": "Plain", "c": inlines_to_json(inlines) }),
5738        Block::Header(level, attr, inlines) => json!({
5739            "t": "Header",
5740            "c": [level, attr_to_json(attr), inlines_to_json(inlines)],
5741        }),
5742        Block::BlockQuote(blocks) => {
5743            json!({ "t": "BlockQuote", "c": blocks_to_json(blocks) })
5744        }
5745        Block::CodeBlock(attr, content) => json!({
5746            "t": "CodeBlock",
5747            "c": [attr_to_json(attr), content],
5748        }),
5749        Block::HorizontalRule => json!({ "t": "HorizontalRule" }),
5750        Block::BulletList(items) => {
5751            let items_json: Vec<Vec<Value>> = items.iter().map(|it| blocks_to_json(it)).collect();
5752            json!({ "t": "BulletList", "c": items_json })
5753        }
5754        Block::OrderedList(start, style, delim, items) => {
5755            let items_json: Vec<Vec<Value>> = items.iter().map(|it| blocks_to_json(it)).collect();
5756            json!({
5757                "t": "OrderedList",
5758                "c": [
5759                    [json!(start), json!({ "t": style }), json!({ "t": delim })],
5760                    items_json,
5761                ],
5762            })
5763        }
5764        Block::RawBlock(format, content) => json!({
5765            "t": "RawBlock",
5766            "c": [format, content],
5767        }),
5768        Block::Table(data) => table_to_json(data),
5769        Block::Div(attr, blocks) => json!({
5770            "t": "Div",
5771            "c": [attr_to_json(attr), blocks_to_json(blocks)],
5772        }),
5773        Block::LineBlock(lines) => {
5774            let lines_json: Vec<Vec<Value>> =
5775                lines.iter().map(|line| inlines_to_json(line)).collect();
5776            json!({ "t": "LineBlock", "c": lines_json })
5777        }
5778        Block::DefinitionList(items) => {
5779            let items_json: Vec<Value> = items
5780                .iter()
5781                .map(|(term, defs)| {
5782                    let defs_json: Vec<Vec<Value>> =
5783                        defs.iter().map(|d| blocks_to_json(d)).collect();
5784                    json!([inlines_to_json(term), defs_json])
5785                })
5786                .collect();
5787            json!({ "t": "DefinitionList", "c": items_json })
5788        }
5789        Block::Figure(attr, caption, body) => {
5790            // Pandoc's Caption shape: `[shortCaption_or_null, [blocks]]`.
5791            // panache stores the caption as a Vec<Block> directly; wrap it.
5792            let caption_json = json!([Value::Null, blocks_to_json(caption)]);
5793            json!({
5794                "t": "Figure",
5795                "c": [attr_to_json(attr), caption_json, blocks_to_json(body)],
5796            })
5797        }
5798        Block::Unsupported(name) => json!({ "t": "Unsupported", "c": name }),
5799    }
5800}
5801
5802fn table_to_json(data: &TableData) -> Value {
5803    // Caption: `[null, [Plain inlines]]` when non-empty, `[null, []]` when empty.
5804    let caption_blocks: Vec<Value> = if data.caption.is_empty() {
5805        Vec::new()
5806    } else {
5807        vec![json!({ "t": "Plain", "c": inlines_to_json(&data.caption) })]
5808    };
5809    let caption_json = json!([Value::Null, caption_blocks]);
5810
5811    // Column specs: pair each align constructor with its column-width
5812    // constructor — `ColWidthDefault` (nullary) or `ColWidth f` (with value).
5813    let colspecs: Vec<Value> = data
5814        .aligns
5815        .iter()
5816        .enumerate()
5817        .map(|(i, align)| {
5818            let width = data.widths.get(i).copied().unwrap_or(None);
5819            let width_json = match width {
5820                None => json!({ "t": "ColWidthDefault" }),
5821                Some(w) => json!({ "t": "ColWidth", "c": w }),
5822            };
5823            json!([{ "t": align }, width_json])
5824        })
5825        .collect();
5826
5827    let empty_attr = json!(["", Vec::<Value>::new(), Vec::<Value>::new()]);
5828
5829    let head_rows: Vec<Value> = data
5830        .head_rows
5831        .iter()
5832        .map(|r| table_row_to_json(r))
5833        .collect();
5834    let body_rows: Vec<Value> = data
5835        .body_rows
5836        .iter()
5837        .map(|r| table_row_to_json(r))
5838        .collect();
5839    let foot_rows: Vec<Value> = data
5840        .foot_rows
5841        .iter()
5842        .map(|r| table_row_to_json(r))
5843        .collect();
5844
5845    let table_head = json!([empty_attr, head_rows]);
5846    let table_bodies = json!([[empty_attr, 0, Vec::<Value>::new(), body_rows,]]);
5847    let table_foot = json!([empty_attr, foot_rows]);
5848
5849    json!({
5850        "t": "Table",
5851        "c": [
5852            attr_to_json(&data.attr),
5853            caption_json,
5854            colspecs,
5855            table_head,
5856            table_bodies,
5857            table_foot,
5858        ],
5859    })
5860}
5861
5862fn table_row_to_json(cells: &[GridCell]) -> Value {
5863    let empty_attr = json!(["", Vec::<Value>::new(), Vec::<Value>::new()]);
5864    let cells_json: Vec<Value> = cells
5865        .iter()
5866        .map(|cell| {
5867            json!([
5868                empty_attr,
5869                { "t": "AlignDefault" },
5870                cell.row_span,
5871                cell.col_span,
5872                blocks_to_json(&cell.blocks),
5873            ])
5874        })
5875        .collect();
5876    json!([empty_attr, cells_json])
5877}
5878
5879#[cfg(test)]
5880mod tests {
5881    use super::*;
5882    use crate::parser::parse;
5883    use serde_json::Value;
5884
5885    fn parse_to_json(input: &str) -> Value {
5886        let tree = parse(input, None);
5887        let s = to_pandoc_json(&tree);
5888        serde_json::from_str(&s).expect("to_pandoc_json must emit valid JSON")
5889    }
5890
5891    #[test]
5892    fn empty_doc_emits_envelope_with_no_blocks() {
5893        let v = parse_to_json("");
5894        assert_eq!(v["pandoc-api-version"], serde_json::json!([1, 23, 1, 1]));
5895        assert_eq!(v["meta"], serde_json::json!({}));
5896        assert_eq!(v["blocks"], serde_json::json!([]));
5897    }
5898
5899    #[test]
5900    fn paragraph_with_str_emits_para_str_shape() {
5901        let v = parse_to_json("hello");
5902        let blocks = v["blocks"].as_array().expect("blocks is array");
5903        assert_eq!(blocks.len(), 1);
5904        let para = &blocks[0];
5905        assert_eq!(para["t"], "Para");
5906        let inlines = para["c"].as_array().expect("Para.c is array");
5907        assert_eq!(inlines.len(), 1);
5908        assert_eq!(inlines[0]["t"], "Str");
5909        assert_eq!(inlines[0]["c"], "hello");
5910    }
5911
5912    #[test]
5913    fn nullary_constructors_omit_c_key() {
5914        // A space between two words produces a nullary `Space` inline.
5915        let v = parse_to_json("a b");
5916        let inlines = v["blocks"][0]["c"].as_array().expect("Para.c is array");
5917        // [Str "a", Space, Str "b"]
5918        let space = inlines
5919            .iter()
5920            .find(|i| i["t"] == "Space")
5921            .expect("Space inline present");
5922        let space_obj = space.as_object().expect("Space is JSON object");
5923        assert!(
5924            !space_obj.contains_key("c"),
5925            "nullary constructors must omit the \"c\" key, got {space:?}",
5926        );
5927    }
5928
5929    #[test]
5930    fn header_attr_shape_matches_pandoc_tuple() {
5931        // `# Hi {#foo .bar key=val}` → Header 1 ("foo", ["bar"], [("key","val")]) [Str "Hi"]
5932        let v = parse_to_json("# Hi {#foo .bar key=val}");
5933        let header = &v["blocks"][0];
5934        assert_eq!(header["t"], "Header");
5935        let c = header["c"].as_array().expect("Header.c is array");
5936        assert_eq!(c.len(), 3);
5937        assert_eq!(c[0], 1, "level");
5938        // attr tuple: [id, [classes], [[k, v], ...]]
5939        let attr = c[1].as_array().expect("attr tuple");
5940        assert_eq!(attr[0], "foo");
5941        assert_eq!(attr[1], serde_json::json!(["bar"]));
5942        assert_eq!(attr[2], serde_json::json!([["key", "val"]]));
5943    }
5944}
panache_parser/pandoc_ast.rs

panache_parser/
pandoc_ast.rs