panache_parser/
pandoc_ast.rs

1//! CST → Pandoc-native AST text projector.
2//!
3//! Walks a panache [`SyntaxNode`] and emits a string in the textual shape of
4//! pandoc's `Pandoc [Block]` AST — the same format produced by
5//! `pandoc -f markdown -t native`. Exposed via [`to_pandoc_ast`] and the
6//! `panache parse --to pandoc-ast` CLI mode; also drives the pandoc
7//! conformance harness in `tests/pandoc.rs`.
8//!
9//! Coverage is intentionally narrow. Unsupported nodes emit
10//! `Unsupported "<KIND>"` so a failing case stays visibly failing rather
11//! than silently dropping content; expand coverage as the corpus grows.
12//!
13//! Output shape matches pandoc 3.9.0.2 with default-standalone-off behavior:
14//! the document is rendered as a bare block list `[ <block>, ... ]`. The
15//! comparison normalizer collapses whitespace runs, so ppShow's pretty-print
16//! line breaks/indentation are not load-bearing.
17
18use std::cell::RefCell;
19use std::collections::{HashMap, HashSet};
20
21use crate::SyntaxNode;
22use crate::syntax::SyntaxKind;
23use rowan::NodeOrToken;
24
25#[derive(Default)]
26struct RefsCtx {
27    refs: HashMap<String, (String, String)>,
28    heading_ids: HashSet<String>,
29    /// Heading text-range start → final disambiguated id. Lets
30    /// `heading_block` look up the document-level id (with `section`
31    /// fallback for empty slugs and `-1`/`-2` suffixes for duplicates)
32    /// that was computed during the pre-pass.
33    heading_id_by_offset: HashMap<u32, String>,
34    /// Footnote label → parsed body blocks. Lookup keyed by the raw label
35    /// id text (no normalization needed — pandoc footnote labels are
36    /// case-sensitive and not whitespace-collapsed).
37    footnotes: HashMap<String, Vec<Block>>,
38    /// Example-list label (`@label`) → resolved item number. Pandoc
39    /// numbers all `OrderedList(_, Example, _)` items across the entire
40    /// document with one shared counter; labeled items also become
41    /// referenceable so inline `@label` resolves to the item's number.
42    example_label_to_num: HashMap<String, usize>,
43    /// Example-list start number per `LIST` text-range start. Looked up
44    /// in `ordered_list_attrs` so each Example list reports the first
45    /// item's number — picking up where the previous Example list left
46    /// off rather than restarting at 1.
47    example_list_start_by_offset: HashMap<u32, usize>,
48    /// Note number per `CITATION` text-range start. Pandoc assigns each
49    /// inline-cite group (and each footnote, regardless of inner cites)
50    /// a position-counter value; cites inside a footnote share its number.
51    cite_note_num_by_offset: HashMap<u32, i64>,
52}
53
54thread_local! {
55    static REFS_CTX: RefCell<RefsCtx> = RefCell::new(RefsCtx::default());
56}
57
58/// Render the given panache CST as pandoc-native AST text.
59///
60/// Output mirrors `pandoc -f markdown -t native` for supported constructs.
61/// Unsupported nodes emit a visible `Unsupported "<KIND>"` sentinel rather
62/// than silently dropping content. Pair with [`normalize_native`] when
63/// comparing against captured pandoc output to ignore pretty-print
64/// whitespace differences.
65pub fn to_pandoc_ast(tree: &SyntaxNode) -> String {
66    let ctx = build_refs_ctx(tree);
67    REFS_CTX.with(|c| *c.borrow_mut() = ctx);
68    let blocks = blocks_from_doc(tree);
69    let mut out = String::new();
70    out.push('[');
71    for (i, b) in blocks.iter().enumerate() {
72        if i > 0 {
73            out.push(',');
74        }
75        out.push(' ');
76        write_block(b, &mut out);
77    }
78    out.push_str(" ]");
79    REFS_CTX.with(|c| *c.borrow_mut() = RefsCtx::default());
80    out
81}
82
83fn build_refs_ctx(tree: &SyntaxNode) -> RefsCtx {
84    let mut ctx = RefsCtx::default();
85    // Cite note-num assignment runs first so it is populated before footnote
86    // bodies are parsed (which would otherwise call `render_citation_inline`
87    // with the lookup map empty and fall back to noteNum=1).
88    collect_cite_note_nums(tree, &mut ctx);
89    // Same reason: example-list numbering and the resolved heading-id lookup
90    // are also referenced from `inlines_from` paths that run during
91    // `parse_footnote_def` below — populate them up-front.
92    let mut example_counter: usize = 0;
93    collect_example_numbering(tree, &mut ctx, &mut example_counter);
94    // Promoting the in-progress ctx into REFS_CTX lets the footnote-body
95    // parser see the cite-note and example-numbering maps that were just
96    // computed. Without this, `parse_footnote_def` (called transitively from
97    // `collect_refs_and_headings` below) reads an empty thread-local.
98    REFS_CTX.with(|c| {
99        let mut borrowed = c.borrow_mut();
100        borrowed.cite_note_num_by_offset = ctx.cite_note_num_by_offset.clone();
101        borrowed.example_label_to_num = ctx.example_label_to_num.clone();
102        borrowed.example_list_start_by_offset = ctx.example_list_start_by_offset.clone();
103    });
104    let mut seen_ids: HashMap<String, u32> = HashMap::new();
105    collect_refs_and_headings(tree, &mut ctx, &mut seen_ids);
106    ctx
107}
108
109/// Walk every inline tree under `tree` and assign a `citationNoteNum` to
110/// each `CITATION` node. Pandoc's rule: outside footnotes, each Cite group
111/// (one CITATION node, regardless of internal `;`-separated keys) gets a
112/// fresh counter value; footnotes increment the counter once on entry,
113/// then ALL cites inside the footnote share that value.
114fn collect_cite_note_nums(tree: &SyntaxNode, ctx: &mut RefsCtx) {
115    let mut footnote_def_nodes: HashMap<String, SyntaxNode> = HashMap::new();
116    for child in tree.descendants() {
117        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION
118            && let Some(label) = footnote_label(&child)
119        {
120            footnote_def_nodes.entry(label).or_insert(child);
121        }
122    }
123    let mut counter: i64 = 0;
124    for child in tree.children() {
125        if child.kind() == SyntaxKind::FOOTNOTE_DEFINITION {
126            continue;
127        }
128        visit_for_cite_nums(&child, &footnote_def_nodes, &mut counter, None, ctx);
129    }
130}
131
132fn visit_for_cite_nums(
133    node: &SyntaxNode,
134    fn_defs: &HashMap<String, SyntaxNode>,
135    counter: &mut i64,
136    in_fn: Option<i64>,
137    ctx: &mut RefsCtx,
138) {
139    for el in node.children_with_tokens() {
140        if let NodeOrToken::Node(n) = el {
141            match n.kind() {
142                SyntaxKind::CITATION => {
143                    let offset: u32 = n.text_range().start().into();
144                    let num = if let Some(fn_num) = in_fn {
145                        fn_num
146                    } else {
147                        *counter += 1;
148                        *counter
149                    };
150                    ctx.cite_note_num_by_offset.insert(offset, num);
151                }
152                SyntaxKind::FOOTNOTE_REFERENCE => {
153                    if in_fn.is_none() {
154                        *counter += 1;
155                        let fn_num = *counter;
156                        if let Some(label) = footnote_label(&n)
157                            && let Some(def) = fn_defs.get(&label)
158                        {
159                            visit_for_cite_nums(def, fn_defs, counter, Some(fn_num), ctx);
160                        }
161                    }
162                }
163                _ => visit_for_cite_nums(&n, fn_defs, counter, in_fn, ctx),
164            }
165        }
166    }
167}
168
169/// Walk every `LIST` in document order and assign Example-list numbers.
170/// Pandoc tracks one counter across all `OrderedList(_, Example, _)` lists
171/// in a document, so each subsequent Example list picks up where the prior
172/// one left off. Labeled items (`(@label)`) get a label → number mapping
173/// for inline `@label` reference resolution.
174fn collect_example_numbering(node: &SyntaxNode, ctx: &mut RefsCtx, counter: &mut usize) {
175    for child in node.children() {
176        if child.kind() == SyntaxKind::LIST && list_is_example(&child) {
177            let list_offset: u32 = child.text_range().start().into();
178            ctx.example_list_start_by_offset
179                .insert(list_offset, *counter + 1);
180            for item in child
181                .children()
182                .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
183            {
184                *counter += 1;
185                if let Some(label) = example_item_label(&item) {
186                    ctx.example_label_to_num.entry(label).or_insert(*counter);
187                }
188            }
189            // Recurse into the list's contents to pick up nested Example
190            // lists (rare but possible).
191            collect_example_numbering(&child, ctx, counter);
192        } else {
193            collect_example_numbering(&child, ctx, counter);
194        }
195    }
196}
197
198/// `(@)` / `(@label)` markers identify Example list items. Returns true
199/// iff the LIST's first item carries such a marker (pandoc decides the
200/// list style from the first marker only).
201fn list_is_example(list: &SyntaxNode) -> bool {
202    let Some(item) = list.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
203        return false;
204    };
205    let marker = list_item_marker_text(&item);
206    let trimmed = marker.trim();
207    let body = if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
208        inner
209    } else if let Some(inner) = trimmed.strip_suffix(')') {
210        inner
211    } else if let Some(inner) = trimmed.strip_suffix('.') {
212        inner
213    } else {
214        trimmed
215    };
216    body.starts_with('@')
217        && body[1..]
218            .chars()
219            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
220}
221
222fn list_item_marker_text(item: &SyntaxNode) -> String {
223    item.children_with_tokens()
224        .filter_map(|el| el.into_token())
225        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
226        .map(|t| t.text().to_string())
227        .unwrap_or_default()
228}
229
230/// Returns the `@label` text for an Example list item, or `None` for the
231/// unlabeled `(@)` form.
232fn example_item_label(item: &SyntaxNode) -> Option<String> {
233    let marker = list_item_marker_text(item);
234    let trimmed = marker.trim();
235    let body = trimmed
236        .strip_prefix('(')
237        .and_then(|s| s.strip_suffix(')'))
238        .or_else(|| trimmed.strip_suffix(')'))
239        .or_else(|| trimmed.strip_suffix('.'))
240        .unwrap_or(trimmed);
241    let label = body.strip_prefix('@')?;
242    if label.is_empty() {
243        None
244    } else {
245        Some(label.to_string())
246    }
247}
248
249fn collect_refs_and_headings(
250    node: &SyntaxNode,
251    ctx: &mut RefsCtx,
252    seen_ids: &mut HashMap<String, u32>,
253) {
254    for child in node.children() {
255        match child.kind() {
256            SyntaxKind::REFERENCE_DEFINITION => {
257                if let Some((label, url, title)) = parse_reference_def(&child) {
258                    ctx.refs
259                        .entry(normalize_ref_label(&label))
260                        .or_insert((url, title));
261                }
262            }
263            SyntaxKind::FOOTNOTE_DEFINITION => {
264                if let Some((label, blocks)) = parse_footnote_def(&child) {
265                    ctx.footnotes.entry(label).or_insert(blocks);
266                }
267            }
268            SyntaxKind::HEADING => {
269                let (id, was_explicit) = heading_id_with_explicitness(&child);
270                let final_id = if was_explicit {
271                    // Explicit `{#x}` ids are kept verbatim; pandoc only
272                    // warns on conflicts but does not auto-disambiguate.
273                    seen_ids.entry(id.clone()).or_insert(0);
274                    id
275                } else {
276                    let mut base = id;
277                    if base.is_empty() {
278                        base = "section".to_string();
279                    }
280                    let count = seen_ids.entry(base.clone()).or_insert(0);
281                    let id = if *count == 0 {
282                        base
283                    } else {
284                        format!("{base}-{count}")
285                    };
286                    *count += 1;
287                    id
288                };
289                if !final_id.is_empty() {
290                    let offset: u32 = child.text_range().start().into();
291                    ctx.heading_ids.insert(final_id.clone());
292                    ctx.heading_id_by_offset.insert(offset, final_id);
293                }
294                collect_refs_and_headings(&child, ctx, seen_ids);
295            }
296            _ => collect_refs_and_headings(&child, ctx, seen_ids),
297        }
298    }
299}
300
301/// Returns `(id, was_explicit)` for a HEADING node. Explicit ids come from
302/// `{#id}` attributes; the auto-id is the slugified plaintext (which may be
303/// empty for headings whose text contains no slug-eligible characters).
304fn heading_id_with_explicitness(node: &SyntaxNode) -> (String, bool) {
305    let inlines = node
306        .children()
307        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
308        .map(|c| coalesce_inlines(inlines_from(&c)))
309        .unwrap_or_default();
310    let attr = node.children_with_tokens().find_map(|el| match el {
311        NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
312        NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => Some(t.text().to_string()),
313        _ => None,
314    });
315    if let Some(raw) = attr {
316        let trimmed = raw.trim();
317        if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
318            let parsed = parse_attr_block(inner);
319            if !parsed.id.is_empty() {
320                return (parsed.id, true);
321            }
322        }
323    }
324    (pandoc_slugify(&inlines_to_plaintext(&inlines)), false)
325}
326
327fn parse_footnote_def(node: &SyntaxNode) -> Option<(String, Vec<Block>)> {
328    let label = footnote_label(node)?;
329    let mut blocks = Vec::new();
330    for child in node.children() {
331        // The CST keeps each footnote-body line at its full raw indentation
332        // (the 4-space body indent plus any nested-block indent). Most blocks
333        // recover transparently because `coalesce_inlines` trims leading
334        // spaces on paragraph content, but indented code blocks preserve all
335        // leading whitespace — strip the 4 footnote-body spaces in addition
336        // to the code block's own 4.
337        if child.kind() == SyntaxKind::CODE_BLOCK
338            && !child
339                .children()
340                .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
341        {
342            blocks.push(indented_code_block_with_extra_strip(&child, 4));
343        } else {
344            collect_block(&child, &mut blocks);
345        }
346    }
347    Some((label, blocks))
348}
349
350fn indented_code_block_with_extra_strip(node: &SyntaxNode, extra: usize) -> Block {
351    let raw_format = code_block_raw_format(node);
352    let attr = code_block_attr(node);
353    let is_fenced = node
354        .children()
355        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
356    let mut content = String::new();
357    for child in node.children() {
358        if child.kind() == SyntaxKind::CODE_CONTENT {
359            content.push_str(&child.text().to_string());
360        }
361    }
362    while content.ends_with('\n') {
363        content.pop();
364    }
365    // Pandoc expands tabs (4-col stops) on code-block bodies before any
366    // indent stripping, so a `:\t` marker followed by `\t\t\tcode` correctly
367    // becomes `"        code"` after the 4-col definition-content offset is
368    // stripped. Apply expansion first, then strip.
369    content = content
370        .split('\n')
371        .map(expand_tabs_to_4)
372        .collect::<Vec<_>>()
373        .join("\n");
374    content = strip_leading_spaces_per_line(&content, extra);
375    if !is_fenced {
376        content = strip_indented_code_indent(&content);
377    }
378    if let Some(fmt) = raw_format {
379        return Block::RawBlock(fmt, content);
380    }
381    Block::CodeBlock(attr, content)
382}
383
384fn strip_leading_spaces_per_line(s: &str, n: usize) -> String {
385    let mut out = String::with_capacity(s.len());
386    for (i, line) in s.split('\n').enumerate() {
387        if i > 0 {
388            out.push('\n');
389        }
390        let to_strip = line.chars().take(n).take_while(|&c| c == ' ').count();
391        out.push_str(&line[to_strip..]);
392    }
393    out
394}
395
396fn footnote_label(node: &SyntaxNode) -> Option<String> {
397    for el in node.children_with_tokens() {
398        if let NodeOrToken::Token(t) = el
399            && t.kind() == SyntaxKind::FOOTNOTE_LABEL_ID
400        {
401            return Some(t.text().to_string());
402        }
403    }
404    None
405}
406
407fn parse_reference_def(node: &SyntaxNode) -> Option<(String, String, String)> {
408    let link = node.children().find(|c| c.kind() == SyntaxKind::LINK)?;
409    let label_node = link
410        .children()
411        .find(|c| c.kind() == SyntaxKind::LINK_TEXT)?;
412    let label = label_node.text().to_string();
413
414    let mut tail = String::new();
415    let mut after_link = false;
416    for el in node.children_with_tokens() {
417        if after_link {
418            match el {
419                NodeOrToken::Token(t) => tail.push_str(t.text()),
420                NodeOrToken::Node(n) => tail.push_str(&n.text().to_string()),
421            }
422        } else if let NodeOrToken::Node(n) = &el
423            && n.kind() == SyntaxKind::LINK
424        {
425            after_link = true;
426        }
427    }
428
429    let trimmed = tail.trim_start();
430    let rest = trimmed.strip_prefix(':')?;
431    let after_colon = rest.trim_start();
432    let (url, after_url) = parse_ref_url(after_colon);
433    let title = parse_dest_title(after_url.trim());
434    Some((unescape_label(&label), url, title))
435}
436
437fn parse_ref_url(s: &str) -> (String, &str) {
438    let s = s.trim_start();
439    if let Some(rest) = s.strip_prefix('<')
440        && let Some(end) = rest.find('>')
441    {
442        return (rest[..end].to_string(), &rest[end + 1..]);
443    }
444    let end = s.find(|c: char| c.is_whitespace()).unwrap_or(s.len());
445    (s[..end].to_string(), &s[end..])
446}
447
448fn unescape_label(label: &str) -> String {
449    let mut out = String::with_capacity(label.len());
450    let mut chars = label.chars().peekable();
451    while let Some(ch) = chars.next() {
452        if ch == '\\'
453            && let Some(&next) = chars.peek()
454            && is_ascii_punct(next)
455        {
456            out.push(next);
457            chars.next();
458        } else {
459            out.push(ch);
460        }
461    }
462    out
463}
464
465fn is_ascii_punct(c: char) -> bool {
466    c.is_ascii() && (c.is_ascii_punctuation())
467}
468
469/// Pandoc/CommonMark reference-label normalization: case-fold and collapse
470/// runs of whitespace to a single space, with leading/trailing trimmed.
471fn normalize_ref_label(label: &str) -> String {
472    let unescaped = unescape_label(label);
473    let mut out = String::new();
474    let mut last_space = false;
475    for ch in unescaped.chars() {
476        if ch.is_whitespace() {
477            if !out.is_empty() && !last_space {
478                out.push(' ');
479                last_space = true;
480            }
481        } else {
482            for lc in ch.to_lowercase() {
483                out.push(lc);
484            }
485            last_space = false;
486        }
487    }
488    if last_space {
489        out.pop();
490    }
491    out
492}
493
494fn lookup_ref(label: &str) -> Option<(String, String)> {
495    let key = normalize_ref_label(label);
496    REFS_CTX.with(|c| c.borrow().refs.get(&key).cloned())
497}
498
499fn lookup_heading_id(label: &str) -> Option<String> {
500    let id = pandoc_slugify(&unescape_label(label));
501    if id.is_empty() {
502        return None;
503    }
504    REFS_CTX.with(|c| {
505        if c.borrow().heading_ids.contains(&id) {
506            Some(id)
507        } else {
508            None
509        }
510    })
511}
512
513/// Canonical form of a Pandoc-native AST string. Tokenizes the input and
514/// re-serializes it with single-space separation so that pretty-print line
515/// breaks and indentation no longer affect equality.
516pub fn normalize_native(s: &str) -> String {
517    let mut tokens = Vec::new();
518    let bytes = s.as_bytes();
519    let mut i = 0usize;
520    while i < bytes.len() {
521        let c = bytes[i];
522        match c {
523            b' ' | b'\t' | b'\n' | b'\r' => {
524                i += 1;
525            }
526            b'[' | b']' | b'(' | b')' | b',' => {
527                tokens.push((c as char).to_string());
528                i += 1;
529            }
530            b'"' => {
531                // String literal: copy bytes until matching unescaped quote.
532                let start = i;
533                i += 1;
534                while i < bytes.len() {
535                    match bytes[i] {
536                        b'\\' if i + 1 < bytes.len() => {
537                            i += 2;
538                        }
539                        b'"' => {
540                            i += 1;
541                            break;
542                        }
543                        _ => {
544                            i += 1;
545                        }
546                    }
547                }
548                tokens.push(s[start..i].to_string());
549            }
550            _ => {
551                let start = i;
552                while i < bytes.len() {
553                    let b = bytes[i];
554                    if matches!(
555                        b,
556                        b' ' | b'\t' | b'\n' | b'\r' | b'[' | b']' | b'(' | b')' | b',' | b'"'
557                    ) {
558                        break;
559                    }
560                    i += 1;
561                }
562                if i > start {
563                    tokens.push(s[start..i].to_string());
564                }
565            }
566        }
567    }
568    tokens.join(" ")
569}
570
571// Variant names mirror Pandoc's `Text.Pandoc.Definition` constructors so the
572// emission code reads 1:1 against pandoc-native — `BlockQuote`, `CodeBlock`,
573// `BulletList`, `OrderedList` are not redundant here, they are the spec names.
574#[derive(Debug)]
575#[allow(clippy::enum_variant_names)]
576enum Block {
577    Para(Vec<Inline>),
578    Plain(Vec<Inline>),
579    Header(usize, Attr, Vec<Inline>),
580    BlockQuote(Vec<Block>),
581    CodeBlock(Attr, String),
582    HorizontalRule,
583    BulletList(Vec<Vec<Block>>),
584    OrderedList(usize, &'static str, &'static str, Vec<Vec<Block>>),
585    RawBlock(String, String),
586    Table(TableData),
587    Div(Attr, Vec<Block>),
588    LineBlock(Vec<Vec<Inline>>),
589    DefinitionList(Vec<(Vec<Inline>, Vec<Vec<Block>>)>),
590    /// `Figure attr (Caption Nothing [caption-blocks]) [body-blocks]` —
591    /// pandoc's implicit_figures wraps an image-only paragraph whose
592    /// alt text becomes the caption and whose body re-includes the
593    /// image as a Plain block.
594    Figure(Attr, Vec<Block>, Vec<Block>),
595    Unsupported(String),
596}
597
598#[derive(Debug)]
599struct TableData {
600    /// Pandoc's `+caption_attributes` extension lifts a trailing
601    /// `{#id .class kv=...}` from the caption text into the Table's outer
602    /// attribute. Default-empty for tables without caption attributes.
603    attr: Attr,
604    caption: Vec<Inline>,
605    aligns: Vec<&'static str>,
606    /// Per-column width. `None` → `ColWidthDefault`, `Some(f)` → `ColWidth f`.
607    widths: Vec<Option<f64>>,
608    head_rows: Vec<Vec<GridCell>>,
609    body_rows: Vec<Vec<GridCell>>,
610    /// Footer rows. Currently only populated for grid tables with a
611    /// trailing `+===+===+` separator before the final body row(s).
612    foot_rows: Vec<Vec<GridCell>>,
613}
614
615/// One cell in a `TableData` row. `row_span`/`col_span` default to 1 for
616/// pipe/simple/multiline tables (which don't model spans). Grid tables
617/// compute proper span counts via the layout algorithm in `grid_table`.
618#[derive(Debug)]
619struct GridCell {
620    row_span: u32,
621    col_span: u32,
622    blocks: Vec<Block>,
623}
624
625impl GridCell {
626    fn no_span(blocks: Vec<Block>) -> Self {
627        Self {
628            row_span: 1,
629            col_span: 1,
630            blocks,
631        }
632    }
633}
634
635#[derive(Debug)]
636#[allow(clippy::enum_variant_names)]
637enum Inline {
638    Str(String),
639    Space,
640    SoftBreak,
641    LineBreak,
642    Emph(Vec<Inline>),
643    Strong(Vec<Inline>),
644    Strikeout(Vec<Inline>),
645    Superscript(Vec<Inline>),
646    Subscript(Vec<Inline>),
647    Code(Attr, String),
648    Link(Attr, Vec<Inline>, String, String),
649    Image(Attr, Vec<Inline>, String, String),
650    Math(&'static str, String),
651    Span(Attr, Vec<Inline>),
652    RawInline(String, String),
653    Quoted(&'static str, Vec<Inline>),
654    Note(Vec<Block>),
655    Cite(Vec<Citation>, Vec<Inline>),
656    Unsupported(String),
657}
658
659#[derive(Debug)]
660struct Citation {
661    id: String,
662    prefix: Vec<Inline>,
663    suffix: Vec<Inline>,
664    mode: CitationMode,
665    note_num: i64,
666    hash: i64,
667}
668
669#[derive(Debug, Clone, Copy)]
670enum CitationMode {
671    AuthorInText,
672    NormalCitation,
673    SuppressAuthor,
674}
675
676#[derive(Debug, Default, Clone)]
677struct Attr {
678    id: String,
679    classes: Vec<String>,
680    kvs: Vec<(String, String)>,
681}
682
683// ----- block-level walking ------------------------------------------------
684
685fn blocks_from_doc(doc: &SyntaxNode) -> Vec<Block> {
686    let mut out = Vec::new();
687    for child in doc.children() {
688        collect_block(&child, &mut out);
689    }
690    out
691}
692
693fn block_from(node: &SyntaxNode) -> Option<Block> {
694    match node.kind() {
695        SyntaxKind::PARAGRAPH => Some(Block::Para(coalesce_inlines(inlines_from(node)))),
696        SyntaxKind::PLAIN => Some(Block::Plain(coalesce_inlines(inlines_from(node)))),
697        SyntaxKind::HEADING => Some(heading_block(node)),
698        SyntaxKind::BLOCK_QUOTE => Some(Block::BlockQuote(blockquote_blocks(node))),
699        SyntaxKind::CODE_BLOCK => Some(code_block(node)),
700        SyntaxKind::HORIZONTAL_RULE => Some(Block::HorizontalRule),
701        SyntaxKind::LIST => Some(list_block(node)),
702        SyntaxKind::BLANK_LINE => None,
703        // Reference definitions don't appear in pandoc-native output (they
704        // resolve into the link they define).
705        SyntaxKind::REFERENCE_DEFINITION => None,
706        // Footnote definitions are pulled into Note inlines at the
707        // FOOTNOTE_REFERENCE site; the definition block itself is dropped.
708        SyntaxKind::FOOTNOTE_DEFINITION => None,
709        // YAML metadata becomes the document Meta wrapper, not a body block.
710        // The projector emits a bare block list, so just drop these.
711        SyntaxKind::YAML_METADATA => None,
712        // Pandoc title block (`% title\n% authors\n% date`) populates Meta
713        // and produces no body block.
714        SyntaxKind::PANDOC_TITLE_BLOCK => None,
715        SyntaxKind::HTML_BLOCK => Some(html_block(node)),
716        SyntaxKind::PIPE_TABLE => pipe_table(node).map(Block::Table),
717        SyntaxKind::SIMPLE_TABLE => simple_table(node).map(Block::Table),
718        SyntaxKind::GRID_TABLE => grid_table(node).map(Block::Table),
719        SyntaxKind::MULTILINE_TABLE => multiline_table(node).map(Block::Table),
720        SyntaxKind::TEX_BLOCK => Some(tex_block(node)),
721        SyntaxKind::FENCED_DIV => Some(fenced_div(node)),
722        SyntaxKind::LINE_BLOCK => Some(line_block(node)),
723        SyntaxKind::DEFINITION_LIST => Some(definition_list(node)),
724        SyntaxKind::FIGURE => Some(figure_block(node)),
725        other => Some(Block::Unsupported(format!("{other:?}"))),
726    }
727}
728
729/// Pandoc's `implicit_figures` extension wraps a paragraph that is *only*
730/// an Image into a `Figure` block: `Figure (id, [], []) (Caption Nothing
731/// [Plain alt]) [Plain [Image]]`. The image's alt-text inlines become the
732/// caption; the body holds the image itself wrapped in a Plain. Any
733/// attribute attached to the Image migrates to the Figure attr (id only)
734/// — the Image keeps its classes/kvs.
735fn figure_block(node: &SyntaxNode) -> Block {
736    let mut alt: Vec<Inline> = Vec::new();
737    let mut image_inline: Option<Inline> = None;
738    if let Some(image) = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_LINK) {
739        let alt_node = image.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
740        if let Some(an) = alt_node {
741            alt = coalesce_inlines(inlines_from(&an));
742        }
743        let mut tmp = Vec::new();
744        render_image_inline(&image, &mut tmp);
745        if let Some(first) = tmp.into_iter().next() {
746            image_inline = Some(first);
747        }
748    }
749    // Pandoc's `implicit_figures` migrates only the image's id to the Figure
750    // attr; the image keeps its classes and key-value pairs but loses the id.
751    let (figure_attr, image_inline) = match image_inline {
752        Some(Inline::Image(mut attr, alt_inlines, url, title)) if !attr.id.is_empty() => {
753            let fig_attr = Attr::with_id(std::mem::take(&mut attr.id));
754            (fig_attr, Some(Inline::Image(attr, alt_inlines, url, title)))
755        }
756        other => (Attr::default(), other),
757    };
758    let caption = if alt.is_empty() {
759        Vec::new()
760    } else {
761        vec![Block::Plain(alt)]
762    };
763    let body = match image_inline {
764        Some(img) => vec![Block::Plain(vec![img])],
765        None => Vec::new(),
766    };
767    Block::Figure(figure_attr, caption, body)
768}
769
770fn heading_block(node: &SyntaxNode) -> Block {
771    let level = heading_level(node);
772    let inlines = node
773        .children()
774        .find(|c| c.kind() == SyntaxKind::HEADING_CONTENT)
775        .map(|c| coalesce_inlines(inlines_from(&c)))
776        .unwrap_or_default();
777    // Auto-id and disambiguation are computed in the `RefsCtx` pre-pass so
778    // duplicate slugs and `section`-fallbacks are document-wide consistent.
779    // Explicit attributes still need their classes/kvs parsed here.
780    let offset: u32 = node.text_range().start().into();
781    let final_id = REFS_CTX
782        .with(|c| c.borrow().heading_id_by_offset.get(&offset).cloned())
783        .unwrap_or_default();
784    let attr = node
785        .children_with_tokens()
786        .find_map(|el| match el {
787            NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
788            NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => {
789                Some(t.text().to_string())
790            }
791            _ => None,
792        })
793        .map(|raw| {
794            let trimmed = raw.trim();
795            if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
796                let mut attr = parse_attr_block(inner);
797                if attr.id.is_empty() {
798                    attr.id = final_id.clone();
799                }
800                attr
801            } else {
802                Attr::with_id(final_id.clone())
803            }
804        })
805        .unwrap_or_else(|| Attr::with_id(final_id));
806    Block::Header(level, attr, inlines)
807}
808
809fn heading_level(node: &SyntaxNode) -> usize {
810    for child in node.children() {
811        if child.kind() == SyntaxKind::ATX_HEADING_MARKER {
812            for tok in child.children_with_tokens() {
813                if let Some(t) = tok.as_token()
814                    && t.kind() == SyntaxKind::ATX_HEADING_MARKER
815                {
816                    return t.text().chars().filter(|&c| c == '#').count();
817                }
818            }
819        }
820    }
821    for el in node.descendants_with_tokens() {
822        if let NodeOrToken::Token(t) = el
823            && t.kind() == SyntaxKind::SETEXT_HEADING_UNDERLINE
824        {
825            return if t.text().trim_start().starts_with('=') {
826                1
827            } else {
828                2
829            };
830        }
831    }
832    1
833}
834
835fn blockquote_blocks(node: &SyntaxNode) -> Vec<Block> {
836    let mut out = Vec::new();
837    for child in node.children() {
838        collect_block(&child, &mut out);
839    }
840    out
841}
842
843fn code_block(node: &SyntaxNode) -> Block {
844    let raw_format = code_block_raw_format(node);
845    let attr = code_block_attr(node);
846    let is_fenced = node
847        .children()
848        .any(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN);
849    let mut content = String::new();
850    for child in node.children() {
851        if child.kind() == SyntaxKind::CODE_CONTENT {
852            content.push_str(&child.text().to_string());
853        }
854    }
855    // Pandoc strips the trailing newline that closes the block.
856    while content.ends_with('\n') {
857        content.pop();
858    }
859    if is_fenced {
860        // Pandoc tab-expands code-block bodies before emission. For indented
861        // code, the expansion happens inside `strip_indented_code_indent`
862        // before the 4-col strip; for fenced code there is no strip, so do
863        // it directly here.
864        content = content
865            .split('\n')
866            .map(expand_tabs_to_4)
867            .collect::<Vec<_>>()
868            .join("\n");
869    } else {
870        content = strip_indented_code_indent(&content);
871    }
872    if let Some(fmt) = raw_format {
873        return Block::RawBlock(fmt, content);
874    }
875    Block::CodeBlock(attr, content)
876}
877
878/// Pandoc's raw-attribute syntax (`Ext_raw_attribute`) treats a fenced code
879/// block whose info string is exactly `{=format}` as a `RawBlock` of that
880/// format rather than a `CodeBlock`. The brace contents must start with `=`
881/// followed by a non-empty token, with no other classes/ids/key-value pairs.
882fn code_block_raw_format(node: &SyntaxNode) -> Option<String> {
883    let open = node
884        .children()
885        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)?;
886    let info = open
887        .children()
888        .find(|c| c.kind() == SyntaxKind::CODE_INFO)?;
889    let raw = info.text().to_string();
890    let trimmed = raw.trim();
891    let inner = trimmed
892        .strip_prefix('{')
893        .and_then(|s| s.strip_suffix('}'))?;
894    let inner = inner.trim();
895    let format = inner.strip_prefix('=')?.trim();
896    if format.is_empty() || format.contains(char::is_whitespace) {
897        return None;
898    }
899    Some(format.to_string())
900}
901
902fn code_block_attr(node: &SyntaxNode) -> Attr {
903    let Some(open) = node
904        .children()
905        .find(|c| c.kind() == SyntaxKind::CODE_FENCE_OPEN)
906    else {
907        return Attr::default();
908    };
909    let Some(info) = open.children().find(|c| c.kind() == SyntaxKind::CODE_INFO) else {
910        return Attr::default();
911    };
912    let raw = info.text().to_string();
913    let trimmed = raw.trim();
914    if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
915        return parse_attr_block(inner);
916    }
917    // Shortcut form: `lang {.cls #id key=value}` — language followed by an
918    // attribute block. Pandoc concatenates the language as the first class.
919    if let Some(brace) = trimmed.find('{')
920        && trimmed.ends_with('}')
921    {
922        let lang = trimmed[..brace].trim();
923        let attr_inner = &trimmed[brace + 1..trimmed.len() - 1];
924        let mut attr = parse_attr_block(attr_inner);
925        if !lang.is_empty() {
926            attr.classes.insert(0, normalize_lang_id(lang));
927        }
928        return attr;
929    }
930    if !trimmed.is_empty() {
931        return Attr {
932            id: String::new(),
933            classes: vec![normalize_lang_id(trimmed)],
934            kvs: Vec::new(),
935        };
936    }
937    Attr::default()
938}
939
940/// Mirrors pandoc's `toLanguageId` (Markdown reader): lowercases the language
941/// identifier and applies the GitHub-syntax-highlighting normalizations
942/// (`c++` → `cpp`, `objective-c` → `objectivec`).
943fn normalize_lang_id(lang: &str) -> String {
944    let lower = lang.to_ascii_lowercase();
945    match lower.as_str() {
946        "c++" => "cpp".to_string(),
947        "objective-c" => "objectivec".to_string(),
948        _ => lower,
949    }
950}
951
952/// Pandoc strips up to four leading spaces (or one tab) from each line of an
953/// indented code block. The CST keeps the indent as part of CODE_CONTENT, so
954/// we remove it here.
955fn strip_indented_code_indent(s: &str) -> String {
956    let mut out = String::with_capacity(s.len());
957    for (i, line) in s.split('\n').enumerate() {
958        if i > 0 {
959            out.push('\n');
960        }
961        // Pandoc expands tabs to 4-column tab stops *before* stripping the
962        // 4-column indent. Mixed `  \tfoo` therefore becomes `    foo` →
963        // `foo` after strip, which is what `pandoc -t native` emits.
964        let expanded = expand_tabs_to_4(line);
965        let stripped = if let Some(rest) = expanded.strip_prefix("    ") {
966            rest.to_string()
967        } else if let Some(rest) = expanded.strip_prefix('\t') {
968            rest.to_string()
969        } else {
970            // Strip up to 3 leading spaces if present (pandoc tolerates short
971            // indentation only on blank lines, which we don't try to detect
972            // here — safer to leave non-conforming lines alone).
973            expanded
974        };
975        out.push_str(&stripped);
976    }
977    out
978}
979
980/// Expand `\t` to spaces using 4-column tab stops, starting from column 0
981/// of `line`. Pandoc applies this to indented code blocks before stripping
982/// the leading 4-column indent so the body byte-equals what pandoc emits.
983fn expand_tabs_to_4(line: &str) -> String {
984    let mut out = String::with_capacity(line.len());
985    let mut col = 0usize;
986    for c in line.chars() {
987        if c == '\t' {
988            let next = (col / 4 + 1) * 4;
989            for _ in col..next {
990                out.push(' ');
991            }
992            col = next;
993        } else {
994            out.push(c);
995            col += 1;
996        }
997    }
998    out
999}
1000
1001fn html_block(node: &SyntaxNode) -> Block {
1002    let mut content = node.text().to_string();
1003    while content.ends_with('\n') {
1004        content.pop();
1005    }
1006    if let Some(div) = try_div_html_block(&content) {
1007        return div;
1008    }
1009    Block::RawBlock("html".to_string(), content)
1010}
1011
1012/// Project an `HTML_BLOCK` node into one or more `Block`s. Pandoc emits each
1013/// "complete tag" line of a block-level HTML_BLOCK as a separate `RawBlock`,
1014/// with text content lines parsed as Markdown into `Plain` blocks (the
1015/// `markdown_in_html_blocks` behavior, default-on under `markdown` flavor).
1016/// Verbatim constructs (comments, `<script>` / `<style>` / `<pre>` /
1017/// `<textarea>`, processing instructions, declarations, CDATA) keep their
1018/// content as a single `RawBlock` with newlines preserved.
1019fn emit_html_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1020    let mut content = node.text().to_string();
1021    while content.ends_with('\n') {
1022        content.pop();
1023    }
1024    if let Some(div) = try_div_html_block(&content) {
1025        out.push(div);
1026        return;
1027    }
1028    let leading_ws = content
1029        .as_bytes()
1030        .iter()
1031        .position(|&b| b != b' ' && b != b'\t')
1032        .unwrap_or(content.len());
1033    let trimmed = &content[leading_ws..];
1034    if trimmed.starts_with("<!--")
1035        || trimmed.starts_with("<?")
1036        || trimmed.starts_with("<![CDATA[")
1037        || trimmed.starts_with("<!")
1038        || is_raw_text_element_open(trimmed)
1039    {
1040        out.push(Block::RawBlock("html".to_string(), content));
1041        return;
1042    }
1043    if !content.contains('\n') {
1044        out.push(Block::RawBlock("html".to_string(), content));
1045        return;
1046    }
1047    for line in content.split('\n') {
1048        let line_trimmed = line.trim();
1049        if line_trimmed.is_empty() {
1050            continue;
1051        }
1052        if is_complete_html_tag_line(line_trimmed) {
1053            out.push(Block::RawBlock(
1054                "html".to_string(),
1055                line_trimmed.to_string(),
1056            ));
1057        } else {
1058            let inlines = coalesce_inlines(parse_cell_text_inlines(line_trimmed));
1059            if !inlines.is_empty() {
1060                out.push(Block::Plain(inlines));
1061            }
1062        }
1063    }
1064}
1065
1066/// Return true if `s` (with leading `<`) opens a raw-text HTML element where
1067/// pandoc keeps the entire block verbatim — no markdown parsing inside.
1068/// Lowercases the tag name for matching; matches when the tag name is
1069/// followed by whitespace, `>`, `/`, or end-of-string.
1070fn is_raw_text_element_open(s: &str) -> bool {
1071    let bytes = s.as_bytes();
1072    if bytes.is_empty() || bytes[0] != b'<' {
1073        return false;
1074    }
1075    let rest = &s[1..];
1076    for tag in ["script", "style", "pre", "textarea"] {
1077        if rest.len() < tag.len() {
1078            continue;
1079        }
1080        if rest[..tag.len()].eq_ignore_ascii_case(tag) {
1081            let after = rest.as_bytes().get(tag.len()).copied();
1082            match after {
1083                None => return true,
1084                Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'>') | Some(b'/') => {
1085                    return true;
1086                }
1087                _ => {}
1088            }
1089        }
1090    }
1091    false
1092}
1093
1094/// Return true if `s` is a single complete HTML tag (or comment / declaration)
1095/// with no content following the closing `>`. Quotes inside attributes are
1096/// skipped so embedded `>` characters don't terminate the scan early.
1097fn is_complete_html_tag_line(s: &str) -> bool {
1098    let bytes = s.as_bytes();
1099    if bytes.is_empty() || bytes[0] != b'<' {
1100        return false;
1101    }
1102    let mut i = 1;
1103    while i < bytes.len() {
1104        match bytes[i] {
1105            b'>' => return i == bytes.len() - 1,
1106            b'"' => {
1107                i += 1;
1108                while i < bytes.len() && bytes[i] != b'"' {
1109                    i += 1;
1110                }
1111                if i >= bytes.len() {
1112                    return false;
1113                }
1114                i += 1;
1115            }
1116            b'\'' => {
1117                i += 1;
1118                while i < bytes.len() && bytes[i] != b'\'' {
1119                    i += 1;
1120                }
1121                if i >= bytes.len() {
1122                    return false;
1123                }
1124                i += 1;
1125            }
1126            _ => i += 1,
1127        }
1128    }
1129    false
1130}
1131
1132/// Iterate `node`'s block-level emission, handling `HTML_BLOCK` splitting
1133/// (one HTML block can project as several pandoc-native blocks under
1134/// `markdown_in_html_blocks`) while keeping every other kind one-block.
1135fn collect_block(node: &SyntaxNode, out: &mut Vec<Block>) {
1136    if node.kind() == SyntaxKind::HTML_BLOCK {
1137        emit_html_block(node, out);
1138        return;
1139    }
1140    if let Some(b) = block_from(node) {
1141        out.push(b);
1142    }
1143}
1144
1145/// Detect a `<div ...>...</div>` HTML block and project it as
1146/// `Div(attr, blocks)` with the inner content reparsed as Pandoc markdown.
1147/// Pandoc's `markdown_in_html_blocks` extension (default-on under `markdown`
1148/// flavor) treats every `<div>` block this way, regardless of whether it
1149/// has attributes. Returns `None` for any HTML block whose outer tag is not
1150/// `<div>` (so other block tags keep falling through to the RawBlock path).
1151fn try_div_html_block(content: &str) -> Option<Block> {
1152    let bytes = content.as_bytes();
1153    let leading_ws = bytes
1154        .iter()
1155        .position(|&b| b != b' ' && b != b'\t')
1156        .unwrap_or(bytes.len());
1157    let head = &content[leading_ws..];
1158    let head_bytes = head.as_bytes();
1159    if head_bytes.len() < 4 || !head_bytes[..4].eq_ignore_ascii_case(b"<div") {
1160        return None;
1161    }
1162    let after_div = head_bytes.get(4).copied();
1163    match after_div {
1164        Some(b' ') | Some(b'\t') | Some(b'\n') | Some(b'>') | Some(b'/') => {}
1165        _ => return None,
1166    }
1167    let close_gt_rel = head[4..].find('>')?;
1168    let open_attrs_raw = &head[4..4 + close_gt_rel];
1169    let open_attrs = open_attrs_raw.trim_matches(|c: char| c.is_whitespace() || c == '/');
1170    let attr = parse_html_attrs(open_attrs);
1171    let after_open_tag = leading_ws + 4 + close_gt_rel + 1;
1172    let multiline = content.as_bytes().get(after_open_tag).copied() == Some(b'\n');
1173    let trailing_ws = content.as_bytes()[after_open_tag..]
1174        .iter()
1175        .rev()
1176        .position(|&b| b != b' ' && b != b'\t' && b != b'\n')
1177        .unwrap_or(0);
1178    let close_end = content.len() - trailing_ws;
1179    let close_search = &content[after_open_tag..close_end];
1180    if !close_search.to_ascii_lowercase().ends_with("</div>") {
1181        return None;
1182    }
1183    let close_start = after_open_tag + close_search.len() - "</div>".len();
1184    let inner = content[after_open_tag..close_start].trim_matches('\n');
1185    let mut blocks = parse_pandoc_blocks(inner);
1186    if !multiline
1187        && blocks.len() == 1
1188        && let Block::Para(inlines) = blocks.remove(0)
1189    {
1190        blocks.push(Block::Plain(inlines));
1191    }
1192    Some(Block::Div(attr, blocks))
1193}
1194
1195/// Reparse `text` as Pandoc-flavored markdown and return its top-level
1196/// blocks. Unlike `parse_cell_text_blocks`, leaves `Para` as `Para` — the
1197/// caller decides whether the surrounding context demands `Plain`.
1198fn parse_pandoc_blocks(text: &str) -> Vec<Block> {
1199    if text.trim().is_empty() {
1200        return Vec::new();
1201    }
1202    let opts = crate::ParserOptions {
1203        flavor: crate::Flavor::Pandoc,
1204        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
1205        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
1206        ..crate::ParserOptions::default()
1207    };
1208    let doc = crate::parse(text, Some(opts));
1209    let mut out = Vec::new();
1210    for child in doc.children() {
1211        collect_block(&child, &mut out);
1212    }
1213    out
1214}
1215
1216fn tex_block(node: &SyntaxNode) -> Block {
1217    let mut content = node.text().to_string();
1218    while content.ends_with('\n') {
1219        content.pop();
1220    }
1221    Block::RawBlock("tex".to_string(), content)
1222}
1223
1224fn fenced_div(node: &SyntaxNode) -> Block {
1225    let attr = node
1226        .children()
1227        .find(|c| c.kind() == SyntaxKind::DIV_FENCE_OPEN)
1228        .map(|open| {
1229            let info = open
1230                .children()
1231                .find(|c| c.kind() == SyntaxKind::DIV_INFO)
1232                .map(|n| n.text().to_string())
1233                .unwrap_or_default();
1234            parse_div_info(info.trim())
1235        })
1236        .unwrap_or_default();
1237    let mut blocks = Vec::new();
1238    for child in node.children() {
1239        match child.kind() {
1240            SyntaxKind::DIV_FENCE_OPEN | SyntaxKind::DIV_FENCE_CLOSE => {}
1241            _ => collect_block(&child, &mut blocks),
1242        }
1243    }
1244    Block::Div(attr, blocks)
1245}
1246
1247/// Parse pandoc div info: either `{#id .class1 .class2 key=value}` or a single
1248/// bare class name like `Warning`.
1249fn parse_div_info(info: &str) -> Attr {
1250    if info.starts_with('{') && info.ends_with('}') {
1251        return parse_attr_block(&info[1..info.len() - 1]);
1252    }
1253    if !info.is_empty() {
1254        return Attr {
1255            id: String::new(),
1256            classes: vec![info.to_string()],
1257            kvs: Vec::new(),
1258        };
1259    }
1260    Attr::default()
1261}
1262
1263/// Read a child `ATTRIBUTE` (node or token) on `parent` and parse its
1264/// `{...}` body into an `Attr`. Returns `Attr::default()` if no attribute
1265/// is attached or the body isn't `{...}`-shaped.
1266fn extract_attr_from_node(parent: &SyntaxNode) -> Attr {
1267    let raw = parent.children_with_tokens().find_map(|el| match el {
1268        NodeOrToken::Node(n) if n.kind() == SyntaxKind::ATTRIBUTE => Some(n.text().to_string()),
1269        NodeOrToken::Token(t) if t.kind() == SyntaxKind::ATTRIBUTE => Some(t.text().to_string()),
1270        _ => None,
1271    });
1272    let Some(raw) = raw else {
1273        return Attr::default();
1274    };
1275    let trimmed = raw.trim();
1276    if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}')) {
1277        parse_attr_block(inner)
1278    } else {
1279        Attr::default()
1280    }
1281}
1282
1283/// Parse the body of an attribute block like `#my-id .class1 .class2 key=value`.
1284/// Whitespace-separated. Tokens starting with `#` are id, `.` are classes,
1285/// `key=value` (optionally quoted value) are kvs.
1286fn parse_attr_block(s: &str) -> Attr {
1287    let mut id = String::new();
1288    let mut classes: Vec<String> = Vec::new();
1289    let mut kvs: Vec<(String, String)> = Vec::new();
1290    let bytes = s.as_bytes();
1291    let mut i = 0usize;
1292    while i < bytes.len() {
1293        match bytes[i] {
1294            b' ' | b'\t' | b'\n' | b'\r' => {
1295                i += 1;
1296            }
1297            b'#' => {
1298                let start = i + 1;
1299                let mut j = start;
1300                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
1301                    j += 1;
1302                }
1303                id = s[start..j].to_string();
1304                i = j;
1305            }
1306            b'.' => {
1307                let start = i + 1;
1308                let mut j = start;
1309                while j < bytes.len() && !matches!(bytes[j], b' ' | b'\t' | b'\n' | b'\r') {
1310                    j += 1;
1311                }
1312                classes.push(s[start..j].to_string());
1313                i = j;
1314            }
1315            _ => {
1316                // Read key up to `=` or whitespace.
1317                let key_start = i;
1318                while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=') {
1319                    i += 1;
1320                }
1321                let key = s[key_start..i].to_string();
1322                if i < bytes.len() && bytes[i] == b'=' {
1323                    i += 1;
1324                    let value = if i < bytes.len() && bytes[i] == b'"' {
1325                        i += 1;
1326                        let v_start = i;
1327                        while i < bytes.len() && bytes[i] != b'"' {
1328                            i += 1;
1329                        }
1330                        let v = s[v_start..i].to_string();
1331                        if i < bytes.len() {
1332                            i += 1;
1333                        }
1334                        v
1335                    } else {
1336                        let v_start = i;
1337                        while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
1338                            i += 1;
1339                        }
1340                        s[v_start..i].to_string()
1341                    };
1342                    kvs.push((key, value));
1343                } else if !key.is_empty() {
1344                    // Bare token (legacy class form).
1345                    classes.push(key);
1346                }
1347            }
1348        }
1349    }
1350    Attr { id, classes, kvs }
1351}
1352
1353/// Parse HTML-style attributes `class="x" id="y" key="z"` into `Attr`,
1354/// mapping `class` (whitespace-split) → classes, `id` → id, others → kvs.
1355fn parse_html_attrs(s: &str) -> Attr {
1356    let mut id = String::new();
1357    let mut classes: Vec<String> = Vec::new();
1358    let mut kvs: Vec<(String, String)> = Vec::new();
1359    let bytes = s.as_bytes();
1360    let mut i = 0usize;
1361    while i < bytes.len() {
1362        match bytes[i] {
1363            b' ' | b'\t' | b'\n' | b'\r' => {
1364                i += 1;
1365            }
1366            _ => {
1367                let key_start = i;
1368                while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=') {
1369                    i += 1;
1370                }
1371                let key = s[key_start..i].to_string();
1372                let value = if i < bytes.len() && bytes[i] == b'=' {
1373                    i += 1;
1374                    if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
1375                        let quote = bytes[i];
1376                        i += 1;
1377                        let v_start = i;
1378                        while i < bytes.len() && bytes[i] != quote {
1379                            i += 1;
1380                        }
1381                        let v = s[v_start..i].to_string();
1382                        if i < bytes.len() {
1383                            i += 1;
1384                        }
1385                        v
1386                    } else {
1387                        let v_start = i;
1388                        while i < bytes.len() && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r') {
1389                            i += 1;
1390                        }
1391                        s[v_start..i].to_string()
1392                    }
1393                } else {
1394                    String::new()
1395                };
1396                if key.is_empty() {
1397                    continue;
1398                }
1399                match key.as_str() {
1400                    "class" => {
1401                        for c in value.split_ascii_whitespace() {
1402                            classes.push(c.to_string());
1403                        }
1404                    }
1405                    "id" => id = value,
1406                    _ => kvs.push((key, value)),
1407                }
1408            }
1409        }
1410    }
1411    Attr { id, classes, kvs }
1412}
1413
1414fn definition_list(node: &SyntaxNode) -> Block {
1415    let items: Vec<(Vec<Inline>, Vec<Vec<Block>>)> = node
1416        .children()
1417        .filter(|c| c.kind() == SyntaxKind::DEFINITION_ITEM)
1418        .map(|item| {
1419            let term = item
1420                .children()
1421                .find(|c| c.kind() == SyntaxKind::TERM)
1422                .map(|t| coalesce_inlines(inlines_from(&t)))
1423                .unwrap_or_default();
1424            let loose = is_loose_definition_item(&item);
1425            let defs: Vec<Vec<Block>> = item
1426                .children()
1427                .filter(|c| c.kind() == SyntaxKind::DEFINITION)
1428                .map(|d| definition_blocks(&d, loose))
1429                .collect();
1430            (term, defs)
1431        })
1432        .collect();
1433    Block::DefinitionList(items)
1434}
1435
1436/// A `DEFINITION_ITEM` is "loose" iff there is a `BLANK_LINE` between the
1437/// `TERM` (or its preceding term continuations) and the first `DEFINITION`.
1438/// Pandoc renders loose definitions with `Para` blocks; tight ones use
1439/// `Plain`. The looseness is per-item (per-term group), not per-definition,
1440/// and applies to *all* definitions in the item — see pandoc's behavior.
1441fn is_loose_definition_item(item: &SyntaxNode) -> bool {
1442    let mut saw_term = false;
1443    for child in item.children_with_tokens() {
1444        if let NodeOrToken::Node(n) = child {
1445            match n.kind() {
1446                SyntaxKind::TERM => {
1447                    saw_term = true;
1448                }
1449                SyntaxKind::BLANK_LINE if saw_term => {
1450                    return true;
1451                }
1452                SyntaxKind::DEFINITION => {
1453                    return false;
1454                }
1455                _ => {}
1456            }
1457        }
1458    }
1459    false
1460}
1461
1462fn definition_blocks(def_node: &SyntaxNode, loose: bool) -> Vec<Block> {
1463    // Definition body content lives at the marker's content offset (`: ` →
1464    // 2 columns by default). The CST keeps that indent on each line, so any
1465    // CODE_BLOCK descendant needs the offset stripped before pandoc-native
1466    // projection.
1467    let extra = definition_content_offset(def_node);
1468    let mut out = Vec::new();
1469    for child in def_node.children() {
1470        match child.kind() {
1471            SyntaxKind::PLAIN => {
1472                let inlines = coalesce_inlines(inlines_from(&child));
1473                if loose {
1474                    out.push(Block::Para(inlines));
1475                } else {
1476                    out.push(Block::Plain(inlines));
1477                }
1478            }
1479            SyntaxKind::PARAGRAPH => {
1480                out.push(Block::Para(coalesce_inlines(inlines_from(&child))));
1481            }
1482            SyntaxKind::CODE_BLOCK if extra > 0 => {
1483                out.push(indented_code_block_with_extra_strip(&child, extra));
1484            }
1485            _ => collect_block(&child, &mut out),
1486        }
1487    }
1488    out
1489}
1490
1491/// Visual column where definition body content starts. The strip later runs
1492/// against the *tab-expanded* body, so this offset must be measured in
1493/// columns (tabs round to the next 4-col stop), not raw chars: `:\t` reaches
1494/// col 4, which is the column the body's strip should remove.
1495fn definition_content_offset(def_node: &SyntaxNode) -> usize {
1496    let mut col = 0usize;
1497    let mut saw_marker = false;
1498    for el in def_node.children_with_tokens() {
1499        if let NodeOrToken::Token(t) = el {
1500            match t.kind() {
1501                SyntaxKind::DEFINITION_MARKER => {
1502                    col = advance_col(col, t.text());
1503                    saw_marker = true;
1504                }
1505                SyntaxKind::WHITESPACE if saw_marker => {
1506                    return advance_col(col, t.text());
1507                }
1508                _ if saw_marker => return col,
1509                _ => {}
1510            }
1511        } else if saw_marker {
1512            return col;
1513        }
1514    }
1515    col
1516}
1517
1518/// Advance a column counter by `s`, treating `\t` as moving to the next
1519/// 4-column tab stop and any other character as a single column.
1520fn advance_col(start: usize, s: &str) -> usize {
1521    let mut col = start;
1522    for c in s.chars() {
1523        if c == '\t' {
1524            col = (col / 4 + 1) * 4;
1525        } else {
1526            col += 1;
1527        }
1528    }
1529    col
1530}
1531
1532fn line_block(node: &SyntaxNode) -> Block {
1533    let lines: Vec<Vec<Inline>> = node
1534        .children()
1535        .filter(|c| c.kind() == SyntaxKind::LINE_BLOCK_LINE)
1536        .map(|line| {
1537            let mut out = Vec::new();
1538            for el in line.children_with_tokens() {
1539                match el {
1540                    NodeOrToken::Token(t) => match t.kind() {
1541                        SyntaxKind::LINE_BLOCK_MARKER | SyntaxKind::NEWLINE => {}
1542                        _ => push_token_inline(&t, &mut out),
1543                    },
1544                    NodeOrToken::Node(n) => out.push(inline_from_node(&n)),
1545                }
1546            }
1547            coalesce_inlines(out)
1548        })
1549        .collect();
1550    Block::LineBlock(lines)
1551}
1552
1553fn latex_command_inline(node: &SyntaxNode) -> Inline {
1554    let content = node.text().to_string();
1555    Inline::RawInline("tex".to_string(), content)
1556}
1557
1558fn bracketed_span_inline(node: &SyntaxNode) -> Inline {
1559    let is_html = node
1560        .children_with_tokens()
1561        .any(|el| matches!(&el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::SPAN_BRACKET_OPEN && t.text().starts_with('<')));
1562    let attr_text = node.children_with_tokens().find_map(|el| match el {
1563        NodeOrToken::Token(t) if t.kind() == SyntaxKind::SPAN_ATTRIBUTES => {
1564            Some(t.text().to_string())
1565        }
1566        NodeOrToken::Node(n) if n.kind() == SyntaxKind::SPAN_ATTRIBUTES => {
1567            Some(n.text().to_string())
1568        }
1569        _ => None,
1570    });
1571    let attr = attr_text
1572        .map(|raw| {
1573            let trimmed = raw.trim();
1574            if is_html {
1575                parse_html_attrs(trimmed)
1576            } else if let Some(inner) = trimmed.strip_prefix('{').and_then(|s| s.strip_suffix('}'))
1577            {
1578                parse_attr_block(inner)
1579            } else {
1580                Attr::default()
1581            }
1582        })
1583        .unwrap_or_default();
1584    let content = node
1585        .children()
1586        .find(|c| c.kind() == SyntaxKind::SPAN_CONTENT)
1587        .map(|n| coalesce_inlines(inlines_from(&n)))
1588        .unwrap_or_default();
1589    Inline::Span(attr, content)
1590}
1591
1592fn pipe_table(node: &SyntaxNode) -> Option<TableData> {
1593    let mut header_cells: Vec<Vec<Inline>> = Vec::new();
1594    let mut body_rows: Vec<Vec<Vec<Inline>>> = Vec::new();
1595    let mut aligns: Vec<&'static str> = Vec::new();
1596    let mut caption_inlines: Vec<Inline> = Vec::new();
1597    for child in node.children() {
1598        match child.kind() {
1599            SyntaxKind::TABLE_HEADER => {
1600                header_cells = pipe_table_cells(&child);
1601            }
1602            SyntaxKind::TABLE_SEPARATOR => {
1603                let raw = child.text().to_string();
1604                aligns = pipe_separator_aligns(&raw);
1605            }
1606            SyntaxKind::TABLE_ROW => {
1607                body_rows.push(pipe_table_cells(&child));
1608            }
1609            SyntaxKind::TABLE_CAPTION => {
1610                caption_inlines = pipe_table_caption(&child);
1611            }
1612            _ => {}
1613        }
1614    }
1615    let cols = header_cells
1616        .len()
1617        .max(body_rows.iter().map(Vec::len).max().unwrap_or(0))
1618        .max(aligns.len());
1619    if cols == 0 {
1620        return None;
1621    }
1622    while aligns.len() < cols {
1623        aligns.push("AlignDefault");
1624    }
1625    let head_rows = if header_cells.is_empty() {
1626        Vec::new()
1627    } else {
1628        vec![cells_to_plain_blocks(header_cells, cols)]
1629    };
1630    let body_rows: Vec<Vec<GridCell>> = body_rows
1631        .into_iter()
1632        .map(|cells| cells_to_plain_blocks(cells, cols))
1633        .collect();
1634    let (attr, caption_inlines) = extract_caption_attrs(caption_inlines);
1635    Some(TableData {
1636        attr,
1637        caption: caption_inlines,
1638        aligns,
1639        widths: vec![None; cols],
1640        head_rows,
1641        body_rows,
1642        foot_rows: Vec::new(),
1643    })
1644}
1645
1646fn pipe_table_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
1647    row.children()
1648        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
1649        .map(|cell| coalesce_inlines(inlines_from(&cell)))
1650        .collect()
1651}
1652
1653/// Pandoc's `+caption_attributes` extension lifts a trailing `{...}` from a
1654/// table caption into the Table's outer attribute. Walk the caption inlines
1655/// from the right looking for a balanced trailing `{...}` span: a Str
1656/// ending with `}` plus zero or more (Space, Str) pairs back until a Str
1657/// starts with `{`. If found, parse the brace contents as an attribute
1658/// block and drop those inlines (plus any preceding Space) from the caption
1659/// text.
1660fn extract_caption_attrs(mut inlines: Vec<Inline>) -> (Attr, Vec<Inline>) {
1661    let last_str_end = inlines
1662        .iter()
1663        .rposition(|i| matches!(i, Inline::Str(s) if s.ends_with('}')));
1664    let Some(end_idx) = last_str_end else {
1665        return (Attr::default(), inlines);
1666    };
1667    // Walk back to find the Str starting with `{`. Allow only Str/Space
1668    // between (no structural inlines like Emph), since attribute blocks
1669    // are plain text.
1670    let mut start_idx = end_idx;
1671    let mut found_open = false;
1672    loop {
1673        match &inlines[start_idx] {
1674            Inline::Str(s) => {
1675                if s.starts_with('{') {
1676                    found_open = true;
1677                    break;
1678                }
1679            }
1680            Inline::Space => {}
1681            _ => return (Attr::default(), inlines),
1682        }
1683        if start_idx == 0 {
1684            break;
1685        }
1686        start_idx -= 1;
1687    }
1688    if !found_open {
1689        return (Attr::default(), inlines);
1690    }
1691    // Concatenate the Str/Space slice into a flat string, then strip the
1692    // outer braces.
1693    let mut raw = String::new();
1694    for el in &inlines[start_idx..=end_idx] {
1695        match el {
1696            Inline::Str(s) => raw.push_str(s),
1697            Inline::Space => raw.push(' '),
1698            _ => return (Attr::default(), inlines),
1699        }
1700    }
1701    if !(raw.starts_with('{') && raw.ends_with('}')) {
1702        return (Attr::default(), inlines);
1703    }
1704    let inner = &raw[1..raw.len() - 1];
1705    let attr = parse_attr_block(inner);
1706    inlines.truncate(start_idx);
1707    if matches!(inlines.last(), Some(Inline::Space)) {
1708        inlines.pop();
1709    }
1710    (attr, inlines)
1711}
1712
1713fn pipe_table_caption(node: &SyntaxNode) -> Vec<Inline> {
1714    // Walk all tokens after TABLE_CAPTION_PREFIX and collect inline content.
1715    let mut out = Vec::new();
1716    let mut after_prefix = false;
1717    for el in node.children_with_tokens() {
1718        match el {
1719            NodeOrToken::Node(n) => {
1720                if n.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
1721                    after_prefix = true;
1722                    continue;
1723                }
1724                if after_prefix {
1725                    out.push(inline_from_node(&n));
1726                }
1727            }
1728            NodeOrToken::Token(t) => {
1729                if t.kind() == SyntaxKind::TABLE_CAPTION_PREFIX {
1730                    after_prefix = true;
1731                    continue;
1732                }
1733                if after_prefix {
1734                    push_token_inline(&t, &mut out);
1735                }
1736            }
1737        }
1738    }
1739    coalesce_inlines(out)
1740}
1741
1742fn pipe_separator_aligns(raw: &str) -> Vec<&'static str> {
1743    // Strip surrounding whitespace before pipe-stripping so an indented
1744    // pipe-table separator (e.g. fenced-div content at column ≥1) doesn't
1745    // leave a leading whitespace segment that then counts as a phantom
1746    // column.
1747    let trimmed = raw.trim();
1748    let inner = trimmed.trim_start_matches('|').trim_end_matches('|');
1749    inner
1750        .split('|')
1751        .map(|seg| {
1752            let s = seg.trim();
1753            let left = s.starts_with(':');
1754            let right = s.ends_with(':');
1755            match (left, right) {
1756                (true, true) => "AlignCenter",
1757                (true, false) => "AlignLeft",
1758                (false, true) => "AlignRight",
1759                _ => "AlignDefault",
1760            }
1761        })
1762        .collect()
1763}
1764
1765fn cells_to_plain_blocks(cells: Vec<Vec<Inline>>, cols: usize) -> Vec<GridCell> {
1766    let mut out: Vec<GridCell> = cells
1767        .into_iter()
1768        .map(|inlines| {
1769            let blocks = if inlines.is_empty() {
1770                Vec::new()
1771            } else {
1772                vec![Block::Plain(inlines)]
1773            };
1774            GridCell::no_span(blocks)
1775        })
1776        .collect();
1777    while out.len() < cols {
1778        out.push(GridCell::no_span(Vec::new()));
1779    }
1780    out
1781}
1782
1783/// Pandoc-style `show` for `Double`. Decimal in `[0.1, 1e7)`, scientific
1784/// otherwise. Always emits a fractional component (`1.0` not `1`). Used for
1785/// `ColWidth N` rendering, where N is in `(0.0, 1.0)` for our cases.
1786fn show_double(x: f64) -> String {
1787    if x == 0.0 {
1788        return "0.0".to_string();
1789    }
1790    let abs = x.abs();
1791    if (0.1..1e7).contains(&abs) {
1792        let s = format!("{x}");
1793        if s.contains('.') || s.contains('e') {
1794            s
1795        } else {
1796            format!("{s}.0")
1797        }
1798    } else {
1799        // Rust's `{:e}` already matches Haskell's mantissa/exponent shape:
1800        // `8.333333333333333e-2`. Whole-number mantissa needs `.0` appended.
1801        let s = format!("{x:e}");
1802        if let Some((m, e)) = s.split_once('e') {
1803            if m.contains('.') {
1804                s
1805            } else {
1806                format!("{m}.0e{e}")
1807            }
1808        } else {
1809            s
1810        }
1811    }
1812}
1813
1814// ----- simple table -------------------------------------------------------
1815
1816/// Project a `SIMPLE_TABLE` node. Pandoc's "simple" table form:
1817///
1818/// ```text
1819///    Col1     Col2
1820/// -------- --------    ← TABLE_SEPARATOR (dash runs define columns)
1821///   data1    data2
1822///
1823/// Table: optional caption
1824/// ```
1825///
1826/// Headerless variant skips the header row and uses dash runs both above
1827/// and below the data. Alignment is derived from each header cell's
1828/// position relative to its column's dash run boundaries. For headerless
1829/// tables, alignment derives from the *first data row*.
1830fn simple_table(node: &SyntaxNode) -> Option<TableData> {
1831    let separator = node
1832        .children()
1833        .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)?;
1834    let cols = simple_table_dash_runs(&separator);
1835    if cols.is_empty() {
1836        return None;
1837    }
1838    let header = node
1839        .children()
1840        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
1841    // Body rows: every TABLE_ROW. Drop a trailing all-dashes row — that is
1842    // the closing `---` separator of a headerless table that the parser
1843    // currently emits as a TABLE_ROW of dash cells.
1844    let mut body_rows_nodes: Vec<SyntaxNode> = node
1845        .children()
1846        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
1847        .collect();
1848    if header.is_none()
1849        && body_rows_nodes
1850            .last()
1851            .map(simple_table_row_is_all_dashes)
1852            .unwrap_or(false)
1853    {
1854        body_rows_nodes.pop();
1855    }
1856    // Alignment: from header if present, else from the first data row.
1857    let aligns = if let Some(h) = &header {
1858        simple_table_aligns(h, &cols)
1859    } else if let Some(r0) = body_rows_nodes.first() {
1860        simple_table_aligns(r0, &cols)
1861    } else {
1862        vec!["AlignDefault"; cols.len()]
1863    };
1864    let head_rows = match &header {
1865        Some(h) => {
1866            let cells: Vec<Vec<Inline>> = simple_table_row_cells(h);
1867            vec![cells_to_plain_blocks(cells, cols.len())]
1868        }
1869        None => Vec::new(),
1870    };
1871    let body_rows: Vec<Vec<GridCell>> = body_rows_nodes
1872        .iter()
1873        .map(|r| cells_to_plain_blocks(simple_table_row_cells(r), cols.len()))
1874        .collect();
1875    let caption_inlines = node
1876        .children()
1877        .find(|c| c.kind() == SyntaxKind::TABLE_CAPTION)
1878        .map(|n| pipe_table_caption(&n))
1879        .unwrap_or_default();
1880    let (attr, caption_inlines) = extract_caption_attrs(caption_inlines);
1881    Some(TableData {
1882        attr,
1883        caption: caption_inlines,
1884        aligns,
1885        widths: vec![None; cols.len()],
1886        head_rows,
1887        body_rows,
1888        foot_rows: Vec::new(),
1889    })
1890}
1891
1892/// Return the `(start_col, end_col)` (inclusive) of each dash run in a
1893/// `TABLE_SEPARATOR` node, where columns are 0-based offsets within the
1894/// separator's line.
1895fn simple_table_dash_runs(separator: &SyntaxNode) -> Vec<(usize, usize)> {
1896    let raw = separator.text().to_string();
1897    let line = raw.trim_end_matches(['\n', '\r']);
1898    let mut runs = Vec::new();
1899    let mut start: Option<usize> = None;
1900    for (i, ch) in line.char_indices() {
1901        if ch == '-' {
1902            if start.is_none() {
1903                start = Some(i);
1904            }
1905        } else if let Some(s) = start.take() {
1906            runs.push((s, i - 1));
1907        }
1908    }
1909    if let Some(s) = start.take() {
1910        runs.push((s, line.len() - 1));
1911    }
1912    runs
1913}
1914
1915fn simple_table_row_cells(row: &SyntaxNode) -> Vec<Vec<Inline>> {
1916    // Zero-width TABLE_CELL nodes represent positionally-empty columns
1917    // (e.g. case 0094, where header words land in only some of the
1918    // dash-defined columns). Keep them as empty cells so the row's
1919    // column ordering matches the dash separator.
1920    row.children()
1921        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
1922        .map(|cell| coalesce_inlines(inlines_from(&cell)))
1923        .collect()
1924}
1925
1926fn simple_table_row_is_all_dashes(row: &SyntaxNode) -> bool {
1927    let mut had_cell = false;
1928    for cell in row
1929        .children()
1930        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
1931    {
1932        let text = cell.text().to_string();
1933        let trimmed = text.trim();
1934        if trimmed.is_empty() {
1935            continue;
1936        }
1937        had_cell = true;
1938        if !trimmed.chars().all(|c| c == '-') {
1939            return false;
1940        }
1941    }
1942    had_cell
1943}
1944
1945/// Derive alignments for a simple-table header (or first data row) by
1946/// comparing each cell's *visible* (whitespace-trimmed) column range to
1947/// the corresponding dash run. Multiline-table TABLE_CELL nodes include
1948/// the padding whitespace within the column slice, so we have to peel
1949/// off leading/trailing whitespace before applying the flushness rule.
1950/// (Single-line simple-table cells already exclude padding whitespace,
1951/// but the trim is a no-op there.)
1952fn simple_table_aligns(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<&'static str> {
1953    let row_start: u32 = row.text_range().start().into();
1954    let mut cell_ranges: Vec<(usize, usize)> = Vec::new();
1955    for cell in row
1956        .children()
1957        .filter(|c| c.kind() == SyntaxKind::TABLE_CELL)
1958    {
1959        if cell.text_range().is_empty() {
1960            continue;
1961        }
1962        let text = cell.text().to_string();
1963        let lstrip = text.chars().take_while(|c| *c == ' ' || *c == '\t').count();
1964        let rstrip = text
1965            .chars()
1966            .rev()
1967            .take_while(|c| *c == ' ' || *c == '\t')
1968            .count();
1969        let trimmed_len = text.chars().count().saturating_sub(lstrip + rstrip);
1970        if trimmed_len == 0 {
1971            continue;
1972        }
1973        let start: u32 = cell.text_range().start().into();
1974        let s = (start - row_start) as usize;
1975        let visible_start = s + lstrip;
1976        let visible_end = visible_start + trimmed_len - 1;
1977        cell_ranges.push((visible_start, visible_end));
1978    }
1979    cols.iter()
1980        .map(|(col_start, col_end)| {
1981            let cell = cell_ranges
1982                .iter()
1983                .find(|(cs, ce)| ce >= col_start && cs <= col_end);
1984            match cell {
1985                Some((cs, ce)) => {
1986                    let left_flush = cs == col_start;
1987                    let right_flush = ce == col_end;
1988                    match (left_flush, right_flush) {
1989                        (true, true) => "AlignDefault",
1990                        (true, false) => "AlignLeft",
1991                        (false, true) => "AlignRight",
1992                        (false, false) => "AlignCenter",
1993                    }
1994                }
1995                None => "AlignDefault",
1996            }
1997        })
1998        .collect()
1999}
2000
2001// ----- grid table ---------------------------------------------------------
2002
2003/// Project a `GRID_TABLE` node into pandoc-native shape. Implements a
2004/// `gridtables`-style 2D layout pass:
2005///
2006/// 1. Collect every line of the table (excluding caption) into a padded
2007///    char grid, tracking which `TABLE_HEADER` / `TABLE_ROW` /
2008///    `TABLE_FOOTER` parent each line came from.
2009/// 2. The canonical column boundaries are the union of `+` positions
2010///    across every "sep-style" line (lines made of `+`/`-`/`=`/`:`/`|`/`
2011///    `). The canonical row boundaries are the indices of those
2012///    sep-style lines. So a partial separator like
2013///    `|        +----+----+` contributes both to canonical column
2014///    positions and to row block boundaries (it ends some cells and
2015///    starts others mid-row).
2016/// 3. Cells are detected by walking `(row_block, col)` in scan order and,
2017///    at each unoccupied position whose top-left `+` is real, finding the
2018///    smallest valid bounding rectangle: top/bottom edges in
2019///    `{-,=,:,+}`, left/right edges in `{|,+}`, no fully-spanning
2020///    interior separator that would split it. RowSpan/ColSpan are
2021///    derived from the canonical row/col indices of the cell's corners.
2022///
2023/// Column widths use the alignment separator (the one carrying `:`s) if
2024/// present, else the first separator — both via `grid_dash_widths`. The
2025/// alignment row also drives per-column alignment via
2026/// `grid_separator_aligns`.
2027#[allow(clippy::needless_range_loop)]
2028fn grid_table(node: &SyntaxNode) -> Option<TableData> {
2029    // Collect all lines except the caption, tagged with their parent kind.
2030    let mut tagged: Vec<(SyntaxKind, String)> = Vec::new();
2031    for child in node.children() {
2032        if child.kind() == SyntaxKind::TABLE_CAPTION {
2033            continue;
2034        }
2035        let text = child.text().to_string();
2036        for line in text.split_inclusive('\n') {
2037            let trimmed = line.trim_end_matches('\n');
2038            tagged.push((child.kind(), trimmed.to_string()));
2039        }
2040    }
2041    if tagged.is_empty() {
2042        return None;
2043    }
2044
2045    // Pad lines into a 2D char grid.
2046    let max_width = tagged
2047        .iter()
2048        .map(|(_, l)| l.chars().count())
2049        .max()
2050        .unwrap_or(0);
2051    let grid: Vec<Vec<char>> = tagged
2052        .iter()
2053        .map(|(_, l)| {
2054            let mut chars: Vec<char> = l.chars().collect();
2055            chars.resize(max_width, ' ');
2056            chars
2057        })
2058        .collect();
2059    let nlines = grid.len();
2060
2061    // A line is "sep-style" if it contains at least one `+` and no chars
2062    // outside `+`/`-`/`=`/`:`/`|`/` `. Partial separators (lines mixing
2063    // `|` and `+`) qualify; content lines do not.
2064    let is_sep_line: Vec<bool> = grid
2065        .iter()
2066        .map(|row| {
2067            row.contains(&'+')
2068                && row
2069                    .iter()
2070                    .all(|&c| matches!(c, '+' | '-' | '=' | ':' | '|' | ' '))
2071        })
2072        .collect();
2073
2074    // Canonical column boundaries: union of `+` columns across all sep-style lines.
2075    let mut col_set: std::collections::BTreeSet<usize> = std::collections::BTreeSet::new();
2076    for (i, row) in grid.iter().enumerate() {
2077        if !is_sep_line[i] {
2078            continue;
2079        }
2080        for (j, &c) in row.iter().enumerate() {
2081            if c == '+' {
2082                col_set.insert(j);
2083            }
2084        }
2085    }
2086    let cols_pos: Vec<usize> = col_set.into_iter().collect();
2087    if cols_pos.len() < 2 {
2088        return None;
2089    }
2090    let ncols = cols_pos.len() - 1;
2091
2092    // Canonical row boundaries: line indices of sep-style lines.
2093    let row_seps: Vec<usize> = (0..nlines).filter(|&i| is_sep_line[i]).collect();
2094    if row_seps.len() < 2 {
2095        return None;
2096    }
2097    let nrows = row_seps.len() - 1;
2098
2099    // Block kind per row block: head if any non-sep line in the block came
2100    // from a TABLE_HEADER, foot if from TABLE_FOOTER, else body.
2101    let mut block_kind: Vec<&'static str> = vec!["body"; nrows];
2102    for r in 0..nrows {
2103        let start = row_seps[r];
2104        let end = row_seps[r + 1];
2105        for i in (start + 1)..end {
2106            match tagged[i].0 {
2107                SyntaxKind::TABLE_HEADER => block_kind[r] = "head",
2108                SyntaxKind::TABLE_FOOTER => block_kind[r] = "foot",
2109                _ => {}
2110            }
2111        }
2112    }
2113
2114    // Detect cells.
2115    let mut occupied = vec![vec![false; ncols]; nrows];
2116    // (start_row, start_col, row_span, col_span, content_text)
2117    let mut cells: Vec<(usize, usize, u32, u32, String)> = Vec::new();
2118    for sr in 0..nrows {
2119        for sc in 0..ncols {
2120            if occupied[sr][sc] {
2121                continue;
2122            }
2123            let i = row_seps[sr];
2124            let j = cols_pos[sc];
2125            if grid[i][j] != '+' {
2126                // No corner here — the canonical column is missing on this
2127                // sep line, meaning the cell that owns this position must
2128                // have been emitted earlier and `occupied` should already be
2129                // set. If not, the table is malformed; skip.
2130                continue;
2131            }
2132            let Some((er, ec, content)) = find_grid_cell(&grid, i, j, sr, sc, &cols_pos, &row_seps)
2133            else {
2134                continue;
2135            };
2136            let row_span = (er - sr) as u32;
2137            let col_span = (ec - sc) as u32;
2138            for r in sr..er {
2139                for c in sc..ec {
2140                    occupied[r][c] = true;
2141                }
2142            }
2143            cells.push((sr, sc, row_span, col_span, content));
2144        }
2145    }
2146
2147    // Group cells by row block and convert to GridCells. Within each block,
2148    // emit cells in canonical column order.
2149    let mut head_rows: Vec<Vec<GridCell>> = Vec::new();
2150    let mut body_rows: Vec<Vec<GridCell>> = Vec::new();
2151    let mut foot_rows: Vec<Vec<GridCell>> = Vec::new();
2152    for r in 0..nrows {
2153        let mut row_cells: Vec<&(usize, usize, u32, u32, String)> =
2154            cells.iter().filter(|(sr, _, _, _, _)| *sr == r).collect();
2155        row_cells.sort_by_key(|(_, sc, _, _, _)| *sc);
2156        let row: Vec<GridCell> = row_cells
2157            .into_iter()
2158            .map(|(_, _, rs, cs, text)| {
2159                let blocks = parse_grid_cell_text(text);
2160                GridCell {
2161                    row_span: *rs,
2162                    col_span: *cs,
2163                    blocks,
2164                }
2165            })
2166            .collect();
2167        match block_kind[r] {
2168            "head" => head_rows.push(row),
2169            "foot" => foot_rows.push(row),
2170            _ => body_rows.push(row),
2171        }
2172    }
2173
2174    // Column widths and alignments. Pick the alignment-bearing separator
2175    // for both (or fall back to the first separator).
2176    let alignment_sep = node
2177        .children()
2178        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2179        .find(|c| c.text().to_string().contains(':'))
2180        .or_else(|| {
2181            node.children()
2182                .find(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2183        })?;
2184    let widths = grid_dash_widths(&alignment_sep);
2185    let aligns_raw = alignment_sep.text().to_string();
2186    let aligns = if aligns_raw.contains(':') {
2187        grid_separator_aligns(&aligns_raw, ncols)
2188    } else {
2189        vec!["AlignDefault"; ncols]
2190    };
2191
2192    // Caption.
2193    let caption_inlines = node
2194        .children()
2195        .find(|c| c.kind() == SyntaxKind::TABLE_CAPTION)
2196        .map(|n| pipe_table_caption(&n))
2197        .unwrap_or_default();
2198    let (attr, caption_inlines) = extract_caption_attrs(caption_inlines);
2199
2200    Some(TableData {
2201        attr,
2202        caption: caption_inlines,
2203        aligns,
2204        widths: widths.into_iter().map(Some).collect(),
2205        head_rows,
2206        body_rows,
2207        foot_rows,
2208    })
2209}
2210
2211/// Find the smallest valid grid-table cell with its top-left `+` at
2212/// `(i, j)` in the char grid, where `(sr, sc)` are the canonical row /
2213/// column indices of that corner.
2214///
2215/// Returns `(end_row_idx, end_col_idx, content_text)` where the cell
2216/// occupies canonical rows `sr..end_row_idx` and canonical columns
2217/// `sc..end_col_idx`. Content is the text inside the cell, with one
2218/// leading-space pad stripped per line and trailing whitespace trimmed,
2219/// joined with `\n`.
2220#[allow(clippy::needless_range_loop)]
2221fn find_grid_cell(
2222    grid: &[Vec<char>],
2223    i: usize,
2224    j: usize,
2225    sr: usize,
2226    sc: usize,
2227    cols_pos: &[usize],
2228    row_seps: &[usize],
2229) -> Option<(usize, usize, String)> {
2230    let nrows = row_seps.len() - 1;
2231    let ncols = cols_pos.len() - 1;
2232
2233    for ec in (sc + 1)..=ncols {
2234        let k = cols_pos[ec];
2235        // Top edge (i, j+1..k) must be all sep chars (intermediate `+`s OK).
2236        let top_ok = (j + 1..k).all(|c| matches!(grid[i][c], '-' | '=' | ':' | '+'));
2237        if !top_ok {
2238            // Hit a `|` or ` `; can't extend further right.
2239            break;
2240        }
2241        for er in (sr + 1)..=nrows {
2242            let l = row_seps[er];
2243            // Left edge col j from i+1..l: chars in {|, +}.
2244            let left_ok = (i + 1..l).all(|r| matches!(grid[r][j], '|' | '+'));
2245            if !left_ok {
2246                break;
2247            }
2248            // Right edge col k from i+1..l: chars in {|, +}.
2249            let right_ok = (i + 1..l).all(|r| matches!(grid[r][k], '|' | '+'));
2250            if !right_ok {
2251                continue;
2252            }
2253            // Bottom edge (l, j+1..k): chars in {-, =, :, +}.
2254            let bot_ok = (j + 1..k).all(|c| matches!(grid[l][c], '-' | '=' | ':' | '+'));
2255            if !bot_ok {
2256                continue;
2257            }
2258            if grid[l][j] != '+' || grid[l][k] != '+' {
2259                continue;
2260            }
2261            // No interior partial separator that fully spans this cell.
2262            // A line m strictly between i and l splits the cell if it has
2263            // `+` at both col j and col k AND all chars between are sep
2264            // chars (i.e., the partial sep extends across the whole cell
2265            // horizontally).
2266            let interior_split = (i + 1..l).any(|m| {
2267                grid[m][j] == '+'
2268                    && grid[m][k] == '+'
2269                    && (j + 1..k).all(|c| matches!(grid[m][c], '-' | '=' | ':' | '+'))
2270            });
2271            if interior_split {
2272                continue;
2273            }
2274
2275            // Extract content text. For each interior line, take chars
2276            // [j+1..k], strip one leading space (cell padding), trim
2277            // trailing whitespace.
2278            let mut content_lines: Vec<String> = Vec::new();
2279            for r in (i + 1)..l {
2280                let slice: String = grid[r][j + 1..k].iter().collect();
2281                let stripped = slice.strip_prefix(' ').unwrap_or(&slice).to_string();
2282                content_lines.push(stripped.trim_end().to_string());
2283            }
2284            // Drop leading/trailing empty lines.
2285            let first = content_lines.iter().position(|s| !s.is_empty());
2286            let last = content_lines.iter().rposition(|s| !s.is_empty());
2287            let content = match (first, last) {
2288                (Some(f), Some(l)) => content_lines[f..=l].join("\n"),
2289                _ => String::new(),
2290            };
2291            return Some((er, ec, content));
2292        }
2293    }
2294    None
2295}
2296
2297/// Parse a grid-table cell's extracted text as block-level markdown via
2298/// panache, then convert top-level `Para`s to `Plain` (pandoc's
2299/// grid-table cell rule).
2300fn parse_grid_cell_text(text: &str) -> Vec<Block> {
2301    if text.trim().is_empty() {
2302        return Vec::new();
2303    }
2304    let opts = crate::ParserOptions {
2305        flavor: crate::Flavor::Pandoc,
2306        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
2307        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
2308        ..crate::ParserOptions::default()
2309    };
2310    let doc = crate::parse(text, Some(opts));
2311    let mut out = Vec::new();
2312    for child in doc.children() {
2313        if let Some(block) = block_from(&child) {
2314            let block = match block {
2315                Block::Para(inlines) => Block::Plain(inlines),
2316                other => other,
2317            };
2318            out.push(block);
2319        }
2320    }
2321    out
2322}
2323
2324/// Compute per-column widths from a grid-table separator like
2325/// `+--------+----------+----------+`. The `+` characters delimit
2326/// columns; each run of dashes/equals/colons between two `+` is one
2327/// column. Pandoc's formula (`Text/Pandoc/Parsing/GridTable.hs::
2328/// fractionalColumnWidths`):
2329/// ```text
2330/// raw[i] = dashes[i] + 1       (include separator width)
2331/// norm   = max(sum(raw) + count - 2, 72)   (72 = readerColumns)
2332/// width[i] = raw[i] / norm
2333/// ```
2334fn grid_dash_widths(separator: &SyntaxNode) -> Vec<f64> {
2335    let raw_text = separator.text().to_string();
2336    let line = raw_text.trim_end_matches(['\n', '\r']);
2337    let mut raw: Vec<usize> = Vec::new();
2338    let mut count: usize = 0;
2339    let mut in_col = false;
2340    for ch in line.chars() {
2341        match ch {
2342            '+' => {
2343                if in_col {
2344                    raw.push(count + 1);
2345                    count = 0;
2346                }
2347                in_col = true;
2348            }
2349            _ => {
2350                if in_col {
2351                    count += 1;
2352                }
2353            }
2354        }
2355    }
2356    if raw.is_empty() {
2357        return Vec::new();
2358    }
2359    let total: usize = raw.iter().sum();
2360    let count = raw.len();
2361    let norm = (total + count).saturating_sub(2).max(72) as f64;
2362    raw.into_iter().map(|w| w as f64 / norm).collect()
2363}
2364
2365fn grid_separator_aligns(raw: &str, cols: usize) -> Vec<&'static str> {
2366    let line = raw.trim_end_matches(['\n', '\r']);
2367    let mut aligns: Vec<&'static str> = Vec::with_capacity(cols);
2368    let mut col_start: Option<usize> = None;
2369    for (i, ch) in line.char_indices() {
2370        if ch == '+' {
2371            if let Some(s) = col_start.take() {
2372                let seg = &line[s..i];
2373                aligns.push(grid_segment_align(seg));
2374            }
2375            col_start = Some(i + 1);
2376        }
2377    }
2378    while aligns.len() < cols {
2379        aligns.push("AlignDefault");
2380    }
2381    aligns.truncate(cols);
2382    aligns
2383}
2384
2385fn grid_segment_align(seg: &str) -> &'static str {
2386    let bytes = seg.as_bytes();
2387    let left = bytes.first() == Some(&b':');
2388    let right = bytes.last() == Some(&b':');
2389    match (left, right) {
2390        (true, true) => "AlignCenter",
2391        (true, false) => "AlignLeft",
2392        (false, true) => "AlignRight",
2393        _ => "AlignDefault",
2394    }
2395}
2396
2397// ----- multiline table ----------------------------------------------------
2398
2399/// Project a `MULTILINE_TABLE` node. Multi-line tables have an opening
2400/// `-----` border, an optional header (one or more lines), a
2401/// `----- ----- -----` column separator, body rows (each row possibly
2402/// spans multiple lines, separated from the next row by a blank line),
2403/// and a closing `-----` border. Cell content within a row is joined with
2404/// `SoftBreak` between source lines. Column widths are
2405/// `(dash_count + 1) / 72`.
2406fn multiline_table(node: &SyntaxNode) -> Option<TableData> {
2407    // The column-separator (the dashes between header and body) is the
2408    // *second* TABLE_SEPARATOR if there is a header, else the first.
2409    let separators: Vec<SyntaxNode> = node
2410        .children()
2411        .filter(|c| c.kind() == SyntaxKind::TABLE_SEPARATOR)
2412        .collect();
2413    let header = node
2414        .children()
2415        .find(|c| c.kind() == SyntaxKind::TABLE_HEADER);
2416    let column_sep = if header.is_some() {
2417        separators.get(1).cloned()
2418    } else {
2419        separators.first().cloned()
2420    }?;
2421    let cols = simple_table_dash_runs(&column_sep);
2422    if cols.is_empty() {
2423        return None;
2424    }
2425    // Per pandoc `widthsFromIndices`: each non-last column's width is
2426    // `dashes + spaces_after` (= start of next column - start of this); the
2427    // last column's width is `dashes + 1` (the indices' bump). Normalize
2428    // by `max(total, 72)`.
2429    let raw: Vec<usize> = cols
2430        .iter()
2431        .enumerate()
2432        .map(|(i, (s, e))| {
2433            if i + 1 < cols.len() {
2434                cols[i + 1].0 - s
2435            } else {
2436                e - s + 2
2437            }
2438        })
2439        .collect();
2440    let total: usize = raw.iter().sum();
2441    let norm = (total.max(72)) as f64;
2442    let widths: Vec<f64> = raw.into_iter().map(|w| w as f64 / norm).collect();
2443    // Alignment from header (if present) or first data row, using the
2444    // simple-table flushness rule against the column-separator dash runs.
2445    let aligns = if let Some(h) = &header {
2446        simple_table_aligns(h, &cols)
2447    } else if let Some(r0) = node.children().find(|c| c.kind() == SyntaxKind::TABLE_ROW) {
2448        simple_table_aligns(&r0, &cols)
2449    } else {
2450        vec!["AlignDefault"; cols.len()]
2451    };
2452    let head_rows = match &header {
2453        Some(h) => vec![
2454            multiline_row_cells_blocks(h, &cols)
2455                .into_iter()
2456                .map(GridCell::no_span)
2457                .collect(),
2458        ],
2459        None => Vec::new(),
2460    };
2461    let body_rows: Vec<Vec<GridCell>> = node
2462        .children()
2463        .filter(|c| c.kind() == SyntaxKind::TABLE_ROW)
2464        .map(|r| {
2465            multiline_row_cells_blocks(&r, &cols)
2466                .into_iter()
2467                .map(GridCell::no_span)
2468                .collect()
2469        })
2470        .collect();
2471    let caption_inlines = node
2472        .children()
2473        .find(|c| c.kind() == SyntaxKind::TABLE_CAPTION)
2474        .map(|n| pipe_table_caption(&n))
2475        .unwrap_or_default();
2476    let (attr, caption_inlines) = extract_caption_attrs(caption_inlines);
2477    Some(TableData {
2478        attr,
2479        caption: caption_inlines,
2480        aligns,
2481        widths: widths.into_iter().map(Some).collect(),
2482        head_rows,
2483        body_rows,
2484        foot_rows: Vec::new(),
2485    })
2486}
2487
2488/// Slice each line of a multiline-table row by column ranges, then merge
2489/// each column's per-line text into a single Plain block with `SoftBreak`s
2490/// between source lines.
2491fn multiline_row_cells_blocks(row: &SyntaxNode, cols: &[(usize, usize)]) -> Vec<Vec<Block>> {
2492    let row_start: u32 = row.text_range().start().into();
2493    let raw = row.text().to_string();
2494    // Re-construct the row's per-line text. Tokens give us byte offsets, but
2495    // plain `.text()` is enough — split on '\n', then for each line, slice by
2496    // column ranges.
2497    let lines: Vec<&str> = raw.split_inclusive('\n').collect();
2498    let mut col_lines: Vec<Vec<String>> = vec![Vec::new(); cols.len()];
2499    let mut line_start_offset: usize = 0;
2500    for line in lines {
2501        let line_no_nl = line.trim_end_matches('\n');
2502        if line_no_nl.trim().is_empty() {
2503            line_start_offset += line.len();
2504            continue;
2505        }
2506        for (i, &(cs, ce)) in cols.iter().enumerate() {
2507            // Slice [cs..=ce] in chars from the line. Lines may be shorter.
2508            let slice = char_slice(line_no_nl, cs, ce + 1);
2509            let trimmed = slice.trim();
2510            if !trimmed.is_empty() {
2511                col_lines[i].push(trimmed.to_string());
2512            }
2513        }
2514        line_start_offset += line.len();
2515    }
2516    let _ = (row_start, line_start_offset);
2517    cols.iter()
2518        .enumerate()
2519        .map(|(i, _)| {
2520            let segments = &col_lines[i];
2521            if segments.is_empty() {
2522                return Vec::new();
2523            }
2524            // Re-parse the cell's joined text through panache's inline parser
2525            // so that `**bold**`, `` `code` ``, `[link](url)` etc. inside
2526            // multiline-table cells project as Strong/Code/Link rather than
2527            // raw Str (matches pandoc's `multilineTableHeader` behavior of
2528            // joining lines per column and parsing as Markdown).
2529            let joined = segments.join("\n");
2530            let inlines = parse_cell_text_inlines(&joined);
2531            if inlines.is_empty() {
2532                return Vec::new();
2533            }
2534            vec![Block::Plain(coalesce_inlines(inlines))]
2535        })
2536        .collect()
2537}
2538
2539/// Parse a cell text fragment through panache's inline parser and return its
2540/// inline content. Used for multiline-table cells whose per-line slices are
2541/// not seen by the outer parser as inline-bearing TABLE_CELLs (the parser
2542/// holds raw TEXT for lines past the first). Empty or whitespace-only input
2543/// returns an empty vec.
2544fn parse_cell_text_inlines(text: &str) -> Vec<Inline> {
2545    if text.trim().is_empty() {
2546        return Vec::new();
2547    }
2548    let opts = crate::ParserOptions {
2549        flavor: crate::Flavor::Pandoc,
2550        dialect: crate::Dialect::for_flavor(crate::Flavor::Pandoc),
2551        extensions: crate::Extensions::for_flavor(crate::Flavor::Pandoc),
2552        ..crate::ParserOptions::default()
2553    };
2554    let doc = crate::parse(text, Some(opts));
2555    for node in doc.descendants() {
2556        if matches!(node.kind(), SyntaxKind::PARAGRAPH | SyntaxKind::PLAIN) {
2557            return inlines_from(&node);
2558        }
2559    }
2560    Vec::new()
2561}
2562
2563fn char_slice(s: &str, start_char: usize, end_char: usize) -> &str {
2564    let mut start_byte = s.len();
2565    let mut end_byte = s.len();
2566    for (i, (b, _)) in s.char_indices().enumerate() {
2567        if i == start_char {
2568            start_byte = b;
2569        }
2570        if i == end_char {
2571            end_byte = b;
2572            break;
2573        }
2574    }
2575    if start_byte > end_byte {
2576        return "";
2577    }
2578    &s[start_byte..end_byte]
2579}
2580
2581fn list_block(node: &SyntaxNode) -> Block {
2582    let loose = is_loose_list(node);
2583    let items: Vec<Vec<Block>> = node
2584        .children()
2585        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
2586        .map(|item| list_item_blocks(&item, loose))
2587        .collect();
2588    if list_is_ordered(node) {
2589        let (start, style, delim) = ordered_list_attrs(node);
2590        Block::OrderedList(start, style, delim, items)
2591    } else {
2592        Block::BulletList(items)
2593    }
2594}
2595
2596fn list_is_ordered(node: &SyntaxNode) -> bool {
2597    let Some(item) = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM) else {
2598        return false;
2599    };
2600    let marker = item
2601        .children_with_tokens()
2602        .filter_map(|el| el.into_token())
2603        .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
2604        .map(|t| t.text().to_string())
2605        .unwrap_or_default();
2606    let trimmed = marker.trim();
2607    !trimmed.starts_with(['-', '+', '*'])
2608}
2609
2610fn ordered_list_attrs(node: &SyntaxNode) -> (usize, &'static str, &'static str) {
2611    let item = node.children().find(|c| c.kind() == SyntaxKind::LIST_ITEM);
2612    let marker = item
2613        .as_ref()
2614        .and_then(|i| {
2615            i.children_with_tokens()
2616                .filter_map(|el| el.into_token())
2617                .find(|t| t.kind() == SyntaxKind::LIST_MARKER)
2618                .map(|t| t.text().to_string())
2619        })
2620        .unwrap_or_default();
2621    let (mut start, style, delim) = classify_ordered_marker(marker.trim());
2622    if style == "Example" {
2623        let offset: u32 = node.text_range().start().into();
2624        if let Some(s) = REFS_CTX.with(|c| {
2625            c.borrow()
2626                .example_list_start_by_offset
2627                .get(&offset)
2628                .copied()
2629        }) {
2630            start = s;
2631        }
2632    }
2633    (start, style, delim)
2634}
2635
2636/// Map a list-marker token (e.g. `1.`, `iv)`, `(A)`, `#.`, `(@)`) to the
2637/// pandoc-native `(start, style, delim)` tuple. Mirrors pandoc's parser logic
2638/// in `Text/Pandoc/Parsing/Lists.hs`: try `decimal`, then `exampleNum` (`@`),
2639/// then `defaultNum` (`#`), then `romanOne` (single `i`/`I`), then alpha,
2640/// then multi-char roman, in that order; the first matching form wins. The
2641/// start value for Example lists is left at 1 — pandoc tracks numbering
2642/// across lists at the document level, which we don't model.
2643fn classify_ordered_marker(trimmed: &str) -> (usize, &'static str, &'static str) {
2644    // Strip surrounding parens / trailing period or paren to get (body, delim).
2645    let (body, delim) =
2646        if let Some(inner) = trimmed.strip_prefix('(').and_then(|s| s.strip_suffix(')')) {
2647            (inner, "TwoParens")
2648        } else if let Some(inner) = trimmed.strip_suffix(')') {
2649            (inner, "OneParen")
2650        } else if let Some(inner) = trimmed.strip_suffix('.') {
2651            (inner, "Period")
2652        } else {
2653            (trimmed, "DefaultDelim")
2654        };
2655
2656    // All-digit body → Decimal.
2657    if !body.is_empty() && body.chars().all(|c| c.is_ascii_digit()) {
2658        let start: usize = body.parse().unwrap_or(1);
2659        return (start, "Decimal", delim);
2660    }
2661
2662    // `#` (DefaultStyle) — when style is DefaultStyle pandoc forces
2663    // DefaultDelim regardless of the actual punctuation.
2664    if body == "#" {
2665        return (1, "DefaultStyle", "DefaultDelim");
2666    }
2667
2668    // `@` or `@label` (Example list).
2669    if let Some(rest) = body.strip_prefix('@')
2670        && rest
2671            .chars()
2672            .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
2673    {
2674        return (1, "Example", delim);
2675    }
2676
2677    // Single `i`/`I` is romanOne (tried before alpha, so `i.`/`I.` is Roman 1).
2678    if body == "i" {
2679        return (1, "LowerRoman", delim);
2680    }
2681    if body == "I" {
2682        return (1, "UpperRoman", delim);
2683    }
2684
2685    // Single lowercase / uppercase letter → alpha.
2686    if body.len() == 1
2687        && let Some(c) = body.chars().next()
2688    {
2689        if c.is_ascii_lowercase() {
2690            return ((c as u8 - b'a') as usize + 1, "LowerAlpha", delim);
2691        }
2692        if c.is_ascii_uppercase() {
2693            return ((c as u8 - b'A') as usize + 1, "UpperAlpha", delim);
2694        }
2695    }
2696
2697    // Multi-char roman lowercase/uppercase.
2698    if body
2699        .chars()
2700        .all(|c| matches!(c, 'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm'))
2701        && let Some(n) = roman_to_int(body, false)
2702    {
2703        return (n, "LowerRoman", delim);
2704    }
2705    if body
2706        .chars()
2707        .all(|c| matches!(c, 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M'))
2708        && let Some(n) = roman_to_int(body, true)
2709    {
2710        return (n, "UpperRoman", delim);
2711    }
2712
2713    // Fallback — the parser accepted some marker we don't classify; emit
2714    // Decimal/Period so the list renders rather than dropping coverage.
2715    (1, "Decimal", delim)
2716}
2717
2718/// Convert a roman numeral string to its integer value. Returns `None` if the
2719/// string isn't a syntactically-valid roman numeral. Mirrors pandoc's
2720/// `romanNumeral` (greedy left-to-right with subtractive pairs).
2721fn roman_to_int(s: &str, upper: bool) -> Option<usize> {
2722    let normalize = |c: char| if upper { c } else { c.to_ascii_uppercase() };
2723    let value = |c: char| match c {
2724        'I' => 1,
2725        'V' => 5,
2726        'X' => 10,
2727        'L' => 50,
2728        'C' => 100,
2729        'D' => 500,
2730        'M' => 1000,
2731        _ => 0,
2732    };
2733    let chars: Vec<char> = s.chars().map(normalize).collect();
2734    if chars.is_empty() {
2735        return None;
2736    }
2737    let mut total = 0usize;
2738    let mut i = 0;
2739    while i < chars.len() {
2740        let v = value(chars[i]);
2741        if v == 0 {
2742            return None;
2743        }
2744        let next = chars.get(i + 1).copied().map(value).unwrap_or(0);
2745        if v < next {
2746            total += next - v;
2747            i += 2;
2748        } else {
2749            total += v;
2750            i += 1;
2751        }
2752    }
2753    Some(total)
2754}
2755
2756fn list_item_blocks(item: &SyntaxNode, loose: bool) -> Vec<Block> {
2757    let mut out = Vec::new();
2758    let item_indent = list_item_content_offset(item);
2759    let task_checkbox = task_checkbox_for_item(item);
2760    let mut checkbox_emitted = false;
2761    for child in item.children() {
2762        match child.kind() {
2763            SyntaxKind::PLAIN => {
2764                let mut inlines = coalesce_inlines(inlines_from(&child));
2765                // Skip empty Plain blocks. The parser emits a PLAIN node for
2766                // any line under a list item, including the bare-marker line
2767                // (`-` followed by blank then indented content); pandoc only
2768                // counts blocks with actual inline content.
2769                if inlines.is_empty() {
2770                    continue;
2771                }
2772                if !checkbox_emitted && let Some(glyph) = task_checkbox {
2773                    inlines.insert(0, Inline::Space);
2774                    inlines.insert(0, Inline::Str(glyph.to_string()));
2775                    checkbox_emitted = true;
2776                }
2777                if loose {
2778                    out.push(Block::Para(inlines));
2779                } else {
2780                    out.push(Block::Plain(inlines));
2781                }
2782            }
2783            SyntaxKind::CODE_BLOCK => {
2784                // Both fenced and indented code blocks inside list items
2785                // carry the item-content indent on every body line in the
2786                // CST. Strip that offset so pandoc sees the same body it
2787                // would in a flat document. (For indented code, the helper
2788                // also strips the 4-space code-block indent on top of the
2789                // item offset; for fenced code, the offset strip alone is
2790                // sufficient.)
2791                out.push(indented_code_block_with_extra_strip(&child, item_indent));
2792            }
2793            _ => collect_block(&child, &mut out),
2794        }
2795    }
2796    out
2797}
2798
2799/// Pandoc renders `- [ ] foo` as `Plain [Str "\u{2610}", Space, Str "foo"]`
2800/// (and `[x]`/`[X]` as `\u{2612}`). The parser keeps `[ ]`/`[x]`/`[X]` as a
2801/// dedicated `TASK_CHECKBOX` token on the `LIST_ITEM`; this helper returns
2802/// the matching ballot-box glyph if one is present.
2803fn task_checkbox_for_item(item: &SyntaxNode) -> Option<&'static str> {
2804    item.children_with_tokens()
2805        .filter_map(|el| el.into_token())
2806        .find(|t| t.kind() == SyntaxKind::TASK_CHECKBOX)
2807        .map(|t| {
2808            let text = t.text();
2809            if text.contains('x') || text.contains('X') {
2810                "\u{2612}"
2811            } else {
2812                "\u{2610}"
2813            }
2814        })
2815}
2816
2817/// Number of leading-space columns each body-content line of `item` carries
2818/// in the CST. Mirrors pandoc's list-item content offset:
2819///   - bare-marker line (no WHITESPACE after LIST_MARKER): offset = marker
2820///     width (e.g. `1` for `-`, `2` for `1.`).
2821///   - marker followed by space(s): offset = marker width + WS width (the
2822///     visual column where content starts on the marker's line).
2823///
2824/// Nested list items also carry leading WHITESPACE *before* the LIST_MARKER
2825/// (the outer item's content offset). Include that so the cumulative depth
2826/// is captured — required for correctly stripping nested fenced/indented
2827/// code blocks.
2828///
2829/// When the LIST is itself a child of an outer container (e.g. a DEFINITION
2830/// body where the `- item` line is indented to the def-content column), the
2831/// per-item leading indent lives on the parent LIST as a WHITESPACE token
2832/// preceding each LIST_ITEM rather than inside the item. Pick that up too —
2833/// without it, code blocks nested inside such items would only have the
2834/// item-local indent stripped, leaving the outer-container offset behind.
2835fn list_item_content_offset(item: &SyntaxNode) -> usize {
2836    let parent_ws = parent_list_leading_ws(item);
2837    let mut marker_width = 0usize;
2838    let mut leading_ws = 0usize;
2839    let mut saw_marker = false;
2840    for el in item.children_with_tokens() {
2841        if let NodeOrToken::Token(t) = el {
2842            match t.kind() {
2843                SyntaxKind::WHITESPACE if !saw_marker => {
2844                    leading_ws += t.text().chars().count();
2845                }
2846                SyntaxKind::LIST_MARKER => {
2847                    marker_width += t.text().chars().count();
2848                    saw_marker = true;
2849                }
2850                SyntaxKind::WHITESPACE if saw_marker => {
2851                    return parent_ws + leading_ws + marker_width + t.text().chars().count();
2852                }
2853                _ if saw_marker => {
2854                    return parent_ws + leading_ws + marker_width;
2855                }
2856                _ => {}
2857            }
2858        } else if saw_marker {
2859            return parent_ws + leading_ws + marker_width;
2860        }
2861    }
2862    parent_ws + leading_ws + marker_width
2863}
2864
2865/// WHITESPACE token immediately preceding `item` on its parent LIST node, if
2866/// any. Used to recover the outer-container indent when the parser stores it
2867/// on the parent LIST (e.g. LIST inside DEFINITION) rather than as the item's
2868/// own leading WHITESPACE.
2869fn parent_list_leading_ws(item: &SyntaxNode) -> usize {
2870    let prev = item.prev_sibling_or_token();
2871    match prev {
2872        Some(NodeOrToken::Token(t)) if t.kind() == SyntaxKind::WHITESPACE => {
2873            t.text().chars().count()
2874        }
2875        _ => 0,
2876    }
2877}
2878
2879fn is_loose_list(node: &SyntaxNode) -> bool {
2880    let mut prev_was_item = false;
2881    for child in node.children_with_tokens() {
2882        if let NodeOrToken::Node(n) = child {
2883            if n.kind() == SyntaxKind::LIST_ITEM {
2884                prev_was_item = true;
2885            } else if n.kind() == SyntaxKind::BLANK_LINE
2886                && prev_was_item
2887                && n.next_sibling()
2888                    .map(|s| s.kind() == SyntaxKind::LIST_ITEM)
2889                    .unwrap_or(false)
2890            {
2891                return true;
2892            }
2893        }
2894    }
2895    for item in node
2896        .children()
2897        .filter(|c| c.kind() == SyntaxKind::LIST_ITEM)
2898    {
2899        if item.children().any(|c| c.kind() == SyntaxKind::PARAGRAPH) {
2900            return true;
2901        }
2902        // Per CommonMark/pandoc: a list is loose if any item directly
2903        // contains a blank line between two block-level children. The
2904        // single-item form (`- a\n\n  b`) only manifests as a BLANK_LINE
2905        // sandwiched between non-blank block children inside the item.
2906        if has_internal_blank_between_blocks(&item) {
2907            return true;
2908        }
2909    }
2910    false
2911}
2912
2913fn has_internal_blank_between_blocks(item: &SyntaxNode) -> bool {
2914    let mut saw_block_before = false;
2915    let mut pending_blank = false;
2916    for child in item.children() {
2917        match child.kind() {
2918            SyntaxKind::BLANK_LINE => {
2919                if saw_block_before {
2920                    pending_blank = true;
2921                }
2922            }
2923            // Bare-marker line emits an empty PLAIN (NEWLINE only); pandoc
2924            // doesn't count that as a block — its first real block is what
2925            // comes after the blank line.
2926            SyntaxKind::PLAIN if child_is_empty_plain(&child) => {}
2927            _ => {
2928                if pending_blank {
2929                    return true;
2930                }
2931                saw_block_before = true;
2932            }
2933        }
2934    }
2935    false
2936}
2937
2938fn child_is_empty_plain(node: &SyntaxNode) -> bool {
2939    !node.children_with_tokens().any(|el| match el {
2940        NodeOrToken::Token(t) => !matches!(t.kind(), SyntaxKind::NEWLINE | SyntaxKind::WHITESPACE),
2941        NodeOrToken::Node(_) => true,
2942    })
2943}
2944
2945// ----- inline walking -----------------------------------------------------
2946
2947fn inlines_from(parent: &SyntaxNode) -> Vec<Inline> {
2948    let mut out = Vec::new();
2949    let mut iter = parent.children_with_tokens().peekable();
2950    while let Some(el) = iter.next() {
2951        match el {
2952            NodeOrToken::Token(t) => push_token_inline(&t, &mut out),
2953            NodeOrToken::Node(n) if n.kind() == SyntaxKind::LATEX_COMMAND => {
2954                emit_latex_command_with_absorb(&n, &mut iter, &mut out);
2955            }
2956            NodeOrToken::Node(n) if n.kind() == SyntaxKind::CITATION => {
2957                emit_citation_with_absorb(&n, &mut iter, &mut out);
2958            }
2959            NodeOrToken::Node(n) => push_inline_node(&n, &mut out),
2960        }
2961    }
2962    // Trailing NEWLINE inside paragraphs/headings is structural. Strip a
2963    // single trailing SoftBreak so the inline list ends on Str/Space, matching
2964    // pandoc's "trim trailing line endings" rule.
2965    while matches!(out.last(), Some(Inline::SoftBreak)) {
2966        out.pop();
2967    }
2968    out
2969}
2970
2971/// Pandoc absorbs `@key [locator]` into a single AuthorInText `Cite` with
2972/// the bracketed text becoming the citation's suffix. The parser emits two
2973/// separate nodes: `CITATION` (bare `@key`, no surrounding brackets) and an
2974/// adjacent `LINK` whose bracketed text has no destination. When the
2975/// CITATION is bare and we can verify both the next siblings (a single
2976/// `TEXT` whitespace token followed by a `LINK` node lacking
2977/// `LINK_DEST_START`), consume both and absorb the link's text as suffix.
2978fn emit_citation_with_absorb<I>(
2979    node: &SyntaxNode,
2980    iter: &mut std::iter::Peekable<I>,
2981    out: &mut Vec<Inline>,
2982) where
2983    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
2984{
2985    let bracketed = node
2986        .children_with_tokens()
2987        .filter_map(|el| el.into_token())
2988        .any(|t| t.kind() == SyntaxKind::LINK_START);
2989    if bracketed {
2990        render_citation_inline(node, out, None);
2991        return;
2992    }
2993    // Bare AuthorInText form. Use rowan's sibling navigation (not the iter
2994    // peek) to verify the absorption pattern without consuming anything we
2995    // can't put back. Then if confirmed, advance the iter to skip both.
2996    let next_sibling_pair = node.next_sibling_or_token().and_then(|el1| {
2997        let t = el1.as_token().cloned()?;
2998        if t.kind() != SyntaxKind::TEXT || !t.text().starts_with(' ') {
2999            return None;
3000        }
3001        let space_text = t.text().to_string();
3002        let link_el = t.next_sibling_or_token()?;
3003        let link = link_el.as_node().cloned()?;
3004        // Pandoc absorbs `[locator]` after `@key` whether the brackets
3005        // resolve as a link or not; under the new IR, an unresolved
3006        // bracket-shape pattern is `UNRESOLVED_REFERENCE` rather than
3007        // shape-only `LINK`. Both shapes are valid locator candidates.
3008        if link.kind() != SyntaxKind::LINK && link.kind() != SyntaxKind::UNRESOLVED_REFERENCE {
3009            return None;
3010        }
3011        let has_dest = link
3012            .children_with_tokens()
3013            .filter_map(|el| el.into_token())
3014            .any(|tok| tok.kind() == SyntaxKind::LINK_DEST_START);
3015        if has_dest {
3016            return None;
3017        }
3018        let link_text = link
3019            .children()
3020            .find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3021            .map(|tt| tt.text().to_string())
3022            .unwrap_or_default();
3023        Some((space_text, link_text))
3024    });
3025    if let Some((_space_text, locator_text)) = next_sibling_pair {
3026        // Advance the iter past the consumed TEXT and LINK.
3027        iter.next();
3028        iter.next();
3029        render_citation_inline(node, out, Some(&locator_text));
3030    } else {
3031        render_citation_inline(node, out, None);
3032    }
3033}
3034
3035/// Pandoc's tex inline reader absorbs trailing horizontal whitespace into the
3036/// raw command when (and only when) the command is `\letters` with no brace
3037/// arguments — `\foo bar` becomes `RawInline tex "\\foo "` + `Str "bar"`,
3038/// while `\frac{a}{b} bar` keeps the space outside (`RawInline tex
3039/// "\\frac{a}{b}"` + `Space` + `Str "bar"`). The discriminator is the last
3040/// byte of the command text: ASCII letter → absorb, otherwise → don't.
3041fn emit_latex_command_with_absorb<I>(
3042    node: &SyntaxNode,
3043    iter: &mut std::iter::Peekable<I>,
3044    out: &mut Vec<Inline>,
3045) where
3046    I: Iterator<Item = rowan::SyntaxElement<crate::syntax::PanacheLanguage>>,
3047{
3048    let mut content = node.text().to_string();
3049    let ends_in_letter = content
3050        .chars()
3051        .next_back()
3052        .is_some_and(|c| c.is_ascii_alphabetic());
3053    if ends_in_letter
3054        && let Some(NodeOrToken::Token(t)) = iter.peek()
3055        && t.kind() == SyntaxKind::TEXT
3056    {
3057        let text = t.text().to_string();
3058        let bytes = text.as_bytes();
3059        let mut absorbed = 0;
3060        while absorbed < bytes.len() && (bytes[absorbed] == b' ' || bytes[absorbed] == b'\t') {
3061            absorbed += 1;
3062        }
3063        if absorbed > 0 {
3064            content.push_str(&text[..absorbed]);
3065            out.push(Inline::RawInline("tex".to_string(), content));
3066            iter.next();
3067            let remainder = &text[absorbed..];
3068            if !remainder.is_empty() {
3069                push_text(remainder, out);
3070            }
3071            return;
3072        }
3073    }
3074    out.push(Inline::RawInline("tex".to_string(), content));
3075}
3076
3077fn push_inline_node(node: &SyntaxNode, out: &mut Vec<Inline>) {
3078    match node.kind() {
3079        SyntaxKind::LINK => render_link_inline(node, out),
3080        SyntaxKind::IMAGE_LINK => render_image_inline(node, out),
3081        SyntaxKind::CITATION => render_citation_inline(node, out, None),
3082        // Pandoc-native treats unresolved bracket-shape patterns as
3083        // literal text — the bracket bytes themselves are `Str "["`
3084        // and `Str "]"`, but inner inline structure (emphasis, math,
3085        // raw spans, etc.) survives. The Panache `UNRESOLVED_REFERENCE`
3086        // wrapper is a tooling concession; emit the bracket bytes as
3087        // `Str` and recurse into structural children so inner content
3088        // is preserved.
3089        SyntaxKind::UNRESOLVED_REFERENCE => render_unresolved_reference_inline(node, out),
3090        _ => out.push(inline_from_node(node)),
3091    }
3092}
3093
3094/// Project an UNRESOLVED_REFERENCE node as pandoc-native inlines.
3095///
3096/// Mirrors the unresolved fall-through of `render_link_inline`: try
3097/// `lookup_heading_id` for implicit-heading shortcut/full-reference
3098/// resolution at projection time (pandoc resolves heading IDs *during
3099/// inline rendering*; the parser's refdef map only carries explicit
3100/// `[label]: url` definitions). On miss, emit the original bracket
3101/// pattern as `Str "["`, inner inline structure (preserved via
3102/// `coalesce_inlines_keep_edges` so leading/trailing whitespace
3103/// survives, matching pandoc's `[ foo ]` → `Str "[", Space, Str "foo",
3104/// Space, Str "]"` behavior), then `Str "]"` (or `Str "][ref]"` for
3105/// full-reference form).
3106fn render_unresolved_reference_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
3107    let is_image = node
3108        .children()
3109        .any(|c| c.kind() == SyntaxKind::IMAGE_LINK_START);
3110    let text_node = if is_image {
3111        node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT)
3112    } else {
3113        node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT)
3114    };
3115    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
3116
3117    let text_label = text_node
3118        .as_ref()
3119        .map(|n| n.text().to_string())
3120        .unwrap_or_default();
3121    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
3122        Some(rn) => {
3123            let inner = rn.text().to_string();
3124            if inner.is_empty() {
3125                (text_label.clone(), true, String::new())
3126            } else {
3127                (inner.clone(), true, inner)
3128            }
3129        }
3130        None => (text_label.clone(), false, String::new()),
3131    };
3132
3133    // Implicit-heading-id resolution at projection time. Only for
3134    // link-shape (not image-shape) shortcut/full-ref/collapsed forms.
3135    if !is_image && let Some(id) = lookup_heading_id(&label) {
3136        let url = format!("#{id}");
3137        let resolved_text_inlines = text_node
3138            .as_ref()
3139            .map(|n| coalesce_inlines(inlines_from(n)))
3140            .unwrap_or_default();
3141        out.push(Inline::Link(
3142            extract_attr_from_node(node),
3143            resolved_text_inlines,
3144            url,
3145            String::new(),
3146        ));
3147        return;
3148    }
3149
3150    // Unresolved: emit the original markdown bytes, preserving inner
3151    // inline structure.
3152    let unresolved_text_inlines = text_node
3153        .as_ref()
3154        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
3155        .unwrap_or_default();
3156    let opener = if is_image { "![" } else { "[" };
3157    out.push(Inline::Str(opener.to_string()));
3158    out.extend(unresolved_text_inlines);
3159    let suffix = if has_second_brackets {
3160        format!("][{second_inner}]")
3161    } else {
3162        "]".to_string()
3163    };
3164    out.push(Inline::Str(suffix));
3165}
3166
3167/// Pandoc treats `(@label)` and bare `@label` as Example-list references
3168/// when the label was defined as an Example item; the inline becomes
3169/// `Str "N"` (just the digits — surrounding parens come from adjacent
3170/// source bytes which our coalesce pass merges back in). Otherwise we
3171/// project the CITATION node as a proper `Cite [Citation, ...] [Inline,
3172/// ...]` per pandoc's citation reader. `extra_suffix_text` carries an
3173/// absorbed `[locator]` (pandoc absorbs `@key [locator]` into the Cite as
3174/// the citation's suffix); the literal text reflects the absorbed bytes.
3175fn render_citation_inline(
3176    node: &SyntaxNode,
3177    out: &mut Vec<Inline>,
3178    extra_suffix_text: Option<&str>,
3179) {
3180    // Example-list resolution short-circuit (legacy carve-out).
3181    let first_key = node
3182        .children_with_tokens()
3183        .filter_map(|el| el.into_token())
3184        .find(|t| t.kind() == SyntaxKind::CITATION_KEY)
3185        .map(|t| t.text().to_string())
3186        .unwrap_or_default();
3187    let example_resolution =
3188        REFS_CTX.with(|c| c.borrow().example_label_to_num.get(&first_key).copied());
3189    if let Some(n) = example_resolution {
3190        out.push(Inline::Str(n.to_string()));
3191        return;
3192    }
3193
3194    let bracketed = node
3195        .children_with_tokens()
3196        .filter_map(|el| el.into_token())
3197        .any(|t| t.kind() == SyntaxKind::LINK_START);
3198
3199    let mut builders: Vec<CitationBuilder> = Vec::new();
3200    let mut current: Option<CitationBuilder> = None;
3201    let mut pending_prefix = String::new();
3202    for el in node.children_with_tokens() {
3203        let token = match el {
3204            NodeOrToken::Token(t) => t,
3205            _ => continue,
3206        };
3207        match token.kind() {
3208            SyntaxKind::LINK_START | SyntaxKind::LINK_DEST => {}
3209            SyntaxKind::CITATION_BRACE_OPEN | SyntaxKind::CITATION_BRACE_CLOSE => {}
3210            SyntaxKind::CITATION_MARKER => {
3211                if let Some(c) = current.take() {
3212                    builders.push(c);
3213                }
3214                let mode = if token.text() == "-@" {
3215                    CitationMode::SuppressAuthor
3216                } else if bracketed {
3217                    CitationMode::NormalCitation
3218                } else {
3219                    CitationMode::AuthorInText
3220                };
3221                current = Some(CitationBuilder::new(
3222                    std::mem::take(&mut pending_prefix),
3223                    mode,
3224                ));
3225            }
3226            SyntaxKind::CITATION_KEY => {
3227                if let Some(c) = &mut current {
3228                    c.id.push_str(token.text());
3229                }
3230            }
3231            SyntaxKind::CITATION_CONTENT => {
3232                if let Some(c) = &mut current {
3233                    c.suffix_raw.push_str(token.text());
3234                } else {
3235                    pending_prefix.push_str(token.text());
3236                }
3237            }
3238            SyntaxKind::CITATION_SEPARATOR => {
3239                if let Some(c) = current.take() {
3240                    builders.push(c);
3241                }
3242            }
3243            _ => {}
3244        }
3245    }
3246    if let Some(c) = current.take() {
3247        builders.push(c);
3248    }
3249
3250    // Absorbed `[locator]` text becomes additional suffix on the LAST
3251    // citation in the group (pandoc only absorbs into AuthorInText cites
3252    // anyway, which always have one citation in the group).
3253    if let Some(extra) = extra_suffix_text
3254        && let Some(last) = builders.last_mut()
3255    {
3256        if !last.suffix_raw.is_empty() && !extra.starts_with(' ') {
3257            last.suffix_raw.push(' ');
3258        }
3259        last.suffix_raw.push_str(extra);
3260    }
3261
3262    let note_offset: u32 = node.text_range().start().into();
3263    let note_num = REFS_CTX
3264        .with(|c| {
3265            c.borrow()
3266                .cite_note_num_by_offset
3267                .get(&note_offset)
3268                .copied()
3269        })
3270        .unwrap_or(1);
3271
3272    let projected: Vec<Citation> = builders
3273        .into_iter()
3274        .map(|b| b.into_citation(note_num))
3275        .collect();
3276
3277    // Build literal text from CITATION node text + any absorbed suffix.
3278    let mut literal = node.text().to_string();
3279    if let Some(extra) = extra_suffix_text {
3280        literal.push(' ');
3281        literal.push('[');
3282        literal.push_str(extra);
3283        literal.push(']');
3284    }
3285    let text_inlines = literal_inlines(&literal);
3286
3287    out.push(Inline::Cite(projected, text_inlines));
3288}
3289
3290/// Internal builder for a single Citation while walking the CITATION node's
3291/// tokens. `prefix_raw` and `suffix_raw` capture the raw `CITATION_CONTENT`
3292/// text segments before / after the key; they are inline-parsed (with smart
3293/// transformations applied via `coalesce_inlines`) once the builder is
3294/// finalized.
3295struct CitationBuilder {
3296    id: String,
3297    prefix_raw: String,
3298    suffix_raw: String,
3299    mode: CitationMode,
3300}
3301
3302impl CitationBuilder {
3303    fn new(prefix_raw: String, mode: CitationMode) -> Self {
3304        Self {
3305            id: String::new(),
3306            prefix_raw,
3307            suffix_raw: String::new(),
3308            mode,
3309        }
3310    }
3311
3312    fn into_citation(self, note_num: i64) -> Citation {
3313        let prefix = parse_cite_affix_inlines(self.prefix_raw.trim_end(), true);
3314        let suffix = parse_cite_affix_inlines(&self.suffix_raw, false);
3315        Citation {
3316            id: self.id,
3317            prefix,
3318            suffix,
3319            mode: self.mode,
3320            note_num,
3321            hash: 0,
3322        }
3323    }
3324}
3325
3326/// Parse a citation prefix or suffix raw-text fragment as inlines, applying
3327/// pandoc's smart transformations (NBSP after abbreviations, en-dash for
3328/// `--`, smart apostrophes/quotes). For prefixes, we trim leading whitespace
3329/// (pandoc's prefix never starts with Space). For suffixes, leading whitespace
3330/// is preserved so `[@key, suffix]` produces `[Str ",", Space, Str "suffix"]`.
3331///
3332/// We wrap the raw text with a benign `Z ` prefix before reparsing, then
3333/// strip the resulting leading `Str "Z"` + `Space`. This is necessary because
3334/// panache's block parser would otherwise misclassify text starting with
3335/// (e.g.) `p. ` as an alphabetical list marker, dropping the `p.` from the
3336/// resulting inline stream entirely.
3337fn parse_cite_affix_inlines(raw: &str, is_prefix: bool) -> Vec<Inline> {
3338    if raw.is_empty() {
3339        return Vec::new();
3340    }
3341    let trimmed = if is_prefix { raw.trim_start() } else { raw };
3342    if trimmed.is_empty() {
3343        return Vec::new();
3344    }
3345    let leading_space = !is_prefix && trimmed.starts_with([' ', '\t']);
3346    let work = trimmed.trim_start_matches([' ', '\t']);
3347    if work.is_empty() {
3348        return if leading_space {
3349            vec![Inline::Space]
3350        } else {
3351            Vec::new()
3352        };
3353    }
3354    let wrapped = format!("Z {work}");
3355    let inlines = parse_cell_text_inlines(&wrapped);
3356    let mut coalesced = coalesce_inlines(inlines);
3357    // Strip the leading `Z` sentinel + Space.
3358    if matches!(coalesced.first(), Some(Inline::Str(s)) if s == "Z") {
3359        coalesced.remove(0);
3360        if matches!(coalesced.first(), Some(Inline::Space)) {
3361            coalesced.remove(0);
3362        }
3363    }
3364    if leading_space {
3365        coalesced.insert(0, Inline::Space);
3366    }
3367    coalesced
3368}
3369
3370/// Tokenize raw input into the literal `[Inline]` payload that pandoc emits
3371/// as the second argument of `Cite`. This is a lossless representation of
3372/// the original bytes (including brackets, semicolons, `*`, `**`, etc.) —
3373/// no markup parsing, no smart-typography. Newlines become `SoftBreak`,
3374/// runs of spaces/tabs become a single `Space`.
3375fn literal_inlines(text: &str) -> Vec<Inline> {
3376    let mut out: Vec<Inline> = Vec::new();
3377    let mut buf = String::new();
3378    for ch in text.chars() {
3379        match ch {
3380            ' ' | '\t' => {
3381                if !buf.is_empty() {
3382                    out.push(Inline::Str(std::mem::take(&mut buf)));
3383                }
3384                if !matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
3385                    out.push(Inline::Space);
3386                }
3387            }
3388            '\n' => {
3389                if !buf.is_empty() {
3390                    out.push(Inline::Str(std::mem::take(&mut buf)));
3391                }
3392                if matches!(out.last(), Some(Inline::Space)) {
3393                    out.pop();
3394                }
3395                out.push(Inline::SoftBreak);
3396            }
3397            _ => buf.push(ch),
3398        }
3399    }
3400    if !buf.is_empty() {
3401        out.push(Inline::Str(buf));
3402    }
3403    out
3404}
3405
3406fn push_token_inline(
3407    t: &rowan::SyntaxToken<crate::syntax::PanacheLanguage>,
3408    out: &mut Vec<Inline>,
3409) {
3410    match t.kind() {
3411        SyntaxKind::TEXT => push_text(t.text(), out),
3412        SyntaxKind::WHITESPACE => out.push(Inline::Space),
3413        SyntaxKind::NEWLINE => out.push(Inline::SoftBreak),
3414        SyntaxKind::HARD_LINE_BREAK => out.push(Inline::LineBreak),
3415        SyntaxKind::ESCAPED_CHAR => {
3416            // \x — keep just the escaped character as a Str
3417            let s: String = t.text().chars().skip(1).collect();
3418            out.push(Inline::Str(s));
3419        }
3420        SyntaxKind::NONBREAKING_SPACE => out.push(Inline::Str("\u{a0}".to_string())),
3421        // Skip structural tokens (markers, brackets, fence bytes) that don't
3422        // contribute to the inline stream.
3423        _ => {}
3424    }
3425}
3426
3427fn push_text(text: &str, out: &mut Vec<Inline>) {
3428    let mut buf = String::new();
3429    for ch in text.chars() {
3430        if ch == ' ' || ch == '\t' {
3431            if !buf.is_empty() {
3432                out.push(Inline::Str(std::mem::take(&mut buf)));
3433            }
3434            out.push(Inline::Space);
3435        } else if ch == '\n' {
3436            if !buf.is_empty() {
3437                out.push(Inline::Str(std::mem::take(&mut buf)));
3438            }
3439            out.push(Inline::SoftBreak);
3440        } else {
3441            buf.push(ch);
3442        }
3443    }
3444    if !buf.is_empty() {
3445        out.push(Inline::Str(buf));
3446    }
3447}
3448
3449fn inline_from_node(node: &SyntaxNode) -> Inline {
3450    match node.kind() {
3451        SyntaxKind::EMPHASIS => {
3452            Inline::Emph(coalesce_inlines_keep_edges(inlines_from_marked(node)))
3453        }
3454        SyntaxKind::STRONG => {
3455            Inline::Strong(coalesce_inlines_keep_edges(inlines_from_marked(node)))
3456        }
3457        SyntaxKind::STRIKEOUT => {
3458            Inline::Strikeout(coalesce_inlines_keep_edges(inlines_from_marked(node)))
3459        }
3460        SyntaxKind::SUPERSCRIPT => {
3461            Inline::Superscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
3462        }
3463        SyntaxKind::SUBSCRIPT => {
3464            Inline::Subscript(coalesce_inlines_keep_edges(inlines_from_marked(node)))
3465        }
3466        SyntaxKind::INLINE_CODE => {
3467            let content: String = node
3468                .children_with_tokens()
3469                .filter_map(|el| el.into_token())
3470                .filter(|t| t.kind() == SyntaxKind::INLINE_CODE_CONTENT)
3471                .map(|t| t.text().to_string())
3472                .collect();
3473            Inline::Code(
3474                extract_attr_from_node(node),
3475                strip_inline_code_padding(&content),
3476            )
3477        }
3478        SyntaxKind::LINK | SyntaxKind::IMAGE_LINK | SyntaxKind::UNRESOLVED_REFERENCE => {
3479            // LINK / IMAGE_LINK / UNRESOLVED_REFERENCE render through
3480            // `push_inline_node` so reference resolution can emit
3481            // multiple inlines (resolved Link, or unresolved Str
3482            // fragments). This single-Inline path is unreachable;
3483            // emit Unsupported as a guard rather than silently
3484            // dropping.
3485            Inline::Unsupported(format!("{:?}", node.kind()))
3486        }
3487        SyntaxKind::AUTO_LINK => autolink_inline(node),
3488        SyntaxKind::INLINE_MATH => math_inline(node, "InlineMath"),
3489        SyntaxKind::DISPLAY_MATH => math_inline(node, "DisplayMath"),
3490        SyntaxKind::LATEX_COMMAND => latex_command_inline(node),
3491        SyntaxKind::BRACKETED_SPAN => bracketed_span_inline(node),
3492        SyntaxKind::INLINE_HTML => Inline::RawInline("html".to_string(), node.text().to_string()),
3493        SyntaxKind::FOOTNOTE_REFERENCE => footnote_reference_inline(node),
3494        SyntaxKind::INLINE_FOOTNOTE => inline_footnote_inline(node),
3495        other => Inline::Unsupported(format!("{other:?}")),
3496    }
3497}
3498
3499/// Inlines from a wrapper (Emph/Strong/...) where the structural markers are
3500/// child *nodes* (e.g. EMPHASIS_MARKER) rather than child tokens. We descend
3501/// through such marker children but skip their bytes.
3502fn inlines_from_marked(parent: &SyntaxNode) -> Vec<Inline> {
3503    let mut out = Vec::new();
3504    let mut iter = parent.children_with_tokens().peekable();
3505    while let Some(el) = iter.next() {
3506        match el {
3507            NodeOrToken::Token(t) => match t.kind() {
3508                SyntaxKind::EMPHASIS_MARKER
3509                | SyntaxKind::STRONG_MARKER
3510                | SyntaxKind::STRIKEOUT_MARKER
3511                | SyntaxKind::SUPERSCRIPT_MARKER
3512                | SyntaxKind::SUBSCRIPT_MARKER
3513                | SyntaxKind::MARK_MARKER => {}
3514                _ => push_token_inline(&t, &mut out),
3515            },
3516            NodeOrToken::Node(n) => match n.kind() {
3517                SyntaxKind::EMPHASIS_MARKER
3518                | SyntaxKind::STRONG_MARKER
3519                | SyntaxKind::STRIKEOUT_MARKER
3520                | SyntaxKind::SUPERSCRIPT_MARKER
3521                | SyntaxKind::SUBSCRIPT_MARKER
3522                | SyntaxKind::MARK_MARKER => {}
3523                _ if n.kind() == SyntaxKind::LATEX_COMMAND => {
3524                    emit_latex_command_with_absorb(&n, &mut iter, &mut out);
3525                }
3526                _ => push_inline_node(&n, &mut out),
3527            },
3528        }
3529    }
3530    out
3531}
3532
3533fn render_link_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
3534    let text_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_TEXT);
3535    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
3536    let has_dest_paren = node
3537        .children_with_tokens()
3538        .any(|el| matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::LINK_DEST_START));
3539
3540    if has_dest_paren {
3541        let text = text_node
3542            .as_ref()
3543            .map(|n| coalesce_inlines(inlines_from(n)))
3544            .unwrap_or_default();
3545        let (url, title) = dest_node
3546            .as_ref()
3547            .map(parse_link_dest)
3548            .unwrap_or((String::new(), String::new()));
3549        out.push(Inline::Link(extract_attr_from_node(node), text, url, title));
3550        return;
3551    }
3552
3553    // Reference-style link: shortcut [label], implicit [label][], or full
3554    // [text][ref]. Distinguish by presence/contents of LINK_REF.
3555    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
3556    let resolved_text_inlines = text_node
3557        .as_ref()
3558        .map(|n| coalesce_inlines(inlines_from(n)))
3559        .unwrap_or_default();
3560    let text_label = text_node
3561        .as_ref()
3562        .map(|n| n.text().to_string())
3563        .unwrap_or_default();
3564
3565    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
3566        Some(rn) => {
3567            let inner = rn.text().to_string();
3568            if inner.is_empty() {
3569                (text_label.clone(), true, String::new())
3570            } else {
3571                (inner.clone(), true, inner)
3572            }
3573        }
3574        None => (text_label.clone(), false, String::new()),
3575    };
3576
3577    if let Some((url, title)) = lookup_ref(&label) {
3578        out.push(Inline::Link(
3579            extract_attr_from_node(node),
3580            resolved_text_inlines,
3581            url,
3582            title,
3583        ));
3584        return;
3585    }
3586
3587    if let Some(id) = lookup_heading_id(&label) {
3588        let url = format!("#{id}");
3589        out.push(Inline::Link(
3590            extract_attr_from_node(node),
3591            resolved_text_inlines,
3592            url,
3593            String::new(),
3594        ));
3595        return;
3596    }
3597
3598    // Unresolved: emit the original markdown bytes as plain text. The reader
3599    // assembles `[<text>]`, optionally followed by `[<ref>]` for a full or
3600    // implicit reference. Using Str inlines here (rather than Link with empty
3601    // dest) matches pandoc's behavior of leaving unresolved references as raw
3602    // text in the output stream. Use keep_edges so leading/trailing whitespace
3603    // inside `[ ... ]` survives — pandoc preserves source whitespace for
3604    // unresolved references (`[ foo ]` → `Str "[", Space, Str "foo", Space,
3605    // Str "]"`), unlike resolved Links which strip edges.
3606    let unresolved_text_inlines = text_node
3607        .as_ref()
3608        .map(|n| coalesce_inlines_keep_edges(inlines_from(n)))
3609        .unwrap_or_default();
3610    out.push(Inline::Str("[".to_string()));
3611    out.extend(unresolved_text_inlines);
3612    let suffix = if has_second_brackets {
3613        format!("][{second_inner}]")
3614    } else {
3615        "]".to_string()
3616    };
3617    out.push(Inline::Str(suffix));
3618}
3619
3620fn render_image_inline(node: &SyntaxNode, out: &mut Vec<Inline>) {
3621    let alt_node = node.children().find(|c| c.kind() == SyntaxKind::IMAGE_ALT);
3622    let dest_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_DEST);
3623    let has_dest_paren = node.children_with_tokens().any(|el| {
3624        matches!(el, NodeOrToken::Token(t) if t.kind() == SyntaxKind::IMAGE_DEST_START
3625            || t.kind() == SyntaxKind::LINK_DEST_START)
3626    });
3627
3628    if has_dest_paren {
3629        let alt = alt_node
3630            .as_ref()
3631            .map(|n| coalesce_inlines(inlines_from(n)))
3632            .unwrap_or_default();
3633        let (url, title) = dest_node
3634            .as_ref()
3635            .map(parse_link_dest)
3636            .unwrap_or((String::new(), String::new()));
3637        out.push(Inline::Image(extract_attr_from_node(node), alt, url, title));
3638        return;
3639    }
3640
3641    let ref_node = node.children().find(|c| c.kind() == SyntaxKind::LINK_REF);
3642    let alt_inlines = alt_node
3643        .as_ref()
3644        .map(|n| coalesce_inlines(inlines_from(n)))
3645        .unwrap_or_default();
3646    let alt_label = alt_node
3647        .as_ref()
3648        .map(|n| n.text().to_string())
3649        .unwrap_or_default();
3650
3651    let (label, has_second_brackets, second_inner) = match ref_node.as_ref() {
3652        Some(rn) => {
3653            let inner = rn.text().to_string();
3654            if inner.is_empty() {
3655                (alt_label.clone(), true, String::new())
3656            } else {
3657                (inner.clone(), true, inner)
3658            }
3659        }
3660        None => (alt_label.clone(), false, String::new()),
3661    };
3662
3663    if let Some((url, title)) = lookup_ref(&label) {
3664        out.push(Inline::Image(
3665            extract_attr_from_node(node),
3666            alt_inlines,
3667            url,
3668            title,
3669        ));
3670        return;
3671    }
3672
3673    if let Some(id) = lookup_heading_id(&label) {
3674        let url = format!("#{id}");
3675        out.push(Inline::Image(
3676            extract_attr_from_node(node),
3677            alt_inlines,
3678            url,
3679            String::new(),
3680        ));
3681        return;
3682    }
3683
3684    out.push(Inline::Str("![".to_string()));
3685    out.extend(alt_inlines);
3686    let suffix = if has_second_brackets {
3687        format!("][{second_inner}]")
3688    } else {
3689        "]".to_string()
3690    };
3691    out.push(Inline::Str(suffix));
3692}
3693
3694/// Pandoc's inline code reader (`Markdown.hs::code`) replaces internal
3695/// newlines with spaces (each `\n` → one space) and then `trim`s leading
3696/// and trailing whitespace from the result. Internal whitespace runs are
3697/// preserved.
3698fn strip_inline_code_padding(s: &str) -> String {
3699    let collapsed: String = s.chars().map(|c| if c == '\n' { ' ' } else { c }).collect();
3700    collapsed.trim().to_string()
3701}
3702
3703fn math_inline(node: &SyntaxNode, kind: &'static str) -> Inline {
3704    let mut content = String::new();
3705    for el in node.children_with_tokens() {
3706        if let NodeOrToken::Token(t) = el {
3707            match t.kind() {
3708                SyntaxKind::INLINE_MATH_MARKER | SyntaxKind::DISPLAY_MATH_MARKER => {}
3709                _ => content.push_str(t.text()),
3710            }
3711        }
3712    }
3713    Inline::Math(kind, content)
3714}
3715
3716fn autolink_inline(node: &SyntaxNode) -> Inline {
3717    let mut url = String::new();
3718    for el in node.children_with_tokens() {
3719        if let NodeOrToken::Token(t) = el
3720            && t.kind() == SyntaxKind::TEXT
3721        {
3722            url.push_str(t.text());
3723        }
3724    }
3725    // Pandoc treats `<foo@bar>` as an email autolink (class "email", `mailto:`
3726    // dest) when the body has no scheme but contains an `@`.
3727    let is_email = !url.contains("://") && !url.starts_with("mailto:") && url.contains('@');
3728    if is_email {
3729        let attr = Attr {
3730            id: String::new(),
3731            classes: vec!["email".to_string()],
3732            kvs: Vec::new(),
3733        };
3734        let dest = format!("mailto:{url}");
3735        return Inline::Link(attr, vec![Inline::Str(url)], dest, String::new());
3736    }
3737    // Pandoc only treats `<scheme:body>` as a URI autolink when `scheme` is
3738    // in its known-schemes allowlist (see pandoc/src/Text/Pandoc/URI.hs).
3739    // Otherwise the original `<...>` bytes are emitted as raw HTML.
3740    if !is_known_uri_scheme(&url) {
3741        return Inline::RawInline("html".to_string(), node.text().to_string());
3742    }
3743    let attr = Attr {
3744        id: String::new(),
3745        classes: vec!["uri".to_string()],
3746        kvs: Vec::new(),
3747    };
3748    Inline::Link(attr, vec![Inline::Str(url.clone())], url, String::new())
3749}
3750
3751/// Pandoc's URI scheme allowlist (IANA + a few unofficial ones). Mirrors
3752/// `pandoc/src/Text/Pandoc/URI.hs`. Lowercase comparison.
3753fn is_known_uri_scheme(url: &str) -> bool {
3754    let scheme_end = url.find(':');
3755    let Some(end) = scheme_end else {
3756        return false;
3757    };
3758    let scheme = url[..end].to_ascii_lowercase();
3759    PANDOC_KNOWN_SCHEMES.binary_search(&scheme.as_str()).is_ok()
3760}
3761
3762/// Pandoc-known URI schemes, sorted for `binary_search`. Mirrors
3763/// `pandoc/src/Text/Pandoc/URI.hs`'s `schemes` set.
3764#[rustfmt::skip]
3765const PANDOC_KNOWN_SCHEMES: &[&str] = &[
3766    "aaa", "aaas", "about", "acap", "acct", "acr",
3767    "adiumxtra", "afp", "afs", "aim", "appdata", "apt",
3768    "attachment", "aw", "barion", "beshare", "bitcoin", "blob",
3769    "bolo", "browserext", "callto", "cap", "chrome", "chrome-extension",
3770    "cid", "coap", "coaps", "com-eventbrite-attendee", "content", "crid",
3771    "cvs", "data", "dav", "dict", "dis", "dlna-playcontainer",
3772    "dlna-playsingle", "dns", "dntp", "doi", "dtn", "dvb",
3773    "ed2k", "example", "facetime", "fax", "feed", "feedready",
3774    "file", "filesystem", "finger", "fish", "ftp", "gemini",
3775    "geo", "gg", "git", "gizmoproject", "go", "gopher",
3776    "graph", "gtalk", "h323", "ham", "hcp", "http",
3777    "https", "hxxp", "hxxps", "hydrazone", "iax", "icap",
3778    "icon", "im", "imap", "info", "iotdisco", "ipn",
3779    "ipp", "ipps", "irc", "irc6", "ircs", "iris",
3780    "iris.beep", "iris.lwz", "iris.xpc", "iris.xpcs", "isbn", "isostore",
3781    "itms", "jabber", "jar", "javascript", "jms", "keyparc",
3782    "lastfm", "ldap", "ldaps", "lvlt", "magnet", "mailserver",
3783    "mailto", "maps", "market", "message", "mid", "mms",
3784    "modem", "mongodb", "moz", "ms-access", "ms-browser-extension", "ms-drive-to",
3785    "ms-enrollment", "ms-excel", "ms-gamebarservices", "ms-getoffice", "ms-help", "ms-infopath",
3786    "ms-media-stream-id", "ms-officeapp", "ms-powerpoint", "ms-project", "ms-publisher", "ms-search-repair",
3787    "ms-secondary-screen-controller", "ms-secondary-screen-setup", "ms-settings", "ms-settings-airplanemode", "ms-settings-bluetooth", "ms-settings-camera",
3788    "ms-settings-cellular", "ms-settings-cloudstorage", "ms-settings-connectabledevices", "ms-settings-displays-topology", "ms-settings-emailandaccounts", "ms-settings-language",
3789    "ms-settings-location", "ms-settings-lock", "ms-settings-nfctransactions", "ms-settings-notifications", "ms-settings-power", "ms-settings-privacy",
3790    "ms-settings-proximity", "ms-settings-screenrotation", "ms-settings-wifi", "ms-settings-workplace", "ms-spd", "ms-sttoverlay",
3791    "ms-transit-to", "ms-virtualtouchpad", "ms-visio", "ms-walk-to", "ms-whiteboard", "ms-whiteboard-cmd",
3792    "ms-word", "msnim", "msrp", "msrps", "mtqp", "mumble",
3793    "mupdate", "mvn", "news", "nfs", "ni", "nih",
3794    "nntp", "notes", "ocf", "oid", "onenote", "onenote-cmd",
3795    "opaquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform",
3796    "pmid", "pop", "pres", "prospero", "proxy", "psyc",
3797    "pwid", "qb", "query", "redis", "rediss", "reload",
3798    "res", "resource", "rmi", "rsync", "rtmfp", "rtmp",
3799    "rtsp", "rtsps", "rtspu", "secondlife", "service", "session",
3800    "sftp", "sgn", "shttp", "sieve", "sip", "sips",
3801    "skype", "smb", "sms", "smtp", "snews", "snmp",
3802    "soap.beep", "soap.beeps", "soldat", "spotify", "ssh", "steam",
3803    "stun", "stuns", "submit", "svn", "tag", "teamspeak",
3804    "tel", "teliaeid", "telnet", "tftp", "things", "thismessage",
3805    "tip", "tn3270", "tool", "turn", "turns", "tv",
3806    "udp", "unreal", "urn", "ut2004", "v-event", "vemmi",
3807    "ventrilo", "videotex", "view-source", "vnc", "wais", "webcal",
3808    "wpid", "ws", "wss", "wtai", "wyciwyg", "xcon",
3809    "xcon-userid", "xfire", "xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri",
3810    "ymsgr", "z39.50", "z39.50r", "z39.50s",
3811];
3812
3813fn footnote_reference_inline(node: &SyntaxNode) -> Inline {
3814    let Some(label) = footnote_label(node) else {
3815        return Inline::Unsupported("FOOTNOTE_REFERENCE".to_string());
3816    };
3817    let blocks = REFS_CTX.with(|c| {
3818        c.borrow()
3819            .footnotes
3820            .get(&label)
3821            .map(|bs| bs.iter().map(clone_block).collect::<Vec<_>>())
3822    });
3823    match blocks {
3824        Some(bs) => Inline::Note(bs),
3825        // Unresolved footnote reference: pandoc emits the original bytes as
3826        // text rather than a `Note []`. Keep the raw token text for now.
3827        None => Inline::Str(node.text().to_string()),
3828    }
3829}
3830
3831fn inline_footnote_inline(node: &SyntaxNode) -> Inline {
3832    let inlines = coalesce_inlines(inlines_from(node));
3833    if inlines.is_empty() {
3834        Inline::Note(Vec::new())
3835    } else {
3836        Inline::Note(vec![Block::Para(inlines)])
3837    }
3838}
3839
3840fn parse_link_dest(node: &SyntaxNode) -> (String, String) {
3841    // LINK_DEST holds the raw bytes between `(` and `)`. Split into URL and
3842    // optional quoted title, then percent-escape unsafe characters in the URL
3843    // to match pandoc's `escapeURI`.
3844    let raw = node.text().to_string();
3845    let trimmed = raw.trim();
3846    // `<URL>` form: pandoc strips the angle brackets, even if the URL
3847    // contains otherwise-ambiguous characters like spaces or parens.
3848    if let Some(rest) = trimmed.strip_prefix('<')
3849        && let Some(end) = rest.find('>')
3850    {
3851        let url = &rest[..end];
3852        let after = rest[end + 1..].trim();
3853        let title = parse_dest_title(after);
3854        return (escape_link_dest(url), title);
3855    }
3856    // URL/title boundary: a title starts with `"`, `'`, or `(` after
3857    // whitespace. Without one, the entire string is the URL — internal
3858    // spaces still get percent-escaped.
3859    let bytes = trimmed.as_bytes();
3860    let mut url_end = trimmed.len();
3861    let mut i = 0;
3862    while i < bytes.len() {
3863        if matches!(bytes[i], b' ' | b'\t' | b'\n') {
3864            let mut j = i;
3865            while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\n') {
3866                j += 1;
3867            }
3868            if j < bytes.len() && matches!(bytes[j], b'"' | b'\'' | b'(') {
3869                url_end = i;
3870                break;
3871            }
3872            i = j;
3873        } else {
3874            i += 1;
3875        }
3876    }
3877    let url_raw = &trimmed[..url_end];
3878    let title = parse_dest_title(trimmed[url_end..].trim());
3879    (escape_link_dest(url_raw), title)
3880}
3881
3882/// Mirrors pandoc's `escapeURI`: percent-escape ASCII whitespace and the
3883/// punctuation `<>|"{}[]^\``. Other ASCII and all non-ASCII chars are
3884/// preserved as-is.
3885fn escape_link_dest(s: &str) -> String {
3886    let mut out = String::with_capacity(s.len());
3887    for ch in s.chars() {
3888        let needs_escape = ch.is_whitespace()
3889            || matches!(
3890                ch,
3891                '<' | '>' | '|' | '"' | '{' | '}' | '[' | ']' | '^' | '`'
3892            );
3893        if needs_escape {
3894            let mut buf = [0u8; 4];
3895            for &b in ch.encode_utf8(&mut buf).as_bytes() {
3896                out.push_str(&format!("%{b:02X}"));
3897            }
3898        } else {
3899            out.push(ch);
3900        }
3901    }
3902    out
3903}
3904
3905fn parse_dest_title(s: &str) -> String {
3906    let bytes = s.as_bytes();
3907    if bytes.is_empty() {
3908        return String::new();
3909    }
3910    let (open, close) = match bytes[0] {
3911        b'"' => (b'"', b'"'),
3912        b'\'' => (b'\'', b'\''),
3913        b'(' => (b'(', b')'),
3914        _ => return String::new(),
3915    };
3916    if !s.starts_with(open as char) {
3917        return String::new();
3918    }
3919    if let Some(end) = s[1..].rfind(close as char) {
3920        return s[1..1 + end].to_string();
3921    }
3922    String::new()
3923}
3924
3925// ----- coalescing & helpers ----------------------------------------------
3926
3927fn coalesce_inlines(input: Vec<Inline>) -> Vec<Inline> {
3928    coalesce_inlines_inner(input, true)
3929}
3930
3931/// Inside markup atoms (Emph/Strong/Strikeout/Sup/Sub), pandoc preserves
3932/// leading/trailing whitespace inside the wrapper — e.g. `*foo bar *` projects
3933/// as `Emph [Str "foo", Space, Str "bar", Space]`. Block-level paragraphs and
3934/// headers strip edge whitespace, but inline markup wrappers do not.
3935fn coalesce_inlines_keep_edges(input: Vec<Inline>) -> Vec<Inline> {
3936    coalesce_inlines_inner(input, false)
3937}
3938
3939fn coalesce_inlines_inner(input: Vec<Inline>, trim_edges: bool) -> Vec<Inline> {
3940    let mut out: Vec<Inline> = Vec::with_capacity(input.len());
3941    for inline in input {
3942        if let Inline::Str(s) = inline {
3943            if let Some(Inline::Str(prev)) = out.last_mut() {
3944                prev.push_str(&s);
3945            } else {
3946                out.push(Inline::Str(s));
3947            }
3948        } else if let Inline::Space = inline {
3949            // Collapse runs of Space into a single Space; pandoc never emits
3950            // two consecutive Space tokens.
3951            if matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
3952                continue;
3953            }
3954            out.push(Inline::Space);
3955        } else if let Inline::SoftBreak = inline {
3956            // SoftBreak after Space: drop the trailing Space to match pandoc
3957            // (line-end whitespace is not preserved as Space).
3958            if matches!(out.last(), Some(Inline::Space)) {
3959                out.pop();
3960            }
3961            out.push(Inline::SoftBreak);
3962        } else {
3963            out.push(inline);
3964        }
3965    }
3966    if trim_edges {
3967        // Trim leading/trailing Space/SoftBreak — pandoc does not emit edge
3968        // whitespace inside a paragraph or header.
3969        while matches!(out.first(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
3970            out.remove(0);
3971        }
3972        while matches!(out.last(), Some(Inline::Space) | Some(Inline::SoftBreak)) {
3973            out.pop();
3974        }
3975    }
3976    // Pandoc's `smart` extension is on by default for markdown. Apply the
3977    // simple in-Str substitutions here (apostrophe, dashes, ellipsis), then
3978    // restructure paired straight quotes into `Quoted` nodes.
3979    for inline in out.iter_mut() {
3980        if let Inline::Str(s) = inline {
3981            let mut t = smart_intraword_apostrophe(s);
3982            t = smart_dashes_and_ellipsis(&t);
3983            *s = t;
3984        }
3985    }
3986    let out = smart_quote_pairs(out);
3987    apply_abbreviations(out)
3988}
3989
3990/// Pandoc's default abbreviation list (from `pandoc/data/abbreviations`).
3991/// When a Str token *exactly equal to* one of these (i.e. the abbrev is a
3992/// suffix of the projected Str preceded by a non-letter / non-dot char or the
3993/// start of the Str) is followed by a `Space`, pandoc replaces the space with
3994/// a non-breaking space appended to the Str. Sorted to allow `binary_search`.
3995const PANDOC_ABBREVIATIONS: &[&str] = &[
3996    "Apr.", "Aug.", "Bros.", "Capt.", "Co.", "Corp.", "Dec.", "Dr.", "Feb.", "Fr.", "Gen.", "Gov.",
3997    "Hon.", "Inc.", "Jan.", "Jr.", "Jul.", "Jun.", "Ltd.", "M.A.", "M.D.", "Mar.", "Mr.", "Mrs.",
3998    "Ms.", "No.", "Nov.", "Oct.", "Ph.D.", "Pres.", "Prof.", "Rep.", "Rev.", "Sen.", "Sep.",
3999    "Sept.", "Sgt.", "Sr.", "St.", "aet.", "aetat.", "al.", "bk.", "c.", "cf.", "ch.", "chap.",
4000    "chs.", "col.", "cp.", "d.", "e.g.", "ed.", "eds.", "esp.", "f.", "fasc.", "ff.", "fig.",
4001    "fl.", "fol.", "fols.", "i.e.", "ill.", "incl.", "n.", "n.b.", "nn.", "p.", "pp.", "pt.",
4002    "q.v.", "s.v.", "s.vv.", "saec.", "sec.", "univ.", "viz.", "vol.", "vs.",
4003];
4004
4005fn matches_abbreviation_suffix(s: &str) -> bool {
4006    for &abbr in PANDOC_ABBREVIATIONS {
4007        if let Some(prefix) = s.strip_suffix(abbr) {
4008            if prefix.is_empty() {
4009                return true;
4010            }
4011            let last = prefix.chars().next_back().unwrap();
4012            if !last.is_alphanumeric() && last != '.' {
4013                return true;
4014            }
4015        }
4016    }
4017    false
4018}
4019
4020/// Apply pandoc's `+abbreviations` extension as a post-pass over a flat inline
4021/// list. For each `Str` ending in a known abbreviation followed by `Space`,
4022/// drop the `Space`, append `\u{a0}` (NBSP) to the `Str`, and merge the
4023/// following `Str` (if any) into it. Recurses into `Quoted` content because
4024/// `Quoted` is built inside `smart_quote_pairs` after the parent
4025/// `coalesce_inlines_inner` already ran on its source list, so its content
4026/// won't have been abbreviation-processed yet. Other inline wrappers (`Emph`,
4027/// `Strong`, `Link`, `Image`, `Note`, …) are constructed via their own
4028/// `coalesce_inlines_*` call, so their contents are already processed.
4029fn apply_abbreviations(inlines: Vec<Inline>) -> Vec<Inline> {
4030    let inlines: Vec<Inline> = inlines
4031        .into_iter()
4032        .map(|inline| match inline {
4033            Inline::Quoted(kind, content) => Inline::Quoted(kind, apply_abbreviations(content)),
4034            other => other,
4035        })
4036        .collect();
4037    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4038    let mut iter = inlines.into_iter().peekable();
4039    while let Some(inline) = iter.next() {
4040        if let Inline::Str(ref s) = inline
4041            && matches_abbreviation_suffix(s)
4042            && matches!(iter.peek(), Some(Inline::Space))
4043        {
4044            // Drop the Space.
4045            iter.next();
4046            let Inline::Str(mut new_s) = inline else {
4047                unreachable!()
4048            };
4049            new_s.push('\u{a0}');
4050            // Merge with the following Str if present.
4051            if let Some(Inline::Str(_)) = iter.peek()
4052                && let Some(Inline::Str(next_s)) = iter.next()
4053            {
4054                new_s.push_str(&next_s);
4055            }
4056            out.push(Inline::Str(new_s));
4057        } else {
4058            out.push(inline);
4059        }
4060    }
4061    out
4062}
4063
4064fn smart_quote_pairs(inlines: Vec<Inline>) -> Vec<Inline> {
4065    // Walk left-to-right, when a Str starts with a straight quote and the
4066    // previous element is a "boundary" (None/Space/SoftBreak/LineBreak), look
4067    // ahead for a matching close quote (Str ending with same quote char,
4068    // followed by a boundary). Wrap the inlines in between in a `Quoted` node.
4069    // Only handle quotes at Str boundaries; embedded or interleaved quotes are
4070    // not restructured (kept as-is) — pandoc has more nuanced rules but this
4071    // covers the common natural-text patterns in the corpus.
4072    fn is_boundary(prev: Option<&Inline>) -> bool {
4073        match prev {
4074            None => true,
4075            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4076            Some(Inline::Str(s)) => s.chars().last().is_some_and(|c| !c.is_alphanumeric()),
4077            _ => false,
4078        }
4079    }
4080    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
4081    let n = inlines.len();
4082    let mut consumed = vec![false; n];
4083    for i in 0..n {
4084        if consumed[i] {
4085            continue;
4086        }
4087        // Try to detect an open quote at position i.
4088        let Inline::Str(s) = &inlines[i] else {
4089            out.push(clone_inline(&inlines[i]));
4090            consumed[i] = true;
4091            continue;
4092        };
4093        let first = s.chars().next();
4094        let quote = match first {
4095            Some('"') => Some('"'),
4096            Some('\'') => Some('\''),
4097            _ => None,
4098        };
4099        // Open quote condition: previous inline is boundary, AND either
4100        // (a) the Str has more chars after the quote and the next char is
4101        //     non-space (open quote attaches to a word in the same Str), or
4102        // (b) the Str is *only* the quote and the next inline is a markup
4103        //     atom (Emph/Strong/...), so the quote attaches across atoms.
4104        let prev_is_boundary = is_boundary(out.last());
4105        let str_has_more = s.chars().count() > 1;
4106        let next_char_is_word = s.chars().nth(1).is_some_and(|c| !c.is_whitespace());
4107        let next_is_markup_atom = matches!(
4108            inlines.get(i + 1),
4109            Some(
4110                Inline::Emph(_)
4111                    | Inline::Strong(_)
4112                    | Inline::Strikeout(_)
4113                    | Inline::Superscript(_)
4114                    | Inline::Subscript(_)
4115                    | Inline::Code(_, _)
4116            )
4117        );
4118        let attaches =
4119            (str_has_more && next_char_is_word) || (!str_has_more && next_is_markup_atom);
4120        if let Some(q) = quote
4121            && prev_is_boundary
4122            && attaches
4123        {
4124            // Find the matching close.
4125            if let Some(close_idx) = find_matching_close(&inlines, i, q, &consumed) {
4126                // Build content: inlines from i to close_idx (inclusive),
4127                // strip the leading quote from inlines[i] and trailing quote
4128                // from inlines[close_idx].
4129                let kind = if q == '"' {
4130                    "DoubleQuote"
4131                } else {
4132                    "SingleQuote"
4133                };
4134                let mut content: Vec<Inline> = Vec::new();
4135                for j in i..=close_idx {
4136                    if consumed[j] {
4137                        continue;
4138                    }
4139                    let inline = &inlines[j];
4140                    if j == i && j == close_idx {
4141                        // Open and close in the same Str — strip both ends.
4142                        if let Inline::Str(s) = inline {
4143                            let mut chars: Vec<char> = s.chars().collect();
4144                            if chars.len() >= 2 {
4145                                chars.remove(0);
4146                                chars.pop();
4147                            }
4148                            let stripped: String = chars.into_iter().collect();
4149                            if !stripped.is_empty() {
4150                                content.push(Inline::Str(stripped));
4151                            }
4152                        }
4153                    } else if j == i {
4154                        if let Inline::Str(s) = inline {
4155                            let stripped: String = s.chars().skip(1).collect();
4156                            if !stripped.is_empty() {
4157                                content.push(Inline::Str(stripped));
4158                            }
4159                        }
4160                    } else if j == close_idx {
4161                        if let Inline::Str(s) = inline {
4162                            let mut stripped: String = s.chars().collect();
4163                            stripped.pop();
4164                            if !stripped.is_empty() {
4165                                content.push(Inline::Str(stripped));
4166                            }
4167                        }
4168                    } else {
4169                        content.push(clone_inline(inline));
4170                    }
4171                    consumed[j] = true;
4172                }
4173                out.push(Inline::Quoted(kind, content));
4174                continue;
4175            }
4176        }
4177        out.push(clone_inline(&inlines[i]));
4178        consumed[i] = true;
4179    }
4180    out
4181}
4182
4183fn find_matching_close(
4184    inlines: &[Inline],
4185    open_idx: usize,
4186    quote: char,
4187    consumed: &[bool],
4188) -> Option<usize> {
4189    // First check: same Str ends with the matching quote (close in same Str).
4190    if let Inline::Str(s) = &inlines[open_idx]
4191        && s.chars().count() >= 3
4192        && s.ends_with(quote)
4193    {
4194        // Need to confirm the next inline (after this Str) is a boundary.
4195        let next = inlines.get(open_idx + 1);
4196        let after_is_boundary = match next {
4197            None => true,
4198            Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4199            Some(Inline::Str(s)) => s.chars().next().is_some_and(|c| !c.is_alphanumeric()),
4200            _ => false,
4201        };
4202        if after_is_boundary {
4203            return Some(open_idx);
4204        }
4205    }
4206    // Otherwise, scan forward for a Str ending with the quote and followed by
4207    // a boundary.
4208    let n = inlines.len();
4209    let mut j = open_idx + 1;
4210    while j < n {
4211        if consumed[j] {
4212            return None;
4213        }
4214        match &inlines[j] {
4215            Inline::Str(s) => {
4216                if s.ends_with(quote) {
4217                    let next = inlines.get(j + 1);
4218                    let after_is_boundary = match next {
4219                        None => true,
4220                        Some(Inline::Space | Inline::SoftBreak | Inline::LineBreak) => true,
4221                        Some(Inline::Str(s)) => {
4222                            s.chars().next().is_some_and(|c| !c.is_alphanumeric())
4223                        }
4224                        _ => false,
4225                    };
4226                    if after_is_boundary {
4227                        return Some(j);
4228                    }
4229                }
4230            }
4231            Inline::Space | Inline::SoftBreak | Inline::LineBreak => {}
4232            // Don't span over markup atoms — keep search cheap and predictable.
4233            _ => {}
4234        }
4235        j += 1;
4236        // Cap search range — natural quoted spans are short.
4237        if j - open_idx > 32 {
4238            return None;
4239        }
4240    }
4241    None
4242}
4243
4244fn clone_inline(inline: &Inline) -> Inline {
4245    match inline {
4246        Inline::Str(s) => Inline::Str(s.clone()),
4247        Inline::Space => Inline::Space,
4248        Inline::SoftBreak => Inline::SoftBreak,
4249        Inline::LineBreak => Inline::LineBreak,
4250        Inline::Emph(c) => Inline::Emph(c.iter().map(clone_inline).collect()),
4251        Inline::Strong(c) => Inline::Strong(c.iter().map(clone_inline).collect()),
4252        Inline::Strikeout(c) => Inline::Strikeout(c.iter().map(clone_inline).collect()),
4253        Inline::Superscript(c) => Inline::Superscript(c.iter().map(clone_inline).collect()),
4254        Inline::Subscript(c) => Inline::Subscript(c.iter().map(clone_inline).collect()),
4255        Inline::Code(a, s) => Inline::Code(a.clone(), s.clone()),
4256        Inline::Link(a, t, u, ti) => Inline::Link(
4257            a.clone(),
4258            t.iter().map(clone_inline).collect(),
4259            u.clone(),
4260            ti.clone(),
4261        ),
4262        Inline::Image(a, t, u, ti) => Inline::Image(
4263            a.clone(),
4264            t.iter().map(clone_inline).collect(),
4265            u.clone(),
4266            ti.clone(),
4267        ),
4268        Inline::Math(k, c) => Inline::Math(k, c.clone()),
4269        Inline::Span(a, c) => Inline::Span(a.clone(), c.iter().map(clone_inline).collect()),
4270        Inline::RawInline(f, c) => Inline::RawInline(f.clone(), c.clone()),
4271        Inline::Quoted(k, c) => Inline::Quoted(k, c.iter().map(clone_inline).collect()),
4272        Inline::Note(blocks) => Inline::Note(blocks.iter().map(clone_block).collect()),
4273        Inline::Cite(citations, text) => Inline::Cite(
4274            citations
4275                .iter()
4276                .map(|c| Citation {
4277                    id: c.id.clone(),
4278                    prefix: c.prefix.iter().map(clone_inline).collect(),
4279                    suffix: c.suffix.iter().map(clone_inline).collect(),
4280                    mode: c.mode,
4281                    note_num: c.note_num,
4282                    hash: c.hash,
4283                })
4284                .collect(),
4285            text.iter().map(clone_inline).collect(),
4286        ),
4287        Inline::Unsupported(s) => Inline::Unsupported(s.clone()),
4288    }
4289}
4290
4291fn clone_block(b: &Block) -> Block {
4292    match b {
4293        Block::Para(c) => Block::Para(c.iter().map(clone_inline).collect()),
4294        Block::Plain(c) => Block::Plain(c.iter().map(clone_inline).collect()),
4295        Block::Header(lvl, a, c) => {
4296            Block::Header(*lvl, a.clone(), c.iter().map(clone_inline).collect())
4297        }
4298        Block::BlockQuote(blocks) => Block::BlockQuote(blocks.iter().map(clone_block).collect()),
4299        Block::CodeBlock(a, s) => Block::CodeBlock(a.clone(), s.clone()),
4300        Block::HorizontalRule => Block::HorizontalRule,
4301        Block::BulletList(items) => Block::BulletList(
4302            items
4303                .iter()
4304                .map(|item| item.iter().map(clone_block).collect())
4305                .collect(),
4306        ),
4307        Block::OrderedList(start, style, delim, items) => Block::OrderedList(
4308            *start,
4309            style,
4310            delim,
4311            items
4312                .iter()
4313                .map(|item| item.iter().map(clone_block).collect())
4314                .collect(),
4315        ),
4316        Block::RawBlock(f, c) => Block::RawBlock(f.clone(), c.clone()),
4317        Block::Table(_) => Block::Unsupported("Table".to_string()),
4318        Block::Div(a, blocks) => Block::Div(a.clone(), blocks.iter().map(clone_block).collect()),
4319        Block::LineBlock(lines) => Block::LineBlock(
4320            lines
4321                .iter()
4322                .map(|line| line.iter().map(clone_inline).collect())
4323                .collect(),
4324        ),
4325        Block::DefinitionList(items) => Block::DefinitionList(
4326            items
4327                .iter()
4328                .map(|(term, defs)| {
4329                    (
4330                        term.iter().map(clone_inline).collect(),
4331                        defs.iter()
4332                            .map(|d| d.iter().map(clone_block).collect())
4333                            .collect(),
4334                    )
4335                })
4336                .collect(),
4337        ),
4338        Block::Figure(a, caption, body) => Block::Figure(
4339            a.clone(),
4340            caption.iter().map(clone_block).collect(),
4341            body.iter().map(clone_block).collect(),
4342        ),
4343        Block::Unsupported(s) => Block::Unsupported(s.clone()),
4344    }
4345}
4346
4347fn smart_dashes_and_ellipsis(s: &str) -> String {
4348    if !s.contains(['-', '.']) {
4349        return s.to_string();
4350    }
4351    let bytes = s.as_bytes();
4352    let mut out = String::with_capacity(s.len());
4353    let mut i = 0usize;
4354    while i < bytes.len() {
4355        if bytes[i] == b'-' {
4356            if i + 2 < bytes.len() && bytes[i + 1] == b'-' && bytes[i + 2] == b'-' {
4357                out.push('\u{2014}');
4358                i += 3;
4359                continue;
4360            }
4361            if i + 1 < bytes.len() && bytes[i + 1] == b'-' {
4362                out.push('\u{2013}');
4363                i += 2;
4364                continue;
4365            }
4366        }
4367        if bytes[i] == b'.' && i + 2 < bytes.len() && bytes[i + 1] == b'.' && bytes[i + 2] == b'.' {
4368            out.push('\u{2026}');
4369            i += 3;
4370            continue;
4371        }
4372        // Read one UTF-8 char.
4373        let len = utf8_char_len(bytes[i]);
4374        out.push_str(&s[i..i + len]);
4375        i += len;
4376    }
4377    out
4378}
4379
4380fn utf8_char_len(b: u8) -> usize {
4381    // Invalid start bytes (0x80..0xc0) advance one byte to recover.
4382    if b < 0xc0 {
4383        1
4384    } else if b < 0xe0 {
4385        2
4386    } else if b < 0xf0 {
4387        3
4388    } else {
4389        4
4390    }
4391}
4392
4393fn smart_intraword_apostrophe(s: &str) -> String {
4394    if !s.contains('\'') {
4395        return s.to_string();
4396    }
4397    let chars: Vec<char> = s.chars().collect();
4398    let mut out = String::with_capacity(s.len());
4399    for (i, &c) in chars.iter().enumerate() {
4400        if c == '\'' {
4401            let prev = i.checked_sub(1).map(|j| chars[j]);
4402            let next = chars.get(i + 1).copied();
4403            let prev_word = prev.is_some_and(is_word_char);
4404            let next_word = next.is_some_and(is_word_char);
4405            if prev_word && next_word {
4406                out.push('\u{2019}');
4407                continue;
4408            }
4409        }
4410        out.push(c);
4411    }
4412    out
4413}
4414
4415fn is_word_char(c: char) -> bool {
4416    c.is_alphanumeric()
4417}
4418
4419fn inlines_to_plaintext(inlines: &[Inline]) -> String {
4420    let mut s = String::new();
4421    for i in inlines {
4422        match i {
4423            Inline::Str(t) => s.push_str(t),
4424            Inline::Space | Inline::SoftBreak => s.push(' '),
4425            Inline::LineBreak => s.push(' '),
4426            Inline::Emph(children)
4427            | Inline::Strong(children)
4428            | Inline::Strikeout(children)
4429            | Inline::Superscript(children)
4430            | Inline::Subscript(children) => s.push_str(&inlines_to_plaintext(children)),
4431            Inline::Code(_, c) => s.push_str(c),
4432            Inline::Link(_, alt, _, _) | Inline::Image(_, alt, _, _) => {
4433                s.push_str(&inlines_to_plaintext(alt))
4434            }
4435            Inline::Math(_, c) => s.push_str(c),
4436            Inline::Span(_, children) => s.push_str(&inlines_to_plaintext(children)),
4437            Inline::RawInline(_, _) => {}
4438            Inline::Quoted(_, children) => s.push_str(&inlines_to_plaintext(children)),
4439            Inline::Note(_) => {}
4440            Inline::Cite(_, text) => s.push_str(&inlines_to_plaintext(text)),
4441            Inline::Unsupported(_) => {}
4442        }
4443    }
4444    s
4445}
4446
4447fn pandoc_slugify(text: &str) -> String {
4448    // Mirror crates/panache-formatter::utils::pandoc_slugify so the parser-side
4449    // projector doesn't need to depend on the formatter crate.
4450    let mut out = String::new();
4451    let mut prev_dash = false;
4452    for ch in text.chars() {
4453        if ch.is_whitespace() {
4454            if !out.is_empty() && !prev_dash {
4455                out.push('-');
4456                prev_dash = true;
4457            }
4458            continue;
4459        }
4460        for lc in ch.to_lowercase() {
4461            if lc.is_alphanumeric() || lc == '_' || lc == '-' || lc == '.' {
4462                out.push(lc);
4463                prev_dash = lc == '-';
4464            }
4465        }
4466    }
4467    while out.ends_with('-') {
4468        out.pop();
4469    }
4470    out
4471}
4472
4473impl Attr {
4474    fn with_id(id: String) -> Self {
4475        Self {
4476            id,
4477            classes: Vec::new(),
4478            kvs: Vec::new(),
4479        }
4480    }
4481}
4482
4483// ----- text emission ------------------------------------------------------
4484
4485fn write_block(b: &Block, out: &mut String) {
4486    match b {
4487        Block::Para(inlines) => {
4488            out.push_str("Para [");
4489            write_inline_list(inlines, out);
4490            out.push_str(" ]");
4491        }
4492        Block::Plain(inlines) => {
4493            out.push_str("Plain [");
4494            write_inline_list(inlines, out);
4495            out.push_str(" ]");
4496        }
4497        Block::Header(level, attr, inlines) => {
4498            out.push_str(&format!("Header {level} ("));
4499            write_attr(attr, out);
4500            out.push_str(") [");
4501            write_inline_list(inlines, out);
4502            out.push_str(" ]");
4503        }
4504        Block::BlockQuote(blocks) => {
4505            out.push_str("BlockQuote [");
4506            write_block_list(blocks, out);
4507            out.push_str(" ]");
4508        }
4509        Block::CodeBlock(attr, content) => {
4510            out.push_str("CodeBlock (");
4511            write_attr(attr, out);
4512            out.push_str(") ");
4513            write_haskell_string(content, out);
4514        }
4515        Block::HorizontalRule => out.push_str("HorizontalRule"),
4516        Block::BulletList(items) => {
4517            out.push_str("BulletList [");
4518            for (i, item) in items.iter().enumerate() {
4519                if i > 0 {
4520                    out.push(',');
4521                }
4522                out.push_str(" [");
4523                write_block_list(item, out);
4524                out.push_str(" ]");
4525            }
4526            out.push_str(" ]");
4527        }
4528        Block::OrderedList(start, style, delim, items) => {
4529            out.push_str(&format!("OrderedList ( {start} , {style} , {delim} ) ["));
4530            for (i, item) in items.iter().enumerate() {
4531                if i > 0 {
4532                    out.push(',');
4533                }
4534                out.push_str(" [");
4535                write_block_list(item, out);
4536                out.push_str(" ]");
4537            }
4538            out.push_str(" ]");
4539        }
4540        Block::RawBlock(format, content) => {
4541            out.push_str("RawBlock ( Format ");
4542            write_haskell_string(format, out);
4543            out.push_str(" ) ");
4544            write_haskell_string(content, out);
4545        }
4546        Block::Table(data) => {
4547            write_table(data, out);
4548        }
4549        Block::Div(attr, blocks) => {
4550            out.push_str("Div (");
4551            write_attr(attr, out);
4552            out.push_str(") [");
4553            write_block_list(blocks, out);
4554            out.push_str(" ]");
4555        }
4556        Block::LineBlock(lines) => {
4557            out.push_str("LineBlock [");
4558            for (i, line) in lines.iter().enumerate() {
4559                if i > 0 {
4560                    out.push(',');
4561                }
4562                out.push_str(" [");
4563                write_inline_list(line, out);
4564                out.push_str(" ]");
4565            }
4566            out.push_str(" ]");
4567        }
4568        Block::DefinitionList(items) => {
4569            out.push_str("DefinitionList [");
4570            for (i, (term, defs)) in items.iter().enumerate() {
4571                if i > 0 {
4572                    out.push(',');
4573                }
4574                out.push_str(" ( [");
4575                write_inline_list(term, out);
4576                out.push_str(" ] , [");
4577                for (j, def) in defs.iter().enumerate() {
4578                    if j > 0 {
4579                        out.push(',');
4580                    }
4581                    out.push_str(" [");
4582                    write_block_list(def, out);
4583                    out.push_str(" ]");
4584                }
4585                out.push_str(" ] )");
4586            }
4587            out.push_str(" ]");
4588        }
4589        Block::Figure(attr, caption, body) => {
4590            out.push_str("Figure (");
4591            write_attr(attr, out);
4592            out.push_str(") ( Caption Nothing [");
4593            write_block_list(caption, out);
4594            out.push_str(" ] ) [");
4595            write_block_list(body, out);
4596            out.push_str(" ]");
4597        }
4598        Block::Unsupported(name) => {
4599            out.push_str(&format!("Unsupported {name:?}"));
4600        }
4601    }
4602}
4603
4604fn write_table(data: &TableData, out: &mut String) {
4605    out.push_str("Table (");
4606    write_attr(&data.attr, out);
4607    out.push_str(") ( Caption Nothing [");
4608    if !data.caption.is_empty() {
4609        out.push_str(" Plain [");
4610        write_inline_list(&data.caption, out);
4611        out.push_str(" ]");
4612    }
4613    out.push_str(" ] ) [");
4614    for (i, align) in data.aligns.iter().enumerate() {
4615        if i > 0 {
4616            out.push(',');
4617        }
4618        let width = data.widths.get(i).copied().unwrap_or(None);
4619        match width {
4620            None => out.push_str(&format!(" ( {align} , ColWidthDefault )")),
4621            Some(w) => out.push_str(&format!(" ( {align} , ColWidth {} )", show_double(w))),
4622        }
4623    }
4624    out.push_str(" ] ( TableHead ( \"\" , [ ] , [ ] ) [");
4625    for (i, row) in data.head_rows.iter().enumerate() {
4626        if i > 0 {
4627            out.push(',');
4628        }
4629        out.push(' ');
4630        write_table_row(row, out);
4631    }
4632    out.push_str(" ] ) [ TableBody ( \"\" , [ ] , [ ] ) ( RowHeadColumns 0 ) [ ] [");
4633    for (i, row) in data.body_rows.iter().enumerate() {
4634        if i > 0 {
4635            out.push(',');
4636        }
4637        out.push(' ');
4638        write_table_row(row, out);
4639    }
4640    out.push_str(" ] ] ( TableFoot ( \"\" , [ ] , [ ] ) [");
4641    for (i, row) in data.foot_rows.iter().enumerate() {
4642        if i > 0 {
4643            out.push(',');
4644        }
4645        out.push(' ');
4646        write_table_row(row, out);
4647    }
4648    out.push_str(" ] )");
4649}
4650
4651fn write_table_row(cells: &[GridCell], out: &mut String) {
4652    out.push_str("Row ( \"\" , [ ] , [ ] ) [");
4653    for (i, cell) in cells.iter().enumerate() {
4654        if i > 0 {
4655            out.push(',');
4656        }
4657        out.push_str(&format!(
4658            " Cell ( \"\" , [ ] , [ ] ) AlignDefault ( RowSpan {} ) ( ColSpan {} ) [",
4659            cell.row_span, cell.col_span
4660        ));
4661        if !cell.blocks.is_empty() {
4662            write_block_list(&cell.blocks, out);
4663        }
4664        out.push_str(" ]");
4665    }
4666    out.push_str(" ]");
4667}
4668
4669fn write_block_list(blocks: &[Block], out: &mut String) {
4670    for (i, b) in blocks.iter().enumerate() {
4671        if i > 0 {
4672            out.push(',');
4673        }
4674        out.push(' ');
4675        write_block(b, out);
4676    }
4677}
4678
4679fn write_inline_list(inlines: &[Inline], out: &mut String) {
4680    for (i, inline) in inlines.iter().enumerate() {
4681        if i > 0 {
4682            out.push(',');
4683        }
4684        out.push(' ');
4685        write_inline(inline, out);
4686    }
4687}
4688
4689fn write_inline(inline: &Inline, out: &mut String) {
4690    match inline {
4691        Inline::Str(s) => {
4692            out.push_str("Str ");
4693            write_haskell_string(s, out);
4694        }
4695        Inline::Space => out.push_str("Space"),
4696        Inline::SoftBreak => out.push_str("SoftBreak"),
4697        Inline::LineBreak => out.push_str("LineBreak"),
4698        Inline::Emph(children) => {
4699            out.push_str("Emph [");
4700            write_inline_list(children, out);
4701            out.push_str(" ]");
4702        }
4703        Inline::Strong(children) => {
4704            out.push_str("Strong [");
4705            write_inline_list(children, out);
4706            out.push_str(" ]");
4707        }
4708        Inline::Strikeout(children) => {
4709            out.push_str("Strikeout [");
4710            write_inline_list(children, out);
4711            out.push_str(" ]");
4712        }
4713        Inline::Superscript(children) => {
4714            out.push_str("Superscript [");
4715            write_inline_list(children, out);
4716            out.push_str(" ]");
4717        }
4718        Inline::Subscript(children) => {
4719            out.push_str("Subscript [");
4720            write_inline_list(children, out);
4721            out.push_str(" ]");
4722        }
4723        Inline::Code(attr, content) => {
4724            out.push_str("Code (");
4725            write_attr(attr, out);
4726            out.push_str(") ");
4727            write_haskell_string(content, out);
4728        }
4729        Inline::Link(attr, text, url, title) => {
4730            out.push_str("Link (");
4731            write_attr(attr, out);
4732            out.push_str(") [");
4733            write_inline_list(text, out);
4734            out.push_str(" ] ( ");
4735            write_haskell_string(url, out);
4736            out.push_str(" , ");
4737            write_haskell_string(title, out);
4738            out.push_str(" )");
4739        }
4740        Inline::Image(attr, alt, url, title) => {
4741            out.push_str("Image (");
4742            write_attr(attr, out);
4743            out.push_str(") [");
4744            write_inline_list(alt, out);
4745            out.push_str(" ] ( ");
4746            write_haskell_string(url, out);
4747            out.push_str(" , ");
4748            write_haskell_string(title, out);
4749            out.push_str(" )");
4750        }
4751        Inline::Math(kind, content) => {
4752            out.push_str("Math ");
4753            out.push_str(kind);
4754            out.push(' ');
4755            write_haskell_string(content, out);
4756        }
4757        Inline::Span(attr, children) => {
4758            out.push_str("Span (");
4759            write_attr(attr, out);
4760            out.push_str(") [");
4761            write_inline_list(children, out);
4762            out.push_str(" ]");
4763        }
4764        Inline::RawInline(format, content) => {
4765            out.push_str("RawInline ( Format ");
4766            write_haskell_string(format, out);
4767            out.push_str(" ) ");
4768            write_haskell_string(content, out);
4769        }
4770        Inline::Quoted(kind, children) => {
4771            out.push_str("Quoted ");
4772            out.push_str(kind);
4773            out.push_str(" [");
4774            write_inline_list(children, out);
4775            out.push_str(" ]");
4776        }
4777        Inline::Note(blocks) => {
4778            out.push_str("Note [");
4779            write_block_list(blocks, out);
4780            out.push_str(" ]");
4781        }
4782        Inline::Cite(citations, text) => {
4783            out.push_str("Cite [");
4784            for (i, c) in citations.iter().enumerate() {
4785                if i > 0 {
4786                    out.push(',');
4787                }
4788                out.push_str(" Citation { citationId = ");
4789                write_haskell_string(&c.id, out);
4790                out.push_str(" , citationPrefix = [");
4791                write_inline_list(&c.prefix, out);
4792                out.push_str(" ] , citationSuffix = [");
4793                write_inline_list(&c.suffix, out);
4794                out.push_str(" ] , citationMode = ");
4795                out.push_str(match c.mode {
4796                    CitationMode::AuthorInText => "AuthorInText",
4797                    CitationMode::NormalCitation => "NormalCitation",
4798                    CitationMode::SuppressAuthor => "SuppressAuthor",
4799                });
4800                out.push_str(&format!(
4801                    " , citationNoteNum = {} , citationHash = {} }}",
4802                    c.note_num, c.hash
4803                ));
4804            }
4805            out.push_str(" ] [");
4806            write_inline_list(text, out);
4807            out.push_str(" ]");
4808        }
4809        Inline::Unsupported(name) => {
4810            out.push_str(&format!("Unsupported {name:?}"));
4811        }
4812    }
4813}
4814
4815fn write_attr(attr: &Attr, out: &mut String) {
4816    out.push(' ');
4817    write_haskell_string(&attr.id, out);
4818    out.push_str(" , [");
4819    for (i, c) in attr.classes.iter().enumerate() {
4820        if i > 0 {
4821            out.push(',');
4822        }
4823        out.push(' ');
4824        write_haskell_string(c, out);
4825    }
4826    if !attr.classes.is_empty() {
4827        out.push(' ');
4828    }
4829    out.push_str("] , [");
4830    for (i, (k, v)) in attr.kvs.iter().enumerate() {
4831        if i > 0 {
4832            out.push(',');
4833        }
4834        out.push_str(" ( ");
4835        write_haskell_string(k, out);
4836        out.push_str(" , ");
4837        write_haskell_string(v, out);
4838        out.push_str(" )");
4839    }
4840    if !attr.kvs.is_empty() {
4841        out.push(' ');
4842    }
4843    out.push_str("] ");
4844}
4845
4846fn write_haskell_string(s: &str, out: &mut String) {
4847    out.push('"');
4848    let mut prev_was_numeric_escape = false;
4849    for ch in s.chars() {
4850        let code = ch as u32;
4851        let is_ascii_printable = (0x20..0x7f).contains(&code);
4852        match ch {
4853            '"' => {
4854                out.push_str("\\\"");
4855                prev_was_numeric_escape = false;
4856            }
4857            '\\' => {
4858                out.push_str("\\\\");
4859                prev_was_numeric_escape = false;
4860            }
4861            '\n' => {
4862                out.push_str("\\n");
4863                prev_was_numeric_escape = false;
4864            }
4865            '\t' => {
4866                out.push_str("\\t");
4867                prev_was_numeric_escape = false;
4868            }
4869            '\r' => {
4870                out.push_str("\\r");
4871                prev_was_numeric_escape = false;
4872            }
4873            _ if is_ascii_printable => {
4874                // Disambiguate digit immediately after a numeric escape: `\160\&33`
4875                // versus `\16033`.
4876                if prev_was_numeric_escape && ch.is_ascii_digit() {
4877                    out.push_str("\\&");
4878                }
4879                out.push(ch);
4880                prev_was_numeric_escape = false;
4881            }
4882            _ => {
4883                // Non-printable or non-ASCII → decimal escape.
4884                out.push('\\');
4885                out.push_str(&code.to_string());
4886                prev_was_numeric_escape = true;
4887            }
4888        }
4889    }
4890    out.push('"');
4891}
panache_parser/pandoc_ast.rs

panache_parser/
pandoc_ast.rs