Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    /// Emit compact `| a | b |` tables instead of the padded GitHub serializer.
22    compact_tables: bool,
23    images: ImageMode,
24    artifacts_dir: String,
25    /// (relative path, bytes) for each referenced image — written by the caller.
26    artifacts: Vec<(String, Vec<u8>)>,
27    pic_index: usize,
28}
29
30/// Render a document to a Markdown string (pictures as placeholders).
31///
32/// `strict` selects the serializer-level behaviours that differ between
33/// docling-legacy output and cleaner Markdown — currently the code-fence
34/// language (legacy drops it, strict keeps it).
35pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
40/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
41/// the caller should write (relative to the Markdown file).
42pub fn to_markdown_images(
43    doc: &DoclingDocument,
44    strict: bool,
45    images: ImageMode,
46    artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48    let mut ctx = Ctx {
49        strict,
50        compact_tables: doc.compact_tables,
51        images,
52        artifacts_dir: artifacts_dir.to_string(),
53        artifacts: Vec::new(),
54        pic_index: 0,
55    };
56    let mut blocks: Vec<String> = Vec::new();
57    render(&doc.nodes, &mut blocks, &mut ctx);
58    let mut body = blocks.join("\n\n");
59    // Strict mode only: turn recovered source hyperlinks into Markdown links.
60    // docling's standard pipeline drops them, so doing this in legacy mode would
61    // diverge from docling — hence strict-only, leaving conformance output intact.
62    if strict && !doc.links.is_empty() {
63        body = apply_links(&body, &doc.links);
64    }
65    let md = if body.is_empty() {
66        String::new()
67    } else {
68        format!("{body}\n")
69    };
70    (md, ctx.artifacts)
71}
72
73/// Wrap each recovered link's anchor text in Markdown `[anchor](href)`. Anchors
74/// arrive cleaned (curly quotes/dashes already normalized) but un-escaped, so we
75/// match against the body's HTML-escaped (`&`/`<`/`>`) form, the way prose nodes
76/// were serialized. Links are consumed in document order from a moving cursor, so
77/// a repeated anchor (e.g. two "issues") links its successive occurrences rather
78/// than all pointing at the first. An anchor that can't be located is skipped
79/// (its text may have been split across a line wrap or table cell).
80fn apply_links(body: &str, links: &[(String, String)]) -> String {
81    let mut out = body.to_string();
82    let mut cursor = 0usize;
83    for (anchor, href) in links {
84        let anchor = anchor
85            .replace('&', "&amp;")
86            .replace('<', "&lt;")
87            .replace('>', "&gt;");
88        if anchor.is_empty() {
89            continue;
90        }
91        if let Some(rel) = out[cursor..].find(&anchor) {
92            let at = cursor + rel;
93            // Don't relink inside an already-emitted `](` Markdown link target.
94            let replacement = format!("[{anchor}]({href})");
95            out.replace_range(at..at + anchor.len(), &replacement);
96            cursor = at + replacement.len();
97        }
98    }
99    out
100}
101
102/// Like [`apply_links`] but over a single chunk, consuming from a shared queue so
103/// the same `[anchor](href)` rewriting can be applied incrementally as Markdown is
104/// streamed out. Each queued link is matched (in document order) against `chunk`
105/// and rewritten in place; a link whose anchor is not in this chunk is carried
106/// forward in the queue for a later chunk. Anchors are recovered in document
107/// order and a chunk is always a contiguous run of whole blocks, so this
108/// reproduces [`apply_links`]' single moving cursor: the link lands in whichever
109/// chunk contains its anchor, identically to the buffered path. (A link whose
110/// anchor never appears is carried to the end and dropped — the same no-op
111/// `apply_links` performs for an unlocatable anchor.)
112fn apply_links_chunk(chunk: &str, queue: &mut Vec<(String, String)>) -> String {
113    let mut out = chunk.to_string();
114    let mut cursor = 0usize;
115    let mut carried: Vec<(String, String)> = Vec::new();
116    for (anchor_raw, href) in std::mem::take(queue) {
117        let anchor = anchor_raw
118            .replace('&', "&amp;")
119            .replace('<', "&lt;")
120            .replace('>', "&gt;");
121        if anchor.is_empty() {
122            continue;
123        }
124        if let Some(rel) = out[cursor..].find(&anchor) {
125            let at = cursor + rel;
126            let replacement = format!("[{anchor}]({href})");
127            out.replace_range(at..at + anchor.len(), &replacement);
128            cursor = at + replacement.len();
129        } else {
130            // Not in this chunk; try again when its block is flushed.
131            carried.push((anchor_raw, href));
132        }
133    }
134    *queue = carried;
135    out
136}
137
138/// Incremental Markdown serializer: feed finalized, in-document-order batches of
139/// [`Node`]s and receive Markdown chunks whose concatenation is **byte-identical**
140/// to [`to_markdown_images`] over the same nodes. This is the streaming
141/// counterpart of the buffered serializer — used to emit a document's Markdown in
142/// chunks (e.g. page by page, as the parallel PDF pipeline finishes pages) instead
143/// of building the whole string up front.
144///
145/// Only [`ImageMode::Placeholder`] and [`ImageMode::Embedded`] are streamable:
146/// [`ImageMode::Referenced`] needs a side-channel for the image bytes, which only
147/// the buffered [`to_markdown_images`] provides.
148///
149/// Each [`push`](Self::push) must contain whole blocks in reading order: a caller
150/// must not split a run of list items across two pushes (the run would render as
151/// two separate lists). Finalized PDF page batches already satisfy this.
152pub struct MarkdownStreamer {
153    strict: bool,
154    images: ImageMode,
155    compact_tables: bool,
156    /// Whether any non-empty chunk has been emitted yet (drives `\n\n` joins and
157    /// the trailing newline).
158    emitted_any: bool,
159    /// Recovered links not yet placed (strict mode), consumed in document order.
160    links: Vec<(String, String)>,
161}
162
163impl MarkdownStreamer {
164    /// Create a streamer. `compact_tables` mirrors [`DoclingDocument::compact_tables`].
165    pub fn new(strict: bool, images: ImageMode, compact_tables: bool) -> Self {
166        debug_assert!(
167            images != ImageMode::Referenced,
168            "referenced image mode is not streamable; use to_markdown_images"
169        );
170        Self {
171            strict,
172            images,
173            compact_tables,
174            emitted_any: false,
175            links: Vec::new(),
176        }
177    }
178
179    /// Render one finalized batch of nodes (plus any links recovered from the same
180    /// span, in document order) into the next Markdown chunk. Returns an empty
181    /// string when the batch produces no output (e.g. empty tables/pictures), in
182    /// which case nothing should be written.
183    pub fn push(&mut self, nodes: &[Node], links: &[(String, String)]) -> String {
184        self.links.extend(links.iter().cloned());
185        let mut ctx = Ctx {
186            strict: self.strict,
187            compact_tables: self.compact_tables,
188            images: self.images,
189            // Referenced mode is rejected at construction, so the artifact sink is
190            // never touched.
191            artifacts_dir: String::new(),
192            artifacts: Vec::new(),
193            pic_index: 0,
194        };
195        let mut blocks: Vec<String> = Vec::new();
196        render(nodes, &mut blocks, &mut ctx);
197        if blocks.is_empty() {
198            return String::new();
199        }
200        let mut body = blocks.join("\n\n");
201        if self.strict && !self.links.is_empty() {
202            body = apply_links_chunk(&body, &mut self.links);
203        }
204        let chunk = if self.emitted_any {
205            format!("\n\n{body}")
206        } else {
207            body
208        };
209        self.emitted_any = true;
210        chunk
211    }
212
213    /// Emit the trailing newline that finishes the document (empty if no content
214    /// was produced). Call exactly once, after the final [`push`](Self::push).
215    pub fn finish(self) -> String {
216        if self.emitted_any {
217            "\n".to_string()
218        } else {
219            String::new()
220        }
221    }
222}
223
224/// In `strict` mode, rewrite inline text for readability rather than byte-for-byte
225/// docling fidelity: undo the legacy `\_` underscore escaping, and tighten stray
226/// spaces around punctuation (`[ 37 , 36 ]` → `[37, 36]`, `( x )` → `(x)`). This
227/// cleans up both the PDF backend's glyph-split spacing and the space the legacy
228/// emphasis serialization leaves before punctuation (`*a* ,` → `*a*,`).
229/// Legacy/default output keeps docling's spacing untouched. Only inline text
230/// nodes pass through here — code blocks and table cells are left alone.
231fn strict_text(text: &str, strict: bool) -> String {
232    if !strict {
233        return text.to_string();
234    }
235    text.replace("\\_", "_")
236        .replace(" ,", ",")
237        .replace(" .", ".")
238        .replace(" ;", ";")
239        .replace(" )", ")")
240        .replace("( ", "(")
241        .replace(" ]", "]")
242        .replace("[ ", "[")
243}
244
245fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
246    let mut i = 0;
247    while i < nodes.len() {
248        match &nodes[i] {
249            Node::ListItem { .. } => {
250                let start = i;
251                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
252                    i += 1;
253                }
254                render_list_run(&nodes[start..i], blocks, ctx.strict);
255            }
256            other => {
257                render_one(other, blocks, ctx);
258                i += 1;
259            }
260        }
261    }
262}
263
264/// Render a contiguous run of list items.
265///
266/// Ordered items use their explicit `number`. A new sibling list (marked by
267/// `first_in_list`) at the same depth is separated by a blank line, matching
268/// docling-core's serializer.
269fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
270    let mut lines: Vec<String> = Vec::new();
271    // Per level, the previous item's (ordered, number) so we can detect a new
272    // sibling list.
273    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
274
275    for item in items {
276        let Node::ListItem {
277            ordered,
278            number,
279            first_in_list,
280            text,
281            level,
282        } = item
283        else {
284            continue;
285        };
286        let level = *level as usize;
287
288        // Returning to a shallower level ends the deeper sibling lists.
289        prev.truncate(level + 1);
290        while prev.len() <= level {
291            prev.push(None);
292        }
293
294        // A new sibling list at the same depth gets a blank line: the kind flips
295        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
296        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
297        if let Some((prev_ordered, prev_number)) = prev[level] {
298            let new_list = *first_in_list
299                || prev_ordered != *ordered
300                || (*ordered && *number != prev_number + 1);
301            if new_list {
302                lines.push(String::new());
303            }
304        }
305
306        let indent = "    ".repeat(level);
307        let marker = if *ordered {
308            format!("{number}.")
309        } else {
310            "-".to_string()
311        };
312        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
313        prev[level] = Some((*ordered, *number));
314    }
315
316    blocks.push(lines.join("\n"));
317}
318
319fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
320    match node {
321        Node::Heading { level, text } => {
322            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
323            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
324        }
325        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
326        Node::Code { language, text } => {
327            // Legacy docling never emits a language on the fence; strict keeps it.
328            let lang = match language {
329                Some(l) if ctx.strict => l.as_str(),
330                _ => "",
331            };
332            blocks.push(format!("```{lang}\n{text}\n```"));
333        }
334        Node::Table(table) => {
335            let rendered = render_table(table, ctx.compact_tables);
336            if !rendered.is_empty() {
337                blocks.push(rendered);
338            }
339        }
340        Node::Picture { caption, image } => {
341            if let Some(cap) = caption {
342                if !cap.is_empty() {
343                    blocks.push(cap.clone());
344                }
345            }
346            blocks.push(picture_marker(image.as_ref(), ctx));
347        }
348        Node::Group { children, .. } => render(children, blocks, ctx),
349        Node::FieldRegion { items } => {
350            // docling renders the region container (which carries no text of its
351            // own) as a `<!-- missing-text -->` marker, then each field item the
352            // same way, followed by that item's marker/key/value as separate
353            // paragraphs.
354            blocks.push(MISSING_TEXT.to_string());
355            for item in items {
356                blocks.push(MISSING_TEXT.to_string());
357                for part in [&item.marker, &item.key, &item.value].into_iter().flatten() {
358                    blocks.push(strict_text(part, ctx.strict));
359                }
360            }
361        }
362        // Handled by the run-merging branch in `render`.
363        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
364    }
365}
366
367/// docling's placeholder for a structural node (a field region / item) that has
368/// no text of its own.
369const MISSING_TEXT: &str = "<!-- missing-text -->";
370
371/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
372/// records the bytes in `ctx.artifacts` for the caller to write.
373fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
374    match (ctx.images, image) {
375        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
376        (ImageMode::Referenced, Some(img)) => {
377            let path = format!(
378                "{}/image_{:06}.{}",
379                ctx.artifacts_dir,
380                ctx.pic_index,
381                ext_for(&img.mimetype)
382            );
383            ctx.pic_index += 1;
384            ctx.artifacts.push((path.clone(), img.data.clone()));
385            format!("![Image]({path})")
386        }
387        // Placeholder, or any mode with no extracted image.
388        _ => "<!-- image -->".to_string(),
389    }
390}
391
392fn ext_for(mimetype: &str) -> &str {
393    match mimetype {
394        "image/jpeg" => "jpg",
395        "image/gif" => "gif",
396        "image/webp" => "webp",
397        "image/bmp" => "bmp",
398        "image/tiff" => "tif",
399        _ => "png",
400    }
401}
402
403/// Render a table. `compact` selects between two serializers:
404///
405/// - **padded** (default) — docling-core's `tabulate(tablefmt="github")`: columns
406///   are padded to a fixed width (header width + a minimum padding of 2, or the
407///   widest data cell); numeric columns (every data cell parses as a number) are
408///   right-aligned, others left-aligned; separators are plain dashes of
409///   `width + 2`. Matches current published docling (DOCX/HTML conformance).
410/// - **compact** — `| a | b |` cells with single-dash `| - | - |` separators, no
411///   width padding. Matches the committed PDF groundtruth corpus, which predates
412///   the padded serializer.
413///
414/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break the
415/// table. Row 0 is the header.
416/// Whether a table cell counts as a number for column alignment, matching
417/// `tabulate`'s detection: an ordinary float/int (`f64`-parseable, covering
418/// `1e2`/`inf`/`+1.5`) **or** a thousands-separated number like `7,015`.
419fn is_number_cell(t: &str) -> bool {
420    t.parse::<f64>().is_ok() || is_thousands_number(t)
421}
422
423/// A number with comma thousands-separators, per `tabulate`'s
424/// `_float_with_thousands_separators` regex
425/// (`^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$`): the
426/// integer part is 1–3 digits then any number of `,ddd` groups; the fraction is
427/// optional (and, without an integer part, must have at least one digit).
428fn is_thousands_number(t: &str) -> bool {
429    let b = t.as_bytes();
430    let mut i = 0;
431    let start = i;
432    if i < b.len() && (b[i] == b'+' || b[i] == b'-') {
433        i += 1;
434    }
435    // First digit chunk: 1–3 digits.
436    let d0 = i;
437    while i < b.len() && b[i].is_ascii_digit() && i - d0 < 3 {
438        i += 1;
439    }
440    let has_int = i > d0;
441    if has_int {
442        // Subsequent `,ddd` groups (exactly three digits each).
443        while i + 3 < b.len() + 1
444            && b.get(i) == Some(&b',')
445            && b.get(i + 1).is_some_and(u8::is_ascii_digit)
446            && b.get(i + 2).is_some_and(u8::is_ascii_digit)
447            && b.get(i + 3).is_some_and(u8::is_ascii_digit)
448        {
449            i += 4;
450        }
451    } else {
452        // A sign only counts with an integer part.
453        i = start;
454    }
455    // Optional fraction.
456    if i < b.len() && b[i] == b'.' {
457        i += 1;
458        let f0 = i;
459        while i < b.len() && b[i].is_ascii_digit() {
460            i += 1;
461        }
462        if !has_int && i == f0 {
463            return false; // `.` with no digits and no integer part
464        }
465    } else if !has_int {
466        return false; // neither integer nor fractional part
467    }
468    i == b.len()
469}
470
471fn render_table(table: &Table, compact: bool) -> String {
472    if table.rows.is_empty() {
473        return String::new();
474    }
475    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
476    if num_cols == 0 {
477        return String::new();
478    }
479
480    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
481    // strips data cells of surrounding whitespace but leaves the header row as-is.
482    let grid: Vec<Vec<String>> = table
483        .rows
484        .iter()
485        .enumerate()
486        .map(|(r, row)| {
487            (0..num_cols)
488                .map(|c| {
489                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
490                    if r == 0 {
491                        cell
492                    } else {
493                        cell.trim().to_string()
494                    }
495                })
496                .collect()
497        })
498        .collect();
499
500    if compact {
501        // Compact: cells joined by " | ", no padding, single-dash separators.
502        let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
503        let mut lines = Vec::with_capacity(grid.len() + 1);
504        lines.push(render_row(0));
505        let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
506        lines.push(format!("| {} |", sep.join(" | ")));
507        for r in 1..grid.len() {
508            lines.push(render_row(r));
509        }
510        return lines.join("\n");
511    }
512
513    // Display width (Unicode scalar count — good enough for now).
514    let dw = |s: &str| s.chars().count();
515    let data_rows = 1..grid.len();
516
517    // A column is right-aligned when at least one data cell is numeric and every
518    // non-empty data cell is numeric — matching `tabulate`'s column typing, where
519    // empty cells are "missing" (ignored) and a number may carry thousands
520    // separators (`7,015`), which a plain `f64` parse rejects.
521    let right: Vec<bool> = (0..num_cols)
522        .map(|c| {
523            let mut any = false;
524            for r in data_rows.clone() {
525                let t = grid[r][c].trim();
526                if t.is_empty() {
527                    continue;
528                }
529                if !is_number_cell(t) {
530                    return false;
531                }
532                any = true;
533            }
534            any
535        })
536        .collect();
537
538    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
539    let width: Vec<usize> = (0..num_cols)
540        .map(|c| {
541            let mut w = dw(&grid[0][c]) + 2;
542            for r in data_rows.clone() {
543                w = w.max(dw(&grid[r][c]));
544            }
545            w
546        })
547        .collect();
548
549    let fmt_cell = |s: &str, c: usize| -> String {
550        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
551        let body = if right[c] {
552            format!("{pad}{s}")
553        } else {
554            format!("{s}{pad}")
555        };
556        format!(" {body} ")
557    };
558    let render_row = |r: usize| -> String {
559        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
560        format!("|{}|", cells.join("|"))
561    };
562
563    let mut lines = Vec::with_capacity(grid.len() + 1);
564    lines.push(render_row(0));
565    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
566    lines.push(format!("|{}|", sep.join("|")));
567    for r in data_rows {
568        lines.push(render_row(r));
569    }
570    lines.join("\n")
571}
572
573/// Escape a table cell so it can't break the markdown table: newlines become
574/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
575fn escape_cell(s: &str) -> String {
576    s.replace('\n', " ").replace('|', "&#124;")
577}
578
579#[cfg(test)]
580mod tests {
581    use super::*;
582
583    #[test]
584    fn renders_headings_paragraphs_and_lists() {
585        let mut doc = DoclingDocument::new("demo");
586        doc.add_heading(1, "Title");
587        doc.add_paragraph("Hello world.");
588        doc.push(Node::ListItem {
589            ordered: false,
590            number: 1,
591            first_in_list: true,
592            text: "first".into(),
593            level: 0,
594        });
595        doc.push(Node::ListItem {
596            ordered: false,
597            number: 2,
598            first_in_list: false,
599            text: "second".into(),
600            level: 0,
601        });
602        let md = doc.export_to_markdown();
603        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
604    }
605
606    #[test]
607    fn strict_renders_recovered_links_legacy_does_not() {
608        let mut doc = DoclingDocument::new("cv");
609        doc.add_paragraph("Find me on LinkedIn or GitHub.");
610        doc.links = vec![
611            ("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
612            ("GitHub".into(), "https://github.com/x/".into()),
613        ];
614        // Legacy/docling mode: links are left untouched (conformance preserved).
615        assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
616        // Strict mode: anchors become Markdown links.
617        assert_eq!(
618            doc.export_to_markdown_with(true),
619            "Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
620        );
621    }
622
623    #[test]
624    fn strict_links_match_escaped_anchor_and_consume_in_order() {
625        let mut doc = DoclingDocument::new("d");
626        // The PDF assembler HTML-escapes prose, so by serialization time the body
627        // already carries `&amp;`; the anchor is stored un-escaped. The matcher must
628        // escape the anchor to find it. Two identical anchors link in document order.
629        doc.add_paragraph("AI &amp; ML here, and issues here, then issues there.");
630        doc.links = vec![
631            ("AI & ML".into(), "https://a/".into()),
632            ("issues".into(), "https://first/".into()),
633            ("issues".into(), "https://second/".into()),
634        ];
635        assert_eq!(
636            doc.export_to_markdown_with(true),
637            "[AI &amp; ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
638        );
639    }
640
641    #[test]
642    fn renders_compact_table() {
643        let mut doc = DoclingDocument::new("t");
644        // The compact form is opt-in (the PDF backend sets it); default output uses
645        // the padded GitHub serializer (covered by the regression fixtures).
646        doc.compact_tables = true;
647        doc.push(Node::Table(Table {
648            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
649        }));
650        let md = doc.export_to_markdown();
651        assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
652    }
653
654    #[test]
655    fn renders_padded_github_table_by_default() {
656        let mut doc = DoclingDocument::new("t");
657        doc.push(Node::Table(Table {
658            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
659        }));
660        let md = doc.export_to_markdown();
661        // Numeric data columns are right-aligned; columns padded to header+2.
662        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
663    }
664
665    #[test]
666    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
667        let mut doc = DoclingDocument::new("t");
668        doc.add_heading(1, "a\\_b");
669        doc.add_paragraph("x\\_y");
670        doc.push(Node::ListItem {
671            ordered: false,
672            number: 1,
673            first_in_list: true,
674            text: "i\\_j".into(),
675            level: 0,
676        });
677        // Legacy reproduces docling's `\_` escaping byte-for-byte.
678        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
679        // Strict prefers literal underscores (Rust-only readability mode).
680        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
681    }
682
683    /// Drive a document's nodes through [`MarkdownStreamer`] in the given page
684    /// splits and assert the concatenated chunks equal the buffered serializer.
685    fn assert_stream_matches(
686        doc: &DoclingDocument,
687        strict: bool,
688        images: ImageMode,
689        splits: &[usize],
690    ) {
691        let want = to_markdown_images(doc, strict, images, "artifacts").0;
692        let mut streamer = MarkdownStreamer::new(strict, images, doc.compact_tables);
693        let mut got = String::new();
694        let mut start = 0;
695        for &end in splits {
696            // Links only matter in strict mode; feed them all with the first batch
697            // that has content (document order is preserved by the queue).
698            let links = if start == 0 {
699                doc.links.as_slice()
700            } else {
701                &[]
702            };
703            got.push_str(&streamer.push(&doc.nodes[start..end], links));
704            start = end;
705        }
706        got.push_str(&streamer.push(
707            &doc.nodes[start..],
708            if start == 0 {
709                doc.links.as_slice()
710            } else {
711                &[]
712            },
713        ));
714        got.push_str(&streamer.finish());
715        assert_eq!(
716            got, want,
717            "streamed output diverged (splits={splits:?}, strict={strict})"
718        );
719    }
720
721    #[test]
722    fn streaming_is_byte_identical_to_buffered() {
723        let mut doc = DoclingDocument::new("d");
724        doc.add_heading(1, "Title");
725        doc.add_paragraph("First paragraph.");
726        doc.push(Node::ListItem {
727            ordered: false,
728            number: 1,
729            first_in_list: true,
730            text: "a".into(),
731            level: 0,
732        });
733        doc.push(Node::ListItem {
734            ordered: false,
735            number: 2,
736            first_in_list: false,
737            text: "b".into(),
738            level: 0,
739        });
740        doc.push(Node::Code {
741            language: Some("rust".into()),
742            text: "let x = 1;".into(),
743        });
744        doc.push(Node::Table(Table {
745            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
746        }));
747        doc.push(Node::Picture {
748            caption: Some("Fig 1".into()),
749            image: None,
750        });
751        doc.add_paragraph("Last paragraph.");
752
753        // A run of list items must never straddle a split, so try splits that fall
754        // on safe block boundaries (the streaming PDF assembler guarantees this).
755        for &strict in &[false, true] {
756            for &images in &[ImageMode::Placeholder, ImageMode::Embedded] {
757                for splits in [&[][..], &[1][..], &[2][..], &[4][..], &[1, 4, 6][..]] {
758                    assert_stream_matches(&doc, strict, images, splits);
759                }
760            }
761        }
762    }
763
764    #[test]
765    fn streaming_applies_recovered_links_in_strict_mode() {
766        let mut doc = DoclingDocument::new("d");
767        doc.add_paragraph("See LinkedIn for details.");
768        doc.add_paragraph("And GitHub too.");
769        doc.links = vec![
770            ("LinkedIn".into(), "https://lnkd/".into()),
771            ("GitHub".into(), "https://gh/".into()),
772        ];
773        // The second anchor lives in the second block, so it must be carried across
774        // the page boundary and placed when that block streams out.
775        assert_stream_matches(&doc, true, ImageMode::Placeholder, &[1]);
776    }
777
778    #[test]
779    fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
780        let mut doc = DoclingDocument::new("t");
781        doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
782        // Legacy keeps docling's spacing byte-for-byte.
783        assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
784        // Strict tightens punctuation for readable Markdown.
785        assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
786    }
787}