Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    /// Emit compact `| a | b |` tables instead of the padded GitHub serializer.
22    compact_tables: bool,
23    images: ImageMode,
24    artifacts_dir: String,
25    /// (relative path, bytes) for each referenced image — written by the caller.
26    artifacts: Vec<(String, Vec<u8>)>,
27    pic_index: usize,
28}
29
30/// Render a document to a Markdown string (pictures as placeholders).
31///
32/// `strict` selects the serializer-level behaviours that differ between
33/// docling-legacy output and cleaner Markdown — currently the code-fence
34/// language (legacy drops it, strict keeps it).
35pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
40/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
41/// the caller should write (relative to the Markdown file).
42pub fn to_markdown_images(
43    doc: &DoclingDocument,
44    strict: bool,
45    images: ImageMode,
46    artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48    let mut ctx = Ctx {
49        strict,
50        compact_tables: doc.compact_tables,
51        images,
52        artifacts_dir: artifacts_dir.to_string(),
53        artifacts: Vec::new(),
54        pic_index: 0,
55    };
56    let mut blocks: Vec<String> = Vec::new();
57    render(&doc.nodes, &mut blocks, &mut ctx);
58    let mut body = blocks.join("\n\n");
59    // Strict mode only: turn recovered source hyperlinks into Markdown links.
60    // docling's standard pipeline drops them, so doing this in legacy mode would
61    // diverge from docling — hence strict-only, leaving conformance output intact.
62    if strict && !doc.links.is_empty() {
63        body = apply_links(&body, &doc.links);
64    }
65    let md = if body.is_empty() {
66        String::new()
67    } else {
68        format!("{body}\n")
69    };
70    (md, ctx.artifacts)
71}
72
73/// Wrap each recovered link's anchor text in Markdown `[anchor](href)`. Anchors
74/// arrive cleaned (curly quotes/dashes already normalized) but un-escaped, so we
75/// match against the body's HTML-escaped (`&`/`<`/`>`) form, the way prose nodes
76/// were serialized. Links are consumed in document order from a moving cursor, so
77/// a repeated anchor (e.g. two "issues") links its successive occurrences rather
78/// than all pointing at the first. An anchor that can't be located is skipped
79/// (its text may have been split across a line wrap or table cell).
80fn apply_links(body: &str, links: &[(String, String)]) -> String {
81    let mut out = body.to_string();
82    let mut cursor = 0usize;
83    for (anchor, href) in links {
84        let anchor = anchor
85            .replace('&', "&amp;")
86            .replace('<', "&lt;")
87            .replace('>', "&gt;");
88        if anchor.is_empty() {
89            continue;
90        }
91        if let Some(rel) = out[cursor..].find(&anchor) {
92            let at = cursor + rel;
93            // Don't relink inside an already-emitted `](` Markdown link target.
94            let replacement = format!("[{anchor}]({href})");
95            out.replace_range(at..at + anchor.len(), &replacement);
96            cursor = at + replacement.len();
97        }
98    }
99    out
100}
101
102/// Like [`apply_links`] but over a single chunk, consuming from a shared queue so
103/// the same `[anchor](href)` rewriting can be applied incrementally as Markdown is
104/// streamed out. Each queued link is matched (in document order) against `chunk`
105/// and rewritten in place; a link whose anchor is not in this chunk is carried
106/// forward in the queue for a later chunk. Anchors are recovered in document
107/// order and a chunk is always a contiguous run of whole blocks, so this
108/// reproduces [`apply_links`]' single moving cursor: the link lands in whichever
109/// chunk contains its anchor, identically to the buffered path. (A link whose
110/// anchor never appears is carried to the end and dropped — the same no-op
111/// `apply_links` performs for an unlocatable anchor.)
112fn apply_links_chunk(chunk: &str, queue: &mut Vec<(String, String)>) -> String {
113    let mut out = chunk.to_string();
114    let mut cursor = 0usize;
115    let mut carried: Vec<(String, String)> = Vec::new();
116    for (anchor_raw, href) in std::mem::take(queue) {
117        let anchor = anchor_raw
118            .replace('&', "&amp;")
119            .replace('<', "&lt;")
120            .replace('>', "&gt;");
121        if anchor.is_empty() {
122            continue;
123        }
124        if let Some(rel) = out[cursor..].find(&anchor) {
125            let at = cursor + rel;
126            let replacement = format!("[{anchor}]({href})");
127            out.replace_range(at..at + anchor.len(), &replacement);
128            cursor = at + replacement.len();
129        } else {
130            // Not in this chunk; try again when its block is flushed.
131            carried.push((anchor_raw, href));
132        }
133    }
134    *queue = carried;
135    out
136}
137
138/// Incremental Markdown serializer: feed finalized, in-document-order batches of
139/// [`Node`]s and receive Markdown chunks whose concatenation is **byte-identical**
140/// to [`to_markdown_images`] over the same nodes. This is the streaming
141/// counterpart of the buffered serializer — used to emit a document's Markdown in
142/// chunks (e.g. page by page, as the parallel PDF pipeline finishes pages) instead
143/// of building the whole string up front.
144///
145/// Only [`ImageMode::Placeholder`] and [`ImageMode::Embedded`] are streamable:
146/// [`ImageMode::Referenced`] needs a side-channel for the image bytes, which only
147/// the buffered [`to_markdown_images`] provides.
148///
149/// Each [`push`](Self::push) must contain whole blocks in reading order: a caller
150/// must not split a run of list items across two pushes (the run would render as
151/// two separate lists). Finalized PDF page batches already satisfy this.
152pub struct MarkdownStreamer {
153    strict: bool,
154    images: ImageMode,
155    compact_tables: bool,
156    /// Whether any non-empty chunk has been emitted yet (drives `\n\n` joins and
157    /// the trailing newline).
158    emitted_any: bool,
159    /// Recovered links not yet placed (strict mode), consumed in document order.
160    links: Vec<(String, String)>,
161}
162
163impl MarkdownStreamer {
164    /// Create a streamer. `compact_tables` mirrors [`DoclingDocument::compact_tables`].
165    pub fn new(strict: bool, images: ImageMode, compact_tables: bool) -> Self {
166        debug_assert!(
167            images != ImageMode::Referenced,
168            "referenced image mode is not streamable; use to_markdown_images"
169        );
170        Self {
171            strict,
172            images,
173            compact_tables,
174            emitted_any: false,
175            links: Vec::new(),
176        }
177    }
178
179    /// Render one finalized batch of nodes (plus any links recovered from the same
180    /// span, in document order) into the next Markdown chunk. Returns an empty
181    /// string when the batch produces no output (e.g. empty tables/pictures), in
182    /// which case nothing should be written.
183    pub fn push(&mut self, nodes: &[Node], links: &[(String, String)]) -> String {
184        self.links.extend(links.iter().cloned());
185        let mut ctx = Ctx {
186            strict: self.strict,
187            compact_tables: self.compact_tables,
188            images: self.images,
189            // Referenced mode is rejected at construction, so the artifact sink is
190            // never touched.
191            artifacts_dir: String::new(),
192            artifacts: Vec::new(),
193            pic_index: 0,
194        };
195        let mut blocks: Vec<String> = Vec::new();
196        render(nodes, &mut blocks, &mut ctx);
197        if blocks.is_empty() {
198            return String::new();
199        }
200        let mut body = blocks.join("\n\n");
201        if self.strict && !self.links.is_empty() {
202            body = apply_links_chunk(&body, &mut self.links);
203        }
204        let chunk = if self.emitted_any {
205            format!("\n\n{body}")
206        } else {
207            body
208        };
209        self.emitted_any = true;
210        chunk
211    }
212
213    /// Emit the trailing newline that finishes the document (empty if no content
214    /// was produced). Call exactly once, after the final [`push`](Self::push).
215    pub fn finish(self) -> String {
216        if self.emitted_any {
217            "\n".to_string()
218        } else {
219            String::new()
220        }
221    }
222}
223
224/// In `strict` mode, rewrite inline text for readability rather than byte-for-byte
225/// docling fidelity: undo the legacy `\_` underscore escaping, and tighten stray
226/// spaces around punctuation (`[ 37 , 36 ]` → `[37, 36]`, `( x )` → `(x)`). This
227/// cleans up both the PDF backend's glyph-split spacing and the space the legacy
228/// emphasis serialization leaves before punctuation (`*a* ,` → `*a*,`).
229/// Legacy/default output keeps docling's spacing untouched. Only inline text
230/// nodes pass through here — code blocks and table cells are left alone.
231fn strict_text(text: &str, strict: bool) -> String {
232    if !strict {
233        return text.to_string();
234    }
235    text.replace("\\_", "_")
236        .replace(" ,", ",")
237        .replace(" .", ".")
238        .replace(" ;", ";")
239        .replace(" )", ")")
240        .replace("( ", "(")
241        .replace(" ]", "]")
242        .replace("[ ", "[")
243}
244
245fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
246    let mut i = 0;
247    while i < nodes.len() {
248        match &nodes[i] {
249            Node::ListItem { .. } => {
250                let start = i;
251                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
252                    i += 1;
253                }
254                render_list_run(&nodes[start..i], blocks, ctx.strict);
255            }
256            other => {
257                render_one(other, blocks, ctx);
258                i += 1;
259            }
260        }
261    }
262}
263
264/// Render a contiguous run of list items.
265///
266/// Ordered items use their explicit `number`. A new sibling list (marked by
267/// `first_in_list`) at the same depth is separated by a blank line, matching
268/// docling-core's serializer.
269fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
270    let mut lines: Vec<String> = Vec::new();
271    // Per level, the previous item's (ordered, number) so we can detect a new
272    // sibling list.
273    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
274
275    for item in items {
276        let Node::ListItem {
277            ordered,
278            number,
279            first_in_list,
280            text,
281            level,
282        } = item
283        else {
284            continue;
285        };
286        let level = *level as usize;
287
288        // Returning to a shallower level ends the deeper sibling lists.
289        prev.truncate(level + 1);
290        while prev.len() <= level {
291            prev.push(None);
292        }
293
294        // A new sibling list at the same depth gets a blank line: the kind flips
295        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
296        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
297        if let Some((prev_ordered, prev_number)) = prev[level] {
298            let new_list = *first_in_list
299                || prev_ordered != *ordered
300                || (*ordered && *number != prev_number + 1);
301            if new_list {
302                lines.push(String::new());
303            }
304        }
305
306        let indent = "    ".repeat(level);
307        let marker = if *ordered {
308            format!("{number}.")
309        } else {
310            "-".to_string()
311        };
312        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
313        prev[level] = Some((*ordered, *number));
314    }
315
316    blocks.push(lines.join("\n"));
317}
318
319fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
320    match node {
321        Node::Heading { level, text } => {
322            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
323            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
324        }
325        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
326        Node::Code { language, text } => {
327            // Legacy docling never emits a language on the fence; strict keeps it.
328            let lang = match language {
329                Some(l) if ctx.strict => l.as_str(),
330                _ => "",
331            };
332            blocks.push(format!("```{lang}\n{text}\n```"));
333        }
334        Node::Table(table) => {
335            let rendered = render_table(table, ctx.compact_tables);
336            if !rendered.is_empty() {
337                blocks.push(rendered);
338            }
339        }
340        Node::Picture { caption, image } => {
341            if let Some(cap) = caption {
342                if !cap.is_empty() {
343                    blocks.push(cap.clone());
344                }
345            }
346            blocks.push(picture_marker(image.as_ref(), ctx));
347        }
348        Node::Group { children, .. } => render(children, blocks, ctx),
349        // Handled by the run-merging branch in `render`.
350        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
351    }
352}
353
354/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
355/// records the bytes in `ctx.artifacts` for the caller to write.
356fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
357    match (ctx.images, image) {
358        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
359        (ImageMode::Referenced, Some(img)) => {
360            let path = format!(
361                "{}/image_{:06}.{}",
362                ctx.artifacts_dir,
363                ctx.pic_index,
364                ext_for(&img.mimetype)
365            );
366            ctx.pic_index += 1;
367            ctx.artifacts.push((path.clone(), img.data.clone()));
368            format!("![Image]({path})")
369        }
370        // Placeholder, or any mode with no extracted image.
371        _ => "<!-- image -->".to_string(),
372    }
373}
374
375fn ext_for(mimetype: &str) -> &str {
376    match mimetype {
377        "image/jpeg" => "jpg",
378        "image/gif" => "gif",
379        "image/webp" => "webp",
380        "image/bmp" => "bmp",
381        "image/tiff" => "tif",
382        _ => "png",
383    }
384}
385
386/// Render a table. `compact` selects between two serializers:
387///
388/// - **padded** (default) — docling-core's `tabulate(tablefmt="github")`: columns
389///   are padded to a fixed width (header width + a minimum padding of 2, or the
390///   widest data cell); numeric columns (every data cell parses as a number) are
391///   right-aligned, others left-aligned; separators are plain dashes of
392///   `width + 2`. Matches current published docling (DOCX/HTML conformance).
393/// - **compact** — `| a | b |` cells with single-dash `| - | - |` separators, no
394///   width padding. Matches the committed PDF groundtruth corpus, which predates
395///   the padded serializer.
396///
397/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break the
398/// table. Row 0 is the header.
399/// Whether a table cell counts as a number for column alignment, matching
400/// `tabulate`'s detection: an ordinary float/int (`f64`-parseable, covering
401/// `1e2`/`inf`/`+1.5`) **or** a thousands-separated number like `7,015`.
402fn is_number_cell(t: &str) -> bool {
403    t.parse::<f64>().is_ok() || is_thousands_number(t)
404}
405
406/// A number with comma thousands-separators, per `tabulate`'s
407/// `_float_with_thousands_separators` regex
408/// (`^(([+-]?[0-9]{1,3})(?:,([0-9]{3}))*)?(?(1)\.[0-9]*|\.[0-9]+)?$`): the
409/// integer part is 1–3 digits then any number of `,ddd` groups; the fraction is
410/// optional (and, without an integer part, must have at least one digit).
411fn is_thousands_number(t: &str) -> bool {
412    let b = t.as_bytes();
413    let mut i = 0;
414    let start = i;
415    if i < b.len() && (b[i] == b'+' || b[i] == b'-') {
416        i += 1;
417    }
418    // First digit chunk: 1–3 digits.
419    let d0 = i;
420    while i < b.len() && b[i].is_ascii_digit() && i - d0 < 3 {
421        i += 1;
422    }
423    let has_int = i > d0;
424    if has_int {
425        // Subsequent `,ddd` groups (exactly three digits each).
426        while i + 3 < b.len() + 1
427            && b.get(i) == Some(&b',')
428            && b.get(i + 1).is_some_and(u8::is_ascii_digit)
429            && b.get(i + 2).is_some_and(u8::is_ascii_digit)
430            && b.get(i + 3).is_some_and(u8::is_ascii_digit)
431        {
432            i += 4;
433        }
434    } else {
435        // A sign only counts with an integer part.
436        i = start;
437    }
438    // Optional fraction.
439    if i < b.len() && b[i] == b'.' {
440        i += 1;
441        let f0 = i;
442        while i < b.len() && b[i].is_ascii_digit() {
443            i += 1;
444        }
445        if !has_int && i == f0 {
446            return false; // `.` with no digits and no integer part
447        }
448    } else if !has_int {
449        return false; // neither integer nor fractional part
450    }
451    i == b.len()
452}
453
454fn render_table(table: &Table, compact: bool) -> String {
455    if table.rows.is_empty() {
456        return String::new();
457    }
458    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
459    if num_cols == 0 {
460        return String::new();
461    }
462
463    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
464    // strips data cells of surrounding whitespace but leaves the header row as-is.
465    let grid: Vec<Vec<String>> = table
466        .rows
467        .iter()
468        .enumerate()
469        .map(|(r, row)| {
470            (0..num_cols)
471                .map(|c| {
472                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
473                    if r == 0 {
474                        cell
475                    } else {
476                        cell.trim().to_string()
477                    }
478                })
479                .collect()
480        })
481        .collect();
482
483    if compact {
484        // Compact: cells joined by " | ", no padding, single-dash separators.
485        let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
486        let mut lines = Vec::with_capacity(grid.len() + 1);
487        lines.push(render_row(0));
488        let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
489        lines.push(format!("| {} |", sep.join(" | ")));
490        for r in 1..grid.len() {
491            lines.push(render_row(r));
492        }
493        return lines.join("\n");
494    }
495
496    // Display width (Unicode scalar count — good enough for now).
497    let dw = |s: &str| s.chars().count();
498    let data_rows = 1..grid.len();
499
500    // A column is right-aligned when at least one data cell is numeric and every
501    // non-empty data cell is numeric — matching `tabulate`'s column typing, where
502    // empty cells are "missing" (ignored) and a number may carry thousands
503    // separators (`7,015`), which a plain `f64` parse rejects.
504    let right: Vec<bool> = (0..num_cols)
505        .map(|c| {
506            let mut any = false;
507            for r in data_rows.clone() {
508                let t = grid[r][c].trim();
509                if t.is_empty() {
510                    continue;
511                }
512                if !is_number_cell(t) {
513                    return false;
514                }
515                any = true;
516            }
517            any
518        })
519        .collect();
520
521    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
522    let width: Vec<usize> = (0..num_cols)
523        .map(|c| {
524            let mut w = dw(&grid[0][c]) + 2;
525            for r in data_rows.clone() {
526                w = w.max(dw(&grid[r][c]));
527            }
528            w
529        })
530        .collect();
531
532    let fmt_cell = |s: &str, c: usize| -> String {
533        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
534        let body = if right[c] {
535            format!("{pad}{s}")
536        } else {
537            format!("{s}{pad}")
538        };
539        format!(" {body} ")
540    };
541    let render_row = |r: usize| -> String {
542        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
543        format!("|{}|", cells.join("|"))
544    };
545
546    let mut lines = Vec::with_capacity(grid.len() + 1);
547    lines.push(render_row(0));
548    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
549    lines.push(format!("|{}|", sep.join("|")));
550    for r in data_rows {
551        lines.push(render_row(r));
552    }
553    lines.join("\n")
554}
555
556/// Escape a table cell so it can't break the markdown table: newlines become
557/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
558fn escape_cell(s: &str) -> String {
559    s.replace('\n', " ").replace('|', "&#124;")
560}
561
562#[cfg(test)]
563mod tests {
564    use super::*;
565
566    #[test]
567    fn renders_headings_paragraphs_and_lists() {
568        let mut doc = DoclingDocument::new("demo");
569        doc.add_heading(1, "Title");
570        doc.add_paragraph("Hello world.");
571        doc.push(Node::ListItem {
572            ordered: false,
573            number: 1,
574            first_in_list: true,
575            text: "first".into(),
576            level: 0,
577        });
578        doc.push(Node::ListItem {
579            ordered: false,
580            number: 2,
581            first_in_list: false,
582            text: "second".into(),
583            level: 0,
584        });
585        let md = doc.export_to_markdown();
586        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
587    }
588
589    #[test]
590    fn strict_renders_recovered_links_legacy_does_not() {
591        let mut doc = DoclingDocument::new("cv");
592        doc.add_paragraph("Find me on LinkedIn or GitHub.");
593        doc.links = vec![
594            ("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
595            ("GitHub".into(), "https://github.com/x/".into()),
596        ];
597        // Legacy/docling mode: links are left untouched (conformance preserved).
598        assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
599        // Strict mode: anchors become Markdown links.
600        assert_eq!(
601            doc.export_to_markdown_with(true),
602            "Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
603        );
604    }
605
606    #[test]
607    fn strict_links_match_escaped_anchor_and_consume_in_order() {
608        let mut doc = DoclingDocument::new("d");
609        // The PDF assembler HTML-escapes prose, so by serialization time the body
610        // already carries `&amp;`; the anchor is stored un-escaped. The matcher must
611        // escape the anchor to find it. Two identical anchors link in document order.
612        doc.add_paragraph("AI &amp; ML here, and issues here, then issues there.");
613        doc.links = vec![
614            ("AI & ML".into(), "https://a/".into()),
615            ("issues".into(), "https://first/".into()),
616            ("issues".into(), "https://second/".into()),
617        ];
618        assert_eq!(
619            doc.export_to_markdown_with(true),
620            "[AI &amp; ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
621        );
622    }
623
624    #[test]
625    fn renders_compact_table() {
626        let mut doc = DoclingDocument::new("t");
627        // The compact form is opt-in (the PDF backend sets it); default output uses
628        // the padded GitHub serializer (covered by the regression fixtures).
629        doc.compact_tables = true;
630        doc.push(Node::Table(Table {
631            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
632        }));
633        let md = doc.export_to_markdown();
634        assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
635    }
636
637    #[test]
638    fn renders_padded_github_table_by_default() {
639        let mut doc = DoclingDocument::new("t");
640        doc.push(Node::Table(Table {
641            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
642        }));
643        let md = doc.export_to_markdown();
644        // Numeric data columns are right-aligned; columns padded to header+2.
645        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
646    }
647
648    #[test]
649    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
650        let mut doc = DoclingDocument::new("t");
651        doc.add_heading(1, "a\\_b");
652        doc.add_paragraph("x\\_y");
653        doc.push(Node::ListItem {
654            ordered: false,
655            number: 1,
656            first_in_list: true,
657            text: "i\\_j".into(),
658            level: 0,
659        });
660        // Legacy reproduces docling's `\_` escaping byte-for-byte.
661        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
662        // Strict prefers literal underscores (Rust-only readability mode).
663        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
664    }
665
666    /// Drive a document's nodes through [`MarkdownStreamer`] in the given page
667    /// splits and assert the concatenated chunks equal the buffered serializer.
668    fn assert_stream_matches(
669        doc: &DoclingDocument,
670        strict: bool,
671        images: ImageMode,
672        splits: &[usize],
673    ) {
674        let want = to_markdown_images(doc, strict, images, "artifacts").0;
675        let mut streamer = MarkdownStreamer::new(strict, images, doc.compact_tables);
676        let mut got = String::new();
677        let mut start = 0;
678        for &end in splits {
679            // Links only matter in strict mode; feed them all with the first batch
680            // that has content (document order is preserved by the queue).
681            let links = if start == 0 {
682                doc.links.as_slice()
683            } else {
684                &[]
685            };
686            got.push_str(&streamer.push(&doc.nodes[start..end], links));
687            start = end;
688        }
689        got.push_str(&streamer.push(
690            &doc.nodes[start..],
691            if start == 0 {
692                doc.links.as_slice()
693            } else {
694                &[]
695            },
696        ));
697        got.push_str(&streamer.finish());
698        assert_eq!(
699            got, want,
700            "streamed output diverged (splits={splits:?}, strict={strict})"
701        );
702    }
703
704    #[test]
705    fn streaming_is_byte_identical_to_buffered() {
706        let mut doc = DoclingDocument::new("d");
707        doc.add_heading(1, "Title");
708        doc.add_paragraph("First paragraph.");
709        doc.push(Node::ListItem {
710            ordered: false,
711            number: 1,
712            first_in_list: true,
713            text: "a".into(),
714            level: 0,
715        });
716        doc.push(Node::ListItem {
717            ordered: false,
718            number: 2,
719            first_in_list: false,
720            text: "b".into(),
721            level: 0,
722        });
723        doc.push(Node::Code {
724            language: Some("rust".into()),
725            text: "let x = 1;".into(),
726        });
727        doc.push(Node::Table(Table {
728            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
729        }));
730        doc.push(Node::Picture {
731            caption: Some("Fig 1".into()),
732            image: None,
733        });
734        doc.add_paragraph("Last paragraph.");
735
736        // A run of list items must never straddle a split, so try splits that fall
737        // on safe block boundaries (the streaming PDF assembler guarantees this).
738        for &strict in &[false, true] {
739            for &images in &[ImageMode::Placeholder, ImageMode::Embedded] {
740                for splits in [&[][..], &[1][..], &[2][..], &[4][..], &[1, 4, 6][..]] {
741                    assert_stream_matches(&doc, strict, images, splits);
742                }
743            }
744        }
745    }
746
747    #[test]
748    fn streaming_applies_recovered_links_in_strict_mode() {
749        let mut doc = DoclingDocument::new("d");
750        doc.add_paragraph("See LinkedIn for details.");
751        doc.add_paragraph("And GitHub too.");
752        doc.links = vec![
753            ("LinkedIn".into(), "https://lnkd/".into()),
754            ("GitHub".into(), "https://gh/".into()),
755        ];
756        // The second anchor lives in the second block, so it must be carried across
757        // the page boundary and placed when that block streams out.
758        assert_stream_matches(&doc, true, ImageMode::Placeholder, &[1]);
759    }
760
761    #[test]
762    fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
763        let mut doc = DoclingDocument::new("t");
764        doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
765        // Legacy keeps docling's spacing byte-for-byte.
766        assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
767        // Strict tightens punctuation for readable Markdown.
768        assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
769    }
770}