Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    /// Emit compact `| a | b |` tables instead of the padded GitHub serializer.
22    compact_tables: bool,
23    images: ImageMode,
24    artifacts_dir: String,
25    /// (relative path, bytes) for each referenced image — written by the caller.
26    artifacts: Vec<(String, Vec<u8>)>,
27    pic_index: usize,
28}
29
30/// Render a document to a Markdown string (pictures as placeholders).
31///
32/// `strict` selects the serializer-level behaviours that differ between
33/// docling-legacy output and cleaner Markdown — currently the code-fence
34/// language (legacy drops it, strict keeps it).
35pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
40/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
41/// the caller should write (relative to the Markdown file).
42pub fn to_markdown_images(
43    doc: &DoclingDocument,
44    strict: bool,
45    images: ImageMode,
46    artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48    let mut ctx = Ctx {
49        strict,
50        compact_tables: doc.compact_tables,
51        images,
52        artifacts_dir: artifacts_dir.to_string(),
53        artifacts: Vec::new(),
54        pic_index: 0,
55    };
56    let mut blocks: Vec<String> = Vec::new();
57    render(&doc.nodes, &mut blocks, &mut ctx);
58    let mut body = blocks.join("\n\n");
59    // Strict mode only: turn recovered source hyperlinks into Markdown links.
60    // docling's standard pipeline drops them, so doing this in legacy mode would
61    // diverge from docling — hence strict-only, leaving conformance output intact.
62    if strict && !doc.links.is_empty() {
63        body = apply_links(&body, &doc.links);
64    }
65    let md = if body.is_empty() {
66        String::new()
67    } else {
68        format!("{body}\n")
69    };
70    (md, ctx.artifacts)
71}
72
73/// Wrap each recovered link's anchor text in Markdown `[anchor](href)`. Anchors
74/// arrive cleaned (curly quotes/dashes already normalized) but un-escaped, so we
75/// match against the body's HTML-escaped (`&`/`<`/`>`) form, the way prose nodes
76/// were serialized. Links are consumed in document order from a moving cursor, so
77/// a repeated anchor (e.g. two "issues") links its successive occurrences rather
78/// than all pointing at the first. An anchor that can't be located is skipped
79/// (its text may have been split across a line wrap or table cell).
80fn apply_links(body: &str, links: &[(String, String)]) -> String {
81    let mut out = body.to_string();
82    let mut cursor = 0usize;
83    for (anchor, href) in links {
84        let anchor = anchor
85            .replace('&', "&amp;")
86            .replace('<', "&lt;")
87            .replace('>', "&gt;");
88        if anchor.is_empty() {
89            continue;
90        }
91        if let Some(rel) = out[cursor..].find(&anchor) {
92            let at = cursor + rel;
93            // Don't relink inside an already-emitted `](` Markdown link target.
94            let replacement = format!("[{anchor}]({href})");
95            out.replace_range(at..at + anchor.len(), &replacement);
96            cursor = at + replacement.len();
97        }
98    }
99    out
100}
101
102/// Like [`apply_links`] but over a single chunk, consuming from a shared queue so
103/// the same `[anchor](href)` rewriting can be applied incrementally as Markdown is
104/// streamed out. Each queued link is matched (in document order) against `chunk`
105/// and rewritten in place; a link whose anchor is not in this chunk is carried
106/// forward in the queue for a later chunk. Anchors are recovered in document
107/// order and a chunk is always a contiguous run of whole blocks, so this
108/// reproduces [`apply_links`]' single moving cursor: the link lands in whichever
109/// chunk contains its anchor, identically to the buffered path. (A link whose
110/// anchor never appears is carried to the end and dropped — the same no-op
111/// `apply_links` performs for an unlocatable anchor.)
112fn apply_links_chunk(chunk: &str, queue: &mut Vec<(String, String)>) -> String {
113    let mut out = chunk.to_string();
114    let mut cursor = 0usize;
115    let mut carried: Vec<(String, String)> = Vec::new();
116    for (anchor_raw, href) in std::mem::take(queue) {
117        let anchor = anchor_raw
118            .replace('&', "&amp;")
119            .replace('<', "&lt;")
120            .replace('>', "&gt;");
121        if anchor.is_empty() {
122            continue;
123        }
124        if let Some(rel) = out[cursor..].find(&anchor) {
125            let at = cursor + rel;
126            let replacement = format!("[{anchor}]({href})");
127            out.replace_range(at..at + anchor.len(), &replacement);
128            cursor = at + replacement.len();
129        } else {
130            // Not in this chunk; try again when its block is flushed.
131            carried.push((anchor_raw, href));
132        }
133    }
134    *queue = carried;
135    out
136}
137
138/// Incremental Markdown serializer: feed finalized, in-document-order batches of
139/// [`Node`]s and receive Markdown chunks whose concatenation is **byte-identical**
140/// to [`to_markdown_images`] over the same nodes. This is the streaming
141/// counterpart of the buffered serializer — used to emit a document's Markdown in
142/// chunks (e.g. page by page, as the parallel PDF pipeline finishes pages) instead
143/// of building the whole string up front.
144///
145/// Only [`ImageMode::Placeholder`] and [`ImageMode::Embedded`] are streamable:
146/// [`ImageMode::Referenced`] needs a side-channel for the image bytes, which only
147/// the buffered [`to_markdown_images`] provides.
148///
149/// Each [`push`](Self::push) must contain whole blocks in reading order: a caller
150/// must not split a run of list items across two pushes (the run would render as
151/// two separate lists). Finalized PDF page batches already satisfy this.
152pub struct MarkdownStreamer {
153    strict: bool,
154    images: ImageMode,
155    compact_tables: bool,
156    /// Whether any non-empty chunk has been emitted yet (drives `\n\n` joins and
157    /// the trailing newline).
158    emitted_any: bool,
159    /// Recovered links not yet placed (strict mode), consumed in document order.
160    links: Vec<(String, String)>,
161}
162
163impl MarkdownStreamer {
164    /// Create a streamer. `compact_tables` mirrors [`DoclingDocument::compact_tables`].
165    pub fn new(strict: bool, images: ImageMode, compact_tables: bool) -> Self {
166        debug_assert!(
167            images != ImageMode::Referenced,
168            "referenced image mode is not streamable; use to_markdown_images"
169        );
170        Self {
171            strict,
172            images,
173            compact_tables,
174            emitted_any: false,
175            links: Vec::new(),
176        }
177    }
178
179    /// Render one finalized batch of nodes (plus any links recovered from the same
180    /// span, in document order) into the next Markdown chunk. Returns an empty
181    /// string when the batch produces no output (e.g. empty tables/pictures), in
182    /// which case nothing should be written.
183    pub fn push(&mut self, nodes: &[Node], links: &[(String, String)]) -> String {
184        self.links.extend(links.iter().cloned());
185        let mut ctx = Ctx {
186            strict: self.strict,
187            compact_tables: self.compact_tables,
188            images: self.images,
189            // Referenced mode is rejected at construction, so the artifact sink is
190            // never touched.
191            artifacts_dir: String::new(),
192            artifacts: Vec::new(),
193            pic_index: 0,
194        };
195        let mut blocks: Vec<String> = Vec::new();
196        render(nodes, &mut blocks, &mut ctx);
197        if blocks.is_empty() {
198            return String::new();
199        }
200        let mut body = blocks.join("\n\n");
201        if self.strict && !self.links.is_empty() {
202            body = apply_links_chunk(&body, &mut self.links);
203        }
204        let chunk = if self.emitted_any {
205            format!("\n\n{body}")
206        } else {
207            body
208        };
209        self.emitted_any = true;
210        chunk
211    }
212
213    /// Emit the trailing newline that finishes the document (empty if no content
214    /// was produced). Call exactly once, after the final [`push`](Self::push).
215    pub fn finish(self) -> String {
216        if self.emitted_any {
217            "\n".to_string()
218        } else {
219            String::new()
220        }
221    }
222}
223
224/// In `strict` mode, rewrite inline text for readability rather than byte-for-byte
225/// docling fidelity: undo the legacy `\_` underscore escaping, and tighten stray
226/// spaces around punctuation (`[ 37 , 36 ]` → `[37, 36]`, `( x )` → `(x)`). This
227/// cleans up both the PDF backend's glyph-split spacing and the space the legacy
228/// emphasis serialization leaves before punctuation (`*a* ,` → `*a*,`).
229/// Legacy/default output keeps docling's spacing untouched. Only inline text
230/// nodes pass through here — code blocks and table cells are left alone.
231fn strict_text(text: &str, strict: bool) -> String {
232    if !strict {
233        return text.to_string();
234    }
235    text.replace("\\_", "_")
236        .replace(" ,", ",")
237        .replace(" .", ".")
238        .replace(" ;", ";")
239        .replace(" )", ")")
240        .replace("( ", "(")
241        .replace(" ]", "]")
242        .replace("[ ", "[")
243}
244
245fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
246    let mut i = 0;
247    while i < nodes.len() {
248        match &nodes[i] {
249            Node::ListItem { .. } => {
250                let start = i;
251                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
252                    i += 1;
253                }
254                render_list_run(&nodes[start..i], blocks, ctx.strict);
255            }
256            other => {
257                render_one(other, blocks, ctx);
258                i += 1;
259            }
260        }
261    }
262}
263
264/// Render a contiguous run of list items.
265///
266/// Ordered items use their explicit `number`. A new sibling list (marked by
267/// `first_in_list`) at the same depth is separated by a blank line, matching
268/// docling-core's serializer.
269fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
270    let mut lines: Vec<String> = Vec::new();
271    // Per level, the previous item's (ordered, number) so we can detect a new
272    // sibling list.
273    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
274
275    for item in items {
276        let Node::ListItem {
277            ordered,
278            number,
279            first_in_list,
280            text,
281            level,
282        } = item
283        else {
284            continue;
285        };
286        let level = *level as usize;
287
288        // Returning to a shallower level ends the deeper sibling lists.
289        prev.truncate(level + 1);
290        while prev.len() <= level {
291            prev.push(None);
292        }
293
294        // A new sibling list at the same depth gets a blank line: the kind flips
295        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
296        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
297        if let Some((prev_ordered, prev_number)) = prev[level] {
298            let new_list = *first_in_list
299                || prev_ordered != *ordered
300                || (*ordered && *number != prev_number + 1);
301            if new_list {
302                lines.push(String::new());
303            }
304        }
305
306        let indent = "    ".repeat(level);
307        let marker = if *ordered {
308            format!("{number}.")
309        } else {
310            "-".to_string()
311        };
312        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
313        prev[level] = Some((*ordered, *number));
314    }
315
316    blocks.push(lines.join("\n"));
317}
318
319fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
320    match node {
321        Node::Heading { level, text } => {
322            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
323            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
324        }
325        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
326        Node::Code { language, text } => {
327            // Legacy docling never emits a language on the fence; strict keeps it.
328            let lang = match language {
329                Some(l) if ctx.strict => l.as_str(),
330                _ => "",
331            };
332            blocks.push(format!("```{lang}\n{text}\n```"));
333        }
334        Node::Table(table) => {
335            let rendered = render_table(table, ctx.compact_tables);
336            if !rendered.is_empty() {
337                blocks.push(rendered);
338            }
339        }
340        Node::Picture { caption, image } => {
341            if let Some(cap) = caption {
342                if !cap.is_empty() {
343                    blocks.push(cap.clone());
344                }
345            }
346            blocks.push(picture_marker(image.as_ref(), ctx));
347        }
348        Node::Group { children, .. } => render(children, blocks, ctx),
349        // Handled by the run-merging branch in `render`.
350        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
351    }
352}
353
354/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
355/// records the bytes in `ctx.artifacts` for the caller to write.
356fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
357    match (ctx.images, image) {
358        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
359        (ImageMode::Referenced, Some(img)) => {
360            let path = format!(
361                "{}/image_{:06}.{}",
362                ctx.artifacts_dir,
363                ctx.pic_index,
364                ext_for(&img.mimetype)
365            );
366            ctx.pic_index += 1;
367            ctx.artifacts.push((path.clone(), img.data.clone()));
368            format!("![Image]({path})")
369        }
370        // Placeholder, or any mode with no extracted image.
371        _ => "<!-- image -->".to_string(),
372    }
373}
374
375fn ext_for(mimetype: &str) -> &str {
376    match mimetype {
377        "image/jpeg" => "jpg",
378        "image/gif" => "gif",
379        "image/webp" => "webp",
380        "image/bmp" => "bmp",
381        "image/tiff" => "tif",
382        _ => "png",
383    }
384}
385
386/// Render a table. `compact` selects between two serializers:
387///
388/// - **padded** (default) — docling-core's `tabulate(tablefmt="github")`: columns
389///   are padded to a fixed width (header width + a minimum padding of 2, or the
390///   widest data cell); numeric columns (every data cell parses as a number) are
391///   right-aligned, others left-aligned; separators are plain dashes of
392///   `width + 2`. Matches current published docling (DOCX/HTML conformance).
393/// - **compact** — `| a | b |` cells with single-dash `| - | - |` separators, no
394///   width padding. Matches the committed PDF groundtruth corpus, which predates
395///   the padded serializer.
396///
397/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break the
398/// table. Row 0 is the header.
399fn render_table(table: &Table, compact: bool) -> String {
400    if table.rows.is_empty() {
401        return String::new();
402    }
403    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
404    if num_cols == 0 {
405        return String::new();
406    }
407
408    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
409    // strips data cells of surrounding whitespace but leaves the header row as-is.
410    let grid: Vec<Vec<String>> = table
411        .rows
412        .iter()
413        .enumerate()
414        .map(|(r, row)| {
415            (0..num_cols)
416                .map(|c| {
417                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
418                    if r == 0 {
419                        cell
420                    } else {
421                        cell.trim().to_string()
422                    }
423                })
424                .collect()
425        })
426        .collect();
427
428    if compact {
429        // Compact: cells joined by " | ", no padding, single-dash separators.
430        let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
431        let mut lines = Vec::with_capacity(grid.len() + 1);
432        lines.push(render_row(0));
433        let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
434        lines.push(format!("| {} |", sep.join(" | ")));
435        for r in 1..grid.len() {
436            lines.push(render_row(r));
437        }
438        return lines.join("\n");
439    }
440
441    // Display width (Unicode scalar count — good enough for now).
442    let dw = |s: &str| s.chars().count();
443    let data_rows = 1..grid.len();
444
445    // A column is right-aligned when it has data and every data cell is numeric.
446    let right: Vec<bool> = (0..num_cols)
447        .map(|c| {
448            !data_rows.is_empty()
449                && data_rows.clone().all(|r| {
450                    let t = grid[r][c].trim();
451                    !t.is_empty() && t.parse::<f64>().is_ok()
452                })
453        })
454        .collect();
455
456    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
457    let width: Vec<usize> = (0..num_cols)
458        .map(|c| {
459            let mut w = dw(&grid[0][c]) + 2;
460            for r in data_rows.clone() {
461                w = w.max(dw(&grid[r][c]));
462            }
463            w
464        })
465        .collect();
466
467    let fmt_cell = |s: &str, c: usize| -> String {
468        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
469        let body = if right[c] {
470            format!("{pad}{s}")
471        } else {
472            format!("{s}{pad}")
473        };
474        format!(" {body} ")
475    };
476    let render_row = |r: usize| -> String {
477        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
478        format!("|{}|", cells.join("|"))
479    };
480
481    let mut lines = Vec::with_capacity(grid.len() + 1);
482    lines.push(render_row(0));
483    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
484    lines.push(format!("|{}|", sep.join("|")));
485    for r in data_rows {
486        lines.push(render_row(r));
487    }
488    lines.join("\n")
489}
490
491/// Escape a table cell so it can't break the markdown table: newlines become
492/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
493fn escape_cell(s: &str) -> String {
494    s.replace('\n', " ").replace('|', "&#124;")
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500
501    #[test]
502    fn renders_headings_paragraphs_and_lists() {
503        let mut doc = DoclingDocument::new("demo");
504        doc.add_heading(1, "Title");
505        doc.add_paragraph("Hello world.");
506        doc.push(Node::ListItem {
507            ordered: false,
508            number: 1,
509            first_in_list: true,
510            text: "first".into(),
511            level: 0,
512        });
513        doc.push(Node::ListItem {
514            ordered: false,
515            number: 2,
516            first_in_list: false,
517            text: "second".into(),
518            level: 0,
519        });
520        let md = doc.export_to_markdown();
521        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
522    }
523
524    #[test]
525    fn strict_renders_recovered_links_legacy_does_not() {
526        let mut doc = DoclingDocument::new("cv");
527        doc.add_paragraph("Find me on LinkedIn or GitHub.");
528        doc.links = vec![
529            ("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
530            ("GitHub".into(), "https://github.com/x/".into()),
531        ];
532        // Legacy/docling mode: links are left untouched (conformance preserved).
533        assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
534        // Strict mode: anchors become Markdown links.
535        assert_eq!(
536            doc.export_to_markdown_with(true),
537            "Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
538        );
539    }
540
541    #[test]
542    fn strict_links_match_escaped_anchor_and_consume_in_order() {
543        let mut doc = DoclingDocument::new("d");
544        // The PDF assembler HTML-escapes prose, so by serialization time the body
545        // already carries `&amp;`; the anchor is stored un-escaped. The matcher must
546        // escape the anchor to find it. Two identical anchors link in document order.
547        doc.add_paragraph("AI &amp; ML here, and issues here, then issues there.");
548        doc.links = vec![
549            ("AI & ML".into(), "https://a/".into()),
550            ("issues".into(), "https://first/".into()),
551            ("issues".into(), "https://second/".into()),
552        ];
553        assert_eq!(
554            doc.export_to_markdown_with(true),
555            "[AI &amp; ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
556        );
557    }
558
559    #[test]
560    fn renders_compact_table() {
561        let mut doc = DoclingDocument::new("t");
562        // The compact form is opt-in (the PDF backend sets it); default output uses
563        // the padded GitHub serializer (covered by the regression fixtures).
564        doc.compact_tables = true;
565        doc.push(Node::Table(Table {
566            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
567        }));
568        let md = doc.export_to_markdown();
569        assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
570    }
571
572    #[test]
573    fn renders_padded_github_table_by_default() {
574        let mut doc = DoclingDocument::new("t");
575        doc.push(Node::Table(Table {
576            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
577        }));
578        let md = doc.export_to_markdown();
579        // Numeric data columns are right-aligned; columns padded to header+2.
580        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
581    }
582
583    #[test]
584    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
585        let mut doc = DoclingDocument::new("t");
586        doc.add_heading(1, "a\\_b");
587        doc.add_paragraph("x\\_y");
588        doc.push(Node::ListItem {
589            ordered: false,
590            number: 1,
591            first_in_list: true,
592            text: "i\\_j".into(),
593            level: 0,
594        });
595        // Legacy reproduces docling's `\_` escaping byte-for-byte.
596        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
597        // Strict prefers literal underscores (Rust-only readability mode).
598        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
599    }
600
601    /// Drive a document's nodes through [`MarkdownStreamer`] in the given page
602    /// splits and assert the concatenated chunks equal the buffered serializer.
603    fn assert_stream_matches(
604        doc: &DoclingDocument,
605        strict: bool,
606        images: ImageMode,
607        splits: &[usize],
608    ) {
609        let want = to_markdown_images(doc, strict, images, "artifacts").0;
610        let mut streamer = MarkdownStreamer::new(strict, images, doc.compact_tables);
611        let mut got = String::new();
612        let mut start = 0;
613        for &end in splits {
614            // Links only matter in strict mode; feed them all with the first batch
615            // that has content (document order is preserved by the queue).
616            let links = if start == 0 {
617                doc.links.as_slice()
618            } else {
619                &[]
620            };
621            got.push_str(&streamer.push(&doc.nodes[start..end], links));
622            start = end;
623        }
624        got.push_str(&streamer.push(
625            &doc.nodes[start..],
626            if start == 0 {
627                doc.links.as_slice()
628            } else {
629                &[]
630            },
631        ));
632        got.push_str(&streamer.finish());
633        assert_eq!(
634            got, want,
635            "streamed output diverged (splits={splits:?}, strict={strict})"
636        );
637    }
638
639    #[test]
640    fn streaming_is_byte_identical_to_buffered() {
641        let mut doc = DoclingDocument::new("d");
642        doc.add_heading(1, "Title");
643        doc.add_paragraph("First paragraph.");
644        doc.push(Node::ListItem {
645            ordered: false,
646            number: 1,
647            first_in_list: true,
648            text: "a".into(),
649            level: 0,
650        });
651        doc.push(Node::ListItem {
652            ordered: false,
653            number: 2,
654            first_in_list: false,
655            text: "b".into(),
656            level: 0,
657        });
658        doc.push(Node::Code {
659            language: Some("rust".into()),
660            text: "let x = 1;".into(),
661        });
662        doc.push(Node::Table(Table {
663            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
664        }));
665        doc.push(Node::Picture {
666            caption: Some("Fig 1".into()),
667            image: None,
668        });
669        doc.add_paragraph("Last paragraph.");
670
671        // A run of list items must never straddle a split, so try splits that fall
672        // on safe block boundaries (the streaming PDF assembler guarantees this).
673        for &strict in &[false, true] {
674            for &images in &[ImageMode::Placeholder, ImageMode::Embedded] {
675                for splits in [&[][..], &[1][..], &[2][..], &[4][..], &[1, 4, 6][..]] {
676                    assert_stream_matches(&doc, strict, images, splits);
677                }
678            }
679        }
680    }
681
682    #[test]
683    fn streaming_applies_recovered_links_in_strict_mode() {
684        let mut doc = DoclingDocument::new("d");
685        doc.add_paragraph("See LinkedIn for details.");
686        doc.add_paragraph("And GitHub too.");
687        doc.links = vec![
688            ("LinkedIn".into(), "https://lnkd/".into()),
689            ("GitHub".into(), "https://gh/".into()),
690        ];
691        // The second anchor lives in the second block, so it must be carried across
692        // the page boundary and placed when that block streams out.
693        assert_stream_matches(&doc, true, ImageMode::Placeholder, &[1]);
694    }
695
696    #[test]
697    fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
698        let mut doc = DoclingDocument::new("t");
699        doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
700        // Legacy keeps docling's spacing byte-for-byte.
701        assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
702        // Strict tightens punctuation for readable Markdown.
703        assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
704    }
705}