Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    /// Emit compact `| a | b |` tables instead of the padded GitHub serializer.
22    compact_tables: bool,
23    images: ImageMode,
24    artifacts_dir: String,
25    /// (relative path, bytes) for each referenced image — written by the caller.
26    artifacts: Vec<(String, Vec<u8>)>,
27    pic_index: usize,
28}
29
30/// Render a document to a Markdown string (pictures as placeholders).
31///
32/// `strict` selects the serializer-level behaviours that differ between
33/// docling-legacy output and cleaner Markdown — currently the code-fence
34/// language (legacy drops it, strict keeps it).
35pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
40/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
41/// the caller should write (relative to the Markdown file).
42pub fn to_markdown_images(
43    doc: &DoclingDocument,
44    strict: bool,
45    images: ImageMode,
46    artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48    let mut ctx = Ctx {
49        strict,
50        compact_tables: doc.compact_tables,
51        images,
52        artifacts_dir: artifacts_dir.to_string(),
53        artifacts: Vec::new(),
54        pic_index: 0,
55    };
56    let mut blocks: Vec<String> = Vec::new();
57    render(&doc.nodes, &mut blocks, &mut ctx);
58    let body = blocks.join("\n\n");
59    let md = if body.is_empty() {
60        String::new()
61    } else {
62        format!("{body}\n")
63    };
64    (md, ctx.artifacts)
65}
66
67/// In `strict` mode, rewrite inline text for readability rather than byte-for-byte
68/// docling fidelity: undo the legacy `\_` underscore escaping, and tighten stray
69/// spaces around punctuation (`[ 37 , 36 ]` → `[37, 36]`, `( x )` → `(x)`). This
70/// cleans up both the PDF backend's glyph-split spacing and the space the legacy
71/// emphasis serialization leaves before punctuation (`*a* ,` → `*a*,`).
72/// Legacy/default output keeps docling's spacing untouched. Only inline text
73/// nodes pass through here — code blocks and table cells are left alone.
74fn strict_text(text: &str, strict: bool) -> String {
75    if !strict {
76        return text.to_string();
77    }
78    text.replace("\\_", "_")
79        .replace(" ,", ",")
80        .replace(" .", ".")
81        .replace(" ;", ";")
82        .replace(" )", ")")
83        .replace("( ", "(")
84        .replace(" ]", "]")
85        .replace("[ ", "[")
86}
87
88fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
89    let mut i = 0;
90    while i < nodes.len() {
91        match &nodes[i] {
92            Node::ListItem { .. } => {
93                let start = i;
94                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
95                    i += 1;
96                }
97                render_list_run(&nodes[start..i], blocks, ctx.strict);
98            }
99            other => {
100                render_one(other, blocks, ctx);
101                i += 1;
102            }
103        }
104    }
105}
106
107/// Render a contiguous run of list items.
108///
109/// Ordered items use their explicit `number`. A new sibling list (marked by
110/// `first_in_list`) at the same depth is separated by a blank line, matching
111/// docling-core's serializer.
112fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
113    let mut lines: Vec<String> = Vec::new();
114    // Per level, the previous item's (ordered, number) so we can detect a new
115    // sibling list.
116    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
117
118    for item in items {
119        let Node::ListItem {
120            ordered,
121            number,
122            first_in_list,
123            text,
124            level,
125        } = item
126        else {
127            continue;
128        };
129        let level = *level as usize;
130
131        // Returning to a shallower level ends the deeper sibling lists.
132        prev.truncate(level + 1);
133        while prev.len() <= level {
134            prev.push(None);
135        }
136
137        // A new sibling list at the same depth gets a blank line: the kind flips
138        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
139        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
140        if let Some((prev_ordered, prev_number)) = prev[level] {
141            let new_list = *first_in_list
142                || prev_ordered != *ordered
143                || (*ordered && *number != prev_number + 1);
144            if new_list {
145                lines.push(String::new());
146            }
147        }
148
149        let indent = "    ".repeat(level);
150        let marker = if *ordered {
151            format!("{number}.")
152        } else {
153            "-".to_string()
154        };
155        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
156        prev[level] = Some((*ordered, *number));
157    }
158
159    blocks.push(lines.join("\n"));
160}
161
162fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
163    match node {
164        Node::Heading { level, text } => {
165            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
166            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
167        }
168        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
169        Node::Code { language, text } => {
170            // Legacy docling never emits a language on the fence; strict keeps it.
171            let lang = match language {
172                Some(l) if ctx.strict => l.as_str(),
173                _ => "",
174            };
175            blocks.push(format!("```{lang}\n{text}\n```"));
176        }
177        Node::Table(table) => {
178            let rendered = render_table(table, ctx.compact_tables);
179            if !rendered.is_empty() {
180                blocks.push(rendered);
181            }
182        }
183        Node::Picture { caption, image } => {
184            if let Some(cap) = caption {
185                if !cap.is_empty() {
186                    blocks.push(cap.clone());
187                }
188            }
189            blocks.push(picture_marker(image.as_ref(), ctx));
190        }
191        Node::Group { children, .. } => render(children, blocks, ctx),
192        // Handled by the run-merging branch in `render`.
193        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
194    }
195}
196
197/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
198/// records the bytes in `ctx.artifacts` for the caller to write.
199fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
200    match (ctx.images, image) {
201        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
202        (ImageMode::Referenced, Some(img)) => {
203            let path = format!(
204                "{}/image_{:06}.{}",
205                ctx.artifacts_dir,
206                ctx.pic_index,
207                ext_for(&img.mimetype)
208            );
209            ctx.pic_index += 1;
210            ctx.artifacts.push((path.clone(), img.data.clone()));
211            format!("![Image]({path})")
212        }
213        // Placeholder, or any mode with no extracted image.
214        _ => "<!-- image -->".to_string(),
215    }
216}
217
218fn ext_for(mimetype: &str) -> &str {
219    match mimetype {
220        "image/jpeg" => "jpg",
221        "image/gif" => "gif",
222        "image/webp" => "webp",
223        "image/bmp" => "bmp",
224        "image/tiff" => "tif",
225        _ => "png",
226    }
227}
228
229/// Render a table. `compact` selects between two serializers:
230///
231/// - **padded** (default) — docling-core's `tabulate(tablefmt="github")`: columns
232///   are padded to a fixed width (header width + a minimum padding of 2, or the
233///   widest data cell); numeric columns (every data cell parses as a number) are
234///   right-aligned, others left-aligned; separators are plain dashes of
235///   `width + 2`. Matches current published docling (DOCX/HTML conformance).
236/// - **compact** — `| a | b |` cells with single-dash `| - | - |` separators, no
237///   width padding. Matches the committed PDF groundtruth corpus, which predates
238///   the padded serializer.
239///
240/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break the
241/// table. Row 0 is the header.
242fn render_table(table: &Table, compact: bool) -> String {
243    if table.rows.is_empty() {
244        return String::new();
245    }
246    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
247    if num_cols == 0 {
248        return String::new();
249    }
250
251    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
252    // strips data cells of surrounding whitespace but leaves the header row as-is.
253    let grid: Vec<Vec<String>> = table
254        .rows
255        .iter()
256        .enumerate()
257        .map(|(r, row)| {
258            (0..num_cols)
259                .map(|c| {
260                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
261                    if r == 0 {
262                        cell
263                    } else {
264                        cell.trim().to_string()
265                    }
266                })
267                .collect()
268        })
269        .collect();
270
271    if compact {
272        // Compact: cells joined by " | ", no padding, single-dash separators.
273        let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
274        let mut lines = Vec::with_capacity(grid.len() + 1);
275        lines.push(render_row(0));
276        let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
277        lines.push(format!("| {} |", sep.join(" | ")));
278        for r in 1..grid.len() {
279            lines.push(render_row(r));
280        }
281        return lines.join("\n");
282    }
283
284    // Display width (Unicode scalar count — good enough for now).
285    let dw = |s: &str| s.chars().count();
286    let data_rows = 1..grid.len();
287
288    // A column is right-aligned when it has data and every data cell is numeric.
289    let right: Vec<bool> = (0..num_cols)
290        .map(|c| {
291            !data_rows.is_empty()
292                && data_rows.clone().all(|r| {
293                    let t = grid[r][c].trim();
294                    !t.is_empty() && t.parse::<f64>().is_ok()
295                })
296        })
297        .collect();
298
299    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
300    let width: Vec<usize> = (0..num_cols)
301        .map(|c| {
302            let mut w = dw(&grid[0][c]) + 2;
303            for r in data_rows.clone() {
304                w = w.max(dw(&grid[r][c]));
305            }
306            w
307        })
308        .collect();
309
310    let fmt_cell = |s: &str, c: usize| -> String {
311        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
312        let body = if right[c] {
313            format!("{pad}{s}")
314        } else {
315            format!("{s}{pad}")
316        };
317        format!(" {body} ")
318    };
319    let render_row = |r: usize| -> String {
320        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
321        format!("|{}|", cells.join("|"))
322    };
323
324    let mut lines = Vec::with_capacity(grid.len() + 1);
325    lines.push(render_row(0));
326    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
327    lines.push(format!("|{}|", sep.join("|")));
328    for r in data_rows {
329        lines.push(render_row(r));
330    }
331    lines.join("\n")
332}
333
334/// Escape a table cell so it can't break the markdown table: newlines become
335/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
336fn escape_cell(s: &str) -> String {
337    s.replace('\n', " ").replace('|', "&#124;")
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn renders_headings_paragraphs_and_lists() {
346        let mut doc = DoclingDocument::new("demo");
347        doc.add_heading(1, "Title");
348        doc.add_paragraph("Hello world.");
349        doc.push(Node::ListItem {
350            ordered: false,
351            number: 1,
352            first_in_list: true,
353            text: "first".into(),
354            level: 0,
355        });
356        doc.push(Node::ListItem {
357            ordered: false,
358            number: 2,
359            first_in_list: false,
360            text: "second".into(),
361            level: 0,
362        });
363        let md = doc.export_to_markdown();
364        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
365    }
366
367    #[test]
368    fn renders_compact_table() {
369        let mut doc = DoclingDocument::new("t");
370        // The compact form is opt-in (the PDF backend sets it); default output uses
371        // the padded GitHub serializer (covered by the regression fixtures).
372        doc.compact_tables = true;
373        doc.push(Node::Table(Table {
374            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
375        }));
376        let md = doc.export_to_markdown();
377        assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
378    }
379
380    #[test]
381    fn renders_padded_github_table_by_default() {
382        let mut doc = DoclingDocument::new("t");
383        doc.push(Node::Table(Table {
384            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
385        }));
386        let md = doc.export_to_markdown();
387        // Numeric data columns are right-aligned; columns padded to header+2.
388        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
389    }
390
391    #[test]
392    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
393        let mut doc = DoclingDocument::new("t");
394        doc.add_heading(1, "a\\_b");
395        doc.add_paragraph("x\\_y");
396        doc.push(Node::ListItem {
397            ordered: false,
398            number: 1,
399            first_in_list: true,
400            text: "i\\_j".into(),
401            level: 0,
402        });
403        // Legacy reproduces docling's `\_` escaping byte-for-byte.
404        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
405        // Strict prefers literal underscores (Rust-only readability mode).
406        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
407    }
408
409    #[test]
410    fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
411        let mut doc = DoclingDocument::new("t");
412        doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
413        // Legacy keeps docling's spacing byte-for-byte.
414        assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
415        // Strict tightens punctuation for readable Markdown.
416        assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
417    }
418}