Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    images: ImageMode,
22    artifacts_dir: String,
23    /// (relative path, bytes) for each referenced image — written by the caller.
24    artifacts: Vec<(String, Vec<u8>)>,
25    pic_index: usize,
26}
27
28/// Render a document to a Markdown string (pictures as placeholders).
29///
30/// `strict` selects the serializer-level behaviours that differ between
31/// docling-legacy output and cleaner Markdown — currently the code-fence
32/// language (legacy drops it, strict keeps it).
33pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
34    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
35}
36
37/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
38/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
39/// the caller should write (relative to the Markdown file).
40pub fn to_markdown_images(
41    doc: &DoclingDocument,
42    strict: bool,
43    images: ImageMode,
44    artifacts_dir: &str,
45) -> (String, Vec<(String, Vec<u8>)>) {
46    let mut ctx = Ctx {
47        strict,
48        images,
49        artifacts_dir: artifacts_dir.to_string(),
50        artifacts: Vec::new(),
51        pic_index: 0,
52    };
53    let mut blocks: Vec<String> = Vec::new();
54    render(&doc.nodes, &mut blocks, &mut ctx);
55    let body = blocks.join("\n\n");
56    let md = if body.is_empty() {
57        String::new()
58    } else {
59        format!("{body}\n")
60    };
61    (md, ctx.artifacts)
62}
63
64/// In `strict` mode, undo the legacy `\_` underscore escaping the backends bake
65/// into inline text. Legacy output keeps `\_` (byte-for-byte with docling, which
66/// escapes underscores); strict prefers literal `_` for readability. Only inline
67/// text nodes are escaped — code blocks and table cells are left alone.
68fn strict_text(text: &str, strict: bool) -> String {
69    if strict {
70        text.replace("\\_", "_")
71    } else {
72        text.to_string()
73    }
74}
75
76fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
77    let mut i = 0;
78    while i < nodes.len() {
79        match &nodes[i] {
80            Node::ListItem { .. } => {
81                let start = i;
82                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
83                    i += 1;
84                }
85                render_list_run(&nodes[start..i], blocks, ctx.strict);
86            }
87            other => {
88                render_one(other, blocks, ctx);
89                i += 1;
90            }
91        }
92    }
93}
94
95/// Render a contiguous run of list items.
96///
97/// Ordered items use their explicit `number`. A new sibling list (marked by
98/// `first_in_list`) at the same depth is separated by a blank line, matching
99/// docling-core's serializer.
100fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
101    let mut lines: Vec<String> = Vec::new();
102    // Per level, the previous item's (ordered, number) so we can detect a new
103    // sibling list.
104    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
105
106    for item in items {
107        let Node::ListItem {
108            ordered,
109            number,
110            first_in_list,
111            text,
112            level,
113        } = item
114        else {
115            continue;
116        };
117        let level = *level as usize;
118
119        // Returning to a shallower level ends the deeper sibling lists.
120        prev.truncate(level + 1);
121        while prev.len() <= level {
122            prev.push(None);
123        }
124
125        // A new sibling list at the same depth gets a blank line: the kind flips
126        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
127        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
128        if let Some((prev_ordered, prev_number)) = prev[level] {
129            let new_list = *first_in_list
130                || prev_ordered != *ordered
131                || (*ordered && *number != prev_number + 1);
132            if new_list {
133                lines.push(String::new());
134            }
135        }
136
137        let indent = "    ".repeat(level);
138        let marker = if *ordered {
139            format!("{number}.")
140        } else {
141            "-".to_string()
142        };
143        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
144        prev[level] = Some((*ordered, *number));
145    }
146
147    blocks.push(lines.join("\n"));
148}
149
150fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
151    match node {
152        Node::Heading { level, text } => {
153            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
154            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
155        }
156        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
157        Node::Code { language, text } => {
158            // Legacy docling never emits a language on the fence; strict keeps it.
159            let lang = match language {
160                Some(l) if ctx.strict => l.as_str(),
161                _ => "",
162            };
163            blocks.push(format!("```{lang}\n{text}\n```"));
164        }
165        Node::Table(table) => {
166            let rendered = render_table(table);
167            if !rendered.is_empty() {
168                blocks.push(rendered);
169            }
170        }
171        Node::Picture { caption, image } => {
172            if let Some(cap) = caption {
173                if !cap.is_empty() {
174                    blocks.push(cap.clone());
175                }
176            }
177            blocks.push(picture_marker(image.as_ref(), ctx));
178        }
179        Node::Group { children, .. } => render(children, blocks, ctx),
180        // Handled by the run-merging branch in `render`.
181        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
182    }
183}
184
185/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
186/// records the bytes in `ctx.artifacts` for the caller to write.
187fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
188    match (ctx.images, image) {
189        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
190        (ImageMode::Referenced, Some(img)) => {
191            let path = format!(
192                "{}/image_{:06}.{}",
193                ctx.artifacts_dir,
194                ctx.pic_index,
195                ext_for(&img.mimetype)
196            );
197            ctx.pic_index += 1;
198            ctx.artifacts.push((path.clone(), img.data.clone()));
199            format!("![Image]({path})")
200        }
201        // Placeholder, or any mode with no extracted image.
202        _ => "<!-- image -->".to_string(),
203    }
204}
205
206fn ext_for(mimetype: &str) -> &str {
207    match mimetype {
208        "image/jpeg" => "jpg",
209        "image/gif" => "gif",
210        "image/webp" => "webp",
211        "image/bmp" => "bmp",
212        "image/tiff" => "tif",
213        _ => "png",
214    }
215}
216
217/// Render a table the way docling-core does: `tabulate(tablefmt="github")`.
218///
219/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break
220/// the table. Columns are padded to a fixed width; the header contributes its
221/// width plus a minimum padding of 2; numeric columns (every data cell parses
222/// as a number) are right-aligned, others left-aligned; the separator is plain
223/// dashes of `width + 2` (github tablefmt emits no alignment colons here). Row 0
224/// is the header.
225fn render_table(table: &Table) -> String {
226    if table.rows.is_empty() {
227        return String::new();
228    }
229    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
230    if num_cols == 0 {
231        return String::new();
232    }
233
234    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
235    // strips data cells of surrounding whitespace but leaves the header row as-is.
236    let grid: Vec<Vec<String>> = table
237        .rows
238        .iter()
239        .enumerate()
240        .map(|(r, row)| {
241            (0..num_cols)
242                .map(|c| {
243                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
244                    if r == 0 {
245                        cell
246                    } else {
247                        cell.trim().to_string()
248                    }
249                })
250                .collect()
251        })
252        .collect();
253
254    // Display width (Unicode scalar count — good enough for now).
255    let dw = |s: &str| s.chars().count();
256    let data_rows = 1..grid.len();
257
258    // A column is right-aligned when it has data and every data cell is numeric.
259    let right: Vec<bool> = (0..num_cols)
260        .map(|c| {
261            !data_rows.is_empty()
262                && data_rows.clone().all(|r| {
263                    let t = grid[r][c].trim();
264                    !t.is_empty() && t.parse::<f64>().is_ok()
265                })
266        })
267        .collect();
268
269    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
270    let width: Vec<usize> = (0..num_cols)
271        .map(|c| {
272            let mut w = dw(&grid[0][c]) + 2;
273            for r in data_rows.clone() {
274                w = w.max(dw(&grid[r][c]));
275            }
276            w
277        })
278        .collect();
279
280    let fmt_cell = |s: &str, c: usize| -> String {
281        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
282        let body = if right[c] {
283            format!("{pad}{s}")
284        } else {
285            format!("{s}{pad}")
286        };
287        format!(" {body} ")
288    };
289    let render_row = |r: usize| -> String {
290        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
291        format!("|{}|", cells.join("|"))
292    };
293
294    let mut lines = Vec::with_capacity(grid.len() + 1);
295    lines.push(render_row(0));
296    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
297    lines.push(format!("|{}|", sep.join("|")));
298    for r in data_rows {
299        lines.push(render_row(r));
300    }
301    lines.join("\n")
302}
303
304/// Escape a table cell so it can't break the markdown table: newlines become
305/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
306fn escape_cell(s: &str) -> String {
307    s.replace('\n', " ").replace('|', "&#124;")
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313
314    #[test]
315    fn renders_headings_paragraphs_and_lists() {
316        let mut doc = DoclingDocument::new("demo");
317        doc.add_heading(1, "Title");
318        doc.add_paragraph("Hello world.");
319        doc.push(Node::ListItem {
320            ordered: false,
321            number: 1,
322            first_in_list: true,
323            text: "first".into(),
324            level: 0,
325        });
326        doc.push(Node::ListItem {
327            ordered: false,
328            number: 2,
329            first_in_list: false,
330            text: "second".into(),
331            level: 0,
332        });
333        let md = doc.export_to_markdown();
334        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
335    }
336
337    #[test]
338    fn renders_github_table() {
339        let mut doc = DoclingDocument::new("t");
340        doc.push(Node::Table(Table {
341            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
342        }));
343        let md = doc.export_to_markdown();
344        // Matches tabulate(tablefmt="github"): padded columns, numeric cells
345        // right-aligned, separator of width+2 dashes.
346        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
347    }
348
349    #[test]
350    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
351        let mut doc = DoclingDocument::new("t");
352        doc.add_heading(1, "a\\_b");
353        doc.add_paragraph("x\\_y");
354        doc.push(Node::ListItem {
355            ordered: false,
356            number: 1,
357            first_in_list: true,
358            text: "i\\_j".into(),
359            level: 0,
360        });
361        // Legacy reproduces docling's `\_` escaping byte-for-byte.
362        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
363        // Strict prefers literal underscores (Rust-only readability mode).
364        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
365    }
366}