Skip to main content

fleischwolf_core/
markdown.rs

1//! Markdown serializer for [`DoclingDocument`].
2
3use crate::document::{DoclingDocument, Node, Table};
4
5/// How pictures are rendered (mirrors docling-core's `ImageRefMode`).
6#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8    /// `<!-- image -->` (docling's default, and the only mode without image data).
9    #[default]
10    Placeholder,
11    /// `![Image](data:<mime>;base64,…)` — self-contained.
12    Embedded,
13    /// `![Image](<artifacts>/image_NNNNNN.<ext>)`; the bytes are returned for the
14    /// caller to write.
15    Referenced,
16}
17
18/// Serializer state threaded through the render walk.
19struct Ctx {
20    strict: bool,
21    /// Emit compact `| a | b |` tables instead of the padded GitHub serializer.
22    compact_tables: bool,
23    images: ImageMode,
24    artifacts_dir: String,
25    /// (relative path, bytes) for each referenced image — written by the caller.
26    artifacts: Vec<(String, Vec<u8>)>,
27    pic_index: usize,
28}
29
30/// Render a document to a Markdown string (pictures as placeholders).
31///
32/// `strict` selects the serializer-level behaviours that differ between
33/// docling-legacy output and cleaner Markdown — currently the code-fence
34/// language (legacy drops it, strict keeps it).
35pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36    to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39/// Render to Markdown with an explicit picture [`ImageMode`]. Returns the
40/// Markdown and, for [`ImageMode::Referenced`], the `(path, bytes)` of each image
41/// the caller should write (relative to the Markdown file).
42pub fn to_markdown_images(
43    doc: &DoclingDocument,
44    strict: bool,
45    images: ImageMode,
46    artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48    let mut ctx = Ctx {
49        strict,
50        compact_tables: doc.compact_tables,
51        images,
52        artifacts_dir: artifacts_dir.to_string(),
53        artifacts: Vec::new(),
54        pic_index: 0,
55    };
56    let mut blocks: Vec<String> = Vec::new();
57    render(&doc.nodes, &mut blocks, &mut ctx);
58    let mut body = blocks.join("\n\n");
59    // Strict mode only: turn recovered source hyperlinks into Markdown links.
60    // docling's standard pipeline drops them, so doing this in legacy mode would
61    // diverge from docling — hence strict-only, leaving conformance output intact.
62    if strict && !doc.links.is_empty() {
63        body = apply_links(&body, &doc.links);
64    }
65    let md = if body.is_empty() {
66        String::new()
67    } else {
68        format!("{body}\n")
69    };
70    (md, ctx.artifacts)
71}
72
73/// Wrap each recovered link's anchor text in Markdown `[anchor](href)`. Anchors
74/// arrive cleaned (curly quotes/dashes already normalized) but un-escaped, so we
75/// match against the body's HTML-escaped (`&`/`<`/`>`) form, the way prose nodes
76/// were serialized. Links are consumed in document order from a moving cursor, so
77/// a repeated anchor (e.g. two "issues") links its successive occurrences rather
78/// than all pointing at the first. An anchor that can't be located is skipped
79/// (its text may have been split across a line wrap or table cell).
80fn apply_links(body: &str, links: &[(String, String)]) -> String {
81    let mut out = body.to_string();
82    let mut cursor = 0usize;
83    for (anchor, href) in links {
84        let anchor = anchor
85            .replace('&', "&amp;")
86            .replace('<', "&lt;")
87            .replace('>', "&gt;");
88        if anchor.is_empty() {
89            continue;
90        }
91        if let Some(rel) = out[cursor..].find(&anchor) {
92            let at = cursor + rel;
93            // Don't relink inside an already-emitted `](` Markdown link target.
94            let replacement = format!("[{anchor}]({href})");
95            out.replace_range(at..at + anchor.len(), &replacement);
96            cursor = at + replacement.len();
97        }
98    }
99    out
100}
101
102/// In `strict` mode, rewrite inline text for readability rather than byte-for-byte
103/// docling fidelity: undo the legacy `\_` underscore escaping, and tighten stray
104/// spaces around punctuation (`[ 37 , 36 ]` → `[37, 36]`, `( x )` → `(x)`). This
105/// cleans up both the PDF backend's glyph-split spacing and the space the legacy
106/// emphasis serialization leaves before punctuation (`*a* ,` → `*a*,`).
107/// Legacy/default output keeps docling's spacing untouched. Only inline text
108/// nodes pass through here — code blocks and table cells are left alone.
109fn strict_text(text: &str, strict: bool) -> String {
110    if !strict {
111        return text.to_string();
112    }
113    text.replace("\\_", "_")
114        .replace(" ,", ",")
115        .replace(" .", ".")
116        .replace(" ;", ";")
117        .replace(" )", ")")
118        .replace("( ", "(")
119        .replace(" ]", "]")
120        .replace("[ ", "[")
121}
122
123fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
124    let mut i = 0;
125    while i < nodes.len() {
126        match &nodes[i] {
127            Node::ListItem { .. } => {
128                let start = i;
129                while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
130                    i += 1;
131                }
132                render_list_run(&nodes[start..i], blocks, ctx.strict);
133            }
134            other => {
135                render_one(other, blocks, ctx);
136                i += 1;
137            }
138        }
139    }
140}
141
142/// Render a contiguous run of list items.
143///
144/// Ordered items use their explicit `number`. A new sibling list (marked by
145/// `first_in_list`) at the same depth is separated by a blank line, matching
146/// docling-core's serializer.
147fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
148    let mut lines: Vec<String> = Vec::new();
149    // Per level, the previous item's (ordered, number) so we can detect a new
150    // sibling list.
151    let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
152
153    for item in items {
154        let Node::ListItem {
155            ordered,
156            number,
157            first_in_list,
158            text,
159            level,
160        } = item
161        else {
162            continue;
163        };
164        let level = *level as usize;
165
166        // Returning to a shallower level ends the deeper sibling lists.
167        prev.truncate(level + 1);
168        while prev.len() <= level {
169            prev.push(None);
170        }
171
172        // A new sibling list at the same depth gets a blank line: the kind flips
173        // (`<ul>`↔`<ol>`), an ordered run breaks (`1, 2` then `42`), or the
174        // backend flagged a fresh list (e.g. Markdown's bullet changing `-`→`*`).
175        if let Some((prev_ordered, prev_number)) = prev[level] {
176            let new_list = *first_in_list
177                || prev_ordered != *ordered
178                || (*ordered && *number != prev_number + 1);
179            if new_list {
180                lines.push(String::new());
181            }
182        }
183
184        let indent = "    ".repeat(level);
185        let marker = if *ordered {
186            format!("{number}.")
187        } else {
188            "-".to_string()
189        };
190        lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
191        prev[level] = Some((*ordered, *number));
192    }
193
194    blocks.push(lines.join("\n"));
195}
196
197fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
198    match node {
199        Node::Heading { level, text } => {
200            let hashes = "#".repeat((*level).clamp(1, 6) as usize);
201            blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
202        }
203        Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
204        Node::Code { language, text } => {
205            // Legacy docling never emits a language on the fence; strict keeps it.
206            let lang = match language {
207                Some(l) if ctx.strict => l.as_str(),
208                _ => "",
209            };
210            blocks.push(format!("```{lang}\n{text}\n```"));
211        }
212        Node::Table(table) => {
213            let rendered = render_table(table, ctx.compact_tables);
214            if !rendered.is_empty() {
215                blocks.push(rendered);
216            }
217        }
218        Node::Picture { caption, image } => {
219            if let Some(cap) = caption {
220                if !cap.is_empty() {
221                    blocks.push(cap.clone());
222                }
223            }
224            blocks.push(picture_marker(image.as_ref(), ctx));
225        }
226        Node::Group { children, .. } => render(children, blocks, ctx),
227        // Handled by the run-merging branch in `render`.
228        Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
229    }
230}
231
232/// The Markdown for a picture under the active [`ImageMode`]; Referenced mode also
233/// records the bytes in `ctx.artifacts` for the caller to write.
234fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
235    match (ctx.images, image) {
236        (ImageMode::Embedded, Some(img)) => format!("![Image]({})", img.data_uri()),
237        (ImageMode::Referenced, Some(img)) => {
238            let path = format!(
239                "{}/image_{:06}.{}",
240                ctx.artifacts_dir,
241                ctx.pic_index,
242                ext_for(&img.mimetype)
243            );
244            ctx.pic_index += 1;
245            ctx.artifacts.push((path.clone(), img.data.clone()));
246            format!("![Image]({path})")
247        }
248        // Placeholder, or any mode with no extracted image.
249        _ => "<!-- image -->".to_string(),
250    }
251}
252
253fn ext_for(mimetype: &str) -> &str {
254    match mimetype {
255        "image/jpeg" => "jpg",
256        "image/gif" => "gif",
257        "image/webp" => "webp",
258        "image/bmp" => "bmp",
259        "image/tiff" => "tif",
260        _ => "png",
261    }
262}
263
264/// Render a table. `compact` selects between two serializers:
265///
266/// - **padded** (default) — docling-core's `tabulate(tablefmt="github")`: columns
267///   are padded to a fixed width (header width + a minimum padding of 2, or the
268///   widest data cell); numeric columns (every data cell parses as a number) are
269///   right-aligned, others left-aligned; separators are plain dashes of
270///   `width + 2`. Matches current published docling (DOCX/HTML conformance).
271/// - **compact** — `| a | b |` cells with single-dash `| - | - |` separators, no
272///   width padding. Matches the committed PDF groundtruth corpus, which predates
273///   the padded serializer.
274///
275/// Each cell is first escaped (`\n` → space, `|` → `&#124;`) so it can't break the
276/// table. Row 0 is the header.
277fn render_table(table: &Table, compact: bool) -> String {
278    if table.rows.is_empty() {
279        return String::new();
280    }
281    let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
282    if num_cols == 0 {
283        return String::new();
284    }
285
286    // Escaped, rectangular grid (ragged rows padded with empty cells). `tabulate`
287    // strips data cells of surrounding whitespace but leaves the header row as-is.
288    let grid: Vec<Vec<String>> = table
289        .rows
290        .iter()
291        .enumerate()
292        .map(|(r, row)| {
293            (0..num_cols)
294                .map(|c| {
295                    let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
296                    if r == 0 {
297                        cell
298                    } else {
299                        cell.trim().to_string()
300                    }
301                })
302                .collect()
303        })
304        .collect();
305
306    if compact {
307        // Compact: cells joined by " | ", no padding, single-dash separators.
308        let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
309        let mut lines = Vec::with_capacity(grid.len() + 1);
310        lines.push(render_row(0));
311        let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
312        lines.push(format!("| {} |", sep.join(" | ")));
313        for r in 1..grid.len() {
314            lines.push(render_row(r));
315        }
316        return lines.join("\n");
317    }
318
319    // Display width (Unicode scalar count — good enough for now).
320    let dw = |s: &str| s.chars().count();
321    let data_rows = 1..grid.len();
322
323    // A column is right-aligned when it has data and every data cell is numeric.
324    let right: Vec<bool> = (0..num_cols)
325        .map(|c| {
326            !data_rows.is_empty()
327                && data_rows.clone().all(|r| {
328                    let t = grid[r][c].trim();
329                    !t.is_empty() && t.parse::<f64>().is_ok()
330                })
331        })
332        .collect();
333
334    // Column width = max(header_width + MIN_PADDING(2), max data-cell width).
335    let width: Vec<usize> = (0..num_cols)
336        .map(|c| {
337            let mut w = dw(&grid[0][c]) + 2;
338            for r in data_rows.clone() {
339                w = w.max(dw(&grid[r][c]));
340            }
341            w
342        })
343        .collect();
344
345    let fmt_cell = |s: &str, c: usize| -> String {
346        let pad = " ".repeat(width[c].saturating_sub(dw(s)));
347        let body = if right[c] {
348            format!("{pad}{s}")
349        } else {
350            format!("{s}{pad}")
351        };
352        format!(" {body} ")
353    };
354    let render_row = |r: usize| -> String {
355        let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
356        format!("|{}|", cells.join("|"))
357    };
358
359    let mut lines = Vec::with_capacity(grid.len() + 1);
360    lines.push(render_row(0));
361    let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
362    lines.push(format!("|{}|", sep.join("|")));
363    for r in data_rows {
364        lines.push(render_row(r));
365    }
366    lines.join("\n")
367}
368
369/// Escape a table cell so it can't break the markdown table: newlines become
370/// spaces and pipes become the `&#124;` HTML entity (matches docling-core).
371fn escape_cell(s: &str) -> String {
372    s.replace('\n', " ").replace('|', "&#124;")
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378
379    #[test]
380    fn renders_headings_paragraphs_and_lists() {
381        let mut doc = DoclingDocument::new("demo");
382        doc.add_heading(1, "Title");
383        doc.add_paragraph("Hello world.");
384        doc.push(Node::ListItem {
385            ordered: false,
386            number: 1,
387            first_in_list: true,
388            text: "first".into(),
389            level: 0,
390        });
391        doc.push(Node::ListItem {
392            ordered: false,
393            number: 2,
394            first_in_list: false,
395            text: "second".into(),
396            level: 0,
397        });
398        let md = doc.export_to_markdown();
399        assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
400    }
401
402    #[test]
403    fn strict_renders_recovered_links_legacy_does_not() {
404        let mut doc = DoclingDocument::new("cv");
405        doc.add_paragraph("Find me on LinkedIn or GitHub.");
406        doc.links = vec![
407            ("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
408            ("GitHub".into(), "https://github.com/x/".into()),
409        ];
410        // Legacy/docling mode: links are left untouched (conformance preserved).
411        assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
412        // Strict mode: anchors become Markdown links.
413        assert_eq!(
414            doc.export_to_markdown_with(true),
415            "Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
416        );
417    }
418
419    #[test]
420    fn strict_links_match_escaped_anchor_and_consume_in_order() {
421        let mut doc = DoclingDocument::new("d");
422        // The PDF assembler HTML-escapes prose, so by serialization time the body
423        // already carries `&amp;`; the anchor is stored un-escaped. The matcher must
424        // escape the anchor to find it. Two identical anchors link in document order.
425        doc.add_paragraph("AI &amp; ML here, and issues here, then issues there.");
426        doc.links = vec![
427            ("AI & ML".into(), "https://a/".into()),
428            ("issues".into(), "https://first/".into()),
429            ("issues".into(), "https://second/".into()),
430        ];
431        assert_eq!(
432            doc.export_to_markdown_with(true),
433            "[AI &amp; ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
434        );
435    }
436
437    #[test]
438    fn renders_compact_table() {
439        let mut doc = DoclingDocument::new("t");
440        // The compact form is opt-in (the PDF backend sets it); default output uses
441        // the padded GitHub serializer (covered by the regression fixtures).
442        doc.compact_tables = true;
443        doc.push(Node::Table(Table {
444            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
445        }));
446        let md = doc.export_to_markdown();
447        assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
448    }
449
450    #[test]
451    fn renders_padded_github_table_by_default() {
452        let mut doc = DoclingDocument::new("t");
453        doc.push(Node::Table(Table {
454            rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
455        }));
456        let md = doc.export_to_markdown();
457        // Numeric data columns are right-aligned; columns padded to header+2.
458        assert_eq!(md, "|   a |   b |\n|-----|-----|\n|   1 |   2 |\n");
459    }
460
461    #[test]
462    fn strict_unescapes_inline_underscores_legacy_keeps_them() {
463        let mut doc = DoclingDocument::new("t");
464        doc.add_heading(1, "a\\_b");
465        doc.add_paragraph("x\\_y");
466        doc.push(Node::ListItem {
467            ordered: false,
468            number: 1,
469            first_in_list: true,
470            text: "i\\_j".into(),
471            level: 0,
472        });
473        // Legacy reproduces docling's `\_` escaping byte-for-byte.
474        assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
475        // Strict prefers literal underscores (Rust-only readability mode).
476        assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
477    }
478
479    #[test]
480    fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
481        let mut doc = DoclingDocument::new("t");
482        doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
483        // Legacy keeps docling's spacing byte-for-byte.
484        assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
485        // Strict tightens punctuation for readable Markdown.
486        assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
487    }
488}