use crate::document::{DoclingDocument, Node, Table};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum ImageMode {
#[default]
Placeholder,
Embedded,
Referenced,
}
struct Ctx {
strict: bool,
compact_tables: bool,
images: ImageMode,
artifacts_dir: String,
artifacts: Vec<(String, Vec<u8>)>,
pic_index: usize,
}
pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
}
pub fn to_markdown_images(
doc: &DoclingDocument,
strict: bool,
images: ImageMode,
artifacts_dir: &str,
) -> (String, Vec<(String, Vec<u8>)>) {
let mut ctx = Ctx {
strict,
compact_tables: doc.compact_tables,
images,
artifacts_dir: artifacts_dir.to_string(),
artifacts: Vec::new(),
pic_index: 0,
};
let mut blocks: Vec<String> = Vec::new();
render(&doc.nodes, &mut blocks, &mut ctx);
let mut body = blocks.join("\n\n");
if strict && !doc.links.is_empty() {
body = apply_links(&body, &doc.links);
}
let md = if body.is_empty() {
String::new()
} else {
format!("{body}\n")
};
(md, ctx.artifacts)
}
fn apply_links(body: &str, links: &[(String, String)]) -> String {
let mut out = body.to_string();
let mut cursor = 0usize;
for (anchor, href) in links {
let anchor = anchor
.replace('&', "&")
.replace('<', "<")
.replace('>', ">");
if anchor.is_empty() {
continue;
}
if let Some(rel) = out[cursor..].find(&anchor) {
let at = cursor + rel;
let replacement = format!("[{anchor}]({href})");
out.replace_range(at..at + anchor.len(), &replacement);
cursor = at + replacement.len();
}
}
out
}
fn apply_links_chunk(chunk: &str, queue: &mut Vec<(String, String)>) -> String {
let mut out = chunk.to_string();
let mut cursor = 0usize;
let mut carried: Vec<(String, String)> = Vec::new();
for (anchor_raw, href) in std::mem::take(queue) {
let anchor = anchor_raw
.replace('&', "&")
.replace('<', "<")
.replace('>', ">");
if anchor.is_empty() {
continue;
}
if let Some(rel) = out[cursor..].find(&anchor) {
let at = cursor + rel;
let replacement = format!("[{anchor}]({href})");
out.replace_range(at..at + anchor.len(), &replacement);
cursor = at + replacement.len();
} else {
carried.push((anchor_raw, href));
}
}
*queue = carried;
out
}
pub struct MarkdownStreamer {
strict: bool,
images: ImageMode,
compact_tables: bool,
emitted_any: bool,
links: Vec<(String, String)>,
}
impl MarkdownStreamer {
pub fn new(strict: bool, images: ImageMode, compact_tables: bool) -> Self {
debug_assert!(
images != ImageMode::Referenced,
"referenced image mode is not streamable; use to_markdown_images"
);
Self {
strict,
images,
compact_tables,
emitted_any: false,
links: Vec::new(),
}
}
pub fn push(&mut self, nodes: &[Node], links: &[(String, String)]) -> String {
self.links.extend(links.iter().cloned());
let mut ctx = Ctx {
strict: self.strict,
compact_tables: self.compact_tables,
images: self.images,
artifacts_dir: String::new(),
artifacts: Vec::new(),
pic_index: 0,
};
let mut blocks: Vec<String> = Vec::new();
render(nodes, &mut blocks, &mut ctx);
if blocks.is_empty() {
return String::new();
}
let mut body = blocks.join("\n\n");
if self.strict && !self.links.is_empty() {
body = apply_links_chunk(&body, &mut self.links);
}
let chunk = if self.emitted_any {
format!("\n\n{body}")
} else {
body
};
self.emitted_any = true;
chunk
}
pub fn finish(self) -> String {
if self.emitted_any {
"\n".to_string()
} else {
String::new()
}
}
}
fn strict_text(text: &str, strict: bool) -> String {
if !strict {
return text.to_string();
}
text.replace("\\_", "_")
.replace(" ,", ",")
.replace(" .", ".")
.replace(" ;", ";")
.replace(" )", ")")
.replace("( ", "(")
.replace(" ]", "]")
.replace("[ ", "[")
}
fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
let mut i = 0;
while i < nodes.len() {
match &nodes[i] {
Node::ListItem { .. } => {
let start = i;
while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
i += 1;
}
render_list_run(&nodes[start..i], blocks, ctx.strict);
}
other => {
render_one(other, blocks, ctx);
i += 1;
}
}
}
}
fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
let mut lines: Vec<String> = Vec::new();
let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
for item in items {
let Node::ListItem {
ordered,
number,
first_in_list,
text,
level,
} = item
else {
continue;
};
let level = *level as usize;
prev.truncate(level + 1);
while prev.len() <= level {
prev.push(None);
}
if let Some((prev_ordered, prev_number)) = prev[level] {
let new_list = *first_in_list
|| prev_ordered != *ordered
|| (*ordered && *number != prev_number + 1);
if new_list {
lines.push(String::new());
}
}
let indent = " ".repeat(level);
let marker = if *ordered {
format!("{number}.")
} else {
"-".to_string()
};
lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
prev[level] = Some((*ordered, *number));
}
blocks.push(lines.join("\n"));
}
fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
match node {
Node::Heading { level, text } => {
let hashes = "#".repeat((*level).clamp(1, 6) as usize);
blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
}
Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
Node::Code { language, text } => {
let lang = match language {
Some(l) if ctx.strict => l.as_str(),
_ => "",
};
blocks.push(format!("```{lang}\n{text}\n```"));
}
Node::Table(table) => {
let rendered = render_table(table, ctx.compact_tables);
if !rendered.is_empty() {
blocks.push(rendered);
}
}
Node::Picture { caption, image } => {
if let Some(cap) = caption {
if !cap.is_empty() {
blocks.push(cap.clone());
}
}
blocks.push(picture_marker(image.as_ref(), ctx));
}
Node::Group { children, .. } => render(children, blocks, ctx),
Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
}
}
fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
match (ctx.images, image) {
(ImageMode::Embedded, Some(img)) => format!("", img.data_uri()),
(ImageMode::Referenced, Some(img)) => {
let path = format!(
"{}/image_{:06}.{}",
ctx.artifacts_dir,
ctx.pic_index,
ext_for(&img.mimetype)
);
ctx.pic_index += 1;
ctx.artifacts.push((path.clone(), img.data.clone()));
format!("")
}
_ => "<!-- image -->".to_string(),
}
}
fn ext_for(mimetype: &str) -> &str {
match mimetype {
"image/jpeg" => "jpg",
"image/gif" => "gif",
"image/webp" => "webp",
"image/bmp" => "bmp",
"image/tiff" => "tif",
_ => "png",
}
}
fn is_number_cell(t: &str) -> bool {
t.parse::<f64>().is_ok() || is_thousands_number(t)
}
fn is_thousands_number(t: &str) -> bool {
let b = t.as_bytes();
let mut i = 0;
let start = i;
if i < b.len() && (b[i] == b'+' || b[i] == b'-') {
i += 1;
}
let d0 = i;
while i < b.len() && b[i].is_ascii_digit() && i - d0 < 3 {
i += 1;
}
let has_int = i > d0;
if has_int {
while i + 3 < b.len() + 1
&& b.get(i) == Some(&b',')
&& b.get(i + 1).is_some_and(u8::is_ascii_digit)
&& b.get(i + 2).is_some_and(u8::is_ascii_digit)
&& b.get(i + 3).is_some_and(u8::is_ascii_digit)
{
i += 4;
}
} else {
i = start;
}
if i < b.len() && b[i] == b'.' {
i += 1;
let f0 = i;
while i < b.len() && b[i].is_ascii_digit() {
i += 1;
}
if !has_int && i == f0 {
return false; }
} else if !has_int {
return false; }
i == b.len()
}
fn render_table(table: &Table, compact: bool) -> String {
if table.rows.is_empty() {
return String::new();
}
let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
if num_cols == 0 {
return String::new();
}
let grid: Vec<Vec<String>> = table
.rows
.iter()
.enumerate()
.map(|(r, row)| {
(0..num_cols)
.map(|c| {
let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
if r == 0 {
cell
} else {
cell.trim().to_string()
}
})
.collect()
})
.collect();
if compact {
let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
let mut lines = Vec::with_capacity(grid.len() + 1);
lines.push(render_row(0));
let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
lines.push(format!("| {} |", sep.join(" | ")));
for r in 1..grid.len() {
lines.push(render_row(r));
}
return lines.join("\n");
}
let dw = |s: &str| s.chars().count();
let data_rows = 1..grid.len();
let right: Vec<bool> = (0..num_cols)
.map(|c| {
let mut any = false;
for r in data_rows.clone() {
let t = grid[r][c].trim();
if t.is_empty() {
continue;
}
if !is_number_cell(t) {
return false;
}
any = true;
}
any
})
.collect();
let width: Vec<usize> = (0..num_cols)
.map(|c| {
let mut w = dw(&grid[0][c]) + 2;
for r in data_rows.clone() {
w = w.max(dw(&grid[r][c]));
}
w
})
.collect();
let fmt_cell = |s: &str, c: usize| -> String {
let pad = " ".repeat(width[c].saturating_sub(dw(s)));
let body = if right[c] {
format!("{pad}{s}")
} else {
format!("{s}{pad}")
};
format!(" {body} ")
};
let render_row = |r: usize| -> String {
let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
format!("|{}|", cells.join("|"))
};
let mut lines = Vec::with_capacity(grid.len() + 1);
lines.push(render_row(0));
let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
lines.push(format!("|{}|", sep.join("|")));
for r in data_rows {
lines.push(render_row(r));
}
lines.join("\n")
}
fn escape_cell(s: &str) -> String {
s.replace('\n', " ").replace('|', "|")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn renders_headings_paragraphs_and_lists() {
let mut doc = DoclingDocument::new("demo");
doc.add_heading(1, "Title");
doc.add_paragraph("Hello world.");
doc.push(Node::ListItem {
ordered: false,
number: 1,
first_in_list: true,
text: "first".into(),
level: 0,
});
doc.push(Node::ListItem {
ordered: false,
number: 2,
first_in_list: false,
text: "second".into(),
level: 0,
});
let md = doc.export_to_markdown();
assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
}
#[test]
fn strict_renders_recovered_links_legacy_does_not() {
let mut doc = DoclingDocument::new("cv");
doc.add_paragraph("Find me on LinkedIn or GitHub.");
doc.links = vec![
("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
("GitHub".into(), "https://github.com/x/".into()),
];
assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
assert_eq!(
doc.export_to_markdown_with(true),
"Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
);
}
#[test]
fn strict_links_match_escaped_anchor_and_consume_in_order() {
let mut doc = DoclingDocument::new("d");
doc.add_paragraph("AI & ML here, and issues here, then issues there.");
doc.links = vec![
("AI & ML".into(), "https://a/".into()),
("issues".into(), "https://first/".into()),
("issues".into(), "https://second/".into()),
];
assert_eq!(
doc.export_to_markdown_with(true),
"[AI & ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
);
}
#[test]
fn renders_compact_table() {
let mut doc = DoclingDocument::new("t");
doc.compact_tables = true;
doc.push(Node::Table(Table {
rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
}));
let md = doc.export_to_markdown();
assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
}
#[test]
fn renders_padded_github_table_by_default() {
let mut doc = DoclingDocument::new("t");
doc.push(Node::Table(Table {
rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
}));
let md = doc.export_to_markdown();
assert_eq!(md, "| a | b |\n|-----|-----|\n| 1 | 2 |\n");
}
#[test]
fn strict_unescapes_inline_underscores_legacy_keeps_them() {
let mut doc = DoclingDocument::new("t");
doc.add_heading(1, "a\\_b");
doc.add_paragraph("x\\_y");
doc.push(Node::ListItem {
ordered: false,
number: 1,
first_in_list: true,
text: "i\\_j".into(),
level: 0,
});
assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
}
fn assert_stream_matches(
doc: &DoclingDocument,
strict: bool,
images: ImageMode,
splits: &[usize],
) {
let want = to_markdown_images(doc, strict, images, "artifacts").0;
let mut streamer = MarkdownStreamer::new(strict, images, doc.compact_tables);
let mut got = String::new();
let mut start = 0;
for &end in splits {
let links = if start == 0 {
doc.links.as_slice()
} else {
&[]
};
got.push_str(&streamer.push(&doc.nodes[start..end], links));
start = end;
}
got.push_str(&streamer.push(
&doc.nodes[start..],
if start == 0 {
doc.links.as_slice()
} else {
&[]
},
));
got.push_str(&streamer.finish());
assert_eq!(
got, want,
"streamed output diverged (splits={splits:?}, strict={strict})"
);
}
#[test]
fn streaming_is_byte_identical_to_buffered() {
let mut doc = DoclingDocument::new("d");
doc.add_heading(1, "Title");
doc.add_paragraph("First paragraph.");
doc.push(Node::ListItem {
ordered: false,
number: 1,
first_in_list: true,
text: "a".into(),
level: 0,
});
doc.push(Node::ListItem {
ordered: false,
number: 2,
first_in_list: false,
text: "b".into(),
level: 0,
});
doc.push(Node::Code {
language: Some("rust".into()),
text: "let x = 1;".into(),
});
doc.push(Node::Table(Table {
rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
}));
doc.push(Node::Picture {
caption: Some("Fig 1".into()),
image: None,
});
doc.add_paragraph("Last paragraph.");
for &strict in &[false, true] {
for &images in &[ImageMode::Placeholder, ImageMode::Embedded] {
for splits in [&[][..], &[1][..], &[2][..], &[4][..], &[1, 4, 6][..]] {
assert_stream_matches(&doc, strict, images, splits);
}
}
}
}
#[test]
fn streaming_applies_recovered_links_in_strict_mode() {
let mut doc = DoclingDocument::new("d");
doc.add_paragraph("See LinkedIn for details.");
doc.add_paragraph("And GitHub too.");
doc.links = vec![
("LinkedIn".into(), "https://lnkd/".into()),
("GitHub".into(), "https://gh/".into()),
];
assert_stream_matches(&doc, true, ImageMode::Placeholder, &[1]);
}
#[test]
fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
let mut doc = DoclingDocument::new("t");
doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
}
}