1use crate::document::{DoclingDocument, Node, Table};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8 #[default]
10 Placeholder,
11 Embedded,
13 Referenced,
16}
17
18struct Ctx {
20 strict: bool,
21 compact_tables: bool,
23 images: ImageMode,
24 artifacts_dir: String,
25 artifacts: Vec<(String, Vec<u8>)>,
27 pic_index: usize,
28}
29
30pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36 to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39pub fn to_markdown_images(
43 doc: &DoclingDocument,
44 strict: bool,
45 images: ImageMode,
46 artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48 let mut ctx = Ctx {
49 strict,
50 compact_tables: doc.compact_tables,
51 images,
52 artifacts_dir: artifacts_dir.to_string(),
53 artifacts: Vec::new(),
54 pic_index: 0,
55 };
56 let mut blocks: Vec<String> = Vec::new();
57 render(&doc.nodes, &mut blocks, &mut ctx);
58 let body = blocks.join("\n\n");
59 let md = if body.is_empty() {
60 String::new()
61 } else {
62 format!("{body}\n")
63 };
64 (md, ctx.artifacts)
65}
66
67fn strict_text(text: &str, strict: bool) -> String {
75 if !strict {
76 return text.to_string();
77 }
78 text.replace("\\_", "_")
79 .replace(" ,", ",")
80 .replace(" .", ".")
81 .replace(" ;", ";")
82 .replace(" )", ")")
83 .replace("( ", "(")
84 .replace(" ]", "]")
85 .replace("[ ", "[")
86}
87
88fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
89 let mut i = 0;
90 while i < nodes.len() {
91 match &nodes[i] {
92 Node::ListItem { .. } => {
93 let start = i;
94 while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
95 i += 1;
96 }
97 render_list_run(&nodes[start..i], blocks, ctx.strict);
98 }
99 other => {
100 render_one(other, blocks, ctx);
101 i += 1;
102 }
103 }
104 }
105}
106
107fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
113 let mut lines: Vec<String> = Vec::new();
114 let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
117
118 for item in items {
119 let Node::ListItem {
120 ordered,
121 number,
122 first_in_list,
123 text,
124 level,
125 } = item
126 else {
127 continue;
128 };
129 let level = *level as usize;
130
131 prev.truncate(level + 1);
133 while prev.len() <= level {
134 prev.push(None);
135 }
136
137 if let Some((prev_ordered, prev_number)) = prev[level] {
141 let new_list = *first_in_list
142 || prev_ordered != *ordered
143 || (*ordered && *number != prev_number + 1);
144 if new_list {
145 lines.push(String::new());
146 }
147 }
148
149 let indent = " ".repeat(level);
150 let marker = if *ordered {
151 format!("{number}.")
152 } else {
153 "-".to_string()
154 };
155 lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
156 prev[level] = Some((*ordered, *number));
157 }
158
159 blocks.push(lines.join("\n"));
160}
161
162fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
163 match node {
164 Node::Heading { level, text } => {
165 let hashes = "#".repeat((*level).clamp(1, 6) as usize);
166 blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
167 }
168 Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
169 Node::Code { language, text } => {
170 let lang = match language {
172 Some(l) if ctx.strict => l.as_str(),
173 _ => "",
174 };
175 blocks.push(format!("```{lang}\n{text}\n```"));
176 }
177 Node::Table(table) => {
178 let rendered = render_table(table, ctx.compact_tables);
179 if !rendered.is_empty() {
180 blocks.push(rendered);
181 }
182 }
183 Node::Picture { caption, image } => {
184 if let Some(cap) = caption {
185 if !cap.is_empty() {
186 blocks.push(cap.clone());
187 }
188 }
189 blocks.push(picture_marker(image.as_ref(), ctx));
190 }
191 Node::Group { children, .. } => render(children, blocks, ctx),
192 Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
194 }
195}
196
197fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
200 match (ctx.images, image) {
201 (ImageMode::Embedded, Some(img)) => format!("", img.data_uri()),
202 (ImageMode::Referenced, Some(img)) => {
203 let path = format!(
204 "{}/image_{:06}.{}",
205 ctx.artifacts_dir,
206 ctx.pic_index,
207 ext_for(&img.mimetype)
208 );
209 ctx.pic_index += 1;
210 ctx.artifacts.push((path.clone(), img.data.clone()));
211 format!("")
212 }
213 _ => "<!-- image -->".to_string(),
215 }
216}
217
218fn ext_for(mimetype: &str) -> &str {
219 match mimetype {
220 "image/jpeg" => "jpg",
221 "image/gif" => "gif",
222 "image/webp" => "webp",
223 "image/bmp" => "bmp",
224 "image/tiff" => "tif",
225 _ => "png",
226 }
227}
228
229fn render_table(table: &Table, compact: bool) -> String {
243 if table.rows.is_empty() {
244 return String::new();
245 }
246 let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
247 if num_cols == 0 {
248 return String::new();
249 }
250
251 let grid: Vec<Vec<String>> = table
254 .rows
255 .iter()
256 .enumerate()
257 .map(|(r, row)| {
258 (0..num_cols)
259 .map(|c| {
260 let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
261 if r == 0 {
262 cell
263 } else {
264 cell.trim().to_string()
265 }
266 })
267 .collect()
268 })
269 .collect();
270
271 if compact {
272 let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
274 let mut lines = Vec::with_capacity(grid.len() + 1);
275 lines.push(render_row(0));
276 let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
277 lines.push(format!("| {} |", sep.join(" | ")));
278 for r in 1..grid.len() {
279 lines.push(render_row(r));
280 }
281 return lines.join("\n");
282 }
283
284 let dw = |s: &str| s.chars().count();
286 let data_rows = 1..grid.len();
287
288 let right: Vec<bool> = (0..num_cols)
290 .map(|c| {
291 !data_rows.is_empty()
292 && data_rows.clone().all(|r| {
293 let t = grid[r][c].trim();
294 !t.is_empty() && t.parse::<f64>().is_ok()
295 })
296 })
297 .collect();
298
299 let width: Vec<usize> = (0..num_cols)
301 .map(|c| {
302 let mut w = dw(&grid[0][c]) + 2;
303 for r in data_rows.clone() {
304 w = w.max(dw(&grid[r][c]));
305 }
306 w
307 })
308 .collect();
309
310 let fmt_cell = |s: &str, c: usize| -> String {
311 let pad = " ".repeat(width[c].saturating_sub(dw(s)));
312 let body = if right[c] {
313 format!("{pad}{s}")
314 } else {
315 format!("{s}{pad}")
316 };
317 format!(" {body} ")
318 };
319 let render_row = |r: usize| -> String {
320 let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
321 format!("|{}|", cells.join("|"))
322 };
323
324 let mut lines = Vec::with_capacity(grid.len() + 1);
325 lines.push(render_row(0));
326 let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
327 lines.push(format!("|{}|", sep.join("|")));
328 for r in data_rows {
329 lines.push(render_row(r));
330 }
331 lines.join("\n")
332}
333
334fn escape_cell(s: &str) -> String {
337 s.replace('\n', " ").replace('|', "|")
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn renders_headings_paragraphs_and_lists() {
346 let mut doc = DoclingDocument::new("demo");
347 doc.add_heading(1, "Title");
348 doc.add_paragraph("Hello world.");
349 doc.push(Node::ListItem {
350 ordered: false,
351 number: 1,
352 first_in_list: true,
353 text: "first".into(),
354 level: 0,
355 });
356 doc.push(Node::ListItem {
357 ordered: false,
358 number: 2,
359 first_in_list: false,
360 text: "second".into(),
361 level: 0,
362 });
363 let md = doc.export_to_markdown();
364 assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
365 }
366
367 #[test]
368 fn renders_compact_table() {
369 let mut doc = DoclingDocument::new("t");
370 doc.compact_tables = true;
373 doc.push(Node::Table(Table {
374 rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
375 }));
376 let md = doc.export_to_markdown();
377 assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
378 }
379
380 #[test]
381 fn renders_padded_github_table_by_default() {
382 let mut doc = DoclingDocument::new("t");
383 doc.push(Node::Table(Table {
384 rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
385 }));
386 let md = doc.export_to_markdown();
387 assert_eq!(md, "| a | b |\n|-----|-----|\n| 1 | 2 |\n");
389 }
390
391 #[test]
392 fn strict_unescapes_inline_underscores_legacy_keeps_them() {
393 let mut doc = DoclingDocument::new("t");
394 doc.add_heading(1, "a\\_b");
395 doc.add_paragraph("x\\_y");
396 doc.push(Node::ListItem {
397 ordered: false,
398 number: 1,
399 first_in_list: true,
400 text: "i\\_j".into(),
401 level: 0,
402 });
403 assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
405 assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
407 }
408
409 #[test]
410 fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
411 let mut doc = DoclingDocument::new("t");
412 doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
413 assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
415 assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
417 }
418}