1use crate::document::{DoclingDocument, Node, Table};
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
7pub enum ImageMode {
8 #[default]
10 Placeholder,
11 Embedded,
13 Referenced,
16}
17
18struct Ctx {
20 strict: bool,
21 compact_tables: bool,
23 images: ImageMode,
24 artifacts_dir: String,
25 artifacts: Vec<(String, Vec<u8>)>,
27 pic_index: usize,
28}
29
30pub fn to_markdown(doc: &DoclingDocument, strict: bool) -> String {
36 to_markdown_images(doc, strict, ImageMode::Placeholder, "artifacts").0
37}
38
39pub fn to_markdown_images(
43 doc: &DoclingDocument,
44 strict: bool,
45 images: ImageMode,
46 artifacts_dir: &str,
47) -> (String, Vec<(String, Vec<u8>)>) {
48 let mut ctx = Ctx {
49 strict,
50 compact_tables: doc.compact_tables,
51 images,
52 artifacts_dir: artifacts_dir.to_string(),
53 artifacts: Vec::new(),
54 pic_index: 0,
55 };
56 let mut blocks: Vec<String> = Vec::new();
57 render(&doc.nodes, &mut blocks, &mut ctx);
58 let mut body = blocks.join("\n\n");
59 if strict && !doc.links.is_empty() {
63 body = apply_links(&body, &doc.links);
64 }
65 let md = if body.is_empty() {
66 String::new()
67 } else {
68 format!("{body}\n")
69 };
70 (md, ctx.artifacts)
71}
72
73fn apply_links(body: &str, links: &[(String, String)]) -> String {
81 let mut out = body.to_string();
82 let mut cursor = 0usize;
83 for (anchor, href) in links {
84 let anchor = anchor
85 .replace('&', "&")
86 .replace('<', "<")
87 .replace('>', ">");
88 if anchor.is_empty() {
89 continue;
90 }
91 if let Some(rel) = out[cursor..].find(&anchor) {
92 let at = cursor + rel;
93 let replacement = format!("[{anchor}]({href})");
95 out.replace_range(at..at + anchor.len(), &replacement);
96 cursor = at + replacement.len();
97 }
98 }
99 out
100}
101
102fn apply_links_chunk(chunk: &str, queue: &mut Vec<(String, String)>) -> String {
113 let mut out = chunk.to_string();
114 let mut cursor = 0usize;
115 let mut carried: Vec<(String, String)> = Vec::new();
116 for (anchor_raw, href) in std::mem::take(queue) {
117 let anchor = anchor_raw
118 .replace('&', "&")
119 .replace('<', "<")
120 .replace('>', ">");
121 if anchor.is_empty() {
122 continue;
123 }
124 if let Some(rel) = out[cursor..].find(&anchor) {
125 let at = cursor + rel;
126 let replacement = format!("[{anchor}]({href})");
127 out.replace_range(at..at + anchor.len(), &replacement);
128 cursor = at + replacement.len();
129 } else {
130 carried.push((anchor_raw, href));
132 }
133 }
134 *queue = carried;
135 out
136}
137
138pub struct MarkdownStreamer {
153 strict: bool,
154 images: ImageMode,
155 compact_tables: bool,
156 emitted_any: bool,
159 links: Vec<(String, String)>,
161}
162
163impl MarkdownStreamer {
164 pub fn new(strict: bool, images: ImageMode, compact_tables: bool) -> Self {
166 debug_assert!(
167 images != ImageMode::Referenced,
168 "referenced image mode is not streamable; use to_markdown_images"
169 );
170 Self {
171 strict,
172 images,
173 compact_tables,
174 emitted_any: false,
175 links: Vec::new(),
176 }
177 }
178
179 pub fn push(&mut self, nodes: &[Node], links: &[(String, String)]) -> String {
184 self.links.extend(links.iter().cloned());
185 let mut ctx = Ctx {
186 strict: self.strict,
187 compact_tables: self.compact_tables,
188 images: self.images,
189 artifacts_dir: String::new(),
192 artifacts: Vec::new(),
193 pic_index: 0,
194 };
195 let mut blocks: Vec<String> = Vec::new();
196 render(nodes, &mut blocks, &mut ctx);
197 if blocks.is_empty() {
198 return String::new();
199 }
200 let mut body = blocks.join("\n\n");
201 if self.strict && !self.links.is_empty() {
202 body = apply_links_chunk(&body, &mut self.links);
203 }
204 let chunk = if self.emitted_any {
205 format!("\n\n{body}")
206 } else {
207 body
208 };
209 self.emitted_any = true;
210 chunk
211 }
212
213 pub fn finish(self) -> String {
216 if self.emitted_any {
217 "\n".to_string()
218 } else {
219 String::new()
220 }
221 }
222}
223
224fn strict_text(text: &str, strict: bool) -> String {
232 if !strict {
233 return text.to_string();
234 }
235 text.replace("\\_", "_")
236 .replace(" ,", ",")
237 .replace(" .", ".")
238 .replace(" ;", ";")
239 .replace(" )", ")")
240 .replace("( ", "(")
241 .replace(" ]", "]")
242 .replace("[ ", "[")
243}
244
245fn render(nodes: &[Node], blocks: &mut Vec<String>, ctx: &mut Ctx) {
246 let mut i = 0;
247 while i < nodes.len() {
248 match &nodes[i] {
249 Node::ListItem { .. } => {
250 let start = i;
251 while matches!(nodes.get(i), Some(Node::ListItem { .. })) {
252 i += 1;
253 }
254 render_list_run(&nodes[start..i], blocks, ctx.strict);
255 }
256 other => {
257 render_one(other, blocks, ctx);
258 i += 1;
259 }
260 }
261 }
262}
263
264fn render_list_run(items: &[Node], blocks: &mut Vec<String>, strict: bool) {
270 let mut lines: Vec<String> = Vec::new();
271 let mut prev: Vec<Option<(bool, u64)>> = Vec::new();
274
275 for item in items {
276 let Node::ListItem {
277 ordered,
278 number,
279 first_in_list,
280 text,
281 level,
282 } = item
283 else {
284 continue;
285 };
286 let level = *level as usize;
287
288 prev.truncate(level + 1);
290 while prev.len() <= level {
291 prev.push(None);
292 }
293
294 if let Some((prev_ordered, prev_number)) = prev[level] {
298 let new_list = *first_in_list
299 || prev_ordered != *ordered
300 || (*ordered && *number != prev_number + 1);
301 if new_list {
302 lines.push(String::new());
303 }
304 }
305
306 let indent = " ".repeat(level);
307 let marker = if *ordered {
308 format!("{number}.")
309 } else {
310 "-".to_string()
311 };
312 lines.push(format!("{indent}{marker} {}", strict_text(text, strict)));
313 prev[level] = Some((*ordered, *number));
314 }
315
316 blocks.push(lines.join("\n"));
317}
318
319fn render_one(node: &Node, blocks: &mut Vec<String>, ctx: &mut Ctx) {
320 match node {
321 Node::Heading { level, text } => {
322 let hashes = "#".repeat((*level).clamp(1, 6) as usize);
323 blocks.push(format!("{hashes} {}", strict_text(text, ctx.strict)));
324 }
325 Node::Paragraph { text } => blocks.push(strict_text(text, ctx.strict)),
326 Node::Code { language, text } => {
327 let lang = match language {
329 Some(l) if ctx.strict => l.as_str(),
330 _ => "",
331 };
332 blocks.push(format!("```{lang}\n{text}\n```"));
333 }
334 Node::Table(table) => {
335 let rendered = render_table(table, ctx.compact_tables);
336 if !rendered.is_empty() {
337 blocks.push(rendered);
338 }
339 }
340 Node::Picture { caption, image } => {
341 if let Some(cap) = caption {
342 if !cap.is_empty() {
343 blocks.push(cap.clone());
344 }
345 }
346 blocks.push(picture_marker(image.as_ref(), ctx));
347 }
348 Node::Group { children, .. } => render(children, blocks, ctx),
349 Node::FieldRegion { items } => {
350 blocks.push(MISSING_TEXT.to_string());
355 for item in items {
356 blocks.push(MISSING_TEXT.to_string());
357 for part in [&item.marker, &item.key, &item.value].into_iter().flatten() {
358 blocks.push(strict_text(part, ctx.strict));
359 }
360 }
361 }
362 Node::ListItem { .. } => unreachable!("list items are rendered in runs"),
364 }
365}
366
367const MISSING_TEXT: &str = "<!-- missing-text -->";
370
371fn picture_marker(image: Option<&crate::PictureImage>, ctx: &mut Ctx) -> String {
374 match (ctx.images, image) {
375 (ImageMode::Embedded, Some(img)) => format!("", img.data_uri()),
376 (ImageMode::Referenced, Some(img)) => {
377 let path = format!(
378 "{}/image_{:06}.{}",
379 ctx.artifacts_dir,
380 ctx.pic_index,
381 ext_for(&img.mimetype)
382 );
383 ctx.pic_index += 1;
384 ctx.artifacts.push((path.clone(), img.data.clone()));
385 format!("")
386 }
387 _ => "<!-- image -->".to_string(),
389 }
390}
391
392fn ext_for(mimetype: &str) -> &str {
393 match mimetype {
394 "image/jpeg" => "jpg",
395 "image/gif" => "gif",
396 "image/webp" => "webp",
397 "image/bmp" => "bmp",
398 "image/tiff" => "tif",
399 _ => "png",
400 }
401}
402
403fn is_number_cell(t: &str) -> bool {
420 t.parse::<f64>().is_ok() || is_thousands_number(t)
421}
422
423fn is_thousands_number(t: &str) -> bool {
429 let b = t.as_bytes();
430 let mut i = 0;
431 let start = i;
432 if i < b.len() && (b[i] == b'+' || b[i] == b'-') {
433 i += 1;
434 }
435 let d0 = i;
437 while i < b.len() && b[i].is_ascii_digit() && i - d0 < 3 {
438 i += 1;
439 }
440 let has_int = i > d0;
441 if has_int {
442 while i + 3 < b.len() + 1
444 && b.get(i) == Some(&b',')
445 && b.get(i + 1).is_some_and(u8::is_ascii_digit)
446 && b.get(i + 2).is_some_and(u8::is_ascii_digit)
447 && b.get(i + 3).is_some_and(u8::is_ascii_digit)
448 {
449 i += 4;
450 }
451 } else {
452 i = start;
454 }
455 if i < b.len() && b[i] == b'.' {
457 i += 1;
458 let f0 = i;
459 while i < b.len() && b[i].is_ascii_digit() {
460 i += 1;
461 }
462 if !has_int && i == f0 {
463 return false; }
465 } else if !has_int {
466 return false; }
468 i == b.len()
469}
470
471fn render_table(table: &Table, compact: bool) -> String {
472 if table.rows.is_empty() {
473 return String::new();
474 }
475 let num_cols = table.rows.iter().map(Vec::len).max().unwrap_or(0);
476 if num_cols == 0 {
477 return String::new();
478 }
479
480 let grid: Vec<Vec<String>> = table
483 .rows
484 .iter()
485 .enumerate()
486 .map(|(r, row)| {
487 (0..num_cols)
488 .map(|c| {
489 let cell = escape_cell(row.get(c).map(String::as_str).unwrap_or(""));
490 if r == 0 {
491 cell
492 } else {
493 cell.trim().to_string()
494 }
495 })
496 .collect()
497 })
498 .collect();
499
500 if compact {
501 let render_row = |r: usize| -> String { format!("| {} |", grid[r].join(" | ")) };
503 let mut lines = Vec::with_capacity(grid.len() + 1);
504 lines.push(render_row(0));
505 let sep: Vec<&str> = (0..num_cols).map(|_| "-").collect();
506 lines.push(format!("| {} |", sep.join(" | ")));
507 for r in 1..grid.len() {
508 lines.push(render_row(r));
509 }
510 return lines.join("\n");
511 }
512
513 let dw = |s: &str| s.chars().count();
515 let data_rows = 1..grid.len();
516
517 let right: Vec<bool> = (0..num_cols)
522 .map(|c| {
523 let mut any = false;
524 for r in data_rows.clone() {
525 let t = grid[r][c].trim();
526 if t.is_empty() {
527 continue;
528 }
529 if !is_number_cell(t) {
530 return false;
531 }
532 any = true;
533 }
534 any
535 })
536 .collect();
537
538 let width: Vec<usize> = (0..num_cols)
540 .map(|c| {
541 let mut w = dw(&grid[0][c]) + 2;
542 for r in data_rows.clone() {
543 w = w.max(dw(&grid[r][c]));
544 }
545 w
546 })
547 .collect();
548
549 let fmt_cell = |s: &str, c: usize| -> String {
550 let pad = " ".repeat(width[c].saturating_sub(dw(s)));
551 let body = if right[c] {
552 format!("{pad}{s}")
553 } else {
554 format!("{s}{pad}")
555 };
556 format!(" {body} ")
557 };
558 let render_row = |r: usize| -> String {
559 let cells: Vec<String> = (0..num_cols).map(|c| fmt_cell(&grid[r][c], c)).collect();
560 format!("|{}|", cells.join("|"))
561 };
562
563 let mut lines = Vec::with_capacity(grid.len() + 1);
564 lines.push(render_row(0));
565 let sep: Vec<String> = (0..num_cols).map(|c| "-".repeat(width[c] + 2)).collect();
566 lines.push(format!("|{}|", sep.join("|")));
567 for r in data_rows {
568 lines.push(render_row(r));
569 }
570 lines.join("\n")
571}
572
573fn escape_cell(s: &str) -> String {
576 s.replace('\n', " ").replace('|', "|")
577}
578
579#[cfg(test)]
580mod tests {
581 use super::*;
582
583 #[test]
584 fn renders_headings_paragraphs_and_lists() {
585 let mut doc = DoclingDocument::new("demo");
586 doc.add_heading(1, "Title");
587 doc.add_paragraph("Hello world.");
588 doc.push(Node::ListItem {
589 ordered: false,
590 number: 1,
591 first_in_list: true,
592 text: "first".into(),
593 level: 0,
594 });
595 doc.push(Node::ListItem {
596 ordered: false,
597 number: 2,
598 first_in_list: false,
599 text: "second".into(),
600 level: 0,
601 });
602 let md = doc.export_to_markdown();
603 assert_eq!(md, "# Title\n\nHello world.\n\n- first\n- second\n");
604 }
605
606 #[test]
607 fn strict_renders_recovered_links_legacy_does_not() {
608 let mut doc = DoclingDocument::new("cv");
609 doc.add_paragraph("Find me on LinkedIn or GitHub.");
610 doc.links = vec![
611 ("LinkedIn".into(), "https://www.linkedin.com/in/x/".into()),
612 ("GitHub".into(), "https://github.com/x/".into()),
613 ];
614 assert_eq!(doc.export_to_markdown(), "Find me on LinkedIn or GitHub.\n");
616 assert_eq!(
618 doc.export_to_markdown_with(true),
619 "Find me on [LinkedIn](https://www.linkedin.com/in/x/) or [GitHub](https://github.com/x/).\n"
620 );
621 }
622
623 #[test]
624 fn strict_links_match_escaped_anchor_and_consume_in_order() {
625 let mut doc = DoclingDocument::new("d");
626 doc.add_paragraph("AI & ML here, and issues here, then issues there.");
630 doc.links = vec![
631 ("AI & ML".into(), "https://a/".into()),
632 ("issues".into(), "https://first/".into()),
633 ("issues".into(), "https://second/".into()),
634 ];
635 assert_eq!(
636 doc.export_to_markdown_with(true),
637 "[AI & ML](https://a/) here, and [issues](https://first/) here, then [issues](https://second/) there.\n"
638 );
639 }
640
641 #[test]
642 fn renders_compact_table() {
643 let mut doc = DoclingDocument::new("t");
644 doc.compact_tables = true;
647 doc.push(Node::Table(Table {
648 rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
649 }));
650 let md = doc.export_to_markdown();
651 assert_eq!(md, "| a | b |\n| - | - |\n| 1 | 2 |\n");
652 }
653
654 #[test]
655 fn renders_padded_github_table_by_default() {
656 let mut doc = DoclingDocument::new("t");
657 doc.push(Node::Table(Table {
658 rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
659 }));
660 let md = doc.export_to_markdown();
661 assert_eq!(md, "| a | b |\n|-----|-----|\n| 1 | 2 |\n");
663 }
664
665 #[test]
666 fn strict_unescapes_inline_underscores_legacy_keeps_them() {
667 let mut doc = DoclingDocument::new("t");
668 doc.add_heading(1, "a\\_b");
669 doc.add_paragraph("x\\_y");
670 doc.push(Node::ListItem {
671 ordered: false,
672 number: 1,
673 first_in_list: true,
674 text: "i\\_j".into(),
675 level: 0,
676 });
677 assert_eq!(doc.export_to_markdown(), "# a\\_b\n\nx\\_y\n\n- i\\_j\n");
679 assert_eq!(doc.export_to_markdown_with(true), "# a_b\n\nx_y\n\n- i_j\n");
681 }
682
683 fn assert_stream_matches(
686 doc: &DoclingDocument,
687 strict: bool,
688 images: ImageMode,
689 splits: &[usize],
690 ) {
691 let want = to_markdown_images(doc, strict, images, "artifacts").0;
692 let mut streamer = MarkdownStreamer::new(strict, images, doc.compact_tables);
693 let mut got = String::new();
694 let mut start = 0;
695 for &end in splits {
696 let links = if start == 0 {
699 doc.links.as_slice()
700 } else {
701 &[]
702 };
703 got.push_str(&streamer.push(&doc.nodes[start..end], links));
704 start = end;
705 }
706 got.push_str(&streamer.push(
707 &doc.nodes[start..],
708 if start == 0 {
709 doc.links.as_slice()
710 } else {
711 &[]
712 },
713 ));
714 got.push_str(&streamer.finish());
715 assert_eq!(
716 got, want,
717 "streamed output diverged (splits={splits:?}, strict={strict})"
718 );
719 }
720
721 #[test]
722 fn streaming_is_byte_identical_to_buffered() {
723 let mut doc = DoclingDocument::new("d");
724 doc.add_heading(1, "Title");
725 doc.add_paragraph("First paragraph.");
726 doc.push(Node::ListItem {
727 ordered: false,
728 number: 1,
729 first_in_list: true,
730 text: "a".into(),
731 level: 0,
732 });
733 doc.push(Node::ListItem {
734 ordered: false,
735 number: 2,
736 first_in_list: false,
737 text: "b".into(),
738 level: 0,
739 });
740 doc.push(Node::Code {
741 language: Some("rust".into()),
742 text: "let x = 1;".into(),
743 });
744 doc.push(Node::Table(Table {
745 rows: vec![vec!["a".into(), "b".into()], vec!["1".into(), "2".into()]],
746 }));
747 doc.push(Node::Picture {
748 caption: Some("Fig 1".into()),
749 image: None,
750 });
751 doc.add_paragraph("Last paragraph.");
752
753 for &strict in &[false, true] {
756 for &images in &[ImageMode::Placeholder, ImageMode::Embedded] {
757 for splits in [&[][..], &[1][..], &[2][..], &[4][..], &[1, 4, 6][..]] {
758 assert_stream_matches(&doc, strict, images, splits);
759 }
760 }
761 }
762 }
763
764 #[test]
765 fn streaming_applies_recovered_links_in_strict_mode() {
766 let mut doc = DoclingDocument::new("d");
767 doc.add_paragraph("See LinkedIn for details.");
768 doc.add_paragraph("And GitHub too.");
769 doc.links = vec![
770 ("LinkedIn".into(), "https://lnkd/".into()),
771 ("GitHub".into(), "https://gh/".into()),
772 ];
773 assert_stream_matches(&doc, true, ImageMode::Placeholder, &[1]);
776 }
777
778 #[test]
779 fn strict_tightens_punctuation_spacing_legacy_keeps_it() {
780 let mut doc = DoclingDocument::new("t");
781 doc.add_paragraph("see [ 37 , 36 ] and ( x ) .");
782 assert_eq!(doc.export_to_markdown(), "see [ 37 , 36 ] and ( x ) .\n");
784 assert_eq!(doc.export_to_markdown_with(true), "see [37, 36] and (x).\n");
786 }
787}