fleischwolf_core/document.rs
1//! The unified document representation.
2
3use crate::markdown::{to_markdown, to_markdown_images};
4use crate::ImageMode;
5
6/// The unified, format-agnostic document produced by every backend.
7///
8/// This is the heart of docling: backends parse their source format into a
9/// `DoclingDocument`, and serializers turn it back into Markdown, HTML, JSON,
10/// etc. Phase 0 uses a flat sequence of [`Node`]s; the production schema will
11/// match docling-core's body-tree-with-references layout.
12#[derive(Debug, Clone, PartialEq)]
13pub struct DoclingDocument {
14 /// Logical document name (usually the input file stem).
15 pub name: String,
16 /// Top-level content, in reading order.
17 pub nodes: Vec<Node>,
18 /// Default Markdown export mode for [`Self::export_to_markdown`]. `false`
19 /// (the default) reproduces docling's legacy output byte-for-byte; `true`
20 /// emits cleaner, more conformant Markdown. Set by `DocumentConverter`.
21 pub strict_markdown: bool,
22}
23
24/// A single piece of document content.
25#[derive(Debug, Clone, PartialEq)]
26pub enum Node {
27 /// A heading. `level` is 1-6.
28 Heading { level: u8, text: String },
29 /// A run of body text.
30 Paragraph { text: String },
31 /// A single list item at the given nesting `level` (0 = top). For ordered
32 /// items, `number` is the display number (honoring the list's `start`); it
33 /// is unused for unordered items. `first_in_list` marks the first item of a
34 /// list so the serializer can blank-line-separate adjacent sibling lists.
35 ListItem {
36 ordered: bool,
37 number: u64,
38 first_in_list: bool,
39 text: String,
40 level: u8,
41 },
42 /// A fenced code block.
43 Code {
44 language: Option<String>,
45 text: String,
46 },
47 /// A table. The first row is treated as the header.
48 Table(Table),
49 /// A picture/figure, with an optional caption and (when a backend extracts
50 /// it) the embedded image itself.
51 Picture {
52 caption: Option<String>,
53 image: Option<PictureImage>,
54 },
55 /// A logical grouping of child nodes (e.g. a list, a section).
56 Group { label: String, children: Vec<Node> },
57}
58
59/// An extracted picture's raw encoded bytes plus its mimetype and pixel size —
60/// the fleischwolf analogue of docling-core's `ImageRef`.
61#[derive(Debug, Clone, PartialEq)]
62pub struct PictureImage {
63 /// e.g. `image/png`, `image/jpeg`.
64 pub mimetype: String,
65 pub width: u32,
66 pub height: u32,
67 /// The image file bytes, exactly as embedded (PNG/JPEG/…).
68 pub data: Vec<u8>,
69}
70
71impl PictureImage {
72 /// A `data:` URI for the image (`data:<mimetype>;base64,<…>`).
73 pub fn data_uri(&self) -> String {
74 format!(
75 "data:{};base64,{}",
76 self.mimetype,
77 crate::base64::encode(&self.data)
78 )
79 }
80}
81
82/// A simple row-major table. `rows[0]` is the header row.
83#[derive(Debug, Clone, PartialEq)]
84pub struct Table {
85 pub rows: Vec<Vec<String>>,
86}
87
88impl DoclingDocument {
89 /// Create an empty document with the given name.
90 pub fn new(name: impl Into<String>) -> Self {
91 Self {
92 name: name.into(),
93 nodes: Vec::new(),
94 strict_markdown: false,
95 }
96 }
97
98 /// Append a node.
99 pub fn push(&mut self, node: Node) {
100 self.nodes.push(node);
101 }
102
103 /// Convenience: append a heading.
104 pub fn add_heading(&mut self, level: u8, text: impl Into<String>) {
105 self.push(Node::Heading {
106 level,
107 text: text.into(),
108 });
109 }
110
111 /// Convenience: append a paragraph.
112 pub fn add_paragraph(&mut self, text: impl Into<String>) {
113 self.push(Node::Paragraph { text: text.into() });
114 }
115
116 /// Serialize the document to Markdown.
117 ///
118 /// The Rust equivalent of docling-core's
119 /// `DoclingDocument.export_to_markdown()`. Uses [`Self::strict_markdown`] to
120 /// pick between docling-legacy output (default) and the cleaner, more
121 /// conformant variant.
122 pub fn export_to_markdown(&self) -> String {
123 to_markdown(self, self.strict_markdown)
124 }
125
126 /// Serialize to Markdown, explicitly choosing the mode regardless of
127 /// [`Self::strict_markdown`]. `strict = true` produces cleaner, more
128 /// conformant Markdown (code-fence languages preserved, no inline-run
129 /// spacing artifacts); `strict = false` reproduces docling's legacy output.
130 pub fn export_to_markdown_with(&self, strict: bool) -> String {
131 to_markdown(self, strict)
132 }
133
134 /// Serialize to docling-core's native JSON wire format (`DoclingDocument`
135 /// schema), pretty-printed — the Rust equivalent of
136 /// `DoclingDocument.export_to_dict()` / `save_as_json()`. The output loads
137 /// back into Python docling-core and round-trips to the same Markdown.
138 pub fn export_to_json(&self) -> String {
139 serde_json::to_string_pretty(&crate::json::to_json(self))
140 .expect("DoclingDocument JSON is always serializable")
141 }
142
143 /// Serialize to Markdown with an explicit picture [`ImageMode`] (mirrors
144 /// docling's `image_mode`). Returns the Markdown and, for
145 /// [`ImageMode::Referenced`], the `(relative-path, bytes)` of each image the
146 /// caller should write next to the Markdown file. `artifacts_dir` is the
147 /// directory name used in referenced links.
148 pub fn export_to_markdown_with_images(
149 &self,
150 image_mode: ImageMode,
151 artifacts_dir: &str,
152 ) -> (String, Vec<(String, Vec<u8>)>) {
153 to_markdown_images(self, self.strict_markdown, image_mode, artifacts_dir)
154 }
155}