Skip to main content

fleischwolf_core/
document.rs

1//! The unified document representation.
2
3use crate::markdown::{to_markdown, to_markdown_images};
4use crate::ImageMode;
5
6/// The unified, format-agnostic document produced by every backend.
7///
8/// This is the heart of docling: backends parse their source format into a
9/// `DoclingDocument`, and serializers turn it back into Markdown, HTML, JSON,
10/// etc. Phase 0 uses a flat sequence of [`Node`]s; the production schema will
11/// match docling-core's body-tree-with-references layout.
12#[derive(Debug, Clone, PartialEq)]
13pub struct DoclingDocument {
14    /// Logical document name (usually the input file stem).
15    pub name: String,
16    /// Top-level content, in reading order.
17    pub nodes: Vec<Node>,
18    /// Default Markdown export mode for [`Self::export_to_markdown`]. `false`
19    /// (the default) reproduces docling's legacy output byte-for-byte; `true`
20    /// emits cleaner, more conformant Markdown. Set by `DocumentConverter`.
21    pub strict_markdown: bool,
22    /// Emit tables in the compact `| a | b |` / `| - | - |` form rather than
23    /// docling-core's width-padded GitHub serializer. The PDF backend sets this
24    /// (its committed groundtruth corpus predates the padded serializer); DOCX/HTML
25    /// leave it `false` to match current published docling.
26    pub compact_tables: bool,
27    /// Hyperlinks recovered from the source, as `(anchor_text, href)` pairs in
28    /// document order. docling's standard pipeline drops PDF link annotations, so
29    /// these are rendered as Markdown `[anchor](href)` **only in strict mode**
30    /// (legacy/docling output is left byte-for-byte unchanged). The PDF backend
31    /// populates this from pdfium link annotations; other backends leave it empty.
32    pub links: Vec<(String, String)>,
33}
34
35/// A single piece of document content.
36#[derive(Debug, Clone, PartialEq)]
37pub enum Node {
38    /// A heading. `level` is 1-6.
39    Heading { level: u8, text: String },
40    /// A run of body text.
41    Paragraph { text: String },
42    /// A single list item at the given nesting `level` (0 = top). For ordered
43    /// items, `number` is the display number (honoring the list's `start`); it
44    /// is unused for unordered items. `first_in_list` marks the first item of a
45    /// list so the serializer can blank-line-separate adjacent sibling lists.
46    ListItem {
47        ordered: bool,
48        number: u64,
49        first_in_list: bool,
50        text: String,
51        level: u8,
52    },
53    /// A fenced code block.
54    Code {
55        language: Option<String>,
56        text: String,
57    },
58    /// A table. The first row is treated as the header.
59    Table(Table),
60    /// A picture/figure, with an optional caption and (when a backend extracts
61    /// it) the embedded image itself.
62    Picture {
63        caption: Option<String>,
64        image: Option<PictureImage>,
65    },
66    /// A logical grouping of child nodes (e.g. a list, a section).
67    Group { label: String, children: Vec<Node> },
68    /// A form key-value region (docling's `field_region`): a set of form fields,
69    /// each pairing an optional marker, key, and value. Backends detect these
70    /// from form structure (e.g. HTML's `keyN` / `keyN_valueM` / `keyN_marker`
71    /// `id`-convention); the serializers render each item's parts as separate
72    /// labelled texts (`marker` / `field_key` / `field_value`).
73    FieldRegion { items: Vec<FieldItem> },
74}
75
76/// One entry of a [`Node::FieldRegion`]: a marker/key/value triple, any of which
77/// may be absent. Mirrors docling's `field_item` with its `marker` / `field_key`
78/// / `field_value` child texts.
79#[derive(Debug, Clone, PartialEq, Default)]
80pub struct FieldItem {
81    pub marker: Option<String>,
82    pub key: Option<String>,
83    pub value: Option<String>,
84}
85
86/// An extracted picture's raw encoded bytes plus its mimetype and pixel size —
87/// the fleischwolf analogue of docling-core's `ImageRef`.
88#[derive(Debug, Clone, PartialEq)]
89pub struct PictureImage {
90    /// e.g. `image/png`, `image/jpeg`.
91    pub mimetype: String,
92    pub width: u32,
93    pub height: u32,
94    /// The image file bytes, exactly as embedded (PNG/JPEG/…).
95    pub data: Vec<u8>,
96}
97
98impl PictureImage {
99    /// A `data:` URI for the image (`data:<mimetype>;base64,<…>`).
100    pub fn data_uri(&self) -> String {
101        format!(
102            "data:{};base64,{}",
103            self.mimetype,
104            crate::base64::encode(&self.data)
105        )
106    }
107}
108
109/// A simple row-major table. `rows[0]` is the header row.
110#[derive(Debug, Clone, PartialEq)]
111pub struct Table {
112    pub rows: Vec<Vec<String>>,
113}
114
115impl DoclingDocument {
116    /// Create an empty document with the given name.
117    pub fn new(name: impl Into<String>) -> Self {
118        Self {
119            name: name.into(),
120            nodes: Vec::new(),
121            strict_markdown: false,
122            compact_tables: false,
123            links: Vec::new(),
124        }
125    }
126
127    /// Append a node.
128    pub fn push(&mut self, node: Node) {
129        self.nodes.push(node);
130    }
131
132    /// Convenience: append a heading.
133    pub fn add_heading(&mut self, level: u8, text: impl Into<String>) {
134        self.push(Node::Heading {
135            level,
136            text: text.into(),
137        });
138    }
139
140    /// Convenience: append a paragraph.
141    pub fn add_paragraph(&mut self, text: impl Into<String>) {
142        self.push(Node::Paragraph { text: text.into() });
143    }
144
145    /// Serialize the document to Markdown.
146    ///
147    /// The Rust equivalent of docling-core's
148    /// `DoclingDocument.export_to_markdown()`. Uses [`Self::strict_markdown`] to
149    /// pick between docling-legacy output (default) and the cleaner, more
150    /// conformant variant.
151    pub fn export_to_markdown(&self) -> String {
152        to_markdown(self, self.strict_markdown)
153    }
154
155    /// Serialize to Markdown, explicitly choosing the mode regardless of
156    /// [`Self::strict_markdown`]. `strict = true` produces cleaner, more
157    /// conformant Markdown (code-fence languages preserved, no inline-run
158    /// spacing artifacts); `strict = false` reproduces docling's legacy output.
159    pub fn export_to_markdown_with(&self, strict: bool) -> String {
160        to_markdown(self, strict)
161    }
162
163    /// Serialize to docling-core's native JSON wire format (`DoclingDocument`
164    /// schema), pretty-printed — the Rust equivalent of
165    /// `DoclingDocument.export_to_dict()` / `save_as_json()`. The output loads
166    /// back into Python docling-core and round-trips to the same Markdown.
167    pub fn export_to_json(&self) -> String {
168        serde_json::to_string_pretty(&crate::json::to_json(self))
169            .expect("DoclingDocument JSON is always serializable")
170    }
171
172    /// Serialize to Markdown with an explicit picture [`ImageMode`] (mirrors
173    /// docling's `image_mode`). Returns the Markdown and, for
174    /// [`ImageMode::Referenced`], the `(relative-path, bytes)` of each image the
175    /// caller should write next to the Markdown file. `artifacts_dir` is the
176    /// directory name used in referenced links.
177    pub fn export_to_markdown_with_images(
178        &self,
179        image_mode: ImageMode,
180        artifacts_dir: &str,
181    ) -> (String, Vec<(String, Vec<u8>)>) {
182        to_markdown_images(self, self.strict_markdown, image_mode, artifacts_dir)
183    }
184}