Skip to main content

fleischwolf_core/
document.rs

1//! The unified document representation.
2
3use crate::markdown::{to_markdown, to_markdown_images};
4use crate::ImageMode;
5
6/// The unified, format-agnostic document produced by every backend.
7///
8/// This is the heart of docling: backends parse their source format into a
9/// `DoclingDocument`, and serializers turn it back into Markdown, HTML, JSON,
10/// etc. Phase 0 uses a flat sequence of [`Node`]s; the production schema will
11/// match docling-core's body-tree-with-references layout.
12#[derive(Debug, Clone, PartialEq)]
13pub struct DoclingDocument {
14    /// Logical document name (usually the input file stem).
15    pub name: String,
16    /// Top-level content, in reading order.
17    pub nodes: Vec<Node>,
18    /// Default Markdown export mode for [`Self::export_to_markdown`]. `false`
19    /// (the default) reproduces docling's legacy output byte-for-byte; `true`
20    /// emits cleaner, more conformant Markdown. Set by `DocumentConverter`.
21    pub strict_markdown: bool,
22}
23
24/// A single piece of document content.
25#[derive(Debug, Clone, PartialEq)]
26pub enum Node {
27    /// A heading. `level` is 1-6.
28    Heading { level: u8, text: String },
29    /// A run of body text.
30    Paragraph { text: String },
31    /// A single list item at the given nesting `level` (0 = top). For ordered
32    /// items, `number` is the display number (honoring the list's `start`); it
33    /// is unused for unordered items. `first_in_list` marks the first item of a
34    /// list so the serializer can blank-line-separate adjacent sibling lists.
35    ListItem {
36        ordered: bool,
37        number: u64,
38        first_in_list: bool,
39        text: String,
40        level: u8,
41    },
42    /// A fenced code block.
43    Code {
44        language: Option<String>,
45        text: String,
46    },
47    /// A table. The first row is treated as the header.
48    Table(Table),
49    /// A picture/figure, with an optional caption and (when a backend extracts
50    /// it) the embedded image itself.
51    Picture {
52        caption: Option<String>,
53        image: Option<PictureImage>,
54    },
55    /// A logical grouping of child nodes (e.g. a list, a section).
56    Group { label: String, children: Vec<Node> },
57}
58
59/// An extracted picture's raw encoded bytes plus its mimetype and pixel size —
60/// the fleischwolf analogue of docling-core's `ImageRef`.
61#[derive(Debug, Clone, PartialEq)]
62pub struct PictureImage {
63    /// e.g. `image/png`, `image/jpeg`.
64    pub mimetype: String,
65    pub width: u32,
66    pub height: u32,
67    /// The image file bytes, exactly as embedded (PNG/JPEG/…).
68    pub data: Vec<u8>,
69}
70
71impl PictureImage {
72    /// A `data:` URI for the image (`data:<mimetype>;base64,<…>`).
73    pub fn data_uri(&self) -> String {
74        format!(
75            "data:{};base64,{}",
76            self.mimetype,
77            crate::base64::encode(&self.data)
78        )
79    }
80}
81
82/// A simple row-major table. `rows[0]` is the header row.
83#[derive(Debug, Clone, PartialEq)]
84pub struct Table {
85    pub rows: Vec<Vec<String>>,
86}
87
88impl DoclingDocument {
89    /// Create an empty document with the given name.
90    pub fn new(name: impl Into<String>) -> Self {
91        Self {
92            name: name.into(),
93            nodes: Vec::new(),
94            strict_markdown: false,
95        }
96    }
97
98    /// Append a node.
99    pub fn push(&mut self, node: Node) {
100        self.nodes.push(node);
101    }
102
103    /// Convenience: append a heading.
104    pub fn add_heading(&mut self, level: u8, text: impl Into<String>) {
105        self.push(Node::Heading {
106            level,
107            text: text.into(),
108        });
109    }
110
111    /// Convenience: append a paragraph.
112    pub fn add_paragraph(&mut self, text: impl Into<String>) {
113        self.push(Node::Paragraph { text: text.into() });
114    }
115
116    /// Serialize the document to Markdown.
117    ///
118    /// The Rust equivalent of docling-core's
119    /// `DoclingDocument.export_to_markdown()`. Uses [`Self::strict_markdown`] to
120    /// pick between docling-legacy output (default) and the cleaner, more
121    /// conformant variant.
122    pub fn export_to_markdown(&self) -> String {
123        to_markdown(self, self.strict_markdown)
124    }
125
126    /// Serialize to Markdown, explicitly choosing the mode regardless of
127    /// [`Self::strict_markdown`]. `strict = true` produces cleaner, more
128    /// conformant Markdown (code-fence languages preserved, no inline-run
129    /// spacing artifacts); `strict = false` reproduces docling's legacy output.
130    pub fn export_to_markdown_with(&self, strict: bool) -> String {
131        to_markdown(self, strict)
132    }
133
134    /// Serialize to docling-core's native JSON wire format (`DoclingDocument`
135    /// schema), pretty-printed — the Rust equivalent of
136    /// `DoclingDocument.export_to_dict()` / `save_as_json()`. The output loads
137    /// back into Python docling-core and round-trips to the same Markdown.
138    pub fn export_to_json(&self) -> String {
139        serde_json::to_string_pretty(&crate::json::to_json(self))
140            .expect("DoclingDocument JSON is always serializable")
141    }
142
143    /// Serialize to Markdown with an explicit picture [`ImageMode`] (mirrors
144    /// docling's `image_mode`). Returns the Markdown and, for
145    /// [`ImageMode::Referenced`], the `(relative-path, bytes)` of each image the
146    /// caller should write next to the Markdown file. `artifacts_dir` is the
147    /// directory name used in referenced links.
148    pub fn export_to_markdown_with_images(
149        &self,
150        image_mode: ImageMode,
151        artifacts_dir: &str,
152    ) -> (String, Vec<(String, Vec<u8>)>) {
153        to_markdown_images(self, self.strict_markdown, image_mode, artifacts_dir)
154    }
155}