Skip to main content

dongler_core/
ir.rs

1use serde::{Deserialize, Serialize};
2
3pub const SCHEMA_VERSION: &str = "dongler.ir.v2";
4
5/// How a page was routed by the pipeline triage stage (IR v2). `None` on
6/// documents produced by the legacy fast path, which keeps v1 deserializable.
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
8#[serde(rename_all = "snake_case")]
9pub enum Route {
10    /// Text-layer characters cover the page; no OCR needed.
11    BornDigital,
12    /// No usable text layer; the page is an image and must be OCR'd.
13    Scanned,
14    /// Partial text layer (e.g. scan with embedded OCR); decided per region.
15    Hybrid,
16}
17
18/// Where a block's text came from. Recorded in [`Provenance`] so consumers can
19/// audit (and filter) text by trustworthiness — the deterministic invariant is
20/// that `Vlm` text only appears after passing the escalation validators.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22#[serde(rename_all = "snake_case")]
23pub enum TextSource {
24    /// Pulled verbatim from the PDF text layer (deterministic, cannot hallucinate).
25    TextLayer,
26    /// Produced by an OCR model on a rasterized region.
27    Ocr,
28    /// Produced by a vision-language model (validator-gated).
29    Vlm,
30    /// Derived by a heuristic in the legacy engine (no model).
31    Heuristic,
32}
33
34/// Per-block provenance attached by the pipeline (IR v2). Optional so legacy
35/// documents remain valid.
36#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
37pub struct Provenance {
38    pub text_source: TextSource,
39    /// Model identifier, e.g. `"docling-layout-heron@v2"`. `None` for text-layer.
40    #[serde(default, skip_serializing_if = "Option::is_none")]
41    pub detector: Option<String>,
42    #[serde(default, skip_serializing_if = "Option::is_none")]
43    pub confidence: Option<f32>,
44}
45
46/// The closed vocabulary of text-block kinds (IR v2). This is a *helper* over the
47/// serialized `TextBlock::kind` string rather than a hard field-type change, so
48/// v1 documents — including the ones that emit the buggy `"heading"` kind — still
49/// deserialize. New pipeline code should construct kinds via [`BlockKind::as_str`].
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum BlockKind {
52    Heading(u8),
53    Paragraph,
54    ListItem,
55    Code,
56    Formula,
57    Caption,
58    PageHeader,
59    PageFooter,
60    Footnote,
61}
62
63impl BlockKind {
64    /// Canonical serialized form used in `TextBlock::kind`.
65    pub fn as_str(&self) -> String {
66        match self {
67            BlockKind::Heading(level) => format!("heading_{}", (*level).clamp(1, 6)),
68            BlockKind::Paragraph => "paragraph".to_owned(),
69            BlockKind::ListItem => "list_item".to_owned(),
70            BlockKind::Code => "code".to_owned(),
71            BlockKind::Formula => "formula".to_owned(),
72            BlockKind::Caption => "caption".to_owned(),
73            BlockKind::PageHeader => "page_header".to_owned(),
74            BlockKind::PageFooter => "page_footer".to_owned(),
75            BlockKind::Footnote => "footnote".to_owned(),
76        }
77    }
78
79    /// Tolerant parse of a serialized kind string. Unknown or legacy values
80    /// (including v1's bare `"heading"` and `"list"`) map to their closest v2
81    /// equivalent, never failing — this is what keeps v1 deserialization total.
82    pub fn parse(kind: &str) -> BlockKind {
83        if let Some(rest) = kind.strip_prefix("heading_") {
84            if let Ok(level) = rest.parse::<u8>() {
85                return BlockKind::Heading(level.clamp(1, 6));
86            }
87        }
88        match kind {
89            "heading" | "title" => BlockKind::Heading(1),
90            "list" | "list_item" => BlockKind::ListItem,
91            "code" => BlockKind::Code,
92            "formula" | "equation" => BlockKind::Formula,
93            "caption" => BlockKind::Caption,
94            "page_header" | "header" => BlockKind::PageHeader,
95            "page_footer" | "footer" => BlockKind::PageFooter,
96            "footnote" => BlockKind::Footnote,
97            _ => BlockKind::Paragraph,
98        }
99    }
100
101    /// Heading level if this kind is a heading.
102    pub fn heading_level(&self) -> Option<u8> {
103        match self {
104            BlockKind::Heading(level) => Some(*level),
105            _ => None,
106        }
107    }
108
109    /// Whether the renderer should drop this kind from default Markdown output
110    /// (page furniture, per the olmOCR convention the PRD adopts).
111    pub fn is_page_furniture(&self) -> bool {
112        matches!(self, BlockKind::PageHeader | BlockKind::PageFooter)
113    }
114}
115
116#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
117pub struct Document {
118    #[serde(default = "default_schema_version")]
119    pub schema_version: String,
120    pub metadata: Metadata,
121    pub pages: Vec<Page>,
122    #[serde(default, skip_serializing_if = "Vec::is_empty")]
123    pub assets: Vec<Asset>,
124    #[serde(default, skip_serializing_if = "Vec::is_empty")]
125    pub warnings: Vec<Warning>,
126}
127
128#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
129pub struct Page {
130    pub number: usize,
131    #[serde(default, skip_serializing_if = "Option::is_none")]
132    pub width: Option<f32>,
133    #[serde(default, skip_serializing_if = "Option::is_none")]
134    pub height: Option<f32>,
135    #[serde(default, skip_serializing_if = "Option::is_none")]
136    pub rotation: Option<i32>,
137    /// Pipeline triage classification (IR v2). `None` on legacy fast-path output.
138    #[serde(default, skip_serializing_if = "Option::is_none")]
139    pub route: Option<Route>,
140    #[serde(default, skip_serializing_if = "Option::is_none")]
141    pub bbox: Option<BBox>,
142    pub blocks: Vec<Block>,
143    #[serde(default, skip_serializing_if = "Vec::is_empty")]
144    pub images: Vec<ImageObject>,
145    #[serde(default, skip_serializing_if = "Vec::is_empty")]
146    pub assets: Vec<Asset>,
147    #[serde(default, skip_serializing_if = "Vec::is_empty")]
148    pub warnings: Vec<Warning>,
149}
150
151#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
152#[serde(tag = "type", rename_all = "snake_case")]
153pub enum Block {
154    Text(TextBlock),
155    Table(TableBlock),
156    Figure(FigureBlock),
157}
158
159#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
160pub struct TextBlock {
161    pub text: String,
162    pub kind: String,
163    #[serde(default, skip_serializing_if = "Option::is_none")]
164    pub bbox: Option<BBox>,
165    #[serde(default, skip_serializing_if = "Vec::is_empty")]
166    pub lines: Vec<Line>,
167    #[serde(default, skip_serializing_if = "Vec::is_empty")]
168    pub source_anchors: Vec<SourceAnchor>,
169    #[serde(default, skip_serializing_if = "Option::is_none")]
170    pub confidence: Option<Confidence>,
171    /// Pipeline provenance (IR v2). `None` on legacy fast-path output.
172    #[serde(default, skip_serializing_if = "Option::is_none")]
173    pub provenance: Option<Provenance>,
174}
175
176#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
177pub struct TableBlock {
178    pub headers: Vec<String>,
179    pub rows: Vec<Vec<String>>,
180    pub caption: Option<String>,
181    #[serde(default, skip_serializing_if = "Option::is_none")]
182    pub bbox: Option<BBox>,
183    #[serde(default, skip_serializing_if = "Vec::is_empty")]
184    pub cells: Vec<TableCell>,
185    /// Pre-rendered HTML table preserving row/col spans (IR v2). When present the
186    /// Markdown renderer embeds it verbatim (the PRD's default table form).
187    #[serde(default, skip_serializing_if = "Option::is_none")]
188    pub html: Option<String>,
189    #[serde(default, skip_serializing_if = "Vec::is_empty")]
190    pub source_anchors: Vec<SourceAnchor>,
191    #[serde(default, skip_serializing_if = "Option::is_none")]
192    pub confidence: Option<Confidence>,
193    /// Pipeline provenance (IR v2). `None` on legacy fast-path output.
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub provenance: Option<Provenance>,
196}
197
198#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
199pub struct FigureBlock {
200    pub alt_text: Option<String>,
201    pub caption: Option<String>,
202    #[serde(default, skip_serializing_if = "Option::is_none")]
203    pub bbox: Option<BBox>,
204    #[serde(default, skip_serializing_if = "Option::is_none")]
205    pub image_ref: Option<String>,
206    #[serde(default, skip_serializing_if = "Vec::is_empty")]
207    pub source_anchors: Vec<SourceAnchor>,
208    #[serde(default, skip_serializing_if = "Option::is_none")]
209    pub confidence: Option<Confidence>,
210    /// Pipeline provenance (IR v2). `None` on legacy fast-path output.
211    #[serde(default, skip_serializing_if = "Option::is_none")]
212    pub provenance: Option<Provenance>,
213}
214
215#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
216pub struct Metadata {
217    pub format: String,
218    pub engine: String,
219    pub source: Option<String>,
220    pub title: Option<String>,
221    pub character_count: usize,
222    pub word_count: usize,
223    pub block_count: usize,
224    #[serde(default, skip_serializing_if = "Option::is_none")]
225    pub file_size_bytes: Option<u64>,
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub pdf_version: Option<String>,
228    #[serde(default)]
229    pub encrypted: bool,
230}
231
232#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
233pub struct BatchResult {
234    pub path: String,
235    pub ok: bool,
236    pub document: Option<Document>,
237    pub error: Option<String>,
238}
239
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct ExtractOptions {
242    #[serde(default = "default_include_geometry")]
243    pub include_geometry: bool,
244    #[serde(default = "default_include_assets")]
245    pub include_assets: bool,
246    #[serde(default, skip_serializing_if = "Option::is_none")]
247    pub max_parallelism: Option<usize>,
248    #[serde(default)]
249    pub suppress_headers_footers: bool,
250    #[serde(default, skip_serializing_if = "Option::is_none")]
251    pub password: Option<String>,
252}
253
254impl Default for ExtractOptions {
255    fn default() -> Self {
256        Self {
257            include_geometry: true,
258            include_assets: true,
259            max_parallelism: None,
260            suppress_headers_footers: false,
261            password: None,
262        }
263    }
264}
265
266#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
267pub struct BBox {
268    pub x: f32,
269    pub y: f32,
270    pub width: f32,
271    pub height: f32,
272}
273
274#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
275pub struct Line {
276    pub text: String,
277    #[serde(default, skip_serializing_if = "Option::is_none")]
278    pub bbox: Option<BBox>,
279    #[serde(default, skip_serializing_if = "Vec::is_empty")]
280    pub spans: Vec<Span>,
281}
282
283#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
284pub struct Span {
285    pub text: String,
286    #[serde(default, skip_serializing_if = "Option::is_none")]
287    pub bbox: Option<BBox>,
288    #[serde(default, skip_serializing_if = "Option::is_none")]
289    pub font: Option<String>,
290    #[serde(default, skip_serializing_if = "Option::is_none")]
291    pub size: Option<f32>,
292    #[serde(default, skip_serializing_if = "is_false")]
293    pub bold: bool,
294    #[serde(default, skip_serializing_if = "is_false")]
295    pub italic: bool,
296}
297
298#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
299pub struct TableCell {
300    pub row: usize,
301    pub column: usize,
302    pub text: String,
303    #[serde(default, skip_serializing_if = "Option::is_none")]
304    pub bbox: Option<BBox>,
305    #[serde(default)]
306    pub is_header: bool,
307    /// Number of grid columns this cell spans (1 for an ordinary cell). A value
308    /// greater than 1 marks a horizontally merged cell; the spanned-over column
309    /// positions are omitted from `cells`.
310    #[serde(default = "one", skip_serializing_if = "is_one")]
311    pub col_span: usize,
312    /// Number of grid rows this cell spans (1 for an ordinary cell).
313    #[serde(default = "one", skip_serializing_if = "is_one")]
314    pub row_span: usize,
315}
316
317#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
318pub struct SourceAnchor {
319    pub page_number: usize,
320    #[serde(default, skip_serializing_if = "Vec::is_empty")]
321    pub pdf_object_ids: Vec<String>,
322    #[serde(default, skip_serializing_if = "Option::is_none")]
323    pub bbox: Option<BBox>,
324    pub extraction_method: String,
325}
326
327#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
328pub struct Confidence {
329    pub score: f32,
330    #[serde(default)]
331    pub calibrated: bool,
332}
333
334#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
335pub struct Warning {
336    pub code: String,
337    pub severity: String,
338    pub message: String,
339    #[serde(default, skip_serializing_if = "Option::is_none")]
340    pub source_anchor: Option<SourceAnchor>,
341}
342
343#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
344pub struct Asset {
345    pub id: String,
346    pub kind: String,
347    #[serde(default, skip_serializing_if = "Option::is_none")]
348    pub object_id: Option<String>,
349    #[serde(default, skip_serializing_if = "Option::is_none")]
350    pub bbox: Option<BBox>,
351    #[serde(default, skip_serializing_if = "Option::is_none")]
352    pub width: Option<u32>,
353    #[serde(default, skip_serializing_if = "Option::is_none")]
354    pub height: Option<u32>,
355}
356
357#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
358pub struct ImageObject {
359    pub id: String,
360    #[serde(default, skip_serializing_if = "Option::is_none")]
361    pub object_id: Option<String>,
362    #[serde(default, skip_serializing_if = "Option::is_none")]
363    pub bbox: Option<BBox>,
364    #[serde(default, skip_serializing_if = "Option::is_none")]
365    pub width: Option<u32>,
366    #[serde(default, skip_serializing_if = "Option::is_none")]
367    pub height: Option<u32>,
368}
369
370pub fn default_schema_version() -> String {
371    SCHEMA_VERSION.to_owned()
372}
373
374fn default_include_geometry() -> bool {
375    true
376}
377
378fn default_include_assets() -> bool {
379    true
380}
381
382fn one() -> usize {
383    1
384}
385
386fn is_one(value: &usize) -> bool {
387    *value == 1
388}
389
390fn is_false(value: &bool) -> bool {
391    !*value
392}