1use serde::{Deserialize, Serialize};
2
3pub const SCHEMA_VERSION: &str = "dongler.ir.v2";
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
8#[serde(rename_all = "snake_case")]
9pub enum Route {
10 BornDigital,
12 Scanned,
14 Hybrid,
16}
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22#[serde(rename_all = "snake_case")]
23pub enum TextSource {
24 TextLayer,
26 Ocr,
28 Vlm,
30 Heuristic,
32}
33
34#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
37pub struct Provenance {
38 pub text_source: TextSource,
39 #[serde(default, skip_serializing_if = "Option::is_none")]
41 pub detector: Option<String>,
42 #[serde(default, skip_serializing_if = "Option::is_none")]
43 pub confidence: Option<f32>,
44}
45
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51pub enum BlockKind {
52 Heading(u8),
53 Paragraph,
54 ListItem,
55 Code,
56 Formula,
57 Caption,
58 PageHeader,
59 PageFooter,
60 Footnote,
61}
62
63impl BlockKind {
64 pub fn as_str(&self) -> String {
66 match self {
67 BlockKind::Heading(level) => format!("heading_{}", (*level).clamp(1, 6)),
68 BlockKind::Paragraph => "paragraph".to_owned(),
69 BlockKind::ListItem => "list_item".to_owned(),
70 BlockKind::Code => "code".to_owned(),
71 BlockKind::Formula => "formula".to_owned(),
72 BlockKind::Caption => "caption".to_owned(),
73 BlockKind::PageHeader => "page_header".to_owned(),
74 BlockKind::PageFooter => "page_footer".to_owned(),
75 BlockKind::Footnote => "footnote".to_owned(),
76 }
77 }
78
79 pub fn parse(kind: &str) -> BlockKind {
83 if let Some(rest) = kind.strip_prefix("heading_") {
84 if let Ok(level) = rest.parse::<u8>() {
85 return BlockKind::Heading(level.clamp(1, 6));
86 }
87 }
88 match kind {
89 "heading" | "title" => BlockKind::Heading(1),
90 "list" | "list_item" => BlockKind::ListItem,
91 "code" => BlockKind::Code,
92 "formula" | "equation" => BlockKind::Formula,
93 "caption" => BlockKind::Caption,
94 "page_header" | "header" => BlockKind::PageHeader,
95 "page_footer" | "footer" => BlockKind::PageFooter,
96 "footnote" => BlockKind::Footnote,
97 _ => BlockKind::Paragraph,
98 }
99 }
100
101 pub fn heading_level(&self) -> Option<u8> {
103 match self {
104 BlockKind::Heading(level) => Some(*level),
105 _ => None,
106 }
107 }
108
109 pub fn is_page_furniture(&self) -> bool {
112 matches!(self, BlockKind::PageHeader | BlockKind::PageFooter)
113 }
114}
115
116#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
117pub struct Document {
118 #[serde(default = "default_schema_version")]
119 pub schema_version: String,
120 pub metadata: Metadata,
121 pub pages: Vec<Page>,
122 #[serde(default, skip_serializing_if = "Vec::is_empty")]
123 pub assets: Vec<Asset>,
124 #[serde(default, skip_serializing_if = "Vec::is_empty")]
125 pub warnings: Vec<Warning>,
126}
127
128#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
129pub struct Page {
130 pub number: usize,
131 #[serde(default, skip_serializing_if = "Option::is_none")]
132 pub width: Option<f32>,
133 #[serde(default, skip_serializing_if = "Option::is_none")]
134 pub height: Option<f32>,
135 #[serde(default, skip_serializing_if = "Option::is_none")]
136 pub rotation: Option<i32>,
137 #[serde(default, skip_serializing_if = "Option::is_none")]
139 pub route: Option<Route>,
140 #[serde(default, skip_serializing_if = "Option::is_none")]
141 pub bbox: Option<BBox>,
142 pub blocks: Vec<Block>,
143 #[serde(default, skip_serializing_if = "Vec::is_empty")]
144 pub images: Vec<ImageObject>,
145 #[serde(default, skip_serializing_if = "Vec::is_empty")]
146 pub assets: Vec<Asset>,
147 #[serde(default, skip_serializing_if = "Vec::is_empty")]
148 pub warnings: Vec<Warning>,
149}
150
151#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
152#[serde(tag = "type", rename_all = "snake_case")]
153pub enum Block {
154 Text(TextBlock),
155 Table(TableBlock),
156 Figure(FigureBlock),
157}
158
159#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
160pub struct TextBlock {
161 pub text: String,
162 pub kind: String,
163 #[serde(default, skip_serializing_if = "Option::is_none")]
164 pub bbox: Option<BBox>,
165 #[serde(default, skip_serializing_if = "Vec::is_empty")]
166 pub lines: Vec<Line>,
167 #[serde(default, skip_serializing_if = "Vec::is_empty")]
168 pub source_anchors: Vec<SourceAnchor>,
169 #[serde(default, skip_serializing_if = "Option::is_none")]
170 pub confidence: Option<Confidence>,
171 #[serde(default, skip_serializing_if = "Option::is_none")]
173 pub provenance: Option<Provenance>,
174}
175
176#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
177pub struct TableBlock {
178 pub headers: Vec<String>,
179 pub rows: Vec<Vec<String>>,
180 pub caption: Option<String>,
181 #[serde(default, skip_serializing_if = "Option::is_none")]
182 pub bbox: Option<BBox>,
183 #[serde(default, skip_serializing_if = "Vec::is_empty")]
184 pub cells: Vec<TableCell>,
185 #[serde(default, skip_serializing_if = "Option::is_none")]
188 pub html: Option<String>,
189 #[serde(default, skip_serializing_if = "Vec::is_empty")]
190 pub source_anchors: Vec<SourceAnchor>,
191 #[serde(default, skip_serializing_if = "Option::is_none")]
192 pub confidence: Option<Confidence>,
193 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub provenance: Option<Provenance>,
196}
197
198#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
199pub struct FigureBlock {
200 pub alt_text: Option<String>,
201 pub caption: Option<String>,
202 #[serde(default, skip_serializing_if = "Option::is_none")]
203 pub bbox: Option<BBox>,
204 #[serde(default, skip_serializing_if = "Option::is_none")]
205 pub image_ref: Option<String>,
206 #[serde(default, skip_serializing_if = "Vec::is_empty")]
207 pub source_anchors: Vec<SourceAnchor>,
208 #[serde(default, skip_serializing_if = "Option::is_none")]
209 pub confidence: Option<Confidence>,
210 #[serde(default, skip_serializing_if = "Option::is_none")]
212 pub provenance: Option<Provenance>,
213}
214
215#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
216pub struct Metadata {
217 pub format: String,
218 pub engine: String,
219 pub source: Option<String>,
220 pub title: Option<String>,
221 pub character_count: usize,
222 pub word_count: usize,
223 pub block_count: usize,
224 #[serde(default, skip_serializing_if = "Option::is_none")]
225 pub file_size_bytes: Option<u64>,
226 #[serde(default, skip_serializing_if = "Option::is_none")]
227 pub pdf_version: Option<String>,
228 #[serde(default)]
229 pub encrypted: bool,
230}
231
232#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
233pub struct BatchResult {
234 pub path: String,
235 pub ok: bool,
236 pub document: Option<Document>,
237 pub error: Option<String>,
238}
239
240#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
241pub struct ExtractOptions {
242 #[serde(default = "default_include_geometry")]
243 pub include_geometry: bool,
244 #[serde(default = "default_include_assets")]
245 pub include_assets: bool,
246 #[serde(default, skip_serializing_if = "Option::is_none")]
247 pub max_parallelism: Option<usize>,
248 #[serde(default)]
249 pub suppress_headers_footers: bool,
250 #[serde(default, skip_serializing_if = "Option::is_none")]
251 pub password: Option<String>,
252}
253
254impl Default for ExtractOptions {
255 fn default() -> Self {
256 Self {
257 include_geometry: true,
258 include_assets: true,
259 max_parallelism: None,
260 suppress_headers_footers: false,
261 password: None,
262 }
263 }
264}
265
266#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
267pub struct BBox {
268 pub x: f32,
269 pub y: f32,
270 pub width: f32,
271 pub height: f32,
272}
273
274#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
275pub struct Line {
276 pub text: String,
277 #[serde(default, skip_serializing_if = "Option::is_none")]
278 pub bbox: Option<BBox>,
279 #[serde(default, skip_serializing_if = "Vec::is_empty")]
280 pub spans: Vec<Span>,
281}
282
283#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
284pub struct Span {
285 pub text: String,
286 #[serde(default, skip_serializing_if = "Option::is_none")]
287 pub bbox: Option<BBox>,
288 #[serde(default, skip_serializing_if = "Option::is_none")]
289 pub font: Option<String>,
290 #[serde(default, skip_serializing_if = "Option::is_none")]
291 pub size: Option<f32>,
292 #[serde(default, skip_serializing_if = "is_false")]
293 pub bold: bool,
294 #[serde(default, skip_serializing_if = "is_false")]
295 pub italic: bool,
296}
297
298#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
299pub struct TableCell {
300 pub row: usize,
301 pub column: usize,
302 pub text: String,
303 #[serde(default, skip_serializing_if = "Option::is_none")]
304 pub bbox: Option<BBox>,
305 #[serde(default)]
306 pub is_header: bool,
307 #[serde(default = "one", skip_serializing_if = "is_one")]
311 pub col_span: usize,
312 #[serde(default = "one", skip_serializing_if = "is_one")]
314 pub row_span: usize,
315}
316
317#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
318pub struct SourceAnchor {
319 pub page_number: usize,
320 #[serde(default, skip_serializing_if = "Vec::is_empty")]
321 pub pdf_object_ids: Vec<String>,
322 #[serde(default, skip_serializing_if = "Option::is_none")]
323 pub bbox: Option<BBox>,
324 pub extraction_method: String,
325}
326
327#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
328pub struct Confidence {
329 pub score: f32,
330 #[serde(default)]
331 pub calibrated: bool,
332}
333
334#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
335pub struct Warning {
336 pub code: String,
337 pub severity: String,
338 pub message: String,
339 #[serde(default, skip_serializing_if = "Option::is_none")]
340 pub source_anchor: Option<SourceAnchor>,
341}
342
343#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
344pub struct Asset {
345 pub id: String,
346 pub kind: String,
347 #[serde(default, skip_serializing_if = "Option::is_none")]
348 pub object_id: Option<String>,
349 #[serde(default, skip_serializing_if = "Option::is_none")]
350 pub bbox: Option<BBox>,
351 #[serde(default, skip_serializing_if = "Option::is_none")]
352 pub width: Option<u32>,
353 #[serde(default, skip_serializing_if = "Option::is_none")]
354 pub height: Option<u32>,
355}
356
357#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
358pub struct ImageObject {
359 pub id: String,
360 #[serde(default, skip_serializing_if = "Option::is_none")]
361 pub object_id: Option<String>,
362 #[serde(default, skip_serializing_if = "Option::is_none")]
363 pub bbox: Option<BBox>,
364 #[serde(default, skip_serializing_if = "Option::is_none")]
365 pub width: Option<u32>,
366 #[serde(default, skip_serializing_if = "Option::is_none")]
367 pub height: Option<u32>,
368}
369
370pub fn default_schema_version() -> String {
371 SCHEMA_VERSION.to_owned()
372}
373
374fn default_include_geometry() -> bool {
375 true
376}
377
378fn default_include_assets() -> bool {
379 true
380}
381
382fn one() -> usize {
383 1
384}
385
386fn is_one(value: &usize) -> bool {
387 *value == 1
388}
389
390fn is_false(value: &bool) -> bool {
391 !*value
392}