1use serde::{Deserialize, Serialize};
2use std::collections::BTreeMap;
3
4#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
5pub enum DocumentBlockKind {
6 Paragraph,
7 Heading,
8 Table,
9 Section,
10 Metadata,
11 Slide,
12 EmailHeader,
13 Code,
14 Raw,
15}
16
17#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
18pub struct DocumentBlockLocation {
19 pub source: Option<String>,
20 pub page: Option<usize>,
21 pub ordinal: Option<usize>,
22 pub continued_from_previous_page: bool,
23 pub continued_to_next_page: bool,
24}
25
26#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct DocumentProvenance {
28 pub parser: Option<String>,
29 pub extractor: Option<String>,
30 pub provider: Option<String>,
31}
32
33#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
34pub struct DocumentConfidence {
35 pub score_percent: Option<u8>,
36 pub label: Option<String>,
37}
38
39#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
40pub struct DocumentMetadata {
41 pub source_mime_type: Option<String>,
42 pub detected_file_type: Option<String>,
43 pub language: Option<String>,
44 pub attributes: BTreeMap<String, String>,
45 pub provenance: Option<DocumentProvenance>,
46 pub confidence: Option<DocumentConfidence>,
47}
48
49#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
50pub struct DocumentBlock {
51 pub kind: DocumentBlockKind,
52 pub label: Option<String>,
53 pub content: String,
54 pub location: Option<DocumentBlockLocation>,
55 pub attributes: BTreeMap<String, String>,
56 pub structured_payload: Option<String>,
57 pub metadata: Option<DocumentMetadata>,
58}
59
60impl DocumentBlock {
61 pub fn new(
62 kind: DocumentBlockKind,
63 label: Option<impl Into<String>>,
64 content: impl Into<String>,
65 ) -> Self {
66 Self {
67 kind,
68 label: label.map(Into::into),
69 content: content.into(),
70 location: None,
71 attributes: BTreeMap::new(),
72 structured_payload: None,
73 metadata: None,
74 }
75 }
76
77 pub fn with_attribute(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
78 self.attributes.insert(key.into(), value.into());
79 self
80 }
81
82 pub fn with_structured_payload(mut self, payload: impl Into<String>) -> Self {
83 self.structured_payload = Some(payload.into());
84 self
85 }
86
87 pub fn with_metadata(mut self, metadata: DocumentMetadata) -> Self {
88 self.metadata = Some(metadata);
89 self
90 }
91
92 pub fn with_source(mut self, source: impl Into<String>) -> Self {
93 self.location
94 .get_or_insert_with(DocumentBlockLocation::default)
95 .source = Some(source.into());
96 self
97 }
98
99 pub fn with_page(mut self, page: usize) -> Self {
100 self.location
101 .get_or_insert_with(DocumentBlockLocation::default)
102 .page = Some(page);
103 self
104 }
105
106 pub fn with_ordinal(mut self, ordinal: usize) -> Self {
107 self.location
108 .get_or_insert_with(DocumentBlockLocation::default)
109 .ordinal = Some(ordinal);
110 self
111 }
112
113 pub fn with_continued_from_previous_page(mut self, continued: bool) -> Self {
114 self.location
115 .get_or_insert_with(DocumentBlockLocation::default)
116 .continued_from_previous_page = continued;
117 self
118 }
119
120 pub fn with_continued_to_next_page(mut self, continued: bool) -> Self {
121 self.location
122 .get_or_insert_with(DocumentBlockLocation::default)
123 .continued_to_next_page = continued;
124 self
125 }
126}
127
128#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
131pub struct StructuredTable {
132 pub index: usize,
134 pub page: Option<usize>,
136 pub source: Option<String>,
138 pub row_count: usize,
140 pub column_count: usize,
142 pub headers: Vec<String>,
144 pub rows: Vec<Vec<String>>,
146 pub location: Option<DocumentBlockLocation>,
148 pub label: Option<String>,
150}
151
152impl StructuredTable {
153 pub fn new(
155 index: usize,
156 row_count: usize,
157 column_count: usize,
158 rows: Vec<Vec<String>>,
159 ) -> Self {
160 let headers = rows.first().cloned().unwrap_or_default();
161 Self {
162 index,
163 page: None,
164 source: None,
165 row_count,
166 column_count,
167 headers,
168 rows,
169 location: None,
170 label: None,
171 }
172 }
173
174 pub fn with_page(mut self, page: usize) -> Self {
176 self.page = Some(page);
177 self
178 }
179
180 pub fn with_source(mut self, source: impl Into<String>) -> Self {
182 self.source = Some(source.into());
183 self
184 }
185
186 pub fn with_location(mut self, location: DocumentBlockLocation) -> Self {
188 self.location = Some(location);
189 self
190 }
191
192 pub fn with_label(mut self, label: impl Into<String>) -> Self {
194 self.label = Some(label.into());
195 self
196 }
197}
198
199#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
201pub struct PageInfo {
202 pub page: usize,
204 pub source: Option<String>,
206 pub block_count: usize,
208 pub labels: Vec<String>,
210 pub preview: Option<String>,
212 pub continued_from_previous_page: bool,
214 pub continued_to_next_page: bool,
216}
217
218impl PageInfo {
219 pub fn new(page: usize) -> Self {
221 Self {
222 page,
223 source: None,
224 block_count: 0,
225 labels: Vec::new(),
226 preview: None,
227 continued_from_previous_page: false,
228 continued_to_next_page: false,
229 }
230 }
231
232 pub fn with_source(mut self, source: impl Into<String>) -> Self {
234 self.source = Some(source.into());
235 self
236 }
237
238 pub fn with_block_count(mut self, count: usize) -> Self {
240 self.block_count = count;
241 self
242 }
243}
244
245#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
247pub enum StructuredElementKind {
248 Block,
250 Table,
252 Page,
254}
255
256impl StructuredElementKind {
257 pub fn as_str(&self) -> &'static str {
258 match self {
259 Self::Block => "block",
260 Self::Table => "table",
261 Self::Page => "page",
262 }
263 }
264}
265
266#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
269pub struct StructuredElement {
270 pub index: usize,
272 pub kind: StructuredElementKind,
274 pub label: Option<String>,
276 pub content: String,
278 pub page: Option<usize>,
280 pub source: Option<String>,
282 pub location: Option<DocumentBlockLocation>,
284 pub attributes: BTreeMap<String, String>,
286 pub structured_payload: Option<String>,
288}
289
290impl StructuredElement {
291 pub fn from_block(block: &DocumentBlock, index: usize) -> Self {
293 Self {
294 index,
295 kind: StructuredElementKind::Block,
296 label: block.label.clone(),
297 content: block.content.clone(),
298 page: block.location.as_ref().and_then(|l| l.page),
299 source: block.location.as_ref().and_then(|l| l.source.clone()),
300 location: block.location.clone(),
301 attributes: block.attributes.clone(),
302 structured_payload: block.structured_payload.clone(),
303 }
304 }
305
306 pub fn from_table(table: &StructuredTable, index: usize) -> Self {
308 let content = if table.rows.is_empty() {
309 String::new()
310 } else {
311 table
312 .rows
313 .iter()
314 .map(|row| row.join("\t"))
315 .collect::<Vec<_>>()
316 .join("\n")
317 };
318
319 let mut attributes = BTreeMap::new();
320 attributes.insert("row_count".to_string(), table.row_count.to_string());
321 attributes.insert("column_count".to_string(), table.column_count.to_string());
322
323 Self {
324 index,
325 kind: StructuredElementKind::Table,
326 label: table.label.clone(),
327 content,
328 page: table.page,
329 source: table.source.clone(),
330 location: table.location.clone(),
331 attributes,
332 structured_payload: None,
333 }
334 }
335
336 pub fn from_page(page: &PageInfo, index: usize) -> Self {
338 Self {
339 index,
340 kind: StructuredElementKind::Page,
341 label: None,
342 content: page.preview.clone().unwrap_or_default(),
343 page: Some(page.page),
344 source: page.source.clone(),
345 location: None,
346 attributes: {
347 let mut attrs = BTreeMap::new();
348 attrs.insert("block_count".to_string(), page.block_count.to_string());
349 if page.continued_from_previous_page {
350 attrs.insert(
351 "continued_from_previous_page".to_string(),
352 "true".to_string(),
353 );
354 }
355 if page.continued_to_next_page {
356 attrs.insert("continued_to_next_page".to_string(), "true".to_string());
357 }
358 attrs
359 },
360 structured_payload: None,
361 }
362 }
363}
364
365#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
366pub struct ParsedDocument {
367 pub title: Option<String>,
368 pub blocks: Vec<DocumentBlock>,
369 pub metadata: Option<DocumentMetadata>,
370 pub tables: Vec<StructuredTable>,
372 pub pages: Vec<PageInfo>,
374 pub elements: Vec<StructuredElement>,
376}
377
378impl ParsedDocument {
379 pub fn new() -> Self {
380 Self::default()
381 }
382
383 pub fn from_text(text: impl Into<String>) -> Self {
384 Self {
385 title: None,
386 blocks: vec![DocumentBlock::new(
387 DocumentBlockKind::Raw,
388 None::<String>,
389 text,
390 )],
391 metadata: None,
392 tables: Vec::new(),
393 pages: Vec::new(),
394 elements: Vec::new(),
395 }
396 }
397
398 pub fn with_title(mut self, title: impl Into<String>) -> Self {
399 self.title = Some(title.into());
400 self
401 }
402
403 pub fn push(&mut self, block: DocumentBlock) {
404 self.blocks.push(block);
405 }
406
407 pub fn with_metadata(mut self, metadata: DocumentMetadata) -> Self {
408 self.metadata = Some(metadata);
409 self
410 }
411
412 pub fn block_count(&self) -> usize {
413 self.blocks.len()
414 }
415
416 pub fn build_elements(&mut self) {
419 let mut elements = Vec::new();
420 let mut index = 0;
421
422 for block in &self.blocks {
424 elements.push(StructuredElement::from_block(block, index));
425 index += 1;
426 }
427
428 for table in &self.tables {
430 elements.push(StructuredElement::from_table(table, index));
431 index += 1;
432 }
433
434 for page in &self.pages {
436 elements.push(StructuredElement::from_page(page, index));
437 index += 1;
438 }
439
440 self.elements = elements;
441 }
442
443 pub fn non_empty_block_count(&self) -> usize {
444 self.blocks
445 .iter()
446 .filter(|block| !block.content.trim().is_empty())
447 .count()
448 }
449
450 pub fn char_count(&self) -> usize {
451 self.to_text().chars().count()
452 }
453
454 pub fn is_empty(&self) -> bool {
455 self.blocks.iter().all(|b| b.content.trim().is_empty())
456 }
457
458 pub fn to_text(&self) -> String {
459 let mut parts = Vec::new();
460 if let Some(title) = &self.title {
461 if !title.trim().is_empty() {
462 parts.push(title.trim().to_string());
463 }
464 }
465 for block in &self.blocks {
466 let mut chunk = String::new();
467 if let Some(label) = &block.label {
468 if !label.trim().is_empty() {
469 chunk.push_str(label.trim());
470 chunk.push('\n');
471 }
472 }
473 chunk.push_str(block.content.trim());
474 if !chunk.trim().is_empty() {
475 parts.push(chunk.trim().to_string());
476 }
477 }
478 parts.join("\n\n")
479 }
480}
481
482#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
483pub struct ExtractedDocument {
484 pub document: ParsedDocument,
485 pub extraction_metadata: Option<DocumentExtractionMetadata>,
486}
487
488#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
489pub struct DocumentExtractionMetadata {
490 pub parser_name: Option<String>,
491 pub parser_signature: Option<String>,
492 pub extractor: Option<String>,
493 pub detected_file_type: Option<String>,
494}
495
496impl ExtractedDocument {
497 pub fn new(document: ParsedDocument) -> Self {
498 Self {
499 document,
500 extraction_metadata: None,
501 }
502 }
503
504 pub fn into_parsed_document(self) -> ParsedDocument {
505 self.document
506 }
507
508 pub fn as_parsed_document(&self) -> &ParsedDocument {
509 &self.document
510 }
511
512 pub fn with_extraction_metadata(mut self, metadata: DocumentExtractionMetadata) -> Self {
513 self.extraction_metadata = Some(metadata);
514 self
515 }
516}
517
518impl From<ParsedDocument> for ExtractedDocument {
519 fn from(value: ParsedDocument) -> Self {
520 Self::new(value)
521 }
522}