Skip to main content

a3s_code_core/doc/
model.rs

1use serde::{Deserialize, Serialize};
2use std::collections::BTreeMap;
3
4#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
5pub enum DocumentBlockKind {
6    Paragraph,
7    Heading,
8    Table,
9    Section,
10    Metadata,
11    Slide,
12    EmailHeader,
13    Code,
14    Raw,
15}
16
17#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
18pub struct DocumentBlockLocation {
19    pub source: Option<String>,
20    pub page: Option<usize>,
21    pub ordinal: Option<usize>,
22    pub continued_from_previous_page: bool,
23    pub continued_to_next_page: bool,
24}
25
26#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
27pub struct DocumentProvenance {
28    pub parser: Option<String>,
29    pub extractor: Option<String>,
30    pub provider: Option<String>,
31}
32
33#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
34pub struct DocumentConfidence {
35    pub score_percent: Option<u8>,
36    pub label: Option<String>,
37}
38
39#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
40pub struct DocumentMetadata {
41    pub source_mime_type: Option<String>,
42    pub detected_file_type: Option<String>,
43    pub language: Option<String>,
44    pub attributes: BTreeMap<String, String>,
45    pub provenance: Option<DocumentProvenance>,
46    pub confidence: Option<DocumentConfidence>,
47}
48
49#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
50pub struct DocumentBlock {
51    pub kind: DocumentBlockKind,
52    pub label: Option<String>,
53    pub content: String,
54    pub location: Option<DocumentBlockLocation>,
55    pub attributes: BTreeMap<String, String>,
56    pub structured_payload: Option<String>,
57    pub metadata: Option<DocumentMetadata>,
58}
59
60impl DocumentBlock {
61    pub fn new(
62        kind: DocumentBlockKind,
63        label: Option<impl Into<String>>,
64        content: impl Into<String>,
65    ) -> Self {
66        Self {
67            kind,
68            label: label.map(Into::into),
69            content: content.into(),
70            location: None,
71            attributes: BTreeMap::new(),
72            structured_payload: None,
73            metadata: None,
74        }
75    }
76
77    pub fn with_attribute(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
78        self.attributes.insert(key.into(), value.into());
79        self
80    }
81
82    pub fn with_structured_payload(mut self, payload: impl Into<String>) -> Self {
83        self.structured_payload = Some(payload.into());
84        self
85    }
86
87    pub fn with_metadata(mut self, metadata: DocumentMetadata) -> Self {
88        self.metadata = Some(metadata);
89        self
90    }
91
92    pub fn with_source(mut self, source: impl Into<String>) -> Self {
93        self.location
94            .get_or_insert_with(DocumentBlockLocation::default)
95            .source = Some(source.into());
96        self
97    }
98
99    pub fn with_page(mut self, page: usize) -> Self {
100        self.location
101            .get_or_insert_with(DocumentBlockLocation::default)
102            .page = Some(page);
103        self
104    }
105
106    pub fn with_ordinal(mut self, ordinal: usize) -> Self {
107        self.location
108            .get_or_insert_with(DocumentBlockLocation::default)
109            .ordinal = Some(ordinal);
110        self
111    }
112
113    pub fn with_continued_from_previous_page(mut self, continued: bool) -> Self {
114        self.location
115            .get_or_insert_with(DocumentBlockLocation::default)
116            .continued_from_previous_page = continued;
117        self
118    }
119
120    pub fn with_continued_to_next_page(mut self, continued: bool) -> Self {
121        self.location
122            .get_or_insert_with(DocumentBlockLocation::default)
123            .continued_to_next_page = continued;
124        self
125    }
126}
127
128/// Structured table representation for stable machine-readable table output.
129/// This provides a consistent format for table data across all document types.
130#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
131pub struct StructuredTable {
132    /// Index of the table block in the document blocks array.
133    pub index: usize,
134    /// Page number where this table appears, if known.
135    pub page: Option<usize>,
136    /// Source file or section reference.
137    pub source: Option<String>,
138    /// Number of data rows (excluding header).
139    pub row_count: usize,
140    /// Number of columns.
141    pub column_count: usize,
142    /// Header row values, if detected.
143    pub headers: Vec<String>,
144    /// All data rows (excluding header if detected).
145    pub rows: Vec<Vec<String>>,
146    /// Location metadata for this table.
147    pub location: Option<DocumentBlockLocation>,
148    /// Original block label.
149    pub label: Option<String>,
150}
151
152impl StructuredTable {
153    /// Create a new structured table from block data.
154    pub fn new(
155        index: usize,
156        row_count: usize,
157        column_count: usize,
158        rows: Vec<Vec<String>>,
159    ) -> Self {
160        let headers = rows.first().cloned().unwrap_or_default();
161        Self {
162            index,
163            page: None,
164            source: None,
165            row_count,
166            column_count,
167            headers,
168            rows,
169            location: None,
170            label: None,
171        }
172    }
173
174    /// Set the page number.
175    pub fn with_page(mut self, page: usize) -> Self {
176        self.page = Some(page);
177        self
178    }
179
180    /// Set the source.
181    pub fn with_source(mut self, source: impl Into<String>) -> Self {
182        self.source = Some(source.into());
183        self
184    }
185
186    /// Set the location.
187    pub fn with_location(mut self, location: DocumentBlockLocation) -> Self {
188        self.location = Some(location);
189        self
190    }
191
192    /// Set the label.
193    pub fn with_label(mut self, label: impl Into<String>) -> Self {
194        self.label = Some(label.into());
195        self
196    }
197}
198
199/// Page-level information for documents that support page structure.
200#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
201pub struct PageInfo {
202    /// Page number (1-indexed).
203    pub page: usize,
204    /// Source file or section reference.
205    pub source: Option<String>,
206    /// Number of blocks on this page.
207    pub block_count: usize,
208    /// Section headings or labels on this page.
209    pub labels: Vec<String>,
210    /// First few lines of content for preview.
211    pub preview: Option<String>,
212    /// Whether content continues from previous page.
213    pub continued_from_previous_page: bool,
214    /// Whether content continues to next page.
215    pub continued_to_next_page: bool,
216}
217
218impl PageInfo {
219    /// Create a new page info.
220    pub fn new(page: usize) -> Self {
221        Self {
222            page,
223            source: None,
224            block_count: 0,
225            labels: Vec::new(),
226            preview: None,
227            continued_from_previous_page: false,
228            continued_to_next_page: false,
229        }
230    }
231
232    /// Set the source.
233    pub fn with_source(mut self, source: impl Into<String>) -> Self {
234        self.source = Some(source.into());
235        self
236    }
237
238    /// Set block count.
239    pub fn with_block_count(mut self, count: usize) -> Self {
240        self.block_count = count;
241        self
242    }
243}
244
245/// Unified element kinds for structured element output.
246#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
247pub enum StructuredElementKind {
248    /// A document block (paragraph, heading, etc.)
249    Block,
250    /// A table element
251    Table,
252    /// A page element
253    Page,
254}
255
256impl StructuredElementKind {
257    pub fn as_str(&self) -> &'static str {
258        match self {
259            Self::Block => "block",
260            Self::Table => "table",
261            Self::Page => "page",
262        }
263    }
264}
265
266/// Unified element representation for stable machine-readable element output.
267/// Combines blocks, tables, and pages into a single indexed array for downstream consumption.
268#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
269pub struct StructuredElement {
270    /// Element index in the flattened elements array.
271    pub index: usize,
272    /// Element type for discrimination.
273    pub kind: StructuredElementKind,
274    /// Element label/name if available.
275    pub label: Option<String>,
276    /// Element content or summary.
277    pub content: String,
278    /// Page number if available.
279    pub page: Option<usize>,
280    /// Source/location reference.
281    pub source: Option<String>,
282    /// Location metadata.
283    pub location: Option<DocumentBlockLocation>,
284    /// Extended attributes for kind-specific data.
285    pub attributes: BTreeMap<String, String>,
286    /// Structured payload for tables and other structured content.
287    pub structured_payload: Option<String>,
288}
289
290impl StructuredElement {
291    /// Create a new structured element from a document block.
292    pub fn from_block(block: &DocumentBlock, index: usize) -> Self {
293        Self {
294            index,
295            kind: StructuredElementKind::Block,
296            label: block.label.clone(),
297            content: block.content.clone(),
298            page: block.location.as_ref().and_then(|l| l.page),
299            source: block.location.as_ref().and_then(|l| l.source.clone()),
300            location: block.location.clone(),
301            attributes: block.attributes.clone(),
302            structured_payload: block.structured_payload.clone(),
303        }
304    }
305
306    /// Create a new structured element from a structured table.
307    pub fn from_table(table: &StructuredTable, index: usize) -> Self {
308        let content = if table.rows.is_empty() {
309            String::new()
310        } else {
311            table
312                .rows
313                .iter()
314                .map(|row| row.join("\t"))
315                .collect::<Vec<_>>()
316                .join("\n")
317        };
318
319        let mut attributes = BTreeMap::new();
320        attributes.insert("row_count".to_string(), table.row_count.to_string());
321        attributes.insert("column_count".to_string(), table.column_count.to_string());
322
323        Self {
324            index,
325            kind: StructuredElementKind::Table,
326            label: table.label.clone(),
327            content,
328            page: table.page,
329            source: table.source.clone(),
330            location: table.location.clone(),
331            attributes,
332            structured_payload: None,
333        }
334    }
335
336    /// Create a new structured element from a page info.
337    pub fn from_page(page: &PageInfo, index: usize) -> Self {
338        Self {
339            index,
340            kind: StructuredElementKind::Page,
341            label: None,
342            content: page.preview.clone().unwrap_or_default(),
343            page: Some(page.page),
344            source: page.source.clone(),
345            location: None,
346            attributes: {
347                let mut attrs = BTreeMap::new();
348                attrs.insert("block_count".to_string(), page.block_count.to_string());
349                if page.continued_from_previous_page {
350                    attrs.insert(
351                        "continued_from_previous_page".to_string(),
352                        "true".to_string(),
353                    );
354                }
355                if page.continued_to_next_page {
356                    attrs.insert("continued_to_next_page".to_string(), "true".to_string());
357                }
358                attrs
359            },
360            structured_payload: None,
361        }
362    }
363}
364
365#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
366pub struct ParsedDocument {
367    pub title: Option<String>,
368    pub blocks: Vec<DocumentBlock>,
369    pub metadata: Option<DocumentMetadata>,
370    /// Stable table output for machine-readable consumption.
371    pub tables: Vec<StructuredTable>,
372    /// Page-level information for documents with page structure.
373    pub pages: Vec<PageInfo>,
374    /// Unified elements array combining blocks, tables, and pages.
375    pub elements: Vec<StructuredElement>,
376}
377
378impl ParsedDocument {
379    pub fn new() -> Self {
380        Self::default()
381    }
382
383    pub fn from_text(text: impl Into<String>) -> Self {
384        Self {
385            title: None,
386            blocks: vec![DocumentBlock::new(
387                DocumentBlockKind::Raw,
388                None::<String>,
389                text,
390            )],
391            metadata: None,
392            tables: Vec::new(),
393            pages: Vec::new(),
394            elements: Vec::new(),
395        }
396    }
397
398    pub fn with_title(mut self, title: impl Into<String>) -> Self {
399        self.title = Some(title.into());
400        self
401    }
402
403    pub fn push(&mut self, block: DocumentBlock) {
404        self.blocks.push(block);
405    }
406
407    pub fn with_metadata(mut self, metadata: DocumentMetadata) -> Self {
408        self.metadata = Some(metadata);
409        self
410    }
411
412    pub fn block_count(&self) -> usize {
413        self.blocks.len()
414    }
415
416    /// Build the unified elements array from blocks, tables, and pages.
417    /// Elements are ordered: blocks first, then tables, then pages.
418    pub fn build_elements(&mut self) {
419        let mut elements = Vec::new();
420        let mut index = 0;
421
422        // Add blocks
423        for block in &self.blocks {
424            elements.push(StructuredElement::from_block(block, index));
425            index += 1;
426        }
427
428        // Add tables
429        for table in &self.tables {
430            elements.push(StructuredElement::from_table(table, index));
431            index += 1;
432        }
433
434        // Add pages
435        for page in &self.pages {
436            elements.push(StructuredElement::from_page(page, index));
437            index += 1;
438        }
439
440        self.elements = elements;
441    }
442
443    pub fn non_empty_block_count(&self) -> usize {
444        self.blocks
445            .iter()
446            .filter(|block| !block.content.trim().is_empty())
447            .count()
448    }
449
450    pub fn char_count(&self) -> usize {
451        self.to_text().chars().count()
452    }
453
454    pub fn is_empty(&self) -> bool {
455        self.blocks.iter().all(|b| b.content.trim().is_empty())
456    }
457
458    pub fn to_text(&self) -> String {
459        let mut parts = Vec::new();
460        if let Some(title) = &self.title {
461            if !title.trim().is_empty() {
462                parts.push(title.trim().to_string());
463            }
464        }
465        for block in &self.blocks {
466            let mut chunk = String::new();
467            if let Some(label) = &block.label {
468                if !label.trim().is_empty() {
469                    chunk.push_str(label.trim());
470                    chunk.push('\n');
471                }
472            }
473            chunk.push_str(block.content.trim());
474            if !chunk.trim().is_empty() {
475                parts.push(chunk.trim().to_string());
476            }
477        }
478        parts.join("\n\n")
479    }
480}
481
482#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
483pub struct ExtractedDocument {
484    pub document: ParsedDocument,
485    pub extraction_metadata: Option<DocumentExtractionMetadata>,
486}
487
488#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
489pub struct DocumentExtractionMetadata {
490    pub parser_name: Option<String>,
491    pub parser_signature: Option<String>,
492    pub extractor: Option<String>,
493    pub detected_file_type: Option<String>,
494}
495
496impl ExtractedDocument {
497    pub fn new(document: ParsedDocument) -> Self {
498        Self {
499            document,
500            extraction_metadata: None,
501        }
502    }
503
504    pub fn into_parsed_document(self) -> ParsedDocument {
505        self.document
506    }
507
508    pub fn as_parsed_document(&self) -> &ParsedDocument {
509        &self.document
510    }
511
512    pub fn with_extraction_metadata(mut self, metadata: DocumentExtractionMetadata) -> Self {
513        self.extraction_metadata = Some(metadata);
514        self
515    }
516}
517
518impl From<ParsedDocument> for ExtractedDocument {
519    fn from(value: ParsedDocument) -> Self {
520        Self::new(value)
521    }
522}