1use serde::{Deserialize, Serialize};
22use serde_json::Value;
23
24use crate::codes::WarningCode;
25use crate::error::{ErrorCode, EthosError};
26use crate::geom::QRect;
27
28#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
30pub struct Document {
31 pub schema_version: String,
33 pub parser: ParserInfo,
35 pub profile: ProfileRef,
37 pub source: SourceInfo,
39 pub config_sha256: String,
41 pub payload_sha256: String,
43 pub fingerprint: String,
45 pub payload: Payload,
47 #[serde(skip_serializing_if = "Option::is_none")]
50 pub diagnostics: Option<Value>,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
55pub struct ParserInfo {
56 pub name: String,
58 pub version: String,
60}
61
62#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
64pub struct ProfileRef {
65 pub id: String,
67 pub sha256: String,
69}
70
71#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
73pub struct SourceInfo {
74 pub fingerprint: String,
76 pub bytes: u64,
78}
79
80#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
82pub struct Payload {
83 pub coordinate_system: CoordinateSystem,
85 pub pages: Vec<Page>,
87 pub elements: Vec<Element>,
89 pub spans: Vec<Span>,
91 pub tables: Vec<Table>,
93 pub chunks: Vec<Chunk>,
95 pub regions: Vec<Region>,
97 pub security_warnings: Vec<Warning>,
99 pub parser_warnings: Vec<Warning>,
101}
102
103#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
105pub struct CoordinateSystem {
106 pub origin: String,
108 pub unit: String,
110 pub quantum_per_point: u32,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub struct Page {
117 pub id: String,
119 pub index: u32,
121 pub width: i64,
123 pub height: i64,
125 pub rotation: u16,
127}
128
129#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
131#[serde(rename_all = "snake_case")]
132pub enum ElementType {
133 TextBlock,
135 Heading,
137 List,
139 ListItem,
141 Table,
143 Region,
145 Header,
147 Footer,
149 Caption,
151 Other,
153}
154
155#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
157pub struct Element {
158 pub id: String,
160 #[serde(rename = "type")]
162 pub element_type: ElementType,
163 pub page: String,
165 pub bbox: QRect,
167 #[serde(skip_serializing_if = "Option::is_none")]
169 pub text: Option<String>,
170 #[serde(skip_serializing_if = "Option::is_none")]
172 pub heading_level: Option<u8>,
173 #[serde(skip_serializing_if = "Option::is_none")]
175 pub table_ref: Option<String>,
176 #[serde(skip_serializing_if = "Option::is_none")]
178 pub region_ref: Option<String>,
179 #[serde(skip_serializing_if = "Option::is_none")]
181 pub confidence: Option<u16>,
182 #[serde(default, skip_serializing_if = "Vec::is_empty")]
184 pub span_refs: Vec<String>,
185 #[serde(default, skip_serializing_if = "Vec::is_empty")]
187 pub warning_refs: Vec<String>,
188}
189
190#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
192pub struct Span {
193 pub id: String,
195 pub page: String,
197 pub bbox: QRect,
199 #[serde(skip_serializing_if = "Option::is_none")]
201 pub origin_locator: Option<SpanOriginLocator>,
202 pub text: String,
204 #[serde(skip_serializing_if = "Option::is_none")]
206 pub font_id: Option<String>,
207 #[serde(skip_serializing_if = "Option::is_none")]
209 pub font_size_q: Option<i64>,
210 #[serde(skip_serializing_if = "Option::is_none")]
212 pub char_start: Option<u32>,
213 #[serde(skip_serializing_if = "Option::is_none")]
215 pub char_end: Option<u32>,
216 #[serde(default, skip_serializing_if = "Vec::is_empty")]
218 pub warning_refs: Vec<String>,
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
223pub struct SpanOriginLocator {
224 pub policy: String,
226 pub first_origin: [i64; 2],
228 pub last_origin: [i64; 2],
230}
231
232#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
234pub struct Table {
235 pub id: String,
237 pub page_refs: Vec<String>,
239 pub bbox: QRect,
241 pub n_rows: u32,
243 pub n_cols: u32,
245 pub header_rows: u32,
247 pub header_cols: u32,
249 pub cells: Vec<Cell>,
251 #[serde(skip_serializing_if = "Option::is_none")]
253 pub confidence: Option<u16>,
254 #[serde(default, skip_serializing_if = "Vec::is_empty")]
256 pub warning_refs: Vec<String>,
257 #[serde(skip_serializing_if = "Option::is_none")]
259 pub exports: Option<TableExports>,
260}
261
262#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
264pub struct TableExports {
265 #[serde(skip_serializing_if = "Option::is_none")]
267 pub csv: Option<String>,
268 #[serde(skip_serializing_if = "Option::is_none")]
270 pub markdown: Option<String>,
271}
272
273#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
275pub struct Cell {
276 pub row: u32,
278 pub col: u32,
280 pub row_span: u32,
282 pub col_span: u32,
284 pub bbox: QRect,
286 pub text: String,
288 #[serde(default, skip_serializing_if = "Vec::is_empty")]
290 pub span_refs: Vec<String>,
291 #[serde(default, skip_serializing_if = "Vec::is_empty")]
293 pub element_refs: Vec<String>,
294}
295
296#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
298pub struct Chunk {
299 pub id: String,
301 pub text: String,
303 pub element_refs: Vec<String>,
305 pub page_refs: Vec<String>,
307 pub bboxes: Vec<PageBox>,
309 pub token_estimate: TokenEstimate,
311 #[serde(default, skip_serializing_if = "Vec::is_empty")]
314 pub warning_refs: Vec<String>,
315}
316
317#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
319pub struct PageBox {
320 pub page: String,
322 pub bbox: QRect,
324}
325
326#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
328pub struct TokenEstimate {
329 pub count: u32,
331 pub estimator: String,
333 pub approximate: bool,
335}
336
337#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
339#[serde(rename_all = "snake_case")]
340pub enum RegionKind {
341 Unknown,
343 Image,
345 Figure,
347 Formula,
349 Chart,
351}
352
353#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
355pub struct Region {
356 pub id: String,
358 pub page: String,
360 pub bbox: QRect,
362 pub kind: RegionKind,
364 #[serde(default, skip_serializing_if = "Vec::is_empty")]
366 pub warning_refs: Vec<String>,
367}
368
369#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
371pub struct Warning {
372 pub id: String,
374 pub code: WarningCode,
376 pub message: String,
378 #[serde(skip_serializing_if = "Option::is_none")]
380 pub page: Option<String>,
381 #[serde(skip_serializing_if = "Option::is_none")]
383 pub element_ref: Option<String>,
384 #[serde(skip_serializing_if = "Option::is_none")]
386 pub span_ref: Option<String>,
387 #[serde(skip_serializing_if = "Option::is_none")]
389 pub region_ref: Option<String>,
390}
391
392impl ElementType {
393 pub fn as_str(self) -> &'static str {
395 match self {
396 ElementType::TextBlock => "text_block",
397 ElementType::Heading => "heading",
398 ElementType::List => "list",
399 ElementType::ListItem => "list_item",
400 ElementType::Table => "table",
401 ElementType::Region => "region",
402 ElementType::Header => "header",
403 ElementType::Footer => "footer",
404 ElementType::Caption => "caption",
405 ElementType::Other => "other",
406 }
407 }
408}
409
410fn grounding_element_from_element(e: &Element) -> crate::grounding::GroundingElement {
411 crate::grounding::GroundingElement {
412 id: e.id.clone(),
413 page: e.page.clone(),
414 bbox: e.bbox.to_array(),
415 kind: e.element_type.as_str().to_string(),
416 text: e.text.clone(),
417 }
418}
419
420impl crate::grounding::GroundingSource for Document {
423 fn parser(&self) -> crate::grounding::ParserIdentity {
424 crate::grounding::ParserIdentity {
425 name: self.parser.name.clone(),
426 version: self.parser.version.clone(),
427 adapter: None,
428 adapter_version: None,
429 }
430 }
431
432 fn capabilities(&self) -> crate::grounding::Capabilities {
433 crate::grounding::Capabilities {
434 spans: true,
435 char_offsets: true,
436 tables: true,
437 fingerprint: true,
438 coordinate_origin: crate::grounding::CoordinateOrigin::TopLeft,
439 crop_support: false,
441 }
442 }
443
444 fn fingerprint(&self) -> Option<String> {
445 Some(self.fingerprint.clone())
446 }
447
448 fn pages(&self) -> Vec<crate::grounding::PageGeometry> {
449 self.payload
450 .pages
451 .iter()
452 .map(|p| crate::grounding::PageGeometry {
453 id: p.id.clone(),
454 index: p.index,
455 width: p.width,
456 height: p.height,
457 rotation: p.rotation,
458 })
459 .collect()
460 }
461
462 fn elements(&self) -> Vec<crate::grounding::GroundingElement> {
463 self.payload
464 .elements
465 .iter()
466 .map(grounding_element_from_element)
467 .collect()
468 }
469
470 fn element_by_id(&self, id: &str) -> Option<crate::grounding::GroundingElement> {
471 self.payload
472 .elements
473 .iter()
474 .find(|e| e.id == id)
475 .map(grounding_element_from_element)
476 }
477
478 fn spans(&self) -> Vec<crate::grounding::GroundingSpan> {
479 self.payload
480 .spans
481 .iter()
482 .map(|s| crate::grounding::GroundingSpan {
483 id: s.id.clone(),
484 page: s.page.clone(),
485 bbox: s.bbox.to_array(),
486 text: s.text.clone(),
487 element: None,
488 char_start: s.char_start,
489 char_end: s.char_end,
490 })
491 .collect()
492 }
493
494 fn tables(&self) -> Vec<crate::grounding::GroundingTable> {
495 self.payload
496 .tables
497 .iter()
498 .map(|t| crate::grounding::GroundingTable {
499 id: t.id.clone(),
500 page: t.page_refs.first().cloned().unwrap_or_default(),
501 bbox: t.bbox.to_array(),
502 cells: t
503 .cells
504 .iter()
505 .map(|c| crate::grounding::GroundingCell {
506 row: c.row,
507 col: c.col,
508 row_span: c.row_span,
509 col_span: c.col_span,
510 bbox: c.bbox.to_array(),
511 text: c.text.clone(),
512 })
513 .collect(),
514 })
515 .collect()
516 }
517}
518
519impl Document {
520 pub fn payload_c14n(&self) -> Result<Vec<u8>, EthosError> {
522 let value = serde_json::to_value(&self.payload)
523 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
524 crate::c14n::c14n_bytes(&value)
525 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
526 }
527
528 pub fn payload_fingerprint_c14n(&self) -> Result<Vec<u8>, EthosError> {
530 let value = stable_payload_projection(&self.payload)?;
531 crate::c14n::c14n_bytes(&value)
532 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
533 }
534
535 pub fn compute_payload_sha256(&self) -> Result<String, EthosError> {
537 let value = stable_payload_projection(&self.payload)?;
538 crate::c14n::sha256_hex(&value)
539 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
540 }
541
542 pub fn compute_payload_sha256_for_payload(payload: &Payload) -> Result<String, EthosError> {
544 let value = stable_payload_projection(payload)?;
545 crate::c14n::sha256_hex(&value)
546 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
547 }
548
549 pub fn payload_fingerprint_value(&self) -> Result<Value, EthosError> {
551 stable_payload_projection(&self.payload)
552 }
553
554 pub fn compute_raw_payload_sha256(&self) -> Result<String, EthosError> {
556 let value = serde_json::to_value(&self.payload)
557 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
558 crate::c14n::sha256_hex(&value)
559 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
560 }
561
562 pub fn compute_fingerprint(&self) -> Result<String, EthosError> {
564 let manifest = crate::fingerprint::FingerprintManifest {
565 config_sha256: self.config_sha256.clone(),
566 payload_sha256: self.compute_payload_sha256()?,
567 profile_id: self.profile.id.clone(),
568 profile_sha256: self.profile.sha256.clone(),
569 schema_version: self.schema_version.clone(),
570 source_fingerprint: self.source.fingerprint.clone(),
571 };
572 manifest
573 .document_fingerprint()
574 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
575 }
576
577 pub fn verify_integrity(&self) -> Result<(), EthosError> {
580 let payload = self.compute_payload_sha256()?;
581 if payload != self.payload_sha256 {
582 return Err(EthosError::new(
583 ErrorCode::InternalError,
584 "payload_sha256 mismatch: document was modified or produced non-canonically",
585 ));
586 }
587 let fp = self.compute_fingerprint()?;
588 if fp != self.fingerprint {
589 return Err(EthosError::new(
590 ErrorCode::InternalError,
591 "fingerprint mismatch: envelope and payload disagree",
592 ));
593 }
594 Ok(())
595 }
596}
597
598fn stable_payload_projection(payload: &Payload) -> Result<Value, EthosError> {
599 let mut value = serde_json::to_value(payload)
600 .map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
601 remove_unstable_geometry(&mut value);
602 Ok(value)
603}
604
605fn remove_unstable_geometry(value: &mut Value) {
606 match value {
607 Value::Object(map) => {
608 map.remove("bbox");
609 map.remove("bboxes");
610 for child in map.values_mut() {
611 remove_unstable_geometry(child);
612 }
613 }
614 Value::Array(items) => {
615 for child in items {
616 remove_unstable_geometry(child);
617 }
618 }
619 _ => {}
620 }
621}
622
623#[cfg(test)]
624mod tests {
625 use super::*;
626
627 fn example() -> (&'static str, Document) {
628 let raw = include_str!(concat!(
629 env!("CARGO_MANIFEST_DIR"),
630 "/../../schemas/examples/document.example.json"
631 ));
632 (
633 raw,
634 serde_json::from_str(raw).expect("example deserializes"),
635 )
636 }
637
638 #[test]
639 fn example_round_trips_at_value_level() {
640 let (raw, doc) = example();
641 let original: Value = serde_json::from_str(raw).unwrap();
642 let reserialized = serde_json::to_value(&doc).unwrap();
643 assert_eq!(
644 original, reserialized,
645 "model drops or reorders schema fields"
646 );
647 }
648
649 #[test]
650 fn example_hashes_are_self_consistent() {
651 let (_, doc) = example();
652 doc.verify_integrity()
653 .expect("example hashes must be real (regenerated, not fake)");
654 }
655
656 #[test]
657 fn reserialization_is_stable() {
658 let (_, doc) = example();
659 let a = doc.payload_c14n().unwrap();
660 let b = doc.payload_c14n().unwrap();
661 assert_eq!(a, b);
662 let v = serde_json::to_value(&doc).unwrap();
664 let doc2: Document = serde_json::from_value(v).unwrap();
665 assert_eq!(doc, doc2);
666 }
667
668 #[test]
669 fn payload_hash_ignores_precise_bbox_geometry() {
670 let (_, doc) = example();
671 let mut shifted = doc.clone();
672 shifted.payload.elements[0].bbox = QRect::new(1, 2, 3, 4).unwrap();
673 shifted.payload.spans[0].bbox = QRect::new(5, 6, 7, 8).unwrap();
674 shifted.payload.tables[0].bbox = QRect::new(9, 10, 11, 12).unwrap();
675 shifted.payload.tables[0].cells[0].bbox = QRect::new(13, 14, 15, 16).unwrap();
676 shifted.payload.chunks[0].bboxes[0].bbox = QRect::new(17, 18, 19, 20).unwrap();
677 shifted.payload.regions[0].bbox = QRect::new(21, 22, 23, 24).unwrap();
678
679 assert_eq!(
680 doc.compute_payload_sha256().unwrap(),
681 shifted.compute_payload_sha256().unwrap()
682 );
683 }
684
685 #[test]
686 fn payload_hash_binds_origin_locator() {
687 let (_, doc) = example();
688 let mut changed = doc.clone();
689 changed.payload.spans[0].origin_locator = Some(SpanOriginLocator {
690 policy: "origin-run-locator-v1".to_string(),
691 first_origin: [7200, 7200],
692 last_origin: [30480, 7200],
693 });
694
695 assert_ne!(
696 doc.compute_payload_sha256().unwrap(),
697 changed.compute_payload_sha256().unwrap()
698 );
699 }
700}