use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::codes::WarningCode;
use crate::error::{ErrorCode, EthosError};
use crate::geom::QRect;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Document {
pub schema_version: String,
pub parser: ParserInfo,
pub profile: ProfileRef,
pub source: SourceInfo,
pub config_sha256: String,
pub payload_sha256: String,
pub fingerprint: String,
pub payload: Payload,
#[serde(skip_serializing_if = "Option::is_none")]
pub diagnostics: Option<Value>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ParserInfo {
pub name: String,
pub version: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ProfileRef {
pub id: String,
pub sha256: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SourceInfo {
pub fingerprint: String,
pub bytes: u64,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Payload {
pub coordinate_system: CoordinateSystem,
pub pages: Vec<Page>,
pub elements: Vec<Element>,
pub spans: Vec<Span>,
pub tables: Vec<Table>,
pub chunks: Vec<Chunk>,
pub regions: Vec<Region>,
pub security_warnings: Vec<Warning>,
pub parser_warnings: Vec<Warning>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct CoordinateSystem {
pub origin: String,
pub unit: String,
pub quantum_per_point: u32,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Page {
pub id: String,
pub index: u32,
pub width: i64,
pub height: i64,
pub rotation: u16,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ElementType {
TextBlock,
Heading,
List,
ListItem,
Table,
Region,
Header,
Footer,
Caption,
Other,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Element {
pub id: String,
#[serde(rename = "type")]
pub element_type: ElementType,
pub page: String,
pub bbox: QRect,
#[serde(skip_serializing_if = "Option::is_none")]
pub text: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub heading_level: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
pub table_ref: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub region_ref: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub confidence: Option<u16>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub span_refs: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_refs: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Span {
pub id: String,
pub page: String,
pub bbox: QRect,
#[serde(skip_serializing_if = "Option::is_none")]
pub origin_locator: Option<SpanOriginLocator>,
pub text: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub font_size_q: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub char_start: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub char_end: Option<u32>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_refs: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SpanOriginLocator {
pub policy: String,
pub first_origin: [i64; 2],
pub last_origin: [i64; 2],
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Table {
pub id: String,
pub page_refs: Vec<String>,
pub bbox: QRect,
pub n_rows: u32,
pub n_cols: u32,
pub header_rows: u32,
pub header_cols: u32,
pub cells: Vec<Cell>,
#[serde(skip_serializing_if = "Option::is_none")]
pub confidence: Option<u16>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_refs: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub exports: Option<TableExports>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct TableExports {
#[serde(skip_serializing_if = "Option::is_none")]
pub csv: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub markdown: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Cell {
pub row: u32,
pub col: u32,
pub row_span: u32,
pub col_span: u32,
pub bbox: QRect,
pub text: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub span_refs: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub element_refs: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Chunk {
pub id: String,
pub text: String,
pub element_refs: Vec<String>,
pub page_refs: Vec<String>,
pub bboxes: Vec<PageBox>,
pub token_estimate: TokenEstimate,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_refs: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct PageBox {
pub page: String,
pub bbox: QRect,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct TokenEstimate {
pub count: u32,
pub estimator: String,
pub approximate: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RegionKind {
Unknown,
Image,
Figure,
Formula,
Chart,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Region {
pub id: String,
pub page: String,
pub bbox: QRect,
pub kind: RegionKind,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_refs: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Warning {
pub id: String,
pub code: WarningCode,
pub message: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub page: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub element_ref: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub span_ref: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub region_ref: Option<String>,
}
impl ElementType {
pub fn as_str(self) -> &'static str {
match self {
ElementType::TextBlock => "text_block",
ElementType::Heading => "heading",
ElementType::List => "list",
ElementType::ListItem => "list_item",
ElementType::Table => "table",
ElementType::Region => "region",
ElementType::Header => "header",
ElementType::Footer => "footer",
ElementType::Caption => "caption",
ElementType::Other => "other",
}
}
}
fn grounding_element_from_element(e: &Element) -> crate::grounding::GroundingElement {
crate::grounding::GroundingElement {
id: e.id.clone(),
page: e.page.clone(),
bbox: e.bbox.to_array(),
kind: e.element_type.as_str().to_string(),
text: e.text.clone(),
}
}
impl crate::grounding::GroundingSource for Document {
fn parser(&self) -> crate::grounding::ParserIdentity {
crate::grounding::ParserIdentity {
name: self.parser.name.clone(),
version: self.parser.version.clone(),
adapter: None,
adapter_version: None,
}
}
fn capabilities(&self) -> crate::grounding::Capabilities {
crate::grounding::Capabilities {
spans: true,
char_offsets: true,
tables: true,
fingerprint: true,
coordinate_origin: crate::grounding::CoordinateOrigin::TopLeft,
crop_support: false,
}
}
fn fingerprint(&self) -> Option<String> {
Some(self.fingerprint.clone())
}
fn pages(&self) -> Vec<crate::grounding::PageGeometry> {
self.payload
.pages
.iter()
.map(|p| crate::grounding::PageGeometry {
id: p.id.clone(),
index: p.index,
width: p.width,
height: p.height,
rotation: p.rotation,
})
.collect()
}
fn elements(&self) -> Vec<crate::grounding::GroundingElement> {
self.payload
.elements
.iter()
.map(grounding_element_from_element)
.collect()
}
fn element_by_id(&self, id: &str) -> Option<crate::grounding::GroundingElement> {
self.payload
.elements
.iter()
.find(|e| e.id == id)
.map(grounding_element_from_element)
}
fn spans(&self) -> Vec<crate::grounding::GroundingSpan> {
self.payload
.spans
.iter()
.map(|s| crate::grounding::GroundingSpan {
id: s.id.clone(),
page: s.page.clone(),
bbox: s.bbox.to_array(),
text: s.text.clone(),
element: None,
char_start: s.char_start,
char_end: s.char_end,
})
.collect()
}
fn tables(&self) -> Vec<crate::grounding::GroundingTable> {
self.payload
.tables
.iter()
.map(|t| crate::grounding::GroundingTable {
id: t.id.clone(),
page: t.page_refs.first().cloned().unwrap_or_default(),
bbox: t.bbox.to_array(),
cells: t
.cells
.iter()
.map(|c| crate::grounding::GroundingCell {
row: c.row,
col: c.col,
row_span: c.row_span,
col_span: c.col_span,
bbox: c.bbox.to_array(),
text: c.text.clone(),
})
.collect(),
})
.collect()
}
}
impl Document {
pub fn payload_c14n(&self) -> Result<Vec<u8>, EthosError> {
let value = serde_json::to_value(&self.payload)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
crate::c14n::c14n_bytes(&value)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn payload_fingerprint_c14n(&self) -> Result<Vec<u8>, EthosError> {
let value = stable_payload_projection(&self.payload)?;
crate::c14n::c14n_bytes(&value)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn compute_payload_sha256(&self) -> Result<String, EthosError> {
let value = stable_payload_projection(&self.payload)?;
crate::c14n::sha256_hex(&value)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn compute_payload_sha256_for_payload(payload: &Payload) -> Result<String, EthosError> {
let value = stable_payload_projection(payload)?;
crate::c14n::sha256_hex(&value)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn payload_fingerprint_value(&self) -> Result<Value, EthosError> {
stable_payload_projection(&self.payload)
}
pub fn compute_raw_payload_sha256(&self) -> Result<String, EthosError> {
let value = serde_json::to_value(&self.payload)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
crate::c14n::sha256_hex(&value)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn compute_fingerprint(&self) -> Result<String, EthosError> {
let manifest = crate::fingerprint::FingerprintManifest {
config_sha256: self.config_sha256.clone(),
payload_sha256: self.compute_payload_sha256()?,
profile_id: self.profile.id.clone(),
profile_sha256: self.profile.sha256.clone(),
schema_version: self.schema_version.clone(),
source_fingerprint: self.source.fingerprint.clone(),
};
manifest
.document_fingerprint()
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.message))
}
pub fn verify_integrity(&self) -> Result<(), EthosError> {
let payload = self.compute_payload_sha256()?;
if payload != self.payload_sha256 {
return Err(EthosError::new(
ErrorCode::InternalError,
"payload_sha256 mismatch: document was modified or produced non-canonically",
));
}
let fp = self.compute_fingerprint()?;
if fp != self.fingerprint {
return Err(EthosError::new(
ErrorCode::InternalError,
"fingerprint mismatch: envelope and payload disagree",
));
}
Ok(())
}
}
fn stable_payload_projection(payload: &Payload) -> Result<Value, EthosError> {
let mut value = serde_json::to_value(payload)
.map_err(|e| EthosError::new(ErrorCode::InternalError, e.to_string()))?;
remove_unstable_geometry(&mut value);
Ok(value)
}
fn remove_unstable_geometry(value: &mut Value) {
match value {
Value::Object(map) => {
map.remove("bbox");
map.remove("bboxes");
for child in map.values_mut() {
remove_unstable_geometry(child);
}
}
Value::Array(items) => {
for child in items {
remove_unstable_geometry(child);
}
}
_ => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn example() -> (&'static str, Document) {
let raw = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../schemas/examples/document.example.json"
));
(
raw,
serde_json::from_str(raw).expect("example deserializes"),
)
}
#[test]
fn example_round_trips_at_value_level() {
let (raw, doc) = example();
let original: Value = serde_json::from_str(raw).unwrap();
let reserialized = serde_json::to_value(&doc).unwrap();
assert_eq!(
original, reserialized,
"model drops or reorders schema fields"
);
}
#[test]
fn example_hashes_are_self_consistent() {
let (_, doc) = example();
doc.verify_integrity()
.expect("example hashes must be real (regenerated, not fake)");
}
#[test]
fn reserialization_is_stable() {
let (_, doc) = example();
let a = doc.payload_c14n().unwrap();
let b = doc.payload_c14n().unwrap();
assert_eq!(a, b);
let v = serde_json::to_value(&doc).unwrap();
let doc2: Document = serde_json::from_value(v).unwrap();
assert_eq!(doc, doc2);
}
#[test]
fn payload_hash_ignores_precise_bbox_geometry() {
let (_, doc) = example();
let mut shifted = doc.clone();
shifted.payload.elements[0].bbox = QRect::new(1, 2, 3, 4).unwrap();
shifted.payload.spans[0].bbox = QRect::new(5, 6, 7, 8).unwrap();
shifted.payload.tables[0].bbox = QRect::new(9, 10, 11, 12).unwrap();
shifted.payload.tables[0].cells[0].bbox = QRect::new(13, 14, 15, 16).unwrap();
shifted.payload.chunks[0].bboxes[0].bbox = QRect::new(17, 18, 19, 20).unwrap();
shifted.payload.regions[0].bbox = QRect::new(21, 22, 23, 24).unwrap();
assert_eq!(
doc.compute_payload_sha256().unwrap(),
shifted.compute_payload_sha256().unwrap()
);
}
#[test]
fn payload_hash_binds_origin_locator() {
let (_, doc) = example();
let mut changed = doc.clone();
changed.payload.spans[0].origin_locator = Some(SpanOriginLocator {
policy: "origin-run-locator-v1".to_string(),
first_origin: [7200, 7200],
last_origin: [30480, 7200],
});
assert_ne!(
doc.compute_payload_sha256().unwrap(),
changed.compute_payload_sha256().unwrap()
);
}
}