oxidize_pdf/semantic/
export.rs1use super::Entity;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[cfg(any(feature = "semantic", test))]
8use super::EntityType;
9
10#[cfg(any(feature = "semantic", test))]
11use serde_json::{json, Value};
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EntityMap {
16 pub document_metadata: HashMap<String, String>,
18 pub pages: HashMap<usize, Vec<Entity>>,
20 pub schemas: Vec<String>,
22}
23
24impl Default for EntityMap {
25 fn default() -> Self {
26 Self::new()
27 }
28}
29
30impl EntityMap {
31 pub fn new() -> Self {
32 Self {
33 document_metadata: HashMap::new(),
34 pages: HashMap::new(),
35 schemas: Vec::new(),
36 }
37 }
38
39 pub fn add_entity(&mut self, entity: Entity) {
41 self.pages.entry(entity.page).or_default().push(entity);
42 }
43
44 #[cfg(any(feature = "semantic", test))]
46 #[allow(unexpected_cfgs)]
47 pub fn to_json(&self) -> Result<String, serde_json::Error> {
48 serde_json::to_string_pretty(self)
49 }
50
51 #[cfg(any(feature = "semantic", test))]
53 #[allow(unexpected_cfgs)]
54 pub fn to_json_compact(&self) -> Result<String, serde_json::Error> {
55 serde_json::to_string(self)
56 }
57
58 pub fn entities_by_type(&self, entity_type: super::EntityType) -> Vec<&Entity> {
60 self.pages
61 .values()
62 .flat_map(|entities| entities.iter())
63 .filter(|e| e.entity_type == entity_type)
64 .collect()
65 }
66
67 pub fn entities_on_page(&self, page: usize) -> Option<&Vec<Entity>> {
69 self.pages.get(&page)
70 }
71
72 #[cfg(any(feature = "semantic", test))]
74 #[allow(unexpected_cfgs)]
75 pub fn to_json_ld(&self) -> Result<String, serde_json::Error> {
76 let mut json_ld = json!({
77 "@context": "https://schema.org",
78 "@type": "DigitalDocument",
79 "additionalType": "AI-Ready PDF",
80 "hasPart": []
81 });
82
83 let mut parts = Vec::new();
84
85 for (page_num, entities) in &self.pages {
86 for entity in entities {
87 let entity_json = entity_to_schema_org(entity, *page_num);
88 parts.push(entity_json);
89 }
90 }
91
92 json_ld["hasPart"] = Value::Array(parts);
93
94 if !self.schemas.is_empty() {
96 json_ld["conformsTo"] = json!(self.schemas);
97 }
98
99 if !self.document_metadata.is_empty() {
101 for (key, value) in &self.document_metadata {
102 json_ld[key] = json!(value);
103 }
104 }
105
106 serde_json::to_string_pretty(&json_ld)
107 }
108}
109
110#[cfg(any(feature = "semantic", test))]
112fn entity_type_to_schema_org(entity_type: &EntityType) -> &'static str {
113 match entity_type {
114 EntityType::Invoice => "Invoice",
116 EntityType::InvoiceNumber => "identifier",
117 EntityType::CustomerName => "customer",
118 EntityType::TotalAmount => "totalPrice",
119 EntityType::TaxAmount => "taxAmount",
120 EntityType::DueDate => "paymentDueDate",
121 EntityType::LineItem => "LineItem",
122 EntityType::PaymentAmount => "price",
123
124 EntityType::PersonName => "Person",
126 EntityType::OrganizationName => "Organization",
127 EntityType::Address => "PostalAddress",
128 EntityType::PhoneNumber => "telephone",
129 EntityType::Email => "email",
130 EntityType::Website => "url",
131
132 EntityType::Contract => "DigitalDocument",
134 EntityType::ContractParty => "Party",
135 EntityType::ContractTerm => "OfferCatalog",
136 EntityType::EffectiveDate => "datePublished",
137 EntityType::ContractValue => "price",
138 EntityType::Signature => "signatureValue",
139
140 EntityType::Heading => "Heading",
142 EntityType::Paragraph => "Paragraph",
143 EntityType::Table => "Table",
144 EntityType::List => "ItemList",
145 EntityType::Image => "ImageObject",
146 EntityType::Text => "Text",
147 EntityType::Header => "WPHeader",
148 EntityType::Footer => "WPFooter",
149 EntityType::PageNumber => "pageStart",
150
151 EntityType::Date => "Date",
153 EntityType::Amount => "MonetaryAmount",
154 EntityType::Quantity => "quantityValue",
155 EntityType::Percentage => "ratingValue",
156
157 EntityType::Custom(_) => "Thing",
159 }
160}
161
162#[cfg(any(feature = "semantic", test))]
164fn entity_to_schema_org(entity: &Entity, page_num: usize) -> Value {
165 let schema_type = entity_type_to_schema_org(&entity.entity_type);
166
167 let mut json = json!({
168 "@type": schema_type,
169 "spatialCoverage": {
170 "@type": "Place",
171 "geo": {
172 "@type": "GeoCoordinates",
173 "box": format!("{},{},{},{}", entity.bounds.0, entity.bounds.1,
174 entity.bounds.2, entity.bounds.3)
175 }
176 },
177 "pageStart": page_num + 1
178 });
179
180 if !entity.id.is_empty() {
182 json["@id"] = json!(entity.id);
183 }
184
185 for (key, value) in &entity.metadata.properties {
187 json[key] = json!(value);
188 }
189
190 if let Some(confidence) = entity.metadata.confidence {
192 json["confidence"] = json!(confidence);
193 }
194
195 if let Some(schema) = &entity.metadata.schema {
197 json["conformsTo"] = json!(schema);
198 }
199
200 json
201}
202
203#[derive(Debug, Clone, Copy)]
205pub enum ExportFormat {
206 Json,
208 JsonLd,
210 Xml,
212}
213
214impl Default for ExportFormat {
215 fn default() -> Self {
216 Self::Json
217 }
218}