1use super::Entity;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[cfg(any(feature = "semantic", test))]
8use super::EntityType;
9
10#[cfg(any(feature = "semantic", test))]
11use serde_json::{json, Value};
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EntityMap {
16 pub document_metadata: HashMap<String, String>,
18 pub pages: HashMap<usize, Vec<Entity>>,
20 pub schemas: Vec<String>,
22}
23
24impl Default for EntityMap {
25 fn default() -> Self {
26 Self::new()
27 }
28}
29
30impl EntityMap {
31 pub fn new() -> Self {
32 Self {
33 document_metadata: HashMap::new(),
34 pages: HashMap::new(),
35 schemas: Vec::new(),
36 }
37 }
38
39 pub fn add_entity(&mut self, entity: Entity) {
41 self.pages.entry(entity.page).or_default().push(entity);
42 }
43
44 #[cfg(any(feature = "semantic", test))]
46 #[allow(unexpected_cfgs)]
47 pub fn to_json(&self) -> Result<String, serde_json::Error> {
48 serde_json::to_string_pretty(self)
49 }
50
51 #[cfg(any(feature = "semantic", test))]
53 #[allow(unexpected_cfgs)]
54 pub fn to_json_compact(&self) -> Result<String, serde_json::Error> {
55 serde_json::to_string(self)
56 }
57
58 pub fn entities_by_type(&self, entity_type: super::EntityType) -> Vec<&Entity> {
60 self.pages
61 .values()
62 .flat_map(|entities| entities.iter())
63 .filter(|e| e.entity_type == entity_type)
64 .collect()
65 }
66
67 pub fn entities_on_page(&self, page: usize) -> Option<&Vec<Entity>> {
69 self.pages.get(&page)
70 }
71
72 #[cfg(any(feature = "semantic", test))]
74 #[allow(unexpected_cfgs)]
75 pub fn to_json_ld(&self) -> Result<String, serde_json::Error> {
76 let mut json_ld = json!({
77 "@context": "https://schema.org",
78 "@type": "DigitalDocument",
79 "additionalType": "AI-Ready PDF",
80 "hasPart": []
81 });
82
83 let mut parts = Vec::new();
84
85 for (page_num, entities) in &self.pages {
86 for entity in entities {
87 let entity_json = entity_to_schema_org(entity, *page_num);
88 parts.push(entity_json);
89 }
90 }
91
92 json_ld["hasPart"] = Value::Array(parts);
93
94 if !self.schemas.is_empty() {
96 json_ld["conformsTo"] = json!(self.schemas);
97 }
98
99 if !self.document_metadata.is_empty() {
101 for (key, value) in &self.document_metadata {
102 json_ld[key] = json!(value);
103 }
104 }
105
106 serde_json::to_string_pretty(&json_ld)
107 }
108}
109
110#[cfg(any(feature = "semantic", test))]
112fn entity_type_to_schema_org(entity_type: &EntityType) -> &'static str {
113 match entity_type {
114 EntityType::Invoice => "Invoice",
116 EntityType::InvoiceNumber => "identifier",
117 EntityType::CustomerName => "customer",
118 EntityType::TotalAmount => "totalPrice",
119 EntityType::TaxAmount => "taxAmount",
120 EntityType::DueDate => "paymentDueDate",
121 EntityType::LineItem => "LineItem",
122 EntityType::PaymentAmount => "price",
123
124 EntityType::PersonName => "Person",
126 EntityType::OrganizationName => "Organization",
127 EntityType::Address => "PostalAddress",
128 EntityType::PhoneNumber => "telephone",
129 EntityType::Email => "email",
130 EntityType::Website => "url",
131
132 EntityType::Contract => "DigitalDocument",
134 EntityType::ContractParty => "Party",
135 EntityType::ContractTerm => "OfferCatalog",
136 EntityType::EffectiveDate => "datePublished",
137 EntityType::ContractValue => "price",
138 EntityType::Signature => "signatureValue",
139
140 EntityType::Heading => "Heading",
142 EntityType::Paragraph => "Paragraph",
143 EntityType::Table => "Table",
144 EntityType::List => "ItemList",
145 EntityType::Image => "ImageObject",
146 EntityType::Text => "Text",
147 EntityType::Header => "WPHeader",
148 EntityType::Footer => "WPFooter",
149 EntityType::PageNumber => "pageStart",
150
151 EntityType::Date => "Date",
153 EntityType::Amount => "MonetaryAmount",
154 EntityType::Quantity => "quantityValue",
155 EntityType::Percentage => "ratingValue",
156
157 EntityType::Custom(_) => "Thing",
159 }
160}
161
162#[cfg(any(feature = "semantic", test))]
164fn entity_to_schema_org(entity: &Entity, page_num: usize) -> Value {
165 let schema_type = entity_type_to_schema_org(&entity.entity_type);
166
167 let mut json = json!({
168 "@type": schema_type,
169 "spatialCoverage": {
170 "@type": "Place",
171 "geo": {
172 "@type": "GeoCoordinates",
173 "box": format!("{},{},{},{}", entity.bounds.0, entity.bounds.1,
174 entity.bounds.2, entity.bounds.3)
175 }
176 },
177 "pageStart": page_num + 1
178 });
179
180 if !entity.id.is_empty() {
182 json["@id"] = json!(entity.id);
183 }
184
185 for (key, value) in &entity.metadata.properties {
187 json[key] = json!(value);
188 }
189
190 if let Some(confidence) = entity.metadata.confidence {
192 json["confidence"] = json!(confidence);
193 }
194
195 if let Some(schema) = &entity.metadata.schema {
197 json["conformsTo"] = json!(schema);
198 }
199
200 json
201}
202
203#[derive(Debug, Clone, Copy, PartialEq)]
205pub enum ExportFormat {
206 Json,
208 JsonLd,
210 Xml,
212}
213
214impl Default for ExportFormat {
215 fn default() -> Self {
216 Self::Json
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223 use crate::semantic::{EntityMetadata, EntityType};
224
225 fn create_test_entity(id: &str, page: usize, entity_type: EntityType) -> Entity {
226 Entity {
227 id: id.to_string(),
228 entity_type,
229 bounds: (0.0, 0.0, 100.0, 50.0),
230 page,
231 metadata: EntityMetadata::new(),
232 }
233 }
234
235 #[test]
236 fn test_entity_map_new() {
237 let map = EntityMap::new();
238
239 assert!(map.document_metadata.is_empty());
240 assert!(map.pages.is_empty());
241 assert!(map.schemas.is_empty());
242 }
243
244 #[test]
245 fn test_entity_map_default() {
246 let map = EntityMap::default();
247
248 assert!(map.document_metadata.is_empty());
249 assert!(map.pages.is_empty());
250 assert!(map.schemas.is_empty());
251 }
252
253 #[test]
254 fn test_add_entity() {
255 let mut map = EntityMap::new();
256 let entity = create_test_entity("e1", 1, EntityType::Text);
257
258 map.add_entity(entity);
259
260 assert!(map.pages.contains_key(&1));
261 assert_eq!(map.pages.get(&1).unwrap().len(), 1);
262 }
263
264 #[test]
265 fn test_add_multiple_entities_same_page() {
266 let mut map = EntityMap::new();
267 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
268 map.add_entity(create_test_entity("e2", 1, EntityType::Image));
269 map.add_entity(create_test_entity("e3", 1, EntityType::Table));
270
271 assert_eq!(map.pages.get(&1).unwrap().len(), 3);
272 }
273
274 #[test]
275 fn test_add_entities_different_pages() {
276 let mut map = EntityMap::new();
277 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
278 map.add_entity(create_test_entity("e2", 2, EntityType::Image));
279 map.add_entity(create_test_entity("e3", 3, EntityType::Table));
280
281 assert_eq!(map.pages.len(), 3);
282 assert!(map.pages.contains_key(&1));
283 assert!(map.pages.contains_key(&2));
284 assert!(map.pages.contains_key(&3));
285 }
286
287 #[test]
288 fn test_entities_on_page() {
289 let mut map = EntityMap::new();
290 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
291 map.add_entity(create_test_entity("e2", 1, EntityType::Image));
292
293 let page_entities = map.entities_on_page(1);
294 assert!(page_entities.is_some());
295 assert_eq!(page_entities.unwrap().len(), 2);
296
297 let missing_page = map.entities_on_page(99);
298 assert!(missing_page.is_none());
299 }
300
301 #[test]
302 fn test_entities_by_type() {
303 let mut map = EntityMap::new();
304 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
305 map.add_entity(create_test_entity("e2", 1, EntityType::Text));
306 map.add_entity(create_test_entity("e3", 2, EntityType::Image));
307 map.add_entity(create_test_entity("e4", 2, EntityType::Text));
308
309 let text_entities = map.entities_by_type(EntityType::Text);
310 assert_eq!(text_entities.len(), 3);
311
312 let image_entities = map.entities_by_type(EntityType::Image);
313 assert_eq!(image_entities.len(), 1);
314
315 let table_entities = map.entities_by_type(EntityType::Table);
316 assert_eq!(table_entities.len(), 0);
317 }
318
319 #[test]
320 fn test_export_format_default() {
321 let format = ExportFormat::default();
322 assert_eq!(format, ExportFormat::Json);
323 }
324
325 #[test]
326 fn test_export_format_variants() {
327 assert_eq!(ExportFormat::Json, ExportFormat::Json);
328 assert_eq!(ExportFormat::JsonLd, ExportFormat::JsonLd);
329 assert_eq!(ExportFormat::Xml, ExportFormat::Xml);
330 assert_ne!(ExportFormat::Json, ExportFormat::JsonLd);
331 }
332
333 #[test]
334 fn test_document_metadata() {
335 let mut map = EntityMap::new();
336 map.document_metadata
337 .insert("title".to_string(), "Test Document".to_string());
338 map.document_metadata
339 .insert("author".to_string(), "Test Author".to_string());
340
341 assert_eq!(
342 map.document_metadata.get("title"),
343 Some(&"Test Document".to_string())
344 );
345 assert_eq!(
346 map.document_metadata.get("author"),
347 Some(&"Test Author".to_string())
348 );
349 }
350
351 #[test]
352 fn test_schemas() {
353 let mut map = EntityMap::new();
354 map.schemas.push("https://schema.org".to_string());
355 map.schemas.push("https://example.com/schema".to_string());
356
357 assert_eq!(map.schemas.len(), 2);
358 assert!(map.schemas.contains(&"https://schema.org".to_string()));
359 }
360
361 #[cfg(any(feature = "semantic", test))]
362 #[test]
363 fn test_to_json() {
364 let mut map = EntityMap::new();
365 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
366
367 let json = map.to_json();
368 assert!(json.is_ok());
369
370 let json_str = json.unwrap();
371 assert!(json_str.contains("pages"));
372 assert!(json_str.contains("e1"));
373 }
374
375 #[cfg(any(feature = "semantic", test))]
376 #[test]
377 fn test_to_json_compact() {
378 let mut map = EntityMap::new();
379 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
380
381 let json = map.to_json_compact();
382 assert!(json.is_ok());
383
384 let json_str = json.unwrap();
385 assert!(!json_str.contains("\n ")); }
388
389 #[cfg(any(feature = "semantic", test))]
390 #[test]
391 fn test_to_json_ld() {
392 let mut map = EntityMap::new();
393 map.add_entity(create_test_entity("e1", 1, EntityType::Text));
394 map.schemas.push("https://schema.org".to_string());
395
396 let json_ld = map.to_json_ld();
397 assert!(json_ld.is_ok());
398
399 let json_str = json_ld.unwrap();
400 assert!(json_str.contains("@context"));
401 assert!(json_str.contains("schema.org"));
402 assert!(json_str.contains("DigitalDocument"));
403 assert!(json_str.contains("hasPart"));
404 }
405
406 #[cfg(any(feature = "semantic", test))]
407 #[test]
408 fn test_entity_type_to_schema_org_financial() {
409 assert_eq!(entity_type_to_schema_org(&EntityType::Invoice), "Invoice");
410 assert_eq!(
411 entity_type_to_schema_org(&EntityType::InvoiceNumber),
412 "identifier"
413 );
414 assert_eq!(
415 entity_type_to_schema_org(&EntityType::TotalAmount),
416 "totalPrice"
417 );
418 assert_eq!(
419 entity_type_to_schema_org(&EntityType::TaxAmount),
420 "taxAmount"
421 );
422 assert_eq!(
423 entity_type_to_schema_org(&EntityType::DueDate),
424 "paymentDueDate"
425 );
426 }
427
428 #[cfg(any(feature = "semantic", test))]
429 #[test]
430 fn test_entity_type_to_schema_org_identity() {
431 assert_eq!(entity_type_to_schema_org(&EntityType::PersonName), "Person");
432 assert_eq!(
433 entity_type_to_schema_org(&EntityType::OrganizationName),
434 "Organization"
435 );
436 assert_eq!(
437 entity_type_to_schema_org(&EntityType::Address),
438 "PostalAddress"
439 );
440 assert_eq!(
441 entity_type_to_schema_org(&EntityType::PhoneNumber),
442 "telephone"
443 );
444 assert_eq!(entity_type_to_schema_org(&EntityType::Email), "email");
445 }
446
447 #[cfg(any(feature = "semantic", test))]
448 #[test]
449 fn test_entity_type_to_schema_org_structure() {
450 assert_eq!(entity_type_to_schema_org(&EntityType::Heading), "Heading");
451 assert_eq!(
452 entity_type_to_schema_org(&EntityType::Paragraph),
453 "Paragraph"
454 );
455 assert_eq!(entity_type_to_schema_org(&EntityType::Table), "Table");
456 assert_eq!(entity_type_to_schema_org(&EntityType::List), "ItemList");
457 assert_eq!(entity_type_to_schema_org(&EntityType::Image), "ImageObject");
458 assert_eq!(entity_type_to_schema_org(&EntityType::Text), "Text");
459 }
460
461 #[cfg(any(feature = "semantic", test))]
462 #[test]
463 fn test_entity_type_to_schema_org_custom() {
464 assert_eq!(
465 entity_type_to_schema_org(&EntityType::Custom("MyType".to_string())),
466 "Thing"
467 );
468 }
469
470 #[cfg(any(feature = "semantic", test))]
471 #[test]
472 fn test_entity_to_schema_org_basic() {
473 let entity = create_test_entity("test_id", 1, EntityType::Text);
474 let json = entity_to_schema_org(&entity, 1);
475
476 assert_eq!(json["@type"], "Text");
477 assert_eq!(json["@id"], "test_id");
478 assert_eq!(json["pageStart"], 2); }
480
481 #[cfg(any(feature = "semantic", test))]
482 #[test]
483 fn test_entity_to_schema_org_with_metadata() {
484 let mut entity = create_test_entity("test_id", 0, EntityType::Invoice);
485 entity.metadata = entity.metadata.with_property("total", "1000.00");
486 entity.metadata = entity.metadata.with_confidence(0.95);
487 entity.metadata = entity.metadata.with_schema("https://schema.org/Invoice");
488
489 let json = entity_to_schema_org(&entity, 0);
490
491 assert_eq!(json["@type"], "Invoice");
492 assert_eq!(json["total"], "1000.00");
493 let confidence = json["confidence"].as_f64().unwrap();
495 assert!((confidence - 0.95).abs() < 0.001);
496 assert_eq!(json["conformsTo"], "https://schema.org/Invoice");
497 }
498}