Skip to main content

oxidize_pdf/semantic/
export.rs

1//! Export functionality for semantic entities
2
3use super::Entity;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[cfg(any(feature = "semantic", test))]
8use super::EntityType;
9
10#[cfg(any(feature = "semantic", test))]
11use serde_json::{json, Value};
12
13/// Map of entities organized by page
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EntityMap {
16    /// Document-level metadata
17    pub document_metadata: HashMap<String, String>,
18    /// Entities organized by page number
19    pub pages: HashMap<usize, Vec<Entity>>,
20    /// Schema definitions used
21    pub schemas: Vec<String>,
22}
23
24impl Default for EntityMap {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl EntityMap {
31    pub fn new() -> Self {
32        Self {
33            document_metadata: HashMap::new(),
34            pages: HashMap::new(),
35            schemas: Vec::new(),
36        }
37    }
38
39    /// Add an entity to the map
40    pub fn add_entity(&mut self, entity: Entity) {
41        self.pages.entry(entity.page).or_default().push(entity);
42    }
43
44    /// Export to JSON string (requires serde_json feature)
45    #[cfg(any(feature = "semantic", test))]
46    #[allow(unexpected_cfgs)]
47    pub fn to_json(&self) -> Result<String, serde_json::Error> {
48        serde_json::to_string_pretty(self)
49    }
50
51    /// Export to JSON with custom options (requires serde_json feature)
52    #[cfg(any(feature = "semantic", test))]
53    #[allow(unexpected_cfgs)]
54    pub fn to_json_compact(&self) -> Result<String, serde_json::Error> {
55        serde_json::to_string(self)
56    }
57
58    /// Get all entities of a specific type
59    pub fn entities_by_type(&self, entity_type: super::EntityType) -> Vec<&Entity> {
60        self.pages
61            .values()
62            .flat_map(|entities| entities.iter())
63            .filter(|e| e.entity_type == entity_type)
64            .collect()
65    }
66
67    /// Get all entities on a specific page
68    pub fn entities_on_page(&self, page: usize) -> Option<&Vec<Entity>> {
69        self.pages.get(&page)
70    }
71
72    /// Export to JSON-LD format with Schema.org context
73    #[cfg(any(feature = "semantic", test))]
74    #[allow(unexpected_cfgs)]
75    pub fn to_json_ld(&self) -> Result<String, serde_json::Error> {
76        let mut json_ld = json!({
77            "@context": "https://schema.org",
78            "@type": "DigitalDocument",
79            "additionalType": "AI-Ready PDF",
80            "hasPart": []
81        });
82
83        let mut parts = Vec::new();
84
85        for (page_num, entities) in &self.pages {
86            for entity in entities {
87                let entity_json = entity_to_schema_org(entity, *page_num);
88                parts.push(entity_json);
89            }
90        }
91
92        json_ld["hasPart"] = Value::Array(parts);
93
94        // Add schemas if any
95        if !self.schemas.is_empty() {
96            json_ld["conformsTo"] = json!(self.schemas);
97        }
98
99        // Add document metadata
100        if !self.document_metadata.is_empty() {
101            for (key, value) in &self.document_metadata {
102                json_ld[key] = json!(value);
103            }
104        }
105
106        serde_json::to_string_pretty(&json_ld)
107    }
108}
109
110/// Convert EntityType to Schema.org type
111#[cfg(any(feature = "semantic", test))]
112fn entity_type_to_schema_org(entity_type: &EntityType) -> &'static str {
113    match entity_type {
114        // Financial Documents
115        EntityType::Invoice => "Invoice",
116        EntityType::InvoiceNumber => "identifier",
117        EntityType::CustomerName => "customer",
118        EntityType::TotalAmount => "totalPrice",
119        EntityType::TaxAmount => "taxAmount",
120        EntityType::DueDate => "paymentDueDate",
121        EntityType::LineItem => "LineItem",
122        EntityType::PaymentAmount => "price",
123
124        // Identity & Contact
125        EntityType::PersonName => "Person",
126        EntityType::OrganizationName => "Organization",
127        EntityType::Address => "PostalAddress",
128        EntityType::PhoneNumber => "telephone",
129        EntityType::Email => "email",
130        EntityType::Website => "url",
131
132        // Legal Documents
133        EntityType::Contract => "DigitalDocument",
134        EntityType::ContractParty => "Party",
135        EntityType::ContractTerm => "OfferCatalog",
136        EntityType::EffectiveDate => "datePublished",
137        EntityType::ContractValue => "price",
138        EntityType::Signature => "signatureValue",
139
140        // Document Structure
141        EntityType::Heading => "Heading",
142        EntityType::Paragraph => "Paragraph",
143        EntityType::Table => "Table",
144        EntityType::List => "ItemList",
145        EntityType::Image => "ImageObject",
146        EntityType::Text => "Text",
147        EntityType::Header => "WPHeader",
148        EntityType::Footer => "WPFooter",
149        EntityType::PageNumber => "pageStart",
150
151        // Dates and Numbers
152        EntityType::Date => "Date",
153        EntityType::Amount => "MonetaryAmount",
154        EntityType::Quantity => "quantityValue",
155        EntityType::Percentage => "ratingValue",
156
157        // Custom
158        EntityType::Custom(_) => "Thing",
159    }
160}
161
162/// Convert Entity to Schema.org JSON-LD
163#[cfg(any(feature = "semantic", test))]
164fn entity_to_schema_org(entity: &Entity, page_num: usize) -> Value {
165    let schema_type = entity_type_to_schema_org(&entity.entity_type);
166
167    let mut json = json!({
168        "@type": schema_type,
169        "spatialCoverage": {
170            "@type": "Place",
171            "geo": {
172                "@type": "GeoCoordinates",
173                "box": format!("{},{},{},{}", entity.bounds.0, entity.bounds.1,
174                              entity.bounds.2, entity.bounds.3)
175            }
176        },
177        "pageStart": page_num + 1
178    });
179
180    // Add ID if present
181    if !entity.id.is_empty() {
182        json["@id"] = json!(entity.id);
183    }
184
185    // Add properties from metadata
186    for (key, value) in &entity.metadata.properties {
187        json[key] = json!(value);
188    }
189
190    // Add confidence if present
191    if let Some(confidence) = entity.metadata.confidence {
192        json["confidence"] = json!(confidence);
193    }
194
195    // Add schema if present
196    if let Some(schema) = &entity.metadata.schema {
197        json["conformsTo"] = json!(schema);
198    }
199
200    json
201}
202
203/// Export format options
204#[derive(Debug, Clone, Copy, PartialEq)]
205pub enum ExportFormat {
206    /// JSON format (default)
207    Json,
208    /// JSON-LD with schema.org context
209    JsonLd,
210    /// XML format
211    Xml,
212}
213
214impl Default for ExportFormat {
215    fn default() -> Self {
216        Self::Json
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223    use crate::semantic::{EntityMetadata, EntityType};
224
225    fn create_test_entity(id: &str, page: usize, entity_type: EntityType) -> Entity {
226        Entity {
227            id: id.to_string(),
228            entity_type,
229            bounds: (0.0, 0.0, 100.0, 50.0),
230            page,
231            metadata: EntityMetadata::new(),
232        }
233    }
234
235    #[test]
236    fn test_entity_map_new() {
237        let map = EntityMap::new();
238
239        assert!(map.document_metadata.is_empty());
240        assert!(map.pages.is_empty());
241        assert!(map.schemas.is_empty());
242    }
243
244    #[test]
245    fn test_entity_map_default() {
246        let map = EntityMap::default();
247
248        assert!(map.document_metadata.is_empty());
249        assert!(map.pages.is_empty());
250        assert!(map.schemas.is_empty());
251    }
252
253    #[test]
254    fn test_add_entity() {
255        let mut map = EntityMap::new();
256        let entity = create_test_entity("e1", 1, EntityType::Text);
257
258        map.add_entity(entity);
259
260        assert!(map.pages.contains_key(&1));
261        assert_eq!(map.pages.get(&1).unwrap().len(), 1);
262    }
263
264    #[test]
265    fn test_add_multiple_entities_same_page() {
266        let mut map = EntityMap::new();
267        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
268        map.add_entity(create_test_entity("e2", 1, EntityType::Image));
269        map.add_entity(create_test_entity("e3", 1, EntityType::Table));
270
271        assert_eq!(map.pages.get(&1).unwrap().len(), 3);
272    }
273
274    #[test]
275    fn test_add_entities_different_pages() {
276        let mut map = EntityMap::new();
277        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
278        map.add_entity(create_test_entity("e2", 2, EntityType::Image));
279        map.add_entity(create_test_entity("e3", 3, EntityType::Table));
280
281        assert_eq!(map.pages.len(), 3);
282        assert!(map.pages.contains_key(&1));
283        assert!(map.pages.contains_key(&2));
284        assert!(map.pages.contains_key(&3));
285    }
286
287    #[test]
288    fn test_entities_on_page() {
289        let mut map = EntityMap::new();
290        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
291        map.add_entity(create_test_entity("e2", 1, EntityType::Image));
292
293        let page_entities = map.entities_on_page(1);
294        assert!(page_entities.is_some());
295        assert_eq!(page_entities.unwrap().len(), 2);
296
297        let missing_page = map.entities_on_page(99);
298        assert!(missing_page.is_none());
299    }
300
301    #[test]
302    fn test_entities_by_type() {
303        let mut map = EntityMap::new();
304        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
305        map.add_entity(create_test_entity("e2", 1, EntityType::Text));
306        map.add_entity(create_test_entity("e3", 2, EntityType::Image));
307        map.add_entity(create_test_entity("e4", 2, EntityType::Text));
308
309        let text_entities = map.entities_by_type(EntityType::Text);
310        assert_eq!(text_entities.len(), 3);
311
312        let image_entities = map.entities_by_type(EntityType::Image);
313        assert_eq!(image_entities.len(), 1);
314
315        let table_entities = map.entities_by_type(EntityType::Table);
316        assert_eq!(table_entities.len(), 0);
317    }
318
319    #[test]
320    fn test_export_format_default() {
321        let format = ExportFormat::default();
322        assert_eq!(format, ExportFormat::Json);
323    }
324
325    #[test]
326    fn test_export_format_variants() {
327        assert_eq!(ExportFormat::Json, ExportFormat::Json);
328        assert_eq!(ExportFormat::JsonLd, ExportFormat::JsonLd);
329        assert_eq!(ExportFormat::Xml, ExportFormat::Xml);
330        assert_ne!(ExportFormat::Json, ExportFormat::JsonLd);
331    }
332
333    #[test]
334    fn test_document_metadata() {
335        let mut map = EntityMap::new();
336        map.document_metadata
337            .insert("title".to_string(), "Test Document".to_string());
338        map.document_metadata
339            .insert("author".to_string(), "Test Author".to_string());
340
341        assert_eq!(
342            map.document_metadata.get("title"),
343            Some(&"Test Document".to_string())
344        );
345        assert_eq!(
346            map.document_metadata.get("author"),
347            Some(&"Test Author".to_string())
348        );
349    }
350
351    #[test]
352    fn test_schemas() {
353        let mut map = EntityMap::new();
354        map.schemas.push("https://schema.org".to_string());
355        map.schemas.push("https://example.com/schema".to_string());
356
357        assert_eq!(map.schemas.len(), 2);
358        assert!(map.schemas.contains(&"https://schema.org".to_string()));
359    }
360
361    #[cfg(any(feature = "semantic", test))]
362    #[test]
363    fn test_to_json() {
364        let mut map = EntityMap::new();
365        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
366
367        let json = map.to_json();
368        assert!(json.is_ok());
369
370        let json_str = json.unwrap();
371        assert!(json_str.contains("pages"));
372        assert!(json_str.contains("e1"));
373    }
374
375    #[cfg(any(feature = "semantic", test))]
376    #[test]
377    fn test_to_json_compact() {
378        let mut map = EntityMap::new();
379        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
380
381        let json = map.to_json_compact();
382        assert!(json.is_ok());
383
384        let json_str = json.unwrap();
385        // Compact JSON should not have newlines
386        assert!(!json_str.contains("\n  ")); // No indented newlines
387    }
388
389    #[cfg(any(feature = "semantic", test))]
390    #[test]
391    fn test_to_json_ld() {
392        let mut map = EntityMap::new();
393        map.add_entity(create_test_entity("e1", 1, EntityType::Text));
394        map.schemas.push("https://schema.org".to_string());
395
396        let json_ld = map.to_json_ld();
397        assert!(json_ld.is_ok());
398
399        let json_str = json_ld.unwrap();
400        assert!(json_str.contains("@context"));
401        assert!(json_str.contains("schema.org"));
402        assert!(json_str.contains("DigitalDocument"));
403        assert!(json_str.contains("hasPart"));
404    }
405
406    #[cfg(any(feature = "semantic", test))]
407    #[test]
408    fn test_entity_type_to_schema_org_financial() {
409        assert_eq!(entity_type_to_schema_org(&EntityType::Invoice), "Invoice");
410        assert_eq!(
411            entity_type_to_schema_org(&EntityType::InvoiceNumber),
412            "identifier"
413        );
414        assert_eq!(
415            entity_type_to_schema_org(&EntityType::TotalAmount),
416            "totalPrice"
417        );
418        assert_eq!(
419            entity_type_to_schema_org(&EntityType::TaxAmount),
420            "taxAmount"
421        );
422        assert_eq!(
423            entity_type_to_schema_org(&EntityType::DueDate),
424            "paymentDueDate"
425        );
426    }
427
428    #[cfg(any(feature = "semantic", test))]
429    #[test]
430    fn test_entity_type_to_schema_org_identity() {
431        assert_eq!(entity_type_to_schema_org(&EntityType::PersonName), "Person");
432        assert_eq!(
433            entity_type_to_schema_org(&EntityType::OrganizationName),
434            "Organization"
435        );
436        assert_eq!(
437            entity_type_to_schema_org(&EntityType::Address),
438            "PostalAddress"
439        );
440        assert_eq!(
441            entity_type_to_schema_org(&EntityType::PhoneNumber),
442            "telephone"
443        );
444        assert_eq!(entity_type_to_schema_org(&EntityType::Email), "email");
445    }
446
447    #[cfg(any(feature = "semantic", test))]
448    #[test]
449    fn test_entity_type_to_schema_org_structure() {
450        assert_eq!(entity_type_to_schema_org(&EntityType::Heading), "Heading");
451        assert_eq!(
452            entity_type_to_schema_org(&EntityType::Paragraph),
453            "Paragraph"
454        );
455        assert_eq!(entity_type_to_schema_org(&EntityType::Table), "Table");
456        assert_eq!(entity_type_to_schema_org(&EntityType::List), "ItemList");
457        assert_eq!(entity_type_to_schema_org(&EntityType::Image), "ImageObject");
458        assert_eq!(entity_type_to_schema_org(&EntityType::Text), "Text");
459    }
460
461    #[cfg(any(feature = "semantic", test))]
462    #[test]
463    fn test_entity_type_to_schema_org_custom() {
464        assert_eq!(
465            entity_type_to_schema_org(&EntityType::Custom("MyType".to_string())),
466            "Thing"
467        );
468    }
469
470    #[cfg(any(feature = "semantic", test))]
471    #[test]
472    fn test_entity_to_schema_org_basic() {
473        let entity = create_test_entity("test_id", 1, EntityType::Text);
474        let json = entity_to_schema_org(&entity, 1);
475
476        assert_eq!(json["@type"], "Text");
477        assert_eq!(json["@id"], "test_id");
478        assert_eq!(json["pageStart"], 2); // page_num + 1
479    }
480
481    #[cfg(any(feature = "semantic", test))]
482    #[test]
483    fn test_entity_to_schema_org_with_metadata() {
484        let mut entity = create_test_entity("test_id", 0, EntityType::Invoice);
485        entity.metadata = entity.metadata.with_property("total", "1000.00");
486        entity.metadata = entity.metadata.with_confidence(0.95);
487        entity.metadata = entity.metadata.with_schema("https://schema.org/Invoice");
488
489        let json = entity_to_schema_org(&entity, 0);
490
491        assert_eq!(json["@type"], "Invoice");
492        assert_eq!(json["total"], "1000.00");
493        // Use approximate comparison for f32 -> f64 conversion
494        let confidence = json["confidence"].as_f64().unwrap();
495        assert!((confidence - 0.95).abs() < 0.001);
496        assert_eq!(json["conformsTo"], "https://schema.org/Invoice");
497    }
498}