oxidize_pdf/semantic/
export.rs

1//! Export functionality for semantic entities
2
3use super::Entity;
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7#[cfg(any(feature = "semantic", test))]
8use super::EntityType;
9
10#[cfg(any(feature = "semantic", test))]
11use serde_json::{json, Value};
12
13/// Map of entities organized by page
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct EntityMap {
16    /// Document-level metadata
17    pub document_metadata: HashMap<String, String>,
18    /// Entities organized by page number
19    pub pages: HashMap<usize, Vec<Entity>>,
20    /// Schema definitions used
21    pub schemas: Vec<String>,
22}
23
24impl Default for EntityMap {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl EntityMap {
31    pub fn new() -> Self {
32        Self {
33            document_metadata: HashMap::new(),
34            pages: HashMap::new(),
35            schemas: Vec::new(),
36        }
37    }
38
39    /// Add an entity to the map
40    pub fn add_entity(&mut self, entity: Entity) {
41        self.pages.entry(entity.page).or_default().push(entity);
42    }
43
44    /// Export to JSON string (requires serde_json feature)
45    #[cfg(any(feature = "semantic", test))]
46    #[allow(unexpected_cfgs)]
47    pub fn to_json(&self) -> Result<String, serde_json::Error> {
48        serde_json::to_string_pretty(self)
49    }
50
51    /// Export to JSON with custom options (requires serde_json feature)
52    #[cfg(any(feature = "semantic", test))]
53    #[allow(unexpected_cfgs)]
54    pub fn to_json_compact(&self) -> Result<String, serde_json::Error> {
55        serde_json::to_string(self)
56    }
57
58    /// Get all entities of a specific type
59    pub fn entities_by_type(&self, entity_type: super::EntityType) -> Vec<&Entity> {
60        self.pages
61            .values()
62            .flat_map(|entities| entities.iter())
63            .filter(|e| e.entity_type == entity_type)
64            .collect()
65    }
66
67    /// Get all entities on a specific page
68    pub fn entities_on_page(&self, page: usize) -> Option<&Vec<Entity>> {
69        self.pages.get(&page)
70    }
71
72    /// Export to JSON-LD format with Schema.org context
73    #[cfg(any(feature = "semantic", test))]
74    #[allow(unexpected_cfgs)]
75    pub fn to_json_ld(&self) -> Result<String, serde_json::Error> {
76        let mut json_ld = json!({
77            "@context": "https://schema.org",
78            "@type": "DigitalDocument",
79            "additionalType": "AI-Ready PDF",
80            "hasPart": []
81        });
82
83        let mut parts = Vec::new();
84
85        for (page_num, entities) in &self.pages {
86            for entity in entities {
87                let entity_json = entity_to_schema_org(entity, *page_num);
88                parts.push(entity_json);
89            }
90        }
91
92        json_ld["hasPart"] = Value::Array(parts);
93
94        // Add schemas if any
95        if !self.schemas.is_empty() {
96            json_ld["conformsTo"] = json!(self.schemas);
97        }
98
99        // Add document metadata
100        if !self.document_metadata.is_empty() {
101            for (key, value) in &self.document_metadata {
102                json_ld[key] = json!(value);
103            }
104        }
105
106        serde_json::to_string_pretty(&json_ld)
107    }
108}
109
110/// Convert EntityType to Schema.org type
111#[cfg(any(feature = "semantic", test))]
112fn entity_type_to_schema_org(entity_type: &EntityType) -> &'static str {
113    match entity_type {
114        // Financial Documents
115        EntityType::Invoice => "Invoice",
116        EntityType::InvoiceNumber => "identifier",
117        EntityType::CustomerName => "customer",
118        EntityType::TotalAmount => "totalPrice",
119        EntityType::TaxAmount => "taxAmount",
120        EntityType::DueDate => "paymentDueDate",
121        EntityType::LineItem => "LineItem",
122        EntityType::PaymentAmount => "price",
123
124        // Identity & Contact
125        EntityType::PersonName => "Person",
126        EntityType::OrganizationName => "Organization",
127        EntityType::Address => "PostalAddress",
128        EntityType::PhoneNumber => "telephone",
129        EntityType::Email => "email",
130        EntityType::Website => "url",
131
132        // Legal Documents
133        EntityType::Contract => "DigitalDocument",
134        EntityType::ContractParty => "Party",
135        EntityType::ContractTerm => "OfferCatalog",
136        EntityType::EffectiveDate => "datePublished",
137        EntityType::ContractValue => "price",
138        EntityType::Signature => "signatureValue",
139
140        // Document Structure
141        EntityType::Heading => "Heading",
142        EntityType::Paragraph => "Paragraph",
143        EntityType::Table => "Table",
144        EntityType::List => "ItemList",
145        EntityType::Image => "ImageObject",
146        EntityType::Text => "Text",
147        EntityType::Header => "WPHeader",
148        EntityType::Footer => "WPFooter",
149        EntityType::PageNumber => "pageStart",
150
151        // Dates and Numbers
152        EntityType::Date => "Date",
153        EntityType::Amount => "MonetaryAmount",
154        EntityType::Quantity => "quantityValue",
155        EntityType::Percentage => "ratingValue",
156
157        // Custom
158        EntityType::Custom(_) => "Thing",
159    }
160}
161
162/// Convert Entity to Schema.org JSON-LD
163#[cfg(any(feature = "semantic", test))]
164fn entity_to_schema_org(entity: &Entity, page_num: usize) -> Value {
165    let schema_type = entity_type_to_schema_org(&entity.entity_type);
166
167    let mut json = json!({
168        "@type": schema_type,
169        "spatialCoverage": {
170            "@type": "Place",
171            "geo": {
172                "@type": "GeoCoordinates",
173                "box": format!("{},{},{},{}", entity.bounds.0, entity.bounds.1,
174                              entity.bounds.2, entity.bounds.3)
175            }
176        },
177        "pageStart": page_num + 1
178    });
179
180    // Add ID if present
181    if !entity.id.is_empty() {
182        json["@id"] = json!(entity.id);
183    }
184
185    // Add properties from metadata
186    for (key, value) in &entity.metadata.properties {
187        json[key] = json!(value);
188    }
189
190    // Add confidence if present
191    if let Some(confidence) = entity.metadata.confidence {
192        json["confidence"] = json!(confidence);
193    }
194
195    // Add schema if present
196    if let Some(schema) = &entity.metadata.schema {
197        json["conformsTo"] = json!(schema);
198    }
199
200    json
201}
202
203/// Export format options
204#[derive(Debug, Clone, Copy)]
205pub enum ExportFormat {
206    /// JSON format (default)
207    Json,
208    /// JSON-LD with schema.org context
209    JsonLd,
210    /// XML format
211    Xml,
212}
213
214impl Default for ExportFormat {
215    fn default() -> Self {
216        Self::Json
217    }
218}