Skip to main content

oxidize_pdf/semantic/
entity.rs

1//! Entity types and metadata for semantic marking
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Bounding box for entity regions
7#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
8pub struct BoundingBox {
9    /// X coordinate (left edge)
10    pub x: f32,
11    /// Y coordinate (bottom edge in PDF coordinates)
12    pub y: f32,
13    /// Width of the region
14    pub width: f32,
15    /// Height of the region  
16    pub height: f32,
17    /// Page number (1-indexed)
18    pub page: u32,
19}
20
21impl BoundingBox {
22    /// Create a new bounding box
23    pub fn new(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
24        Self {
25            x,
26            y,
27            width,
28            height,
29            page,
30        }
31    }
32
33    /// Get the right edge coordinate
34    pub fn right(&self) -> f32 {
35        self.x + self.width
36    }
37
38    /// Get the top edge coordinate (in PDF coordinates)
39    pub fn top(&self) -> f32 {
40        self.y + self.height
41    }
42
43    /// Check if this bounding box intersects with another
44    pub fn intersects(&self, other: &BoundingBox) -> bool {
45        self.page == other.page
46            && self.x < other.right()
47            && self.right() > other.x
48            && self.y < other.top()
49            && self.top() > other.y
50    }
51
52    /// Get the area of this bounding box
53    pub fn area(&self) -> f32 {
54        self.width * self.height
55    }
56}
57
58/// Relationship between entities
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub struct EntityRelation {
61    /// ID of the target entity
62    pub target_id: String,
63    /// Type of relationship
64    pub relation_type: RelationType,
65}
66
67/// Types of relationships between entities
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69#[serde(rename_all = "camelCase")]
70pub enum RelationType {
71    /// This entity contains the target entity
72    Contains,
73    /// This entity is part of the target entity
74    IsPartOf,
75    /// This entity references the target entity
76    References,
77    /// This entity follows the target entity (sequential)
78    Follows,
79    /// This entity precedes the target entity
80    Precedes,
81    /// Custom relationship type
82    Custom(String),
83}
84
85/// Standard entity types available in all editions
86#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
87#[serde(rename_all = "camelCase")]
88pub enum EntityType {
89    // Document Structure
90    /// Generic text region
91    Text,
92    /// Image or graphic
93    Image,
94    /// Table structure
95    Table,
96    /// Heading/Title
97    Heading,
98    /// Paragraph of text
99    Paragraph,
100    /// List (ordered or unordered)
101    List,
102    /// Page number
103    PageNumber,
104    /// Header region
105    Header,
106    /// Footer region
107    Footer,
108
109    // Financial Documents
110    /// Invoice document
111    Invoice,
112    /// Invoice number/identifier
113    InvoiceNumber,
114    /// Customer name or organization
115    CustomerName,
116    /// Line item in an invoice
117    LineItem,
118    /// Total amount
119    TotalAmount,
120    /// Tax amount
121    TaxAmount,
122    /// Due date
123    DueDate,
124    /// Payment amount
125    PaymentAmount,
126
127    // Identity & Contact
128    /// Person name
129    PersonName,
130    /// Organization/Company name
131    OrganizationName,
132    /// Address (street, city, etc.)
133    Address,
134    /// Phone number
135    PhoneNumber,
136    /// Email address
137    Email,
138    /// Website URL
139    Website,
140
141    // Legal Documents
142    /// Contract document
143    Contract,
144    /// Contract party
145    ContractParty,
146    /// Contract term or clause
147    ContractTerm,
148    /// Effective date
149    EffectiveDate,
150    /// Contract value/amount
151    ContractValue,
152    /// Signature region
153    Signature,
154
155    // Dates and Numbers
156    /// Generic date
157    Date,
158    /// Amount or monetary value
159    Amount,
160    /// Quantity or count
161    Quantity,
162    /// Percentage value
163    Percentage,
164
165    // Custom entity type for extensibility (will be serialized as the inner string)
166    #[serde(untagged)]
167    Custom(String),
168}
169
170/// Metadata associated with an entity
171#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
172pub struct EntityMetadata {
173    /// Key-value pairs of metadata
174    pub properties: HashMap<String, String>,
175    /// Confidence score (0.0 to 1.0)
176    pub confidence: Option<f32>,
177    /// Schema URL if applicable
178    pub schema: Option<String>,
179}
180
181/// Enhanced semantic entity with relationships
182#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
183pub struct SemanticEntity {
184    /// Unique identifier for this entity
185    pub id: String,
186    /// Type of entity
187    #[serde(rename = "type")]
188    pub entity_type: EntityType,
189    /// Geometric bounds of the entity
190    pub bounds: BoundingBox,
191    /// Text content of the entity (if applicable)
192    pub content: String,
193    /// Associated metadata
194    pub metadata: EntityMetadata,
195    /// Relationships to other entities
196    pub relationships: Vec<EntityRelation>,
197}
198
199impl SemanticEntity {
200    /// Create a new semantic entity
201    pub fn new(id: String, entity_type: EntityType, bounds: BoundingBox) -> Self {
202        Self {
203            id,
204            entity_type,
205            bounds,
206            content: String::new(),
207            metadata: EntityMetadata::new(),
208            relationships: Vec::new(),
209        }
210    }
211
212    /// Set the content text for this entity
213    pub fn with_content(mut self, content: impl Into<String>) -> Self {
214        self.content = content.into();
215        self
216    }
217
218    /// Add metadata to this entity
219    pub fn with_metadata(mut self, metadata: EntityMetadata) -> Self {
220        self.metadata = metadata;
221        self
222    }
223
224    /// Add a relationship to another entity
225    pub fn with_relationship(
226        mut self,
227        target_id: impl Into<String>,
228        relation_type: RelationType,
229    ) -> Self {
230        self.relationships.push(EntityRelation {
231            target_id: target_id.into(),
232            relation_type,
233        });
234        self
235    }
236
237    /// Add multiple relationships
238    pub fn with_relationships(mut self, relationships: Vec<EntityRelation>) -> Self {
239        self.relationships.extend(relationships);
240        self
241    }
242}
243
244impl Default for EntityMetadata {
245    fn default() -> Self {
246        Self::new()
247    }
248}
249
250impl EntityMetadata {
251    pub fn new() -> Self {
252        Self {
253            properties: HashMap::new(),
254            confidence: None,
255            schema: None,
256        }
257    }
258
259    pub fn with_property(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
260        self.properties.insert(key.into(), value.into());
261        self
262    }
263
264    pub fn with_confidence(mut self, confidence: f32) -> Self {
265        self.confidence = Some(confidence.clamp(0.0, 1.0));
266        self
267    }
268
269    pub fn with_schema(mut self, schema: impl Into<String>) -> Self {
270        self.schema = Some(schema.into());
271        self
272    }
273}
274
275/// A marked entity in the PDF (backward compatibility)
276#[derive(Debug, Clone, Serialize, Deserialize)]
277pub struct Entity {
278    /// Unique identifier for this entity
279    pub id: String,
280    /// Type of entity
281    #[serde(rename = "type")]
282    pub entity_type: EntityType,
283    /// Bounding box (x, y, width, height)
284    pub bounds: (f64, f64, f64, f64),
285    /// Page number (0-indexed)
286    pub page: usize,
287    /// Associated metadata
288    pub metadata: EntityMetadata,
289}
290
291impl Entity {
292    pub fn new(
293        id: String,
294        entity_type: EntityType,
295        bounds: (f64, f64, f64, f64),
296        page: usize,
297    ) -> Self {
298        Self {
299            id,
300            entity_type,
301            bounds,
302            page,
303            metadata: EntityMetadata::new(),
304        }
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_entity_type_variants() {
314        let types = vec![
315            EntityType::Text,
316            EntityType::Image,
317            EntityType::Table,
318            EntityType::Heading,
319            EntityType::Paragraph,
320            EntityType::List,
321            EntityType::PageNumber,
322            EntityType::Header,
323            EntityType::Footer,
324            EntityType::Invoice,
325            EntityType::InvoiceNumber,
326            EntityType::CustomerName,
327            EntityType::Custom("TestType".to_string()),
328        ];
329
330        for entity_type in types {
331            match entity_type {
332                EntityType::Text => assert_eq!(entity_type, EntityType::Text),
333                EntityType::Image => assert_eq!(entity_type, EntityType::Image),
334                EntityType::Table => assert_eq!(entity_type, EntityType::Table),
335                EntityType::Heading => assert_eq!(entity_type, EntityType::Heading),
336                EntityType::Paragraph => assert_eq!(entity_type, EntityType::Paragraph),
337                EntityType::List => assert_eq!(entity_type, EntityType::List),
338                EntityType::PageNumber => assert_eq!(entity_type, EntityType::PageNumber),
339                EntityType::Header => assert_eq!(entity_type, EntityType::Header),
340                EntityType::Footer => assert_eq!(entity_type, EntityType::Footer),
341                EntityType::Invoice => assert_eq!(entity_type, EntityType::Invoice),
342                EntityType::InvoiceNumber => assert_eq!(entity_type, EntityType::InvoiceNumber),
343                EntityType::CustomerName => assert_eq!(entity_type, EntityType::CustomerName),
344                EntityType::Custom(ref s) => assert_eq!(s, "TestType"),
345                _ => {
346                    // Handle all other EntityType variants that exist
347                    assert!(matches!(
348                        entity_type,
349                        EntityType::LineItem
350                            | EntityType::TotalAmount
351                            | EntityType::TaxAmount
352                            | EntityType::DueDate
353                            | EntityType::PaymentAmount
354                            | EntityType::PersonName
355                            | EntityType::OrganizationName
356                            | EntityType::Address
357                            | EntityType::PhoneNumber
358                            | EntityType::Email
359                            | EntityType::Website
360                            | EntityType::Contract
361                            | EntityType::ContractParty
362                            | EntityType::ContractTerm
363                            | EntityType::EffectiveDate
364                            | EntityType::ContractValue
365                            | EntityType::Signature
366                            | EntityType::Date
367                            | EntityType::Amount
368                            | EntityType::Quantity
369                            | EntityType::Percentage
370                    ));
371                }
372            }
373        }
374    }
375
376    #[test]
377    fn test_entity_metadata_new() {
378        let metadata = EntityMetadata::new();
379        assert!(metadata.properties.is_empty());
380        assert!(metadata.confidence.is_none());
381        assert!(metadata.schema.is_none());
382    }
383
384    #[test]
385    fn test_entity_metadata_with_property() {
386        let metadata = EntityMetadata::new()
387            .with_property("author", "John Doe")
388            .with_property("title", "Test Document");
389
390        assert_eq!(metadata.properties.len(), 2);
391        assert_eq!(
392            metadata.properties.get("author"),
393            Some(&"John Doe".to_string())
394        );
395        assert_eq!(
396            metadata.properties.get("title"),
397            Some(&"Test Document".to_string())
398        );
399    }
400
401    #[test]
402    fn test_entity_metadata_with_confidence() {
403        let metadata = EntityMetadata::new().with_confidence(0.95);
404        assert_eq!(metadata.confidence, Some(0.95));
405
406        // Test clamping
407        let metadata_high = EntityMetadata::new().with_confidence(1.5);
408        assert_eq!(metadata_high.confidence, Some(1.0));
409
410        let metadata_low = EntityMetadata::new().with_confidence(-0.5);
411        assert_eq!(metadata_low.confidence, Some(0.0));
412    }
413
414    #[test]
415    fn test_entity_metadata_with_schema() {
416        let metadata = EntityMetadata::new().with_schema("https://schema.org/Article");
417        assert_eq!(
418            metadata.schema,
419            Some("https://schema.org/Article".to_string())
420        );
421    }
422
423    #[test]
424    fn test_entity_metadata_builder_chain() {
425        let metadata = EntityMetadata::new()
426            .with_property("lang", "en")
427            .with_property("version", "1.0")
428            .with_confidence(0.85)
429            .with_schema("https://example.com/schema");
430
431        assert_eq!(metadata.properties.len(), 2);
432        assert_eq!(metadata.confidence, Some(0.85));
433        assert!(metadata.schema.is_some());
434    }
435
436    #[test]
437    fn test_entity_new() {
438        let entity = Entity::new(
439            "entity-1".to_string(),
440            EntityType::Paragraph,
441            (10.0, 20.0, 100.0, 50.0),
442            0,
443        );
444
445        assert_eq!(entity.id, "entity-1");
446        assert_eq!(entity.entity_type, EntityType::Paragraph);
447        assert_eq!(entity.bounds, (10.0, 20.0, 100.0, 50.0));
448        assert_eq!(entity.page, 0);
449        assert!(entity.metadata.properties.is_empty());
450    }
451
452    #[test]
453    fn test_entity_with_metadata() {
454        let mut entity = Entity::new(
455            "heading-1".to_string(),
456            EntityType::Heading,
457            (0.0, 0.0, 200.0, 30.0),
458            1,
459        );
460
461        entity.metadata = EntityMetadata::new()
462            .with_property("level", "1")
463            .with_property("text", "Introduction")
464            .with_confidence(0.98);
465
466        assert_eq!(
467            entity.metadata.properties.get("level"),
468            Some(&"1".to_string())
469        );
470        assert_eq!(
471            entity.metadata.properties.get("text"),
472            Some(&"Introduction".to_string())
473        );
474        assert_eq!(entity.metadata.confidence, Some(0.98));
475    }
476
477    #[test]
478    fn test_entity_serialization() {
479        let entity = Entity::new(
480            "test-entity".to_string(),
481            EntityType::Image,
482            (50.0, 50.0, 150.0, 100.0),
483            2,
484        );
485
486        // Test that entity can be serialized
487        let json = serde_json::to_string(&entity).unwrap();
488        assert!(json.contains("\"id\":\"test-entity\""));
489        assert!(json.contains("\"type\":\"image\""));
490
491        // Test deserialization
492        let deserialized: Entity = serde_json::from_str(&json).unwrap();
493        assert_eq!(deserialized.id, entity.id);
494        assert_eq!(deserialized.entity_type, entity.entity_type);
495    }
496
497    #[test]
498    fn test_entity_type_serialization() {
499        // Test that EntityType serializes to camelCase
500        let entity_type = EntityType::PageNumber;
501        let json = serde_json::to_string(&entity_type).unwrap();
502        assert_eq!(json, "\"pageNumber\"");
503
504        // Test deserialization
505        let deserialized: EntityType = serde_json::from_str("\"pageNumber\"").unwrap();
506        assert_eq!(deserialized, EntityType::PageNumber);
507    }
508
509    #[test]
510    fn test_multiple_entities() {
511        let entities = vec![
512            Entity::new(
513                "e1".to_string(),
514                EntityType::Header,
515                (0.0, 0.0, 100.0, 20.0),
516                0,
517            ),
518            Entity::new(
519                "e2".to_string(),
520                EntityType::Paragraph,
521                (0.0, 20.0, 100.0, 80.0),
522                0,
523            ),
524            Entity::new(
525                "e3".to_string(),
526                EntityType::Footer,
527                (0.0, 100.0, 100.0, 20.0),
528                0,
529            ),
530        ];
531
532        assert_eq!(entities.len(), 3);
533        assert_eq!(entities[0].entity_type, EntityType::Header);
534        assert_eq!(entities[1].entity_type, EntityType::Paragraph);
535        assert_eq!(entities[2].entity_type, EntityType::Footer);
536    }
537
538    #[test]
539    fn test_entity_bounds() {
540        let entity = Entity::new(
541            "table-1".to_string(),
542            EntityType::Table,
543            (25.5, 30.75, 200.25, 150.5),
544            5,
545        );
546
547        let (x, y, width, height) = entity.bounds;
548        assert_eq!(x, 25.5);
549        assert_eq!(y, 30.75);
550        assert_eq!(width, 200.25);
551        assert_eq!(height, 150.5);
552    }
553
554    #[test]
555    fn test_metadata_multiple_properties() {
556        let mut metadata = EntityMetadata::new();
557
558        // Add properties one by one
559        for i in 0..10 {
560            metadata
561                .properties
562                .insert(format!("key{}", i), format!("value{}", i));
563        }
564
565        assert_eq!(metadata.properties.len(), 10);
566        assert_eq!(metadata.properties.get("key5"), Some(&"value5".to_string()));
567    }
568
569    #[test]
570    fn test_entity_list_type() {
571        let list_entity = Entity::new(
572            "list-1".to_string(),
573            EntityType::List,
574            (10.0, 10.0, 180.0, 100.0),
575            0,
576        );
577
578        // Add list-specific metadata
579        let mut entity = list_entity;
580        entity.metadata = EntityMetadata::new()
581            .with_property("list_type", "ordered")
582            .with_property("item_count", "5");
583
584        assert_eq!(entity.entity_type, EntityType::List);
585        assert_eq!(
586            entity.metadata.properties.get("list_type"),
587            Some(&"ordered".to_string())
588        );
589    }
590
591    #[test]
592    fn test_confidence_edge_cases() {
593        // Test exact boundaries
594        let metadata1 = EntityMetadata::new().with_confidence(0.0);
595        assert_eq!(metadata1.confidence, Some(0.0));
596
597        let metadata2 = EntityMetadata::new().with_confidence(1.0);
598        assert_eq!(metadata2.confidence, Some(1.0));
599
600        // Test normal value
601        let metadata3 = EntityMetadata::new().with_confidence(0.5);
602        assert_eq!(metadata3.confidence, Some(0.5));
603    }
604
605    #[test]
606    fn test_financial_entity_types() {
607        let invoice = Entity::new(
608            "invoice_001".to_string(),
609            EntityType::Invoice,
610            (0.0, 0.0, 500.0, 600.0),
611            0,
612        );
613
614        let invoice_number = Entity::new(
615            "inv_num_001".to_string(),
616            EntityType::InvoiceNumber,
617            (100.0, 700.0, 150.0, 20.0),
618            0,
619        );
620
621        assert_eq!(invoice.entity_type, EntityType::Invoice);
622        assert_eq!(invoice_number.entity_type, EntityType::InvoiceNumber);
623    }
624
625    #[test]
626    fn test_custom_entity_type() {
627        let custom_entity = Entity::new(
628            "custom_001".to_string(),
629            EntityType::Custom("PurchaseOrder".to_string()),
630            (0.0, 0.0, 400.0, 500.0),
631            0,
632        );
633
634        assert_eq!(
635            custom_entity.entity_type,
636            EntityType::Custom("PurchaseOrder".to_string())
637        );
638
639        // Test serialization of custom type
640        let json = serde_json::to_string(&custom_entity.entity_type).unwrap();
641        assert!(json.contains("PurchaseOrder"));
642    }
643
644    #[test]
645    fn test_invoice_entity_with_metadata() {
646        let mut invoice = Entity::new(
647            "invoice_123".to_string(),
648            EntityType::Invoice,
649            (50.0, 50.0, 450.0, 700.0),
650            0,
651        );
652
653        invoice.metadata = EntityMetadata::new()
654            .with_property("invoice_number", "INV-2024-001")
655            .with_property("total_amount", "1234.56")
656            .with_property("currency", "USD")
657            .with_confidence(0.98)
658            .with_schema("https://schema.org/Invoice");
659
660        assert_eq!(
661            invoice.metadata.properties.get("invoice_number"),
662            Some(&"INV-2024-001".to_string())
663        );
664        assert_eq!(
665            invoice.metadata.properties.get("total_amount"),
666            Some(&"1234.56".to_string())
667        );
668        assert_eq!(invoice.metadata.confidence, Some(0.98));
669    }
670}