1use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
8pub struct BoundingBox {
9 pub x: f32,
11 pub y: f32,
13 pub width: f32,
15 pub height: f32,
17 pub page: u32,
19}
20
21impl BoundingBox {
22 pub fn new(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
24 Self {
25 x,
26 y,
27 width,
28 height,
29 page,
30 }
31 }
32
33 pub fn right(&self) -> f32 {
35 self.x + self.width
36 }
37
38 pub fn top(&self) -> f32 {
40 self.y + self.height
41 }
42
43 pub fn intersects(&self, other: &BoundingBox) -> bool {
45 self.page == other.page
46 && self.x < other.right()
47 && self.right() > other.x
48 && self.y < other.top()
49 && self.top() > other.y
50 }
51
52 pub fn area(&self) -> f32 {
54 self.width * self.height
55 }
56}
57
58#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub struct EntityRelation {
61 pub target_id: String,
63 pub relation_type: RelationType,
65}
66
67#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
69#[serde(rename_all = "camelCase")]
70pub enum RelationType {
71 Contains,
73 IsPartOf,
75 References,
77 Follows,
79 Precedes,
81 Custom(String),
83}
84
85#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
87#[serde(rename_all = "camelCase")]
88pub enum EntityType {
89 Text,
92 Image,
94 Table,
96 Heading,
98 Paragraph,
100 List,
102 PageNumber,
104 Header,
106 Footer,
108
109 Invoice,
112 InvoiceNumber,
114 CustomerName,
116 LineItem,
118 TotalAmount,
120 TaxAmount,
122 DueDate,
124 PaymentAmount,
126
127 PersonName,
130 OrganizationName,
132 Address,
134 PhoneNumber,
136 Email,
138 Website,
140
141 Contract,
144 ContractParty,
146 ContractTerm,
148 EffectiveDate,
150 ContractValue,
152 Signature,
154
155 Date,
158 Amount,
160 Quantity,
162 Percentage,
164
165 #[serde(untagged)]
167 Custom(String),
168}
169
170#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
172pub struct EntityMetadata {
173 pub properties: HashMap<String, String>,
175 pub confidence: Option<f32>,
177 pub schema: Option<String>,
179}
180
181#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
183pub struct SemanticEntity {
184 pub id: String,
186 #[serde(rename = "type")]
188 pub entity_type: EntityType,
189 pub bounds: BoundingBox,
191 pub content: String,
193 pub metadata: EntityMetadata,
195 pub relationships: Vec<EntityRelation>,
197}
198
199impl SemanticEntity {
200 pub fn new(id: String, entity_type: EntityType, bounds: BoundingBox) -> Self {
202 Self {
203 id,
204 entity_type,
205 bounds,
206 content: String::new(),
207 metadata: EntityMetadata::new(),
208 relationships: Vec::new(),
209 }
210 }
211
212 pub fn with_content(mut self, content: impl Into<String>) -> Self {
214 self.content = content.into();
215 self
216 }
217
218 pub fn with_metadata(mut self, metadata: EntityMetadata) -> Self {
220 self.metadata = metadata;
221 self
222 }
223
224 pub fn with_relationship(
226 mut self,
227 target_id: impl Into<String>,
228 relation_type: RelationType,
229 ) -> Self {
230 self.relationships.push(EntityRelation {
231 target_id: target_id.into(),
232 relation_type,
233 });
234 self
235 }
236
237 pub fn with_relationships(mut self, relationships: Vec<EntityRelation>) -> Self {
239 self.relationships.extend(relationships);
240 self
241 }
242}
243
244impl Default for EntityMetadata {
245 fn default() -> Self {
246 Self::new()
247 }
248}
249
250impl EntityMetadata {
251 pub fn new() -> Self {
252 Self {
253 properties: HashMap::new(),
254 confidence: None,
255 schema: None,
256 }
257 }
258
259 pub fn with_property(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
260 self.properties.insert(key.into(), value.into());
261 self
262 }
263
264 pub fn with_confidence(mut self, confidence: f32) -> Self {
265 self.confidence = Some(confidence.clamp(0.0, 1.0));
266 self
267 }
268
269 pub fn with_schema(mut self, schema: impl Into<String>) -> Self {
270 self.schema = Some(schema.into());
271 self
272 }
273}
274
275#[derive(Debug, Clone, Serialize, Deserialize)]
277pub struct Entity {
278 pub id: String,
280 #[serde(rename = "type")]
282 pub entity_type: EntityType,
283 pub bounds: (f64, f64, f64, f64),
285 pub page: usize,
287 pub metadata: EntityMetadata,
289}
290
291impl Entity {
292 pub fn new(
293 id: String,
294 entity_type: EntityType,
295 bounds: (f64, f64, f64, f64),
296 page: usize,
297 ) -> Self {
298 Self {
299 id,
300 entity_type,
301 bounds,
302 page,
303 metadata: EntityMetadata::new(),
304 }
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn test_entity_type_variants() {
314 let types = vec![
315 EntityType::Text,
316 EntityType::Image,
317 EntityType::Table,
318 EntityType::Heading,
319 EntityType::Paragraph,
320 EntityType::List,
321 EntityType::PageNumber,
322 EntityType::Header,
323 EntityType::Footer,
324 EntityType::Invoice,
325 EntityType::InvoiceNumber,
326 EntityType::CustomerName,
327 EntityType::Custom("TestType".to_string()),
328 ];
329
330 for entity_type in types {
331 match entity_type {
332 EntityType::Text => assert_eq!(entity_type, EntityType::Text),
333 EntityType::Image => assert_eq!(entity_type, EntityType::Image),
334 EntityType::Table => assert_eq!(entity_type, EntityType::Table),
335 EntityType::Heading => assert_eq!(entity_type, EntityType::Heading),
336 EntityType::Paragraph => assert_eq!(entity_type, EntityType::Paragraph),
337 EntityType::List => assert_eq!(entity_type, EntityType::List),
338 EntityType::PageNumber => assert_eq!(entity_type, EntityType::PageNumber),
339 EntityType::Header => assert_eq!(entity_type, EntityType::Header),
340 EntityType::Footer => assert_eq!(entity_type, EntityType::Footer),
341 EntityType::Invoice => assert_eq!(entity_type, EntityType::Invoice),
342 EntityType::InvoiceNumber => assert_eq!(entity_type, EntityType::InvoiceNumber),
343 EntityType::CustomerName => assert_eq!(entity_type, EntityType::CustomerName),
344 EntityType::Custom(ref s) => assert_eq!(s, "TestType"),
345 _ => {
346 assert!(matches!(
348 entity_type,
349 EntityType::LineItem
350 | EntityType::TotalAmount
351 | EntityType::TaxAmount
352 | EntityType::DueDate
353 | EntityType::PaymentAmount
354 | EntityType::PersonName
355 | EntityType::OrganizationName
356 | EntityType::Address
357 | EntityType::PhoneNumber
358 | EntityType::Email
359 | EntityType::Website
360 | EntityType::Contract
361 | EntityType::ContractParty
362 | EntityType::ContractTerm
363 | EntityType::EffectiveDate
364 | EntityType::ContractValue
365 | EntityType::Signature
366 | EntityType::Date
367 | EntityType::Amount
368 | EntityType::Quantity
369 | EntityType::Percentage
370 ));
371 }
372 }
373 }
374 }
375
376 #[test]
377 fn test_entity_metadata_new() {
378 let metadata = EntityMetadata::new();
379 assert!(metadata.properties.is_empty());
380 assert!(metadata.confidence.is_none());
381 assert!(metadata.schema.is_none());
382 }
383
384 #[test]
385 fn test_entity_metadata_with_property() {
386 let metadata = EntityMetadata::new()
387 .with_property("author", "John Doe")
388 .with_property("title", "Test Document");
389
390 assert_eq!(metadata.properties.len(), 2);
391 assert_eq!(
392 metadata.properties.get("author"),
393 Some(&"John Doe".to_string())
394 );
395 assert_eq!(
396 metadata.properties.get("title"),
397 Some(&"Test Document".to_string())
398 );
399 }
400
401 #[test]
402 fn test_entity_metadata_with_confidence() {
403 let metadata = EntityMetadata::new().with_confidence(0.95);
404 assert_eq!(metadata.confidence, Some(0.95));
405
406 let metadata_high = EntityMetadata::new().with_confidence(1.5);
408 assert_eq!(metadata_high.confidence, Some(1.0));
409
410 let metadata_low = EntityMetadata::new().with_confidence(-0.5);
411 assert_eq!(metadata_low.confidence, Some(0.0));
412 }
413
414 #[test]
415 fn test_entity_metadata_with_schema() {
416 let metadata = EntityMetadata::new().with_schema("https://schema.org/Article");
417 assert_eq!(
418 metadata.schema,
419 Some("https://schema.org/Article".to_string())
420 );
421 }
422
423 #[test]
424 fn test_entity_metadata_builder_chain() {
425 let metadata = EntityMetadata::new()
426 .with_property("lang", "en")
427 .with_property("version", "1.0")
428 .with_confidence(0.85)
429 .with_schema("https://example.com/schema");
430
431 assert_eq!(metadata.properties.len(), 2);
432 assert_eq!(metadata.confidence, Some(0.85));
433 assert!(metadata.schema.is_some());
434 }
435
436 #[test]
437 fn test_entity_new() {
438 let entity = Entity::new(
439 "entity-1".to_string(),
440 EntityType::Paragraph,
441 (10.0, 20.0, 100.0, 50.0),
442 0,
443 );
444
445 assert_eq!(entity.id, "entity-1");
446 assert_eq!(entity.entity_type, EntityType::Paragraph);
447 assert_eq!(entity.bounds, (10.0, 20.0, 100.0, 50.0));
448 assert_eq!(entity.page, 0);
449 assert!(entity.metadata.properties.is_empty());
450 }
451
452 #[test]
453 fn test_entity_with_metadata() {
454 let mut entity = Entity::new(
455 "heading-1".to_string(),
456 EntityType::Heading,
457 (0.0, 0.0, 200.0, 30.0),
458 1,
459 );
460
461 entity.metadata = EntityMetadata::new()
462 .with_property("level", "1")
463 .with_property("text", "Introduction")
464 .with_confidence(0.98);
465
466 assert_eq!(
467 entity.metadata.properties.get("level"),
468 Some(&"1".to_string())
469 );
470 assert_eq!(
471 entity.metadata.properties.get("text"),
472 Some(&"Introduction".to_string())
473 );
474 assert_eq!(entity.metadata.confidence, Some(0.98));
475 }
476
477 #[test]
478 fn test_entity_serialization() {
479 let entity = Entity::new(
480 "test-entity".to_string(),
481 EntityType::Image,
482 (50.0, 50.0, 150.0, 100.0),
483 2,
484 );
485
486 let json = serde_json::to_string(&entity).unwrap();
488 assert!(json.contains("\"id\":\"test-entity\""));
489 assert!(json.contains("\"type\":\"image\""));
490
491 let deserialized: Entity = serde_json::from_str(&json).unwrap();
493 assert_eq!(deserialized.id, entity.id);
494 assert_eq!(deserialized.entity_type, entity.entity_type);
495 }
496
497 #[test]
498 fn test_entity_type_serialization() {
499 let entity_type = EntityType::PageNumber;
501 let json = serde_json::to_string(&entity_type).unwrap();
502 assert_eq!(json, "\"pageNumber\"");
503
504 let deserialized: EntityType = serde_json::from_str("\"pageNumber\"").unwrap();
506 assert_eq!(deserialized, EntityType::PageNumber);
507 }
508
509 #[test]
510 fn test_multiple_entities() {
511 let entities = vec![
512 Entity::new(
513 "e1".to_string(),
514 EntityType::Header,
515 (0.0, 0.0, 100.0, 20.0),
516 0,
517 ),
518 Entity::new(
519 "e2".to_string(),
520 EntityType::Paragraph,
521 (0.0, 20.0, 100.0, 80.0),
522 0,
523 ),
524 Entity::new(
525 "e3".to_string(),
526 EntityType::Footer,
527 (0.0, 100.0, 100.0, 20.0),
528 0,
529 ),
530 ];
531
532 assert_eq!(entities.len(), 3);
533 assert_eq!(entities[0].entity_type, EntityType::Header);
534 assert_eq!(entities[1].entity_type, EntityType::Paragraph);
535 assert_eq!(entities[2].entity_type, EntityType::Footer);
536 }
537
538 #[test]
539 fn test_entity_bounds() {
540 let entity = Entity::new(
541 "table-1".to_string(),
542 EntityType::Table,
543 (25.5, 30.75, 200.25, 150.5),
544 5,
545 );
546
547 let (x, y, width, height) = entity.bounds;
548 assert_eq!(x, 25.5);
549 assert_eq!(y, 30.75);
550 assert_eq!(width, 200.25);
551 assert_eq!(height, 150.5);
552 }
553
554 #[test]
555 fn test_metadata_multiple_properties() {
556 let mut metadata = EntityMetadata::new();
557
558 for i in 0..10 {
560 metadata
561 .properties
562 .insert(format!("key{}", i), format!("value{}", i));
563 }
564
565 assert_eq!(metadata.properties.len(), 10);
566 assert_eq!(metadata.properties.get("key5"), Some(&"value5".to_string()));
567 }
568
569 #[test]
570 fn test_entity_list_type() {
571 let list_entity = Entity::new(
572 "list-1".to_string(),
573 EntityType::List,
574 (10.0, 10.0, 180.0, 100.0),
575 0,
576 );
577
578 let mut entity = list_entity;
580 entity.metadata = EntityMetadata::new()
581 .with_property("list_type", "ordered")
582 .with_property("item_count", "5");
583
584 assert_eq!(entity.entity_type, EntityType::List);
585 assert_eq!(
586 entity.metadata.properties.get("list_type"),
587 Some(&"ordered".to_string())
588 );
589 }
590
591 #[test]
592 fn test_confidence_edge_cases() {
593 let metadata1 = EntityMetadata::new().with_confidence(0.0);
595 assert_eq!(metadata1.confidence, Some(0.0));
596
597 let metadata2 = EntityMetadata::new().with_confidence(1.0);
598 assert_eq!(metadata2.confidence, Some(1.0));
599
600 let metadata3 = EntityMetadata::new().with_confidence(0.5);
602 assert_eq!(metadata3.confidence, Some(0.5));
603 }
604
605 #[test]
606 fn test_financial_entity_types() {
607 let invoice = Entity::new(
608 "invoice_001".to_string(),
609 EntityType::Invoice,
610 (0.0, 0.0, 500.0, 600.0),
611 0,
612 );
613
614 let invoice_number = Entity::new(
615 "inv_num_001".to_string(),
616 EntityType::InvoiceNumber,
617 (100.0, 700.0, 150.0, 20.0),
618 0,
619 );
620
621 assert_eq!(invoice.entity_type, EntityType::Invoice);
622 assert_eq!(invoice_number.entity_type, EntityType::InvoiceNumber);
623 }
624
625 #[test]
626 fn test_custom_entity_type() {
627 let custom_entity = Entity::new(
628 "custom_001".to_string(),
629 EntityType::Custom("PurchaseOrder".to_string()),
630 (0.0, 0.0, 400.0, 500.0),
631 0,
632 );
633
634 assert_eq!(
635 custom_entity.entity_type,
636 EntityType::Custom("PurchaseOrder".to_string())
637 );
638
639 let json = serde_json::to_string(&custom_entity.entity_type).unwrap();
641 assert!(json.contains("PurchaseOrder"));
642 }
643
644 #[test]
645 fn test_invoice_entity_with_metadata() {
646 let mut invoice = Entity::new(
647 "invoice_123".to_string(),
648 EntityType::Invoice,
649 (50.0, 50.0, 450.0, 700.0),
650 0,
651 );
652
653 invoice.metadata = EntityMetadata::new()
654 .with_property("invoice_number", "INV-2024-001")
655 .with_property("total_amount", "1234.56")
656 .with_property("currency", "USD")
657 .with_confidence(0.98)
658 .with_schema("https://schema.org/Invoice");
659
660 assert_eq!(
661 invoice.metadata.properties.get("invoice_number"),
662 Some(&"INV-2024-001".to_string())
663 );
664 assert_eq!(
665 invoice.metadata.properties.get("total_amount"),
666 Some(&"1234.56".to_string())
667 );
668 assert_eq!(invoice.metadata.confidence, Some(0.98));
669 }
670}