oxidize_pdf/text/invoice/types.rs
1//! Data types for invoice extraction
2//!
3//! This module provides the core data structures used by the invoice extraction system.
4//! It includes language definitions, field types, confidence scoring, and metadata.
5//!
6//! # Overview
7//!
8//! The invoice extraction system works in several stages:
9//! 1. Text extraction from PDF pages
10//! 2. Pattern matching against language-specific templates
11//! 3. Type conversion and confidence scoring
12//! 4. Structured data output with metadata
13//!
14//! # Examples
15//!
16//! ```
17//! use oxidize_pdf::text::invoice::{Language, InvoiceField, InvoiceExtractor};
18//!
19//! // Create extractor for Spanish invoices
20//! let extractor = InvoiceExtractor::builder()
21//! .with_language("es")
22//! .confidence_threshold(0.7)
23//! .build();
24//! ```
25
26/// Supported languages for invoice extraction
27///
28/// Each language has specific patterns for:
29/// - Invoice number formats (e.g., "Factura Nº" vs "Invoice Number")
30/// - Date formats (DD/MM/YYYY vs MM/DD/YYYY vs DD.MM.YYYY)
31/// - Number formats (1.234,56 vs 1,234.56)
32/// - Field labels and terminology
33///
34/// # Language-Specific Behaviors
35///
36/// - **Spanish**: Uses European number format (1.234,56), DD/MM/YYYY dates
37/// - **English**: Uses US/UK number format (1,234.56), DD/MM/YYYY dates
38/// - **German**: Uses European number format (1.234,56), DD.MM.YYYY dates
39/// - **Italian**: Uses European number format (1.234,56), DD/MM/YYYY dates
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub enum Language {
42 /// Spanish (Spain, Latin America)
43 ///
44 /// Patterns include: "Factura", "CIF", "Base Imponible", "IVA"
45 Spanish,
46
47 /// English (UK, US, International)
48 ///
49 /// Patterns include: "Invoice", "VAT Number", "Subtotal", "Total"
50 English,
51
52 /// German (Germany, Austria, Switzerland)
53 ///
54 /// Patterns include: "Rechnung", "USt-IdNr.", "Nettobetrag", "MwSt."
55 German,
56
57 /// Italian (Italy)
58 ///
59 /// Patterns include: "Fattura", "Partita IVA", "Imponibile", "IVA"
60 Italian,
61}
62
63impl Language {
64 /// Convert language code to Language enum
65 ///
66 /// # Examples
67 ///
68 /// ```
69 /// use oxidize_pdf::text::invoice::Language;
70 ///
71 /// assert_eq!(Language::from_code("es"), Some(Language::Spanish));
72 /// assert_eq!(Language::from_code("en"), Some(Language::English));
73 /// assert_eq!(Language::from_code("invalid"), None);
74 /// ```
75 pub fn from_code(code: &str) -> Option<Self> {
76 match code.to_lowercase().as_str() {
77 "es" | "spa" | "spanish" => Some(Language::Spanish),
78 "en" | "eng" | "english" => Some(Language::English),
79 "de" | "deu" | "german" => Some(Language::German),
80 "it" | "ita" | "italian" => Some(Language::Italian),
81 _ => None,
82 }
83 }
84
85 /// Get the language code (ISO 639-1)
86 pub fn code(&self) -> &'static str {
87 match self {
88 Language::Spanish => "es",
89 Language::English => "en",
90 Language::German => "de",
91 Language::Italian => "it",
92 }
93 }
94}
95
96/// Bounding box for text positioning in PDF coordinate space
97///
98/// PDF coordinates start at bottom-left (0,0) with Y increasing upward.
99/// This structure represents a rectangular region where extracted text was found.
100///
101/// # Coordinate System
102///
103/// ```text
104/// (0, height) (width, height)
105/// ┌─────────────────────┐
106/// │ │
107/// │ Text content │
108/// │ │
109/// └─────────────────────┘
110/// (0, 0) (width, 0)
111/// ```
112///
113/// # Examples
114///
115/// ```
116/// use oxidize_pdf::text::invoice::BoundingBox;
117///
118/// let bbox = BoundingBox::new(50.0, 100.0, 200.0, 20.0);
119/// assert!(bbox.contains(150.0, 110.0)); // Point inside
120/// assert!(!bbox.contains(300.0, 110.0)); // Point outside
121/// assert_eq!(bbox.area(), 4000.0);
122/// ```
123#[derive(Debug, Clone, Copy, PartialEq)]
124pub struct BoundingBox {
125 /// X coordinate of left edge (in PDF points from page origin)
126 pub x: f64,
127
128 /// Y coordinate of bottom edge (in PDF points from page origin)
129 pub y: f64,
130
131 /// Width of the box (in PDF points)
132 pub width: f64,
133
134 /// Height of the box (in PDF points)
135 pub height: f64,
136}
137
138impl BoundingBox {
139 /// Create a new bounding box
140 pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
141 Self {
142 x,
143 y,
144 width,
145 height,
146 }
147 }
148
149 /// Check if this bounding box contains a point
150 pub fn contains(&self, px: f64, py: f64) -> bool {
151 px >= self.x && px <= self.x + self.width && py >= self.y && py <= self.y + self.height
152 }
153
154 /// Calculate the area of the bounding box
155 pub fn area(&self) -> f64 {
156 self.width * self.height
157 }
158}
159
160/// Extracted invoice field with strongly-typed data
161///
162/// Each variant represents a different type of information that can be extracted
163/// from an invoice. Fields are matched using language-specific patterns and
164/// converted to appropriate types (String for text, f64 for amounts).
165///
166/// # Type Conversion
167///
168/// - **String fields**: Invoice numbers, dates, names (preserved as-is)
169/// - **Amount fields**: Parsed with language-aware decimal handling
170/// - European format: `1.234,56` → `1234.56`
171/// - US/UK format: `1,234.56` → `1234.56`
172/// - **Quantity fields**: Parsed as floating-point numbers
173///
174/// # Examples
175///
176/// ```
177/// use oxidize_pdf::text::invoice::InvoiceField;
178///
179/// let invoice_number = InvoiceField::InvoiceNumber("INV-2025-001".to_string());
180/// let total = InvoiceField::TotalAmount(1234.56);
181///
182/// assert_eq!(invoice_number.name(), "Invoice Number");
183/// assert_eq!(total.name(), "Total Amount");
184/// ```
185#[derive(Debug, Clone, PartialEq)]
186pub enum InvoiceField {
187 /// Invoice number (e.g., "INV-2025-001", "Factura Nº: 2025-001")
188 ///
189 /// Typically appears near the top of the invoice. Format varies by country
190 /// and company, but usually includes alphanumeric identifiers.
191 InvoiceNumber(String),
192
193 /// Invoice date as extracted from document
194 ///
195 /// Format varies by language:
196 /// - Spanish/Italian: DD/MM/YYYY
197 /// - German: DD.MM.YYYY
198 /// - English: DD/MM/YYYY or MM/DD/YYYY
199 ///
200 /// Note: Stored as string, not parsed to Date type (MVP)
201 InvoiceDate(String),
202
203 /// Due date for payment
204 ///
205 /// Same format considerations as InvoiceDate.
206 DueDate(String),
207
208 /// Total amount including all taxes (in currency units)
209 ///
210 /// Also known as: "Total", "Grand Total", "Gesamtbetrag", "Totale"
211 TotalAmount(f64),
212
213 /// Tax amount (VAT/IVA/MwSt/IVA in currency units)
214 ///
215 /// Represents the total tax charged. May include breakdown of different
216 /// tax rates (e.g., 21% VAT, 10% reduced rate).
217 TaxAmount(f64),
218
219 /// Net amount before tax (in currency units)
220 ///
221 /// Also known as: "Subtotal", "Net Amount", "Base Imponible", "Nettobetrag", "Imponibile"
222 NetAmount(f64),
223
224 /// VAT/Tax identification number
225 ///
226 /// Format varies by country:
227 /// - Spain: CIF (A12345678)
228 /// - UK: VAT Number (GB123456789)
229 /// - Germany: USt-IdNr. (DE123456789)
230 /// - Italy: Partita IVA (IT12345678901)
231 VatNumber(String),
232
233 /// Supplier/Vendor name (company issuing the invoice)
234 SupplierName(String),
235
236 /// Customer/Client name (company receiving the invoice)
237 CustomerName(String),
238
239 /// Currency code (ISO 4217)
240 ///
241 /// Examples: "EUR", "GBP", "USD", "CHF"
242 Currency(String),
243
244 /// Article/Product number for line items
245 ///
246 /// SKU, part number, or product code.
247 ArticleNumber(String),
248
249 /// Line item description/name
250 ///
251 /// Textual description of product or service.
252 LineItemDescription(String),
253
254 /// Line item quantity (units ordered/delivered)
255 LineItemQuantity(f64),
256
257 /// Line item unit price (price per unit, before tax)
258 LineItemUnitPrice(f64),
259}
260
261impl InvoiceField {
262 /// Get a human-readable name for this field type
263 pub fn name(&self) -> &'static str {
264 match self {
265 InvoiceField::InvoiceNumber(_) => "Invoice Number",
266 InvoiceField::InvoiceDate(_) => "Invoice Date",
267 InvoiceField::DueDate(_) => "Due Date",
268 InvoiceField::TotalAmount(_) => "Total Amount",
269 InvoiceField::TaxAmount(_) => "Tax Amount",
270 InvoiceField::NetAmount(_) => "Net Amount",
271 InvoiceField::VatNumber(_) => "VAT Number",
272 InvoiceField::SupplierName(_) => "Supplier Name",
273 InvoiceField::CustomerName(_) => "Customer Name",
274 InvoiceField::Currency(_) => "Currency",
275 InvoiceField::ArticleNumber(_) => "Article Number",
276 InvoiceField::LineItemDescription(_) => "Line Item Description",
277 InvoiceField::LineItemQuantity(_) => "Line Item Quantity",
278 InvoiceField::LineItemUnitPrice(_) => "Line Item Unit Price",
279 }
280 }
281}
282
283/// An extracted field with metadata
284#[derive(Debug, Clone, PartialEq)]
285pub struct ExtractedField {
286 /// Type and value of the field
287 pub field_type: InvoiceField,
288
289 /// Confidence score (0.0 to 1.0)
290 pub confidence: f64,
291
292 /// Position in the document
293 pub position: BoundingBox,
294
295 /// Raw text as it appeared in the PDF
296 pub raw_text: String,
297}
298
299impl ExtractedField {
300 /// Create a new extracted field
301 pub fn new(
302 field_type: InvoiceField,
303 confidence: f64,
304 position: BoundingBox,
305 raw_text: String,
306 ) -> Self {
307 Self {
308 field_type,
309 confidence,
310 position,
311 raw_text,
312 }
313 }
314}
315
316/// Metadata about the invoice extraction
317#[derive(Debug, Clone, PartialEq)]
318pub struct InvoiceMetadata {
319 /// Page number where the invoice was found (1-indexed)
320 pub page_number: u32,
321
322 /// Overall extraction confidence (average of all fields)
323 pub extraction_confidence: f64,
324
325 /// Detected language (if applicable)
326 pub detected_language: Option<Language>,
327}
328
329impl InvoiceMetadata {
330 /// Create new metadata
331 pub fn new(page_number: u32, extraction_confidence: f64) -> Self {
332 Self {
333 page_number,
334 extraction_confidence,
335 detected_language: None,
336 }
337 }
338
339 /// Set the detected language
340 pub fn with_language(mut self, lang: Language) -> Self {
341 self.detected_language = Some(lang);
342 self
343 }
344}
345
346/// Extracted invoice data
347#[derive(Debug, Clone, PartialEq)]
348pub struct InvoiceData {
349 /// All extracted fields
350 pub fields: Vec<ExtractedField>,
351
352 /// Metadata about the extraction
353 pub metadata: InvoiceMetadata,
354}
355
356impl InvoiceData {
357 /// Create new invoice data
358 pub fn new(fields: Vec<ExtractedField>, metadata: InvoiceMetadata) -> Self {
359 Self { fields, metadata }
360 }
361
362 /// Get all fields of a specific type
363 pub fn get_fields(&self, field_name: &str) -> Vec<&ExtractedField> {
364 self.fields
365 .iter()
366 .filter(|f| f.field_type.name() == field_name)
367 .collect()
368 }
369
370 /// Get the first field of a specific type
371 pub fn get_field(&self, field_name: &str) -> Option<&ExtractedField> {
372 self.fields
373 .iter()
374 .find(|f| f.field_type.name() == field_name)
375 }
376
377 /// Get the count of extracted fields
378 pub fn field_count(&self) -> usize {
379 self.fields.len()
380 }
381
382 /// Filter fields by minimum confidence
383 pub fn filter_by_confidence(mut self, min_confidence: f64) -> Self {
384 self.fields.retain(|f| f.confidence >= min_confidence);
385 // Recalculate overall confidence
386 if !self.fields.is_empty() {
387 let sum: f64 = self.fields.iter().map(|f| f.confidence).sum();
388 self.metadata.extraction_confidence = sum / self.fields.len() as f64;
389 }
390 self
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::*;
397
398 #[test]
399 fn test_language_from_code() {
400 assert_eq!(Language::from_code("es"), Some(Language::Spanish));
401 assert_eq!(Language::from_code("ES"), Some(Language::Spanish));
402 assert_eq!(Language::from_code("spanish"), Some(Language::Spanish));
403
404 assert_eq!(Language::from_code("en"), Some(Language::English));
405 assert_eq!(Language::from_code("de"), Some(Language::German));
406 assert_eq!(Language::from_code("it"), Some(Language::Italian));
407
408 assert_eq!(Language::from_code("fr"), None);
409 assert_eq!(Language::from_code("invalid"), None);
410 }
411
412 #[test]
413 fn test_language_code() {
414 assert_eq!(Language::Spanish.code(), "es");
415 assert_eq!(Language::English.code(), "en");
416 assert_eq!(Language::German.code(), "de");
417 assert_eq!(Language::Italian.code(), "it");
418 }
419
420 #[test]
421 fn test_bounding_box_contains() {
422 let bbox = BoundingBox::new(10.0, 20.0, 50.0, 30.0);
423
424 assert!(bbox.contains(10.0, 20.0)); // bottom-left corner
425 assert!(bbox.contains(60.0, 50.0)); // top-right corner
426 assert!(bbox.contains(35.0, 35.0)); // center
427
428 assert!(!bbox.contains(5.0, 20.0)); // left of box
429 assert!(!bbox.contains(65.0, 35.0)); // right of box
430 assert!(!bbox.contains(35.0, 15.0)); // below box
431 assert!(!bbox.contains(35.0, 55.0)); // above box
432 }
433
434 #[test]
435 fn test_bounding_box_area() {
436 let bbox = BoundingBox::new(0.0, 0.0, 10.0, 5.0);
437 assert_eq!(bbox.area(), 50.0);
438 }
439
440 #[test]
441 fn test_invoice_field_name() {
442 let field = InvoiceField::InvoiceNumber("INV-001".to_string());
443 assert_eq!(field.name(), "Invoice Number");
444
445 let field = InvoiceField::TotalAmount(1234.56);
446 assert_eq!(field.name(), "Total Amount");
447 }
448
449 #[test]
450 fn test_invoice_data_get_field() {
451 let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
452 let field1 = ExtractedField::new(
453 InvoiceField::InvoiceNumber("INV-001".to_string()),
454 0.9,
455 bbox,
456 "INV-001".to_string(),
457 );
458 let field2 = ExtractedField::new(
459 InvoiceField::TotalAmount(100.0),
460 0.8,
461 bbox,
462 "100.00".to_string(),
463 );
464
465 let metadata = InvoiceMetadata::new(1, 0.85);
466 let data = InvoiceData::new(vec![field1, field2], metadata);
467
468 assert_eq!(data.field_count(), 2);
469 assert!(data.get_field("Invoice Number").is_some());
470 assert!(data.get_field("Total Amount").is_some());
471 assert!(data.get_field("Nonexistent").is_none());
472 }
473
474 #[test]
475 fn test_invoice_data_filter_by_confidence() {
476 let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
477 let field1 = ExtractedField::new(
478 InvoiceField::InvoiceNumber("INV-001".to_string()),
479 0.9,
480 bbox,
481 "INV-001".to_string(),
482 );
483 let field2 = ExtractedField::new(
484 InvoiceField::TotalAmount(100.0),
485 0.5,
486 bbox,
487 "100.00".to_string(),
488 );
489
490 let metadata = InvoiceMetadata::new(1, 0.7);
491 let data = InvoiceData::new(vec![field1, field2], metadata);
492
493 let filtered = data.filter_by_confidence(0.7);
494 assert_eq!(filtered.field_count(), 1);
495 assert!(filtered.get_field("Invoice Number").is_some());
496 assert!(filtered.get_field("Total Amount").is_none());
497 }
498}