Skip to main content

oxidize_pdf/text/invoice/
types.rs

1//! Data types for invoice extraction
2//!
3//! This module provides the core data structures used by the invoice extraction system.
4//! It includes language definitions, field types, confidence scoring, and metadata.
5//!
6//! # Overview
7//!
8//! The invoice extraction system works in several stages:
9//! 1. Text extraction from PDF pages
10//! 2. Pattern matching against language-specific templates
11//! 3. Type conversion and confidence scoring
12//! 4. Structured data output with metadata
13//!
14//! # Examples
15//!
16//! ```
17//! use oxidize_pdf::text::invoice::{Language, InvoiceField, InvoiceExtractor};
18//!
19//! // Create extractor for Spanish invoices
20//! let extractor = InvoiceExtractor::builder()
21//!     .with_language("es")
22//!     .confidence_threshold(0.7)
23//!     .build();
24//! ```
25
26/// Supported languages for invoice extraction
27///
28/// Each language has specific patterns for:
29/// - Invoice number formats (e.g., "Factura Nº" vs "Invoice Number")
30/// - Date formats (DD/MM/YYYY vs MM/DD/YYYY vs DD.MM.YYYY)
31/// - Number formats (1.234,56 vs 1,234.56)
32/// - Field labels and terminology
33///
34/// # Language-Specific Behaviors
35///
36/// - **Spanish**: Uses European number format (1.234,56), DD/MM/YYYY dates
37/// - **English**: Uses US/UK number format (1,234.56), DD/MM/YYYY dates
38/// - **German**: Uses European number format (1.234,56), DD.MM.YYYY dates
39/// - **Italian**: Uses European number format (1.234,56), DD/MM/YYYY dates
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub enum Language {
42    /// Spanish (Spain, Latin America)
43    ///
44    /// Patterns include: "Factura", "CIF", "Base Imponible", "IVA"
45    Spanish,
46
47    /// English (UK, US, International)
48    ///
49    /// Patterns include: "Invoice", "VAT Number", "Subtotal", "Total"
50    English,
51
52    /// German (Germany, Austria, Switzerland)
53    ///
54    /// Patterns include: "Rechnung", "USt-IdNr.", "Nettobetrag", "MwSt."
55    German,
56
57    /// Italian (Italy)
58    ///
59    /// Patterns include: "Fattura", "Partita IVA", "Imponibile", "IVA"
60    Italian,
61}
62
63impl Language {
64    /// Convert language code to Language enum
65    ///
66    /// # Examples
67    ///
68    /// ```
69    /// use oxidize_pdf::text::invoice::Language;
70    ///
71    /// assert_eq!(Language::from_code("es"), Some(Language::Spanish));
72    /// assert_eq!(Language::from_code("en"), Some(Language::English));
73    /// assert_eq!(Language::from_code("invalid"), None);
74    /// ```
75    pub fn from_code(code: &str) -> Option<Self> {
76        match code.to_lowercase().as_str() {
77            "es" | "spa" | "spanish" => Some(Language::Spanish),
78            "en" | "eng" | "english" => Some(Language::English),
79            "de" | "deu" | "german" => Some(Language::German),
80            "it" | "ita" | "italian" => Some(Language::Italian),
81            _ => None,
82        }
83    }
84
85    /// Get the language code (ISO 639-1)
86    pub fn code(&self) -> &'static str {
87        match self {
88            Language::Spanish => "es",
89            Language::English => "en",
90            Language::German => "de",
91            Language::Italian => "it",
92        }
93    }
94}
95
96/// Bounding box for text positioning in PDF coordinate space
97///
98/// PDF coordinates start at bottom-left (0,0) with Y increasing upward.
99/// This structure represents a rectangular region where extracted text was found.
100///
101/// # Coordinate System
102///
103/// ```text
104/// (0, height)         (width, height)
105///     ┌─────────────────────┐
106///     │                     │
107///     │   Text content      │
108///     │                     │
109///     └─────────────────────┘
110/// (0, 0)              (width, 0)
111/// ```
112///
113/// # Examples
114///
115/// ```
116/// use oxidize_pdf::text::invoice::BoundingBox;
117///
118/// let bbox = BoundingBox::new(50.0, 100.0, 200.0, 20.0);
119/// assert!(bbox.contains(150.0, 110.0));  // Point inside
120/// assert!(!bbox.contains(300.0, 110.0)); // Point outside
121/// assert_eq!(bbox.area(), 4000.0);
122/// ```
123#[derive(Debug, Clone, Copy, PartialEq)]
124pub struct BoundingBox {
125    /// X coordinate of left edge (in PDF points from page origin)
126    pub x: f64,
127
128    /// Y coordinate of bottom edge (in PDF points from page origin)
129    pub y: f64,
130
131    /// Width of the box (in PDF points)
132    pub width: f64,
133
134    /// Height of the box (in PDF points)
135    pub height: f64,
136}
137
138impl BoundingBox {
139    /// Create a new bounding box
140    pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
141        Self {
142            x,
143            y,
144            width,
145            height,
146        }
147    }
148
149    /// Check if this bounding box contains a point
150    pub fn contains(&self, px: f64, py: f64) -> bool {
151        px >= self.x && px <= self.x + self.width && py >= self.y && py <= self.y + self.height
152    }
153
154    /// Calculate the area of the bounding box
155    pub fn area(&self) -> f64 {
156        self.width * self.height
157    }
158}
159
160/// Extracted invoice field with strongly-typed data
161///
162/// Each variant represents a different type of information that can be extracted
163/// from an invoice. Fields are matched using language-specific patterns and
164/// converted to appropriate types (String for text, f64 for amounts).
165///
166/// # Type Conversion
167///
168/// - **String fields**: Invoice numbers, dates, names (preserved as-is)
169/// - **Amount fields**: Parsed with language-aware decimal handling
170///   - European format: `1.234,56` → `1234.56`
171///   - US/UK format: `1,234.56` → `1234.56`
172/// - **Quantity fields**: Parsed as floating-point numbers
173///
174/// # Examples
175///
176/// ```
177/// use oxidize_pdf::text::invoice::InvoiceField;
178///
179/// let invoice_number = InvoiceField::InvoiceNumber("INV-2025-001".to_string());
180/// let total = InvoiceField::TotalAmount(1234.56);
181///
182/// assert_eq!(invoice_number.name(), "Invoice Number");
183/// assert_eq!(total.name(), "Total Amount");
184/// ```
185#[derive(Debug, Clone, PartialEq)]
186pub enum InvoiceField {
187    /// Invoice number (e.g., "INV-2025-001", "Factura Nº: 2025-001")
188    ///
189    /// Typically appears near the top of the invoice. Format varies by country
190    /// and company, but usually includes alphanumeric identifiers.
191    InvoiceNumber(String),
192
193    /// Invoice date as extracted from document
194    ///
195    /// Format varies by language:
196    /// - Spanish/Italian: DD/MM/YYYY
197    /// - German: DD.MM.YYYY
198    /// - English: DD/MM/YYYY or MM/DD/YYYY
199    ///
200    /// Note: Stored as string, not parsed to Date type (MVP)
201    InvoiceDate(String),
202
203    /// Due date for payment
204    ///
205    /// Same format considerations as InvoiceDate.
206    DueDate(String),
207
208    /// Total amount including all taxes (in currency units)
209    ///
210    /// Also known as: "Total", "Grand Total", "Gesamtbetrag", "Totale"
211    TotalAmount(f64),
212
213    /// Tax amount (VAT/IVA/MwSt/IVA in currency units)
214    ///
215    /// Represents the total tax charged. May include breakdown of different
216    /// tax rates (e.g., 21% VAT, 10% reduced rate).
217    TaxAmount(f64),
218
219    /// Net amount before tax (in currency units)
220    ///
221    /// Also known as: "Subtotal", "Net Amount", "Base Imponible", "Nettobetrag", "Imponibile"
222    NetAmount(f64),
223
224    /// VAT/Tax identification number
225    ///
226    /// Format varies by country:
227    /// - Spain: CIF (A12345678)
228    /// - UK: VAT Number (GB123456789)
229    /// - Germany: USt-IdNr. (DE123456789)
230    /// - Italy: Partita IVA (IT12345678901)
231    VatNumber(String),
232
233    /// Supplier/Vendor name (company issuing the invoice)
234    SupplierName(String),
235
236    /// Customer/Client name (company receiving the invoice)
237    CustomerName(String),
238
239    /// Currency code (ISO 4217)
240    ///
241    /// Examples: "EUR", "GBP", "USD", "CHF"
242    Currency(String),
243
244    /// Article/Product number for line items
245    ///
246    /// SKU, part number, or product code.
247    ArticleNumber(String),
248
249    /// Line item description/name
250    ///
251    /// Textual description of product or service.
252    LineItemDescription(String),
253
254    /// Line item quantity (units ordered/delivered)
255    LineItemQuantity(f64),
256
257    /// Line item unit price (price per unit, before tax)
258    LineItemUnitPrice(f64),
259}
260
261impl InvoiceField {
262    /// Get a human-readable name for this field type
263    pub fn name(&self) -> &'static str {
264        match self {
265            InvoiceField::InvoiceNumber(_) => "Invoice Number",
266            InvoiceField::InvoiceDate(_) => "Invoice Date",
267            InvoiceField::DueDate(_) => "Due Date",
268            InvoiceField::TotalAmount(_) => "Total Amount",
269            InvoiceField::TaxAmount(_) => "Tax Amount",
270            InvoiceField::NetAmount(_) => "Net Amount",
271            InvoiceField::VatNumber(_) => "VAT Number",
272            InvoiceField::SupplierName(_) => "Supplier Name",
273            InvoiceField::CustomerName(_) => "Customer Name",
274            InvoiceField::Currency(_) => "Currency",
275            InvoiceField::ArticleNumber(_) => "Article Number",
276            InvoiceField::LineItemDescription(_) => "Line Item Description",
277            InvoiceField::LineItemQuantity(_) => "Line Item Quantity",
278            InvoiceField::LineItemUnitPrice(_) => "Line Item Unit Price",
279        }
280    }
281}
282
283/// An extracted field with metadata
284#[derive(Debug, Clone, PartialEq)]
285pub struct ExtractedField {
286    /// Type and value of the field
287    pub field_type: InvoiceField,
288
289    /// Confidence score (0.0 to 1.0)
290    pub confidence: f64,
291
292    /// Position in the document
293    pub position: BoundingBox,
294
295    /// Raw text as it appeared in the PDF
296    pub raw_text: String,
297}
298
299impl ExtractedField {
300    /// Create a new extracted field
301    pub fn new(
302        field_type: InvoiceField,
303        confidence: f64,
304        position: BoundingBox,
305        raw_text: String,
306    ) -> Self {
307        Self {
308            field_type,
309            confidence,
310            position,
311            raw_text,
312        }
313    }
314}
315
316/// Metadata about the invoice extraction
317#[derive(Debug, Clone, PartialEq)]
318pub struct InvoiceMetadata {
319    /// Page number where the invoice was found (1-indexed)
320    pub page_number: u32,
321
322    /// Overall extraction confidence (average of all fields)
323    pub extraction_confidence: f64,
324
325    /// Detected language (if applicable)
326    pub detected_language: Option<Language>,
327}
328
329impl InvoiceMetadata {
330    /// Create new metadata
331    pub fn new(page_number: u32, extraction_confidence: f64) -> Self {
332        Self {
333            page_number,
334            extraction_confidence,
335            detected_language: None,
336        }
337    }
338
339    /// Set the detected language
340    pub fn with_language(mut self, lang: Language) -> Self {
341        self.detected_language = Some(lang);
342        self
343    }
344}
345
346/// Extracted invoice data
347#[derive(Debug, Clone, PartialEq)]
348pub struct InvoiceData {
349    /// All extracted fields
350    pub fields: Vec<ExtractedField>,
351
352    /// Metadata about the extraction
353    pub metadata: InvoiceMetadata,
354}
355
356impl InvoiceData {
357    /// Create new invoice data
358    pub fn new(fields: Vec<ExtractedField>, metadata: InvoiceMetadata) -> Self {
359        Self { fields, metadata }
360    }
361
362    /// Get all fields of a specific type
363    pub fn get_fields(&self, field_name: &str) -> Vec<&ExtractedField> {
364        self.fields
365            .iter()
366            .filter(|f| f.field_type.name() == field_name)
367            .collect()
368    }
369
370    /// Get the first field of a specific type
371    pub fn get_field(&self, field_name: &str) -> Option<&ExtractedField> {
372        self.fields
373            .iter()
374            .find(|f| f.field_type.name() == field_name)
375    }
376
377    /// Get the count of extracted fields
378    pub fn field_count(&self) -> usize {
379        self.fields.len()
380    }
381
382    /// Filter fields by minimum confidence
383    pub fn filter_by_confidence(mut self, min_confidence: f64) -> Self {
384        self.fields.retain(|f| f.confidence >= min_confidence);
385        // Recalculate overall confidence
386        if !self.fields.is_empty() {
387            let sum: f64 = self.fields.iter().map(|f| f.confidence).sum();
388            self.metadata.extraction_confidence = sum / self.fields.len() as f64;
389        }
390        self
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    #[test]
399    fn test_language_from_code() {
400        assert_eq!(Language::from_code("es"), Some(Language::Spanish));
401        assert_eq!(Language::from_code("ES"), Some(Language::Spanish));
402        assert_eq!(Language::from_code("spanish"), Some(Language::Spanish));
403
404        assert_eq!(Language::from_code("en"), Some(Language::English));
405        assert_eq!(Language::from_code("de"), Some(Language::German));
406        assert_eq!(Language::from_code("it"), Some(Language::Italian));
407
408        assert_eq!(Language::from_code("fr"), None);
409        assert_eq!(Language::from_code("invalid"), None);
410    }
411
412    #[test]
413    fn test_language_code() {
414        assert_eq!(Language::Spanish.code(), "es");
415        assert_eq!(Language::English.code(), "en");
416        assert_eq!(Language::German.code(), "de");
417        assert_eq!(Language::Italian.code(), "it");
418    }
419
420    #[test]
421    fn test_bounding_box_contains() {
422        let bbox = BoundingBox::new(10.0, 20.0, 50.0, 30.0);
423
424        assert!(bbox.contains(10.0, 20.0)); // bottom-left corner
425        assert!(bbox.contains(60.0, 50.0)); // top-right corner
426        assert!(bbox.contains(35.0, 35.0)); // center
427
428        assert!(!bbox.contains(5.0, 20.0)); // left of box
429        assert!(!bbox.contains(65.0, 35.0)); // right of box
430        assert!(!bbox.contains(35.0, 15.0)); // below box
431        assert!(!bbox.contains(35.0, 55.0)); // above box
432    }
433
434    #[test]
435    fn test_bounding_box_area() {
436        let bbox = BoundingBox::new(0.0, 0.0, 10.0, 5.0);
437        assert_eq!(bbox.area(), 50.0);
438    }
439
440    #[test]
441    fn test_invoice_field_name() {
442        let field = InvoiceField::InvoiceNumber("INV-001".to_string());
443        assert_eq!(field.name(), "Invoice Number");
444
445        let field = InvoiceField::TotalAmount(1234.56);
446        assert_eq!(field.name(), "Total Amount");
447    }
448
449    #[test]
450    fn test_invoice_data_get_field() {
451        let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
452        let field1 = ExtractedField::new(
453            InvoiceField::InvoiceNumber("INV-001".to_string()),
454            0.9,
455            bbox,
456            "INV-001".to_string(),
457        );
458        let field2 = ExtractedField::new(
459            InvoiceField::TotalAmount(100.0),
460            0.8,
461            bbox,
462            "100.00".to_string(),
463        );
464
465        let metadata = InvoiceMetadata::new(1, 0.85);
466        let data = InvoiceData::new(vec![field1, field2], metadata);
467
468        assert_eq!(data.field_count(), 2);
469        assert!(data.get_field("Invoice Number").is_some());
470        assert!(data.get_field("Total Amount").is_some());
471        assert!(data.get_field("Nonexistent").is_none());
472    }
473
474    #[test]
475    fn test_invoice_data_filter_by_confidence() {
476        let bbox = BoundingBox::new(0.0, 0.0, 10.0, 10.0);
477        let field1 = ExtractedField::new(
478            InvoiceField::InvoiceNumber("INV-001".to_string()),
479            0.9,
480            bbox,
481            "INV-001".to_string(),
482        );
483        let field2 = ExtractedField::new(
484            InvoiceField::TotalAmount(100.0),
485            0.5,
486            bbox,
487            "100.00".to_string(),
488        );
489
490        let metadata = InvoiceMetadata::new(1, 0.7);
491        let data = InvoiceData::new(vec![field1, field2], metadata);
492
493        let filtered = data.filter_by_confidence(0.7);
494        assert_eq!(filtered.field_count(), 1);
495        assert!(filtered.get_field("Invoice Number").is_some());
496        assert!(filtered.get_field("Total Amount").is_none());
497    }
498}