Skip to main content

oxidize_pdf/text/invoice/
patterns.rs

1//! Pattern matching for invoice fields
2//!
3//! This module contains regex patterns and matching logic for extracting
4//! structured data from invoice text.
5
6use super::error::{ExtractionError, Result};
7use super::types::Language;
8use regex::Regex;
9
10/// A pattern for matching invoice fields
11#[derive(Debug, Clone)]
12pub struct FieldPattern {
13    /// Type of field this pattern matches
14    pub field_type: InvoiceFieldType,
15
16    /// Compiled regex pattern
17    pub regex: Regex,
18
19    /// Base confidence score (0.0 to 1.0)
20    pub confidence_base: f64,
21
22    /// Language this pattern is specific to (None = all languages)
23    pub language: Option<Language>,
24
25    /// Context hints - words that increase confidence when found nearby
26    pub context_hints: Vec<String>,
27}
28
29/// Field type identifier (without data)
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
31pub enum InvoiceFieldType {
32    InvoiceNumber,
33    InvoiceDate,
34    DueDate,
35    TotalAmount,
36    TaxAmount,
37    NetAmount,
38    VatNumber,
39    SupplierName,
40    CustomerName,
41    Currency,
42    ArticleNumber,
43    LineItemDescription,
44    LineItemQuantity,
45    LineItemUnitPrice,
46}
47
48impl FieldPattern {
49    /// Create a new field pattern
50    pub fn new(
51        field_type: InvoiceFieldType,
52        pattern: &str,
53        confidence_base: f64,
54        language: Option<Language>,
55    ) -> Result<Self> {
56        let regex = Regex::new(pattern)
57            .map_err(|e| ExtractionError::RegexError(format!("{}: {}", pattern, e)))?;
58
59        Ok(Self {
60            field_type,
61            regex,
62            confidence_base,
63            language,
64            context_hints: Vec::new(),
65        })
66    }
67
68    /// Add context hints to this pattern
69    pub fn with_hints(mut self, hints: Vec<String>) -> Self {
70        self.context_hints = hints;
71        self
72    }
73
74    /// Check if this pattern matches the given text
75    pub fn matches(&self, text: &str) -> Option<String> {
76        self.regex
77            .captures(text)
78            .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
79    }
80}
81
82/// Library of patterns for invoice field extraction
83pub struct PatternLibrary {
84    patterns: Vec<FieldPattern>,
85}
86
87impl PatternLibrary {
88    /// Create a new empty pattern library
89    pub fn new() -> Self {
90        Self {
91            patterns: Vec::new(),
92        }
93    }
94
95    /// Create a pattern library for a specific language
96    pub fn with_language(lang: Language) -> Self {
97        let mut lib = Self::new();
98        lib.load_patterns_for_language(lang);
99        lib
100    }
101
102    /// Add a pattern to the library
103    pub fn add_pattern(&mut self, pattern: FieldPattern) {
104        self.patterns.push(pattern);
105    }
106
107    /// Create a pattern library with default Spanish patterns
108    ///
109    /// Returns a new `PatternLibrary` pre-loaded with Spanish invoice patterns.
110    /// Users can add custom patterns on top of defaults using `add_pattern()`.
111    ///
112    /// # Examples
113    ///
114    /// ```
115    /// use oxidize_pdf::text::invoice::{PatternLibrary, FieldPattern, InvoiceFieldType, Language};
116    ///
117    /// // Load Spanish defaults
118    /// let mut patterns = PatternLibrary::default_spanish();
119    ///
120    /// // Add custom pattern for specific format
121    /// patterns.add_pattern(
122    ///     FieldPattern::new(
123    ///         InvoiceFieldType::InvoiceNumber,
124    ///         r"Invoice\s+#\s*([0-9]+)",
125    ///         0.85,
126    ///         Some(Language::Spanish)
127    ///     ).unwrap()
128    /// );
129    /// ```
130    pub fn default_spanish() -> Self {
131        Self::with_language(Language::Spanish)
132    }
133
134    /// Create a pattern library with default English patterns
135    ///
136    /// Returns a new `PatternLibrary` pre-loaded with English invoice patterns.
137    ///
138    /// # Examples
139    ///
140    /// ```
141    /// use oxidize_pdf::text::invoice::PatternLibrary;
142    ///
143    /// let patterns = PatternLibrary::default_english();
144    /// ```
145    pub fn default_english() -> Self {
146        Self::with_language(Language::English)
147    }
148
149    /// Create a pattern library with default German patterns
150    ///
151    /// Returns a new `PatternLibrary` pre-loaded with German invoice patterns.
152    ///
153    /// # Examples
154    ///
155    /// ```
156    /// use oxidize_pdf::text::invoice::PatternLibrary;
157    ///
158    /// let patterns = PatternLibrary::default_german();
159    /// ```
160    pub fn default_german() -> Self {
161        Self::with_language(Language::German)
162    }
163
164    /// Create a pattern library with default Italian patterns
165    ///
166    /// Returns a new `PatternLibrary` pre-loaded with Italian invoice patterns.
167    ///
168    /// # Examples
169    ///
170    /// ```
171    /// use oxidize_pdf::text::invoice::PatternLibrary;
172    ///
173    /// let patterns = PatternLibrary::default_italian();
174    /// ```
175    pub fn default_italian() -> Self {
176        Self::with_language(Language::Italian)
177    }
178
179    /// Merge patterns from another library into this one
180    ///
181    /// Adds all patterns from `other` to this library. Useful for combining
182    /// default patterns with custom patterns.
183    ///
184    /// # Examples
185    ///
186    /// ```
187    /// use oxidize_pdf::text::invoice::{PatternLibrary, FieldPattern, InvoiceFieldType, Language};
188    ///
189    /// // Start with Spanish defaults
190    /// let mut patterns = PatternLibrary::default_spanish();
191    ///
192    /// // Create custom library
193    /// let mut custom = PatternLibrary::new();
194    /// custom.add_pattern(
195    ///     FieldPattern::new(
196    ///         InvoiceFieldType::InvoiceNumber,
197    ///         r"Ref:\s*([A-Z0-9\-]+)",
198    ///         0.8,
199    ///         Some(Language::Spanish)
200    ///     ).unwrap()
201    /// );
202    ///
203    /// // Merge custom into defaults
204    /// patterns.merge(custom);
205    /// ```
206    pub fn merge(&mut self, other: PatternLibrary) {
207        self.patterns.extend(other.patterns);
208    }
209
210    /// Match text against all patterns
211    pub fn match_text(&self, text: &str) -> Vec<(InvoiceFieldType, String, f64)> {
212        let mut matches = Vec::new();
213
214        for pattern in &self.patterns {
215            if let Some(matched_value) = pattern.matches(text) {
216                matches.push((pattern.field_type, matched_value, pattern.confidence_base));
217            }
218        }
219
220        matches
221    }
222
223    /// Load patterns for a specific language
224    fn load_patterns_for_language(&mut self, lang: Language) {
225        match lang {
226            Language::Spanish => self.load_spanish_patterns(),
227            Language::English => self.load_english_patterns(),
228            Language::German => self.load_german_patterns(),
229            Language::Italian => self.load_italian_patterns(),
230        }
231    }
232
233    /// Load Spanish invoice patterns
234    fn load_spanish_patterns(&mut self) {
235        // Invoice number patterns
236        // Matches: "Factura N° 2025-001", "Factura Nº: 12345", "Núm. Factura: INV-001"
237        if let Ok(pattern) = FieldPattern::new(
238            InvoiceFieldType::InvoiceNumber,
239            r"(?:Factura|FACTURA|Fac\.?)\s+(?:N[úuº°]?\.?|Número)\s*:?\s*([A-Z0-9][A-Z0-9\-/]*)",
240            0.9,
241            Some(Language::Spanish),
242        ) {
243            self.add_pattern(pattern.with_hints(vec![
244                "factura".to_string(),
245                "número".to_string(),
246                "nº".to_string(),
247            ]));
248        }
249
250        // Invoice date patterns
251        // Matches: "Fecha: 15/03/2025", "Fecha de emisión: 15-03-2025"
252        if let Ok(pattern) = FieldPattern::new(
253            InvoiceFieldType::InvoiceDate,
254            r"(?:Fecha(?:\s+de\s+emisión)?|FECHA):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
255            0.85,
256            Some(Language::Spanish),
257        ) {
258            self.add_pattern(pattern.with_hints(vec!["fecha".to_string(), "emisión".to_string()]));
259        }
260
261        // Due date patterns
262        // Matches: "Vencimiento: 15/04/2025"
263        if let Ok(pattern) = FieldPattern::new(
264            InvoiceFieldType::DueDate,
265            r"(?:Vencimiento|Fecha\s+de\s+vencimiento|VENCIMIENTO):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
266            0.85,
267            Some(Language::Spanish),
268        ) {
269            self.add_pattern(pattern.with_hints(vec!["vencimiento".to_string()]));
270        }
271
272        // Total amount patterns
273        // Matches: "Total: 1.234,56 €", "TOTAL: € 1.234,56", "Importe Total: 1234.56"
274        if let Ok(pattern) = FieldPattern::new(
275            InvoiceFieldType::TotalAmount,
276            r"(?:Total|TOTAL|Importe\s+Total):?\s*€?\s*([0-9]{1,3}(?:[.,][0-9]{3})*[.,][0-9]{2})\s*€?",
277            0.9,
278            Some(Language::Spanish),
279        ) {
280            self.add_pattern(pattern.with_hints(vec!["total".to_string(), "importe".to_string()]));
281        }
282
283        // Tax amount (IVA) patterns
284        // Matches: "IVA (21%): 123,45 €"
285        if let Ok(pattern) = FieldPattern::new(
286            InvoiceFieldType::TaxAmount,
287            r"(?:IVA|I\.V\.A\.|Impuesto).*?:?\s*€?\s*([0-9]{1,3}(?:[.,][0-9]{3})*[.,][0-9]{2})\s*€?",
288            0.85,
289            Some(Language::Spanish),
290        ) {
291            self.add_pattern(pattern.with_hints(vec!["iva".to_string(), "impuesto".to_string()]));
292        }
293
294        // Net amount patterns
295        // Matches: "Base Imponible: 500,00 €", "Base: 500,00"
296        if let Ok(pattern) = FieldPattern::new(
297            InvoiceFieldType::NetAmount,
298            r"(?:Base\s+Imponible|Base):?\s*€?\s*([0-9]{1,3}(?:[.,][0-9]{3})*[.,][0-9]{2})\s*€?",
299            0.85,
300            Some(Language::Spanish),
301        ) {
302            self.add_pattern(pattern.with_hints(vec!["base".to_string(), "imponible".to_string()]));
303        }
304
305        // Additional net amount patterns (more variants)
306        // Matches: "Neto: 500,00", "Subtotal: 500,00", "Suma Neta: 500,00"
307        if let Ok(pattern) = FieldPattern::new(
308            InvoiceFieldType::NetAmount,
309            r"(?:Neto|NETO|Subtotal|SUBTOTAL|Suma\s+Neta):?\s*€?\s*([0-9]{1,3}(?:[.,][0-9]{3})*[.,][0-9]{2})\s*€?",
310            0.80,
311            Some(Language::Spanish),
312        ) {
313            self.add_pattern(pattern.with_hints(vec!["neto".to_string(), "subtotal".to_string()]));
314        }
315
316        // VAT number patterns (Spanish CIF/NIF)
317        // Matches: "CIF: A12345678", "NIF: 12345678Z"
318        if let Ok(pattern) = FieldPattern::new(
319            InvoiceFieldType::VatNumber,
320            r"(?:CIF|NIF|N\.I\.F\.|C\.I\.F\.):?\s*([A-Z]?[0-9]{8}[A-Z0-9])",
321            0.9,
322            Some(Language::Spanish),
323        ) {
324            self.add_pattern(pattern.with_hints(vec!["cif".to_string(), "nif".to_string()]));
325        }
326
327        // Customer name patterns
328        // Matches: "Cliente: Empresa S.L.", "Facturar a: Juan Pérez", "Destinatario: ABC Corp"
329        if let Ok(pattern) = FieldPattern::new(
330            InvoiceFieldType::CustomerName,
331            r"(?:Cliente|CLIENTE|Facturar\s+a|FACTURAR\s+A|Destinatario|DESTINATARIO|A\s+la\s+atención\s+de|Att\.?|Attn\.?):?\s*([A-ZÀ-ÿa-z][A-ZÀ-ÿa-z\s\.,&\-]{2,80})",
332            0.75,
333            Some(Language::Spanish),
334        ) {
335            self.add_pattern(pattern.with_hints(vec![
336                "cliente".to_string(),
337                "facturar".to_string(),
338                "destinatario".to_string(),
339                "atención".to_string(),
340            ]));
341        }
342
343        // Currency pattern
344        // Matches: "€", "EUR", "Moneda: EUR"
345        if let Ok(pattern) = FieldPattern::new(
346            InvoiceFieldType::Currency,
347            r"(?:Moneda:?\s+)?(€|EUR|USD|GBP|CHF)",
348            0.7,
349            Some(Language::Spanish),
350        ) {
351            self.add_pattern(pattern);
352        }
353    }
354
355    /// Load English invoice patterns
356    fn load_english_patterns(&mut self) {
357        // Invoice number patterns
358        // Matches: "Invoice #12345", "Invoice No: INV-001", "Invoice Number: 2025-001"
359        if let Ok(pattern) = FieldPattern::new(
360            InvoiceFieldType::InvoiceNumber,
361            r"(?:Invoice|INVOICE)\s+(?:#|No\.?|Number)\s*:?\s*([A-Z0-9][A-Z0-9\-/]*)",
362            0.9,
363            Some(Language::English),
364        ) {
365            self.add_pattern(pattern.with_hints(vec![
366                "invoice".to_string(),
367                "number".to_string(),
368                "no".to_string(),
369            ]));
370        }
371
372        // Invoice date patterns
373        // Matches: "Date: 10/20/2025", "Invoice Date: 20-10-2025"
374        if let Ok(pattern) = FieldPattern::new(
375            InvoiceFieldType::InvoiceDate,
376            r"(?:(?:Invoice\s+)?Date|DATE):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
377            0.85,
378            Some(Language::English),
379        ) {
380            self.add_pattern(pattern.with_hints(vec!["date".to_string(), "invoice".to_string()]));
381        }
382
383        // Due date patterns
384        // Matches: "Due Date: 11/20/2025", "Payment Due: 20-11-2025"
385        if let Ok(pattern) = FieldPattern::new(
386            InvoiceFieldType::DueDate,
387            r"(?:Due\s+Date|Payment\s+Due|DUE\s+DATE):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
388            0.85,
389            Some(Language::English),
390        ) {
391            self.add_pattern(pattern.with_hints(vec!["due".to_string(), "payment".to_string()]));
392        }
393
394        // Total amount patterns
395        // Matches: "Total: $1,234.56", "Total: £1,234.56", "Amount Due: 1234.56"
396        if let Ok(pattern) = FieldPattern::new(
397            InvoiceFieldType::TotalAmount,
398            r"(?:Total|TOTAL|Amount\s+Due):?\s*[$£]?\s*([0-9]{1,3}(?:,[0-9]{3})*\.[0-9]{2})\s*[$£]?",
399            0.9,
400            Some(Language::English),
401        ) {
402            self.add_pattern(pattern.with_hints(vec![
403                "total".to_string(),
404                "amount".to_string(),
405                "due".to_string(),
406            ]));
407        }
408
409        // Tax amount (VAT) patterns
410        // Matches: "VAT: 123.45", "Tax: 123.45", "VAT (20%): 123.45"
411        if let Ok(pattern) = FieldPattern::new(
412            InvoiceFieldType::TaxAmount,
413            r"(?:VAT|Tax|V\.A\.T\.).*?:?\s*[$£]?\s*([0-9]{1,3}(?:,[0-9]{3})*\.[0-9]{2})\s*[$£]?",
414            0.85,
415            Some(Language::English),
416        ) {
417            self.add_pattern(pattern.with_hints(vec!["vat".to_string(), "tax".to_string()]));
418        }
419
420        // Net amount patterns
421        // Matches: "Subtotal: 500.00", "Net Amount: 500.00"
422        if let Ok(pattern) = FieldPattern::new(
423            InvoiceFieldType::NetAmount,
424            r"(?:Subtotal|Net\s+Amount|SUBTOTAL):?\s*[$£]?\s*([0-9]{1,3}(?:,[0-9]{3})*\.[0-9]{2})\s*[$£]?",
425            0.85,
426            Some(Language::English),
427        ) {
428            self.add_pattern(pattern.with_hints(vec!["subtotal".to_string(), "net".to_string()]));
429        }
430
431        // Additional net amount patterns (more variants)
432        // Matches: "Net: 500.00", "Sub-total: 500.00", "Net Sum: 500.00"
433        if let Ok(pattern) = FieldPattern::new(
434            InvoiceFieldType::NetAmount,
435            r"(?:Net|NET|Sub-total|SUB-TOTAL|Net\s+Sum):?\s*[$£]?\s*([0-9]{1,3}(?:,[0-9]{3})*\.[0-9]{2})\s*[$£]?",
436            0.80,
437            Some(Language::English),
438        ) {
439            self.add_pattern(pattern.with_hints(vec!["net".to_string(), "sub".to_string()]));
440        }
441
442        // Table-format net amount patterns (label and value may be in separate columns/lines)
443        // Matches: "Total excl VAT\n1,463.88" or "Total excluding VAT    1,234.56"
444        if let Ok(pattern) = FieldPattern::new(
445            InvoiceFieldType::NetAmount,
446            r"(?:Total\s+excl(?:uding)?\.?\s+VAT|Total\s+ex\.?\s+VAT|Subtotal\s+ex\.?\s+VAT)\s*\n?\s*[$£]?\s*([0-9]{1,3}(?:,[0-9]{3})*\.[0-9]{2})",
447            0.85,
448            Some(Language::English),
449        ) {
450            self.add_pattern(pattern.with_hints(vec![
451                "total".to_string(),
452                "excl".to_string(),
453                "vat".to_string(),
454            ]));
455        }
456
457        // VAT number patterns
458        // Matches: "VAT No: GB123456789", "VAT Reg: 123456789"
459        if let Ok(pattern) = FieldPattern::new(
460            InvoiceFieldType::VatNumber,
461            r"(?:VAT\s+(?:No\.?|Reg\.?|Registration)|V\.A\.T\.\s+No\.?):?\s*([A-Z]{2}[0-9]{9,12}|[0-9]{9,12})",
462            0.9,
463            Some(Language::English),
464        ) {
465            self.add_pattern(
466                pattern.with_hints(vec!["vat".to_string(), "registration".to_string()]),
467            );
468        }
469
470        // Customer name patterns
471        // Matches: "Bill to: ABC Corp", "Sold to: XYZ Ltd", "Client: Jane Smith"
472        // Requires at least 2 words to avoid false positives like "No" or "VAT No"
473        // Excludes patterns with "VAT", "No.", "Reg" to prevent matching "Customer VAT No."
474        if let Ok(pattern) = FieldPattern::new(
475            InvoiceFieldType::CustomerName,
476            r"(?:Bill\s+to|BILL\s+TO|Sold\s+to|SOLD\s+TO|Client|CLIENT):[\s:]*([A-Za-z]{3,}\s+[A-Za-z][A-Za-z\s\.,&\-]{2,77})",
477            0.75,
478            Some(Language::English),
479        ) {
480            self.add_pattern(pattern.with_hints(vec![
481                "bill".to_string(),
482                "sold".to_string(),
483                "client".to_string(),
484            ]));
485        }
486
487        // Currency pattern
488        // Matches: "$", "£", "USD", "GBP", "Currency: USD"
489        if let Ok(pattern) = FieldPattern::new(
490            InvoiceFieldType::Currency,
491            r"(?:Currency:?\s+)?([$£]|USD|GBP|EUR|CHF)",
492            0.7,
493            Some(Language::English),
494        ) {
495            self.add_pattern(pattern);
496        }
497    }
498
499    /// Load German invoice patterns
500    fn load_german_patterns(&mut self) {
501        // Invoice number patterns
502        // Matches: "Rechnungsnummer: 2025-001", "Rechnung Nr. 12345", "Re.-Nr.: INV-001"
503        if let Ok(pattern) = FieldPattern::new(
504            InvoiceFieldType::InvoiceNumber,
505            r"(?:Rechnungsnummer|Rechnung\s+Nr\.?|Re\.-Nr\.?):?\s*([A-Z0-9][A-Z0-9\-/]*)",
506            0.9,
507            Some(Language::German),
508        ) {
509            self.add_pattern(pattern.with_hints(vec![
510                "rechnung".to_string(),
511                "rechnungsnummer".to_string(),
512                "nummer".to_string(),
513            ]));
514        }
515
516        // Invoice date patterns
517        // Matches: "Datum: 20.10.2025", "Rechnungsdatum: 20-10-2025"
518        if let Ok(pattern) = FieldPattern::new(
519            InvoiceFieldType::InvoiceDate,
520            r"(?:(?:Rechnungs)?datum|DATUM):?\s*(\d{1,2}[.\-]\d{1,2}[.\-]\d{2,4})",
521            0.85,
522            Some(Language::German),
523        ) {
524            self.add_pattern(
525                pattern.with_hints(vec!["datum".to_string(), "rechnungsdatum".to_string()]),
526            );
527        }
528
529        // Due date patterns
530        // Matches: "Fälligkeitsdatum: 20.11.2025", "Zahlbar bis: 20.11.2025"
531        if let Ok(pattern) = FieldPattern::new(
532            InvoiceFieldType::DueDate,
533            r"(?:Fälligkeitsdatum|Zahlbar\s+bis|FÄLLIGKEITSDATUM):?\s*(\d{1,2}[.\-]\d{1,2}[.\-]\d{2,4})",
534            0.85,
535            Some(Language::German),
536        ) {
537            self.add_pattern(
538                pattern.with_hints(vec!["fälligkeitsdatum".to_string(), "zahlbar".to_string()]),
539            );
540        }
541
542        // Total amount patterns
543        // Matches: "Gesamtbetrag: 1.234,56 €", "Betrag: 1.234,56€", "Summe: 1234,56"
544        if let Ok(pattern) = FieldPattern::new(
545            InvoiceFieldType::TotalAmount,
546            r"(?:Gesamtbetrag|Betrag|Summe|GESAMTBETRAG):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
547            0.9,
548            Some(Language::German),
549        ) {
550            self.add_pattern(pattern.with_hints(vec![
551                "gesamtbetrag".to_string(),
552                "betrag".to_string(),
553                "summe".to_string(),
554            ]));
555        }
556
557        // Tax amount (MwSt/USt) patterns
558        // Matches: "MwSt: 123,45 €", "Umsatzsteuer (19%): 123,45", "USt: 123,45"
559        if let Ok(pattern) = FieldPattern::new(
560            InvoiceFieldType::TaxAmount,
561            r"(?:MwSt\.?|Umsatzsteuer|USt\.?).*?:?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
562            0.85,
563            Some(Language::German),
564        ) {
565            self.add_pattern(pattern.with_hints(vec![
566                "mwst".to_string(),
567                "umsatzsteuer".to_string(),
568                "ust".to_string(),
569            ]));
570        }
571
572        // Net amount patterns
573        // Matches: "Nettobetrag: 500,00 €", "Zwischensumme: 500,00"
574        if let Ok(pattern) = FieldPattern::new(
575            InvoiceFieldType::NetAmount,
576            r"(?:Nettobetrag|Zwischensumme|NETTOBETRAG):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
577            0.85,
578            Some(Language::German),
579        ) {
580            self.add_pattern(
581                pattern.with_hints(vec!["nettobetrag".to_string(), "zwischensumme".to_string()]),
582            );
583        }
584
585        // Additional net amount patterns (more variants)
586        // Matches: "Netto: 500,00", "Summe Netto: 500,00", "Teilsumme: 500,00"
587        if let Ok(pattern) = FieldPattern::new(
588            InvoiceFieldType::NetAmount,
589            r"(?:Netto|NETTO|Summe\s+Netto|Teilsumme|TEILSUMME):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
590            0.80,
591            Some(Language::German),
592        ) {
593            self.add_pattern(
594                pattern.with_hints(vec!["netto".to_string(), "teilsumme".to_string()]),
595            );
596        }
597
598        // VAT number patterns (German USt-IdNr or Steuernummer)
599        // Matches: "USt-IdNr: DE123456789", "Steuernummer: 123/456/78901"
600        if let Ok(pattern) = FieldPattern::new(
601            InvoiceFieldType::VatNumber,
602            r"(?:USt-IdNr\.?|Steuernummer):?\s*(DE[0-9]{9}|[0-9]{2,3}/[0-9]{3}/[0-9]{4,5})",
603            0.9,
604            Some(Language::German),
605        ) {
606            self.add_pattern(
607                pattern.with_hints(vec!["ust-idnr".to_string(), "steuernummer".to_string()]),
608            );
609        }
610
611        // Customer name patterns
612        // Matches: "Kunde: Firma GmbH", "Rechnungsempfänger: Max Mustermann", "z.Hd.: Peter Schmidt"
613        if let Ok(pattern) = FieldPattern::new(
614            InvoiceFieldType::CustomerName,
615            r"(?:Kunde|KUNDE|Rechnungsempfänger|RECHNUNGSEMPFÄNGER|An|z\.Hd\.|z\.\s*Hd\.):?\s*([A-Za-zÄÖÜäöüß][A-Za-zÄÖÜäöüß\s\.,&\-]{2,80})",
616            0.75,
617            Some(Language::German),
618        ) {
619            self.add_pattern(pattern.with_hints(vec![
620                "kunde".to_string(),
621                "rechnungsempfänger".to_string(),
622                "z.hd".to_string(),
623            ]));
624        }
625
626        // Currency pattern
627        // Matches: "€", "EUR", "Währung: EUR"
628        if let Ok(pattern) = FieldPattern::new(
629            InvoiceFieldType::Currency,
630            r"(?:Währung:?\s+)?(€|EUR|USD|GBP|CHF)",
631            0.7,
632            Some(Language::German),
633        ) {
634            self.add_pattern(pattern);
635        }
636    }
637
638    /// Load Italian invoice patterns
639    fn load_italian_patterns(&mut self) {
640        // Invoice number patterns
641        // Matches: "Fattura N. 2025-001", "Numero Fattura: 12345", "N. Fatt.: INV-001"
642        if let Ok(pattern) = FieldPattern::new(
643            InvoiceFieldType::InvoiceNumber,
644            r"(?:Fattura\s+N\.?|Numero\s+Fattura|N\.\s+Fatt\.?|FATTURA\s+N\.?):?\s*([A-Z0-9][A-Z0-9\-/]*)",
645            0.9,
646            Some(Language::Italian),
647        ) {
648            self.add_pattern(pattern.with_hints(vec!["fattura".to_string(), "numero".to_string()]));
649        }
650
651        // Invoice date patterns
652        // Matches: "Data: 20/10/2025", "Data Fattura: 20-10-2025"
653        if let Ok(pattern) = FieldPattern::new(
654            InvoiceFieldType::InvoiceDate,
655            r"(?:(?:Data\s+)?Fattura|Data|DATA):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
656            0.85,
657            Some(Language::Italian),
658        ) {
659            self.add_pattern(pattern.with_hints(vec!["data".to_string(), "fattura".to_string()]));
660        }
661
662        // Due date patterns
663        // Matches: "Scadenza: 20/11/2025", "Data Scadenza: 20-11-2025"
664        if let Ok(pattern) = FieldPattern::new(
665            InvoiceFieldType::DueDate,
666            r"(?:(?:Data\s+)?Scadenza|SCADENZA):?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
667            0.85,
668            Some(Language::Italian),
669        ) {
670            self.add_pattern(pattern.with_hints(vec!["scadenza".to_string()]));
671        }
672
673        // Total amount patterns
674        // Matches: "Totale: 1.234,56 €", "Importo Totale: €1.234,56", "Totale Fattura: 1234,56"
675        if let Ok(pattern) = FieldPattern::new(
676            InvoiceFieldType::TotalAmount,
677            r"(?:Totale(?:\s+Fattura)?|Importo\s+Totale|TOTALE):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
678            0.9,
679            Some(Language::Italian),
680        ) {
681            self.add_pattern(pattern.with_hints(vec!["totale".to_string(), "importo".to_string()]));
682        }
683
684        // Tax amount (IVA) patterns
685        // Matches: "IVA: 123,45 €", "IVA (22%): 123,45", "Imposta: 123,45"
686        if let Ok(pattern) = FieldPattern::new(
687            InvoiceFieldType::TaxAmount,
688            r"(?:IVA|I\.V\.A\.|Imposta).*?:?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
689            0.85,
690            Some(Language::Italian),
691        ) {
692            self.add_pattern(pattern.with_hints(vec!["iva".to_string(), "imposta".to_string()]));
693        }
694
695        // Net amount patterns
696        // Matches: "Imponibile: 500,00 €", "Subtotale: 500,00"
697        if let Ok(pattern) = FieldPattern::new(
698            InvoiceFieldType::NetAmount,
699            r"(?:Imponibile|Subtotale|IMPONIBILE):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
700            0.85,
701            Some(Language::Italian),
702        ) {
703            self.add_pattern(
704                pattern.with_hints(vec!["imponibile".to_string(), "subtotale".to_string()]),
705            );
706        }
707
708        // Additional net amount patterns (more variants)
709        // Matches: "Netto: 500,00", "Somma Netta: 500,00", "Importo Netto: 500,00"
710        if let Ok(pattern) = FieldPattern::new(
711            InvoiceFieldType::NetAmount,
712            r"(?:Netto|NETTO|Somma\s+Netta|Importo\s+Netto):?\s*€?\s*([0-9]{1,3}(?:\.[0-9]{3})*,[0-9]{2})\s*€?",
713            0.80,
714            Some(Language::Italian),
715        ) {
716            self.add_pattern(pattern.with_hints(vec!["netto".to_string(), "somma".to_string()]));
717        }
718
719        // VAT number patterns (Italian P.IVA)
720        // Matches: "P.IVA: IT12345678901", "Partita IVA: 12345678901"
721        if let Ok(pattern) = FieldPattern::new(
722            InvoiceFieldType::VatNumber,
723            r"(?:P\.IVA|P\.\s*IVA|Partita\s+IVA):?\s*(IT[0-9]{11}|[0-9]{11})",
724            0.9,
725            Some(Language::Italian),
726        ) {
727            self.add_pattern(pattern.with_hints(vec!["p.iva".to_string(), "partita".to_string()]));
728        }
729
730        // Customer name patterns
731        // Matches: "Cliente: Azienda S.r.l.", "Fatturare a: Mario Rossi", "Spett.le: ABC S.p.A.", "Destinatario: Luigi Verdi"
732        if let Ok(pattern) = FieldPattern::new(
733            InvoiceFieldType::CustomerName,
734            r"(?:Cliente|CLIENTE|Fatturare\s+a|FATTURARE\s+A|Spett\.le|Spettabile|SPETTABILE|Destinatario|DESTINATARIO):?\s*([A-Za-zÀ-ÿ][A-Za-zÀ-ÿ\s\.,&\-]{2,80})",
735            0.75,
736            Some(Language::Italian),
737        ) {
738            self.add_pattern(pattern.with_hints(vec![
739                "cliente".to_string(),
740                "fatturare".to_string(),
741                "spett".to_string(),
742                "destinatario".to_string(),
743            ]));
744        }
745
746        // Currency pattern
747        // Matches: "€", "EUR", "Valuta: EUR"
748        if let Ok(pattern) = FieldPattern::new(
749            InvoiceFieldType::Currency,
750            r"(?:Valuta:?\s+)?(€|EUR|USD|GBP|CHF)",
751            0.7,
752            Some(Language::Italian),
753        ) {
754            self.add_pattern(pattern);
755        }
756    }
757}
758
759impl Default for PatternLibrary {
760    fn default() -> Self {
761        Self::new()
762    }
763}
764
765#[cfg(test)]
766mod tests {
767    use super::*;
768
769    #[test]
770    fn test_pattern_library_new() {
771        let lib = PatternLibrary::new();
772        assert_eq!(lib.patterns.len(), 0);
773    }
774
775    #[test]
776    fn test_field_pattern_creation() {
777        let pattern = FieldPattern::new(InvoiceFieldType::InvoiceNumber, r"INV-(\d+)", 0.9, None);
778        assert!(pattern.is_ok());
779    }
780
781    #[test]
782    fn test_field_pattern_invalid_regex() {
783        let pattern = FieldPattern::new(InvoiceFieldType::InvoiceNumber, r"[invalid(", 0.9, None);
784        assert!(pattern.is_err());
785    }
786
787    #[test]
788    fn test_pattern_matches() {
789        let pattern = FieldPattern::new(InvoiceFieldType::InvoiceNumber, r"INV-(\d+)", 0.9, None)
790            .expect("Hardcoded regex pattern should be valid");
791
792        assert_eq!(pattern.matches("INV-12345"), Some("12345".to_string()));
793        assert_eq!(pattern.matches("Invoice INV-999"), Some("999".to_string()));
794        assert_eq!(pattern.matches("No match here"), None);
795    }
796}