oxidize_pdf/text/invoice/
extractor.rs

1//! Invoice data extractor
2//!
3//! This module provides the main `InvoiceExtractor` type for extracting structured
4//! data from invoice PDFs using pattern matching and confidence scoring.
5//!
6//! # Architecture
7//!
8//! The extraction process follows a pipeline:
9//!
10//! ```text
11//! TextFragments → Text Reconstruction → Pattern Matching → Type Conversion → InvoiceData
12//! ```
13//!
14//! 1. **Text Reconstruction**: Join text fragments with spatial awareness
15//! 2. **Pattern Matching**: Apply language-specific regex patterns
16//! 3. **Confidence Scoring**: Calculate confidence for each match (0.0-1.0)
17//! 4. **Type Conversion**: Convert strings to typed fields (amounts, dates, etc.)
18//! 5. **Filtering**: Remove low-confidence matches below threshold
19//!
20//! # Usage
21//!
22//! ```ignore
23//! use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
24//! use oxidize_pdf::text::invoice::InvoiceExtractor;
25//! use oxidize_pdf::Document;
26//!
27//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
28//! // Extract text from PDF
29//! let doc = Document::open("invoice.pdf")?;
30//! let page = doc.get_page(1)?;
31//! let text_extractor = TextExtractor::new();
32//! let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
33//!
34//! // Extract invoice data
35//! let extractor = InvoiceExtractor::builder()
36//!     .with_language("es")
37//!     .confidence_threshold(0.7)
38//!     .build();
39//!
40//! let invoice = extractor.extract(&extracted.fragments)?;
41//! println!("Found {} fields", invoice.field_count());
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! # Confidence Scoring
47//!
48//! Each extracted field has a confidence score (0.0 = no confidence, 1.0 = certain):
49//!
50//! - **0.9**: Critical fields (invoice number, total amount)
51//! - **0.8**: Important fields (dates, tax amounts)
52//! - **0.7**: Standard fields (VAT numbers, names)
53//!
54//! Fields below the confidence threshold are automatically filtered out.
55
56use super::error::{ExtractionError, Result};
57use super::patterns::{InvoiceFieldType, PatternLibrary};
58use super::types::{
59    BoundingBox, ExtractedField, InvoiceData, InvoiceField, InvoiceMetadata, Language,
60};
61use super::validators;
62use crate::text::extraction::TextFragment;
63
64/// Invoice data extractor with configurable pattern matching
65///
66/// This is the main entry point for invoice extraction. Use the builder pattern
67/// to configure language, confidence thresholds, and other options.
68///
69/// # Examples
70///
71/// ```
72/// use oxidize_pdf::text::invoice::InvoiceExtractor;
73///
74/// // Spanish invoices with high confidence threshold and kerning-aware spacing
75/// let extractor = InvoiceExtractor::builder()
76///     .with_language("es")
77///     .confidence_threshold(0.85)
78///     .use_kerning(true)  // Enables font-aware spacing in text reconstruction
79///     .build();
80/// ```
81///
82/// # Thread Safety
83///
84/// `InvoiceExtractor` is immutable after construction and can be safely shared
85/// across threads. Consider creating one extractor per language and reusing it.
86pub struct InvoiceExtractor {
87    pattern_library: PatternLibrary,
88    confidence_threshold: f64,
89    /// Enable kerning-aware text reconstruction
90    ///
91    /// When enabled, adjusts inter-fragment spacing based on font continuity.
92    /// Fragments with the same font use tighter spacing (single space), while
93    /// font changes use normal spacing (double space).
94    ///
95    /// **Implementation Note**: This is a simplified version of true kerning.
96    /// Full kerning with font metrics requires access to kerning pair tables,
97    /// which would require passing `font_cache` or `Document` reference.
98    /// The current implementation provides spacing improvements without
99    /// breaking API compatibility.
100    use_kerning: bool,
101    language: Option<Language>,
102}
103
104impl InvoiceExtractor {
105    /// Create a new builder for configuring the extractor
106    ///
107    /// This is the recommended way to create an `InvoiceExtractor`.
108    ///
109    /// # Examples
110    ///
111    /// ```
112    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
113    ///
114    /// let extractor = InvoiceExtractor::builder()
115    ///     .with_language("es")
116    ///     .confidence_threshold(0.8)
117    ///     .build();
118    /// ```
119    pub fn builder() -> InvoiceExtractorBuilder {
120        InvoiceExtractorBuilder::new()
121    }
122
123    /// Extract structured invoice data from text fragments
124    ///
125    /// This is the main extraction method. It processes text fragments from a PDF page
126    /// and returns structured invoice data with confidence scores.
127    ///
128    /// # Process
129    ///
130    /// 1. Text fragments are reconstructed into full text
131    /// 2. Language-specific patterns are applied
132    /// 3. Matches are converted to typed fields
133    /// 4. Confidence scores are calculated
134    /// 5. Low-confidence fields are filtered out
135    ///
136    /// # Arguments
137    ///
138    /// * `text_fragments` - Text fragments extracted from PDF page (from `TextExtractor`)
139    ///
140    /// # Returns
141    ///
142    /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if:
143    /// - No text fragments provided
144    /// - PDF page is empty
145    /// - Text extraction failed
146    ///
147    /// # Examples
148    ///
149    /// ```ignore
150    /// use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
151    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
152    /// use oxidize_pdf::Document;
153    ///
154    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
155    /// let doc = Document::open("invoice.pdf")?;
156    /// let page = doc.get_page(1)?;
157    ///
158    /// // Extract text
159    /// let text_extractor = TextExtractor::new();
160    /// let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
161    ///
162    /// // Extract invoice data
163    /// let extractor = InvoiceExtractor::builder()
164    ///     .with_language("es")
165    ///     .build();
166    ///
167    /// let invoice = extractor.extract(&extracted.fragments)?;
168    ///
169    /// // Access extracted fields
170    /// for field in &invoice.fields {
171    ///     println!("{}: {:?} (confidence: {:.2})",
172    ///         field.field_type.name(),
173    ///         field.field_type,
174    ///         field.confidence
175    ///     );
176    /// }
177    /// # Ok(())
178    /// # }
179    /// ```
180    ///
181    /// # Performance
182    ///
183    /// Extraction is CPU-bound and typically completes in <100ms for standard invoices.
184    /// The extractor can be safely reused across multiple pages and threads.
185    pub fn extract(&self, text_fragments: &[TextFragment]) -> Result<InvoiceData> {
186        if text_fragments.is_empty() {
187            return Err(ExtractionError::NoTextFound(1));
188        }
189
190        // Step 1: Reconstruct full text with position tracking
191        let full_text = self.reconstruct_text(text_fragments);
192
193        // Step 2: Apply pattern matching
194        let matches = self.pattern_library.match_text(&full_text);
195
196        // Step 3: Convert matches to ExtractedField with proper types
197        let mut fields = Vec::new();
198        for (field_type, matched_value, base_confidence) in matches {
199            // Calculate confidence score with context
200            let confidence =
201                self.calculate_confidence(&field_type, base_confidence, &matched_value, &full_text);
202
203            // Skip fields below threshold
204            if confidence < self.confidence_threshold {
205                continue;
206            }
207
208            // Find position of this match in fragments
209            let position = self.find_match_position(&matched_value, text_fragments);
210
211            // Convert to proper InvoiceField with typed data
212            if let Some(invoice_field) = self.convert_to_invoice_field(field_type, &matched_value) {
213                fields.push(ExtractedField::new(
214                    invoice_field,
215                    confidence,
216                    position,
217                    matched_value,
218                ));
219            }
220        }
221
222        // Step 4: Calculate overall confidence
223        let overall_confidence = if fields.is_empty() {
224            0.0
225        } else {
226            fields.iter().map(|f| f.confidence).sum::<f64>() / fields.len() as f64
227        };
228
229        // Step 5: Create metadata
230        let metadata = InvoiceMetadata::new(1, overall_confidence)
231            .with_language(self.language.unwrap_or(Language::English));
232
233        Ok(InvoiceData::new(fields, metadata))
234    }
235
236    /// Extract invoice data from plain text (convenience method for testing)
237    ///
238    /// This is a convenience wrapper around `extract()` that creates synthetic
239    /// TextFragment objects from plain text input. Primarily useful for testing
240    /// and simple scenarios where you don't have actual PDF text fragments.
241    ///
242    /// **Note**: This method creates fragments without position information,
243    /// so proximity-based scoring may be less accurate than with real PDF fragments.
244    ///
245    /// # Arguments
246    ///
247    /// * `text` - Plain text string to extract invoice data from
248    ///
249    /// # Returns
250    ///
251    /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if text is empty
252    ///
253    /// # Examples
254    ///
255    /// ```
256    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
257    ///
258    /// let extractor = InvoiceExtractor::builder()
259    ///     .with_language("en")
260    ///     .confidence_threshold(0.7)
261    ///     .build();
262    ///
263    /// let invoice_text = "Invoice Number: INV-001\nTotal: £100.00";
264    /// let result = extractor.extract_from_text(invoice_text)?;
265    ///
266    /// assert!(!result.fields.is_empty());
267    /// # Ok::<(), Box<dyn std::error::Error>>(())
268    /// ```
269    pub fn extract_from_text(&self, text: &str) -> Result<InvoiceData> {
270        if text.is_empty() {
271            return Err(ExtractionError::NoTextFound(1));
272        }
273
274        // Create a single synthetic TextFragment from the text
275        let fragment = TextFragment {
276            text: text.to_string(),
277            x: 0.0,
278            y: 0.0,
279            width: 0.0,
280            height: 12.0,
281            font_size: 12.0,
282            font_name: None,
283            is_bold: false,
284            is_italic: false,
285            color: None,
286            space_decisions: Vec::new(),
287        };
288
289        // Use the standard extract method
290        self.extract(&[fragment])
291    }
292
293    /// Reconstruct text from fragments
294    ///
295    /// When `use_kerning` is enabled, applies tighter spacing between fragments
296    /// that share the same font, simulating kerning-aware text reconstruction.
297    ///
298    /// **Implementation**: While full kerning requires font metrics (kerning pairs),
299    /// this simplified version adjusts inter-fragment spacing based on font continuity.
300    /// Fragments with the same font get minimal spacing (single space), while font
301    /// changes get normal spacing (double space).
302    fn reconstruct_text(&self, fragments: &[TextFragment]) -> String {
303        if fragments.is_empty() {
304            return String::new();
305        }
306
307        if !self.use_kerning {
308            // Default: join all fragments with single space
309            return fragments
310                .iter()
311                .map(|f| f.text.as_str())
312                .collect::<Vec<_>>()
313                .join(" ");
314        }
315
316        // Kerning-aware: use tighter spacing for same-font fragments
317        let mut result = String::with_capacity(
318            fragments.iter().map(|f| f.text.len()).sum::<usize>() + fragments.len(),
319        );
320
321        for (i, fragment) in fragments.iter().enumerate() {
322            result.push_str(&fragment.text);
323
324            // Add spacing between fragments
325            if i < fragments.len() - 1 {
326                let next = &fragments[i + 1];
327
328                // If both fragments have same font, use minimal spacing
329                // Otherwise use normal spacing for font transitions
330                let spacing = match (&fragment.font_name, &next.font_name) {
331                    (Some(f1), Some(f2)) if f1 == f2 => " ", // Same font: tight spacing
332                    _ => "  ", // Different/unknown font: normal spacing
333                };
334
335                result.push_str(spacing);
336            }
337        }
338
339        result
340    }
341
342    /// Parse amount with language-aware decimal handling
343    fn parse_amount(&self, value: &str) -> Option<f64> {
344        // Determine decimal format based on language
345        let uses_european_format = matches!(
346            self.language,
347            Some(Language::Spanish) | Some(Language::German) | Some(Language::Italian)
348        );
349
350        let normalized = if uses_european_format {
351            // European format: 1.234,56 → remove dots (thousands), replace comma with dot (decimal)
352            value.replace('.', "").replace(',', ".")
353        } else {
354            // US/UK format: 1,234.56 → remove commas (thousands), dot is already decimal
355            value.replace(',', "")
356        };
357
358        normalized.parse::<f64>().ok()
359    }
360
361    /// Calculate confidence score for a match using multi-factor scoring
362    ///
363    /// Combines multiple factors to produce a final confidence score:
364    /// 1. **Base Pattern Confidence** (0.7-0.9): From pattern matching quality
365    /// 2. **Value Validation Bonus** (-0.5 to +0.2): Format and content validation
366    /// 3. **Proximity Bonus** (0.0 to +0.15): Distance from field label keywords
367    ///
368    /// # Arguments
369    ///
370    /// * `field_type` - The type of field being scored (affects which validator is applied)
371    /// * `base_confidence` - Initial confidence from pattern match quality
372    /// * `matched_value` - The extracted value (used for validation)
373    /// * `full_text` - Complete text of the invoice (used for proximity calculation)
374    ///
375    /// # Returns
376    ///
377    /// Final confidence score clamped to [0.0, 1.0]
378    ///
379    /// # Examples
380    ///
381    /// ```ignore
382    /// // Invoice date with valid format gets validation bonus
383    /// let confidence = extractor.calculate_confidence(
384    ///     &InvoiceFieldType::InvoiceDate,
385    ///     0.85,  // base from pattern
386    ///     "20/01/2025",
387    ///     full_text
388    /// );
389    /// // Result: 0.85 + 0.20 (valid date) + proximity = ~1.0
390    /// ```
391    fn calculate_confidence(
392        &self,
393        field_type: &InvoiceFieldType,
394        base_confidence: f64,
395        matched_value: &str,
396        full_text: &str,
397    ) -> f64 {
398        // Start with base confidence from pattern matching
399        let mut score = base_confidence;
400
401        // Apply value validation adjustments based on field type
402        let validation_adjustment = match field_type {
403            InvoiceFieldType::InvoiceDate | InvoiceFieldType::DueDate => {
404                validators::validate_date(matched_value)
405            }
406            InvoiceFieldType::TotalAmount
407            | InvoiceFieldType::TaxAmount
408            | InvoiceFieldType::NetAmount
409            | InvoiceFieldType::LineItemUnitPrice => validators::validate_amount(matched_value),
410            InvoiceFieldType::InvoiceNumber => validators::validate_invoice_number(matched_value),
411            InvoiceFieldType::VatNumber => validators::validate_vat_number(matched_value),
412            // No validators yet for these fields
413            InvoiceFieldType::SupplierName
414            | InvoiceFieldType::CustomerName
415            | InvoiceFieldType::Currency
416            | InvoiceFieldType::ArticleNumber
417            | InvoiceFieldType::LineItemDescription
418            | InvoiceFieldType::LineItemQuantity => 0.0,
419        };
420
421        score += validation_adjustment;
422
423        // Apply proximity bonus (closeness to field label in text)
424        let proximity_bonus = self.calculate_proximity_bonus(field_type, matched_value, full_text);
425        score += proximity_bonus;
426
427        // Clamp to valid range [0.0, 1.0]
428        score.clamp(0.0, 1.0)
429    }
430
431    /// Calculate proximity bonus based on distance from field label keywords
432    ///
433    /// Fields that appear close to their expected label keywords receive a bonus.
434    /// This helps distinguish between correct matches and ambiguous values that
435    /// happen to match the pattern but appear in the wrong context.
436    ///
437    /// # Proximity Bonus Scale
438    ///
439    /// - **+0.15**: Keyword within 20 characters of match
440    /// - **+0.10**: Keyword within 50 characters
441    /// - **+0.05**: Keyword within 100 characters
442    /// - **0.00**: Keyword beyond 100 characters or not found
443    ///
444    /// # Arguments
445    ///
446    /// * `field_type` - The type of field (determines which keywords to search for)
447    /// * `matched_value` - The extracted value
448    /// * `full_text` - Complete invoice text
449    ///
450    /// # Returns
451    ///
452    /// Proximity bonus in range [0.0, 0.15]
453    fn calculate_proximity_bonus(
454        &self,
455        field_type: &InvoiceFieldType,
456        matched_value: &str,
457        full_text: &str,
458    ) -> f64 {
459        // Define keywords for each field type (language-agnostic where possible)
460        let keywords: Vec<&str> = match field_type {
461            InvoiceFieldType::InvoiceNumber => {
462                vec![
463                    "Invoice", "Factura", "Rechnung", "Fattura", "Number", "Número", "Nr",
464                ]
465            }
466            InvoiceFieldType::InvoiceDate => {
467                vec!["Date", "Fecha", "Datum", "Data", "Invoice Date"]
468            }
469            InvoiceFieldType::DueDate => {
470                vec!["Due", "Vencimiento", "Fällig", "Scadenza", "Payment"]
471            }
472            InvoiceFieldType::TotalAmount => {
473                vec![
474                    "Total",
475                    "Grand Total",
476                    "Amount Due",
477                    "Gesamtbetrag",
478                    "Totale",
479                ]
480            }
481            InvoiceFieldType::TaxAmount => {
482                vec!["VAT", "IVA", "MwSt", "Tax", "Impuesto"]
483            }
484            InvoiceFieldType::NetAmount => {
485                vec![
486                    "Subtotal",
487                    "Net",
488                    "Neto",
489                    "Nettobetrag",
490                    "Imponibile",
491                    "Base",
492                ]
493            }
494            InvoiceFieldType::VatNumber => {
495                vec!["VAT", "CIF", "NIF", "USt", "Partita IVA", "Tax ID"]
496            }
497            InvoiceFieldType::CustomerName => {
498                vec!["Bill to", "Customer", "Client", "Cliente"]
499            }
500            InvoiceFieldType::SupplierName => {
501                vec!["From", "Supplier", "Vendor", "Proveedor"]
502            }
503            _ => return 0.0, // No proximity bonus for other fields
504        };
505
506        // Find the matched value position in full text
507        let match_pos = match full_text.find(matched_value) {
508            Some(pos) => pos,
509            None => return 0.0, // Value not found in text (shouldn't happen)
510        };
511
512        // Find the closest keyword and calculate distance
513        let mut min_distance = usize::MAX;
514        for keyword in keywords {
515            // Case-insensitive search
516            let text_lower = full_text.to_lowercase();
517            let keyword_lower = keyword.to_lowercase();
518
519            if let Some(keyword_pos) = text_lower.find(&keyword_lower) {
520                let distance = if keyword_pos < match_pos {
521                    match_pos - keyword_pos
522                } else {
523                    keyword_pos - match_pos
524                };
525
526                min_distance = min_distance.min(distance);
527            }
528        }
529
530        // Award bonus based on proximity (distance in characters)
531        match min_distance {
532            0..=20 => 0.15,   // Very close (same line, adjacent)
533            21..=50 => 0.10,  // Close (nearby in layout)
534            51..=100 => 0.05, // Moderately close
535            _ => 0.0,         // Too far or not found
536        }
537    }
538
539    /// Find the bounding box of a matched value in the fragments
540    fn find_match_position(&self, matched_value: &str, fragments: &[TextFragment]) -> BoundingBox {
541        // Simple approach: find first fragment containing the value
542        for fragment in fragments {
543            if fragment.text.contains(matched_value) {
544                return BoundingBox::new(fragment.x, fragment.y, fragment.width, fragment.height);
545            }
546        }
547
548        // Fallback: use first fragment's position
549        if let Some(first) = fragments.first() {
550            BoundingBox::new(first.x, first.y, first.width, first.height)
551        } else {
552            BoundingBox::new(0.0, 0.0, 0.0, 0.0)
553        }
554    }
555
556    /// Convert field type and string value to typed InvoiceField
557    fn convert_to_invoice_field(
558        &self,
559        field_type: InvoiceFieldType,
560        value: &str,
561    ) -> Option<InvoiceField> {
562        match field_type {
563            InvoiceFieldType::InvoiceNumber => Some(InvoiceField::InvoiceNumber(value.to_string())),
564            InvoiceFieldType::InvoiceDate => Some(InvoiceField::InvoiceDate(value.to_string())),
565            InvoiceFieldType::DueDate => Some(InvoiceField::DueDate(value.to_string())),
566            InvoiceFieldType::TotalAmount => {
567                self.parse_amount(value).map(InvoiceField::TotalAmount)
568            }
569            InvoiceFieldType::TaxAmount => self.parse_amount(value).map(InvoiceField::TaxAmount),
570            InvoiceFieldType::NetAmount => self.parse_amount(value).map(InvoiceField::NetAmount),
571            InvoiceFieldType::VatNumber => Some(InvoiceField::VatNumber(value.to_string())),
572            InvoiceFieldType::SupplierName => Some(InvoiceField::SupplierName(value.to_string())),
573            InvoiceFieldType::CustomerName => Some(InvoiceField::CustomerName(value.to_string())),
574            InvoiceFieldType::Currency => Some(InvoiceField::Currency(value.to_string())),
575            InvoiceFieldType::ArticleNumber => Some(InvoiceField::ArticleNumber(value.to_string())),
576            InvoiceFieldType::LineItemDescription => {
577                Some(InvoiceField::LineItemDescription(value.to_string()))
578            }
579            InvoiceFieldType::LineItemQuantity => {
580                self.parse_amount(value).map(InvoiceField::LineItemQuantity)
581            }
582            InvoiceFieldType::LineItemUnitPrice => self
583                .parse_amount(value)
584                .map(InvoiceField::LineItemUnitPrice),
585        }
586    }
587}
588
589/// Builder for configuring `InvoiceExtractor`
590///
591/// Provides a fluent API for configuring extraction behavior. All settings
592/// have sensible defaults for immediate use.
593///
594/// # Defaults
595///
596/// - **Language**: None (uses default patterns)
597/// - **Confidence Threshold**: 0.7 (70%)
598/// - **Use Kerning**: true (stored but not yet functional - see `use_kerning()` docs)
599///
600/// # Examples
601///
602/// ```
603/// use oxidize_pdf::text::invoice::InvoiceExtractor;
604///
605/// // Minimal configuration
606/// let extractor = InvoiceExtractor::builder()
607///     .with_language("es")
608///     .build();
609///
610/// // Full configuration
611/// let extractor = InvoiceExtractor::builder()
612///     .with_language("de")
613///     .confidence_threshold(0.85)
614///     .use_kerning(false)
615///     .build();
616/// ```
617pub struct InvoiceExtractorBuilder {
618    language: Option<Language>,
619    confidence_threshold: f64,
620    use_kerning: bool,
621    custom_patterns: Option<PatternLibrary>,
622}
623
624impl InvoiceExtractorBuilder {
625    /// Create a new builder with default settings
626    ///
627    /// Defaults:
628    /// - No language (uses English patterns)
629    /// - Confidence threshold: 0.7
630    /// - Kerning: enabled
631    pub fn new() -> Self {
632        Self {
633            language: None,
634            confidence_threshold: 0.7,
635            use_kerning: true,
636            custom_patterns: None,
637        }
638    }
639
640    /// Set the language for pattern matching
641    ///
642    /// Accepts language codes: "es", "en", "de", "it"
643    ///
644    /// # Examples
645    ///
646    /// ```
647    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
648    ///
649    /// let extractor = InvoiceExtractor::builder()
650    ///     .with_language("es")  // Spanish patterns
651    ///     .build();
652    /// ```
653    pub fn with_language(mut self, lang: &str) -> Self {
654        self.language = Language::from_code(lang);
655        self
656    }
657
658    /// Set the minimum confidence threshold (0.0 to 1.0)
659    ///
660    /// Fields below this threshold are filtered out. Higher values reduce
661    /// false positives but may miss valid fields.
662    ///
663    /// Recommended values:
664    /// - **0.5**: Maximum recall (may include false positives)
665    /// - **0.7**: Balanced (default)
666    /// - **0.9**: Maximum precision (may miss valid fields)
667    ///
668    /// # Examples
669    ///
670    /// ```
671    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
672    ///
673    /// // High precision mode
674    /// let extractor = InvoiceExtractor::builder()
675    ///     .confidence_threshold(0.9)
676    ///     .build();
677    /// ```
678    ///
679    /// # Validation
680    ///
681    /// The threshold is automatically clamped to the valid range [0.0, 1.0].
682    /// Values outside this range are silently adjusted to the nearest valid value.
683    pub fn confidence_threshold(mut self, threshold: f64) -> Self {
684        self.confidence_threshold = threshold.clamp(0.0, 1.0);
685        self
686    }
687
688    /// Enable or disable kerning-aware text positioning (PLANNED for v2.0)
689    ///
690    /// **Current Behavior**: This flag is stored but NOT yet used in extraction logic.
691    ///
692    /// **Planned Feature** (v2.0): When enabled, text reconstruction will use actual
693    /// font kerning pairs to calculate accurate character spacing, improving pattern
694    /// matching for invoices with tight kerning (e.g., "AV", "To").
695    ///
696    /// **Why Not Implemented**: Requires architectural changes to expose font metadata
697    /// in `TextFragment`. See struct documentation for technical details.
698    ///
699    /// # Examples
700    ///
701    /// ```
702    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
703    ///
704    /// // Enable for future use (no effect in v1.x)
705    /// let extractor = InvoiceExtractor::builder()
706    ///     .use_kerning(true)  // ⚠️ Stored but not yet functional
707    ///     .build();
708    /// ```
709    pub fn use_kerning(mut self, enabled: bool) -> Self {
710        self.use_kerning = enabled;
711        self
712    }
713
714    /// Use a custom pattern library instead of language-based defaults
715    ///
716    /// Allows complete control over invoice pattern matching by providing a
717    /// custom `PatternLibrary`. Useful for specialized invoice formats or
718    /// combining default patterns with custom additions.
719    ///
720    /// **Note**: When using custom patterns, the `with_language()` setting is ignored.
721    ///
722    /// # Examples
723    ///
724    /// **Example 1: Use default patterns and add custom ones**
725    /// ```
726    /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
727    ///
728    /// // Start with Spanish defaults
729    /// let mut patterns = PatternLibrary::default_spanish();
730    ///
731    /// // Add custom pattern for your specific invoice format
732    /// patterns.add_pattern(
733    ///     FieldPattern::new(
734    ///         InvoiceFieldType::InvoiceNumber,
735    ///         r"Ref:\s*([A-Z0-9\-]+)",  // Your custom format
736    ///         0.85,
737    ///         Some(Language::Spanish)
738    ///     ).unwrap()
739    /// );
740    ///
741    /// let extractor = InvoiceExtractor::builder()
742    ///     .with_custom_patterns(patterns)
743    ///     .build();
744    /// ```
745    ///
746    /// **Example 2: Build completely custom pattern library**
747    /// ```
748    /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
749    ///
750    /// let mut patterns = PatternLibrary::new();
751    ///
752    /// // Add only the patterns you need
753    /// patterns.add_pattern(
754    ///     FieldPattern::new(
755    ///         InvoiceFieldType::InvoiceNumber,
756    ///         r"Order\s+#([0-9]+)",
757    ///         0.9,
758    ///         None  // Language-agnostic
759    ///     ).unwrap()
760    /// );
761    ///
762    /// let extractor = InvoiceExtractor::builder()
763    ///     .with_custom_patterns(patterns)
764    ///     .confidence_threshold(0.8)
765    ///     .build();
766    /// ```
767    pub fn with_custom_patterns(mut self, patterns: PatternLibrary) -> Self {
768        self.custom_patterns = Some(patterns);
769        self
770    }
771
772    /// Build the InvoiceExtractor
773    pub fn build(self) -> InvoiceExtractor {
774        // Use custom patterns if provided, otherwise create from language
775        let pattern_library = if let Some(custom) = self.custom_patterns {
776            custom
777        } else if let Some(lang) = self.language {
778            PatternLibrary::with_language(lang)
779        } else {
780            PatternLibrary::new()
781        };
782
783        InvoiceExtractor {
784            pattern_library,
785            confidence_threshold: self.confidence_threshold,
786            use_kerning: self.use_kerning,
787            language: self.language,
788        }
789    }
790}
791
792impl Default for InvoiceExtractorBuilder {
793    fn default() -> Self {
794        Self::new()
795    }
796}
797
798#[cfg(test)]
799mod tests {
800    use super::*;
801
802    #[test]
803    fn test_builder_defaults() {
804        let extractor = InvoiceExtractor::builder().build();
805        assert_eq!(extractor.confidence_threshold, 0.7);
806        assert!(extractor.use_kerning);
807        assert!(extractor.language.is_none());
808    }
809
810    #[test]
811    fn test_builder_with_language() {
812        let extractor = InvoiceExtractor::builder().with_language("es").build();
813        assert_eq!(extractor.language, Some(Language::Spanish));
814    }
815
816    #[test]
817    fn test_builder_confidence_threshold() {
818        let extractor = InvoiceExtractor::builder()
819            .confidence_threshold(0.9)
820            .build();
821        assert_eq!(extractor.confidence_threshold, 0.9);
822    }
823
824    #[test]
825    fn test_builder_use_kerning() {
826        let extractor = InvoiceExtractor::builder().use_kerning(false).build();
827        assert!(!extractor.use_kerning);
828    }
829
830    #[test]
831    fn test_use_kerning_stored_for_future_use() {
832        // Verify the flag is stored correctly (even though not yet functional)
833        let extractor_enabled = InvoiceExtractor::builder().use_kerning(true).build();
834        assert!(
835            extractor_enabled.use_kerning,
836            "use_kerning should be stored as true"
837        );
838
839        let extractor_disabled = InvoiceExtractor::builder().use_kerning(false).build();
840        assert!(
841            !extractor_disabled.use_kerning,
842            "use_kerning should be stored as false"
843        );
844
845        // Default value
846        let extractor_default = InvoiceExtractor::builder().build();
847        assert!(
848            extractor_default.use_kerning,
849            "use_kerning should default to true"
850        );
851    }
852}
oxidize_pdf/text/invoice/extractor.rs

oxidize_pdf/text/invoice/
extractor.rs