oxidize_pdf/text/invoice/
extractor.rs

1//! Invoice data extractor
2//!
3//! This module provides the main `InvoiceExtractor` type for extracting structured
4//! data from invoice PDFs using pattern matching and confidence scoring.
5//!
6//! # Architecture
7//!
8//! The extraction process follows a pipeline:
9//!
10//! ```text
11//! TextFragments → Text Reconstruction → Pattern Matching → Type Conversion → InvoiceData
12//! ```
13//!
14//! 1. **Text Reconstruction**: Join text fragments with spatial awareness
15//! 2. **Pattern Matching**: Apply language-specific regex patterns
16//! 3. **Confidence Scoring**: Calculate confidence for each match (0.0-1.0)
17//! 4. **Type Conversion**: Convert strings to typed fields (amounts, dates, etc.)
18//! 5. **Filtering**: Remove low-confidence matches below threshold
19//!
20//! # Usage
21//!
22//! ```ignore
23//! use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
24//! use oxidize_pdf::text::invoice::InvoiceExtractor;
25//! use oxidize_pdf::Document;
26//!
27//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
28//! // Extract text from PDF
29//! let doc = Document::open("invoice.pdf")?;
30//! let page = doc.get_page(1)?;
31//! let text_extractor = TextExtractor::new();
32//! let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
33//!
34//! // Extract invoice data
35//! let extractor = InvoiceExtractor::builder()
36//!     .with_language("es")
37//!     .confidence_threshold(0.7)
38//!     .build();
39//!
40//! let invoice = extractor.extract(&extracted.fragments)?;
41//! println!("Found {} fields", invoice.field_count());
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! # Confidence Scoring
47//!
48//! Each extracted field has a confidence score (0.0 = no confidence, 1.0 = certain):
49//!
50//! - **0.9**: Critical fields (invoice number, total amount)
51//! - **0.8**: Important fields (dates, tax amounts)
52//! - **0.7**: Standard fields (VAT numbers, names)
53//!
54//! Fields below the confidence threshold are automatically filtered out.
55
56use super::error::{ExtractionError, Result};
57use super::patterns::{InvoiceFieldType, PatternLibrary};
58use super::types::{
59    BoundingBox, ExtractedField, InvoiceData, InvoiceField, InvoiceMetadata, Language,
60};
61use super::validators;
62use crate::text::extraction::TextFragment;
63
64/// Invoice data extractor with configurable pattern matching
65///
66/// This is the main entry point for invoice extraction. Use the builder pattern
67/// to configure language, confidence thresholds, and other options.
68///
69/// # Examples
70///
71/// ```
72/// use oxidize_pdf::text::invoice::InvoiceExtractor;
73///
74/// // Spanish invoices with high confidence threshold and kerning-aware spacing
75/// let extractor = InvoiceExtractor::builder()
76///     .with_language("es")
77///     .confidence_threshold(0.85)
78///     .use_kerning(true)  // Enables font-aware spacing in text reconstruction
79///     .build();
80/// ```
81///
82/// # Thread Safety
83///
84/// `InvoiceExtractor` is immutable after construction and can be safely shared
85/// across threads. Consider creating one extractor per language and reusing it.
86pub struct InvoiceExtractor {
87    pattern_library: PatternLibrary,
88    confidence_threshold: f64,
89    /// Enable kerning-aware text reconstruction
90    ///
91    /// When enabled, adjusts inter-fragment spacing based on font continuity.
92    /// Fragments with the same font use tighter spacing (single space), while
93    /// font changes use normal spacing (double space).
94    ///
95    /// **Implementation Note**: This is a simplified version of true kerning.
96    /// Full kerning with font metrics requires access to kerning pair tables,
97    /// which would require passing `font_cache` or `Document` reference.
98    /// The current implementation provides spacing improvements without
99    /// breaking API compatibility.
100    use_kerning: bool,
101    language: Option<Language>,
102}
103
104impl InvoiceExtractor {
105    /// Create a new builder for configuring the extractor
106    ///
107    /// This is the recommended way to create an `InvoiceExtractor`.
108    ///
109    /// # Examples
110    ///
111    /// ```
112    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
113    ///
114    /// let extractor = InvoiceExtractor::builder()
115    ///     .with_language("es")
116    ///     .confidence_threshold(0.8)
117    ///     .build();
118    /// ```
119    pub fn builder() -> InvoiceExtractorBuilder {
120        InvoiceExtractorBuilder::new()
121    }
122
123    /// Extract structured invoice data from text fragments
124    ///
125    /// This is the main extraction method. It processes text fragments from a PDF page
126    /// and returns structured invoice data with confidence scores.
127    ///
128    /// # Process
129    ///
130    /// 1. Text fragments are reconstructed into full text
131    /// 2. Language-specific patterns are applied
132    /// 3. Matches are converted to typed fields
133    /// 4. Confidence scores are calculated
134    /// 5. Low-confidence fields are filtered out
135    ///
136    /// # Arguments
137    ///
138    /// * `text_fragments` - Text fragments extracted from PDF page (from `TextExtractor`)
139    ///
140    /// # Returns
141    ///
142    /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if:
143    /// - No text fragments provided
144    /// - PDF page is empty
145    /// - Text extraction failed
146    ///
147    /// # Examples
148    ///
149    /// ```ignore
150    /// use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
151    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
152    /// use oxidize_pdf::Document;
153    ///
154    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
155    /// let doc = Document::open("invoice.pdf")?;
156    /// let page = doc.get_page(1)?;
157    ///
158    /// // Extract text
159    /// let text_extractor = TextExtractor::new();
160    /// let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
161    ///
162    /// // Extract invoice data
163    /// let extractor = InvoiceExtractor::builder()
164    ///     .with_language("es")
165    ///     .build();
166    ///
167    /// let invoice = extractor.extract(&extracted.fragments)?;
168    ///
169    /// // Access extracted fields
170    /// for field in &invoice.fields {
171    ///     println!("{}: {:?} (confidence: {:.2})",
172    ///         field.field_type.name(),
173    ///         field.field_type,
174    ///         field.confidence
175    ///     );
176    /// }
177    /// # Ok(())
178    /// # }
179    /// ```
180    ///
181    /// # Performance
182    ///
183    /// Extraction is CPU-bound and typically completes in <100ms for standard invoices.
184    /// The extractor can be safely reused across multiple pages and threads.
185    pub fn extract(&self, text_fragments: &[TextFragment]) -> Result<InvoiceData> {
186        if text_fragments.is_empty() {
187            return Err(ExtractionError::NoTextFound(1));
188        }
189
190        // Step 1: Reconstruct full text with position tracking
191        let full_text = self.reconstruct_text(text_fragments);
192
193        // Step 2: Apply pattern matching
194        let matches = self.pattern_library.match_text(&full_text);
195
196        // Step 3: Convert matches to ExtractedField with proper types
197        let mut fields = Vec::new();
198        for (field_type, matched_value, base_confidence) in matches {
199            // Calculate confidence score with context
200            let confidence =
201                self.calculate_confidence(&field_type, base_confidence, &matched_value, &full_text);
202
203            // Skip fields below threshold
204            if confidence < self.confidence_threshold {
205                continue;
206            }
207
208            // Find position of this match in fragments
209            let position = self.find_match_position(&matched_value, text_fragments);
210
211            // Convert to proper InvoiceField with typed data
212            if let Some(invoice_field) = self.convert_to_invoice_field(field_type, &matched_value) {
213                fields.push(ExtractedField::new(
214                    invoice_field,
215                    confidence,
216                    position,
217                    matched_value,
218                ));
219            }
220        }
221
222        // Step 4: Calculate overall confidence
223        let overall_confidence = if fields.is_empty() {
224            0.0
225        } else {
226            fields.iter().map(|f| f.confidence).sum::<f64>() / fields.len() as f64
227        };
228
229        // Step 5: Create metadata
230        let metadata = InvoiceMetadata::new(1, overall_confidence)
231            .with_language(self.language.unwrap_or(Language::English));
232
233        Ok(InvoiceData::new(fields, metadata))
234    }
235
236    /// Extract invoice data from plain text (convenience method for testing)
237    ///
238    /// This is a convenience wrapper around `extract()` that creates synthetic
239    /// TextFragment objects from plain text input. Primarily useful for testing
240    /// and simple scenarios where you don't have actual PDF text fragments.
241    ///
242    /// **Note**: This method creates fragments without position information,
243    /// so proximity-based scoring may be less accurate than with real PDF fragments.
244    ///
245    /// # Arguments
246    ///
247    /// * `text` - Plain text string to extract invoice data from
248    ///
249    /// # Returns
250    ///
251    /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if text is empty
252    ///
253    /// # Examples
254    ///
255    /// ```
256    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
257    ///
258    /// let extractor = InvoiceExtractor::builder()
259    ///     .with_language("en")
260    ///     .confidence_threshold(0.7)
261    ///     .build();
262    ///
263    /// let invoice_text = "Invoice Number: INV-001\nTotal: £100.00";
264    /// let result = extractor.extract_from_text(invoice_text)?;
265    ///
266    /// assert!(!result.fields.is_empty());
267    /// # Ok::<(), Box<dyn std::error::Error>>(())
268    /// ```
269    pub fn extract_from_text(&self, text: &str) -> Result<InvoiceData> {
270        if text.is_empty() {
271            return Err(ExtractionError::NoTextFound(1));
272        }
273
274        // Create a single synthetic TextFragment from the text
275        let fragment = TextFragment {
276            text: text.to_string(),
277            x: 0.0,
278            y: 0.0,
279            width: 0.0,
280            height: 12.0,
281            font_size: 12.0,
282            font_name: None,
283            is_bold: false,
284            is_italic: false,
285            color: None,
286        };
287
288        // Use the standard extract method
289        self.extract(&[fragment])
290    }
291
292    /// Reconstruct text from fragments
293    ///
294    /// When `use_kerning` is enabled, applies tighter spacing between fragments
295    /// that share the same font, simulating kerning-aware text reconstruction.
296    ///
297    /// **Implementation**: While full kerning requires font metrics (kerning pairs),
298    /// this simplified version adjusts inter-fragment spacing based on font continuity.
299    /// Fragments with the same font get minimal spacing (single space), while font
300    /// changes get normal spacing (double space).
301    fn reconstruct_text(&self, fragments: &[TextFragment]) -> String {
302        if fragments.is_empty() {
303            return String::new();
304        }
305
306        if !self.use_kerning {
307            // Default: join all fragments with single space
308            return fragments
309                .iter()
310                .map(|f| f.text.as_str())
311                .collect::<Vec<_>>()
312                .join(" ");
313        }
314
315        // Kerning-aware: use tighter spacing for same-font fragments
316        let mut result = String::with_capacity(
317            fragments.iter().map(|f| f.text.len()).sum::<usize>() + fragments.len(),
318        );
319
320        for (i, fragment) in fragments.iter().enumerate() {
321            result.push_str(&fragment.text);
322
323            // Add spacing between fragments
324            if i < fragments.len() - 1 {
325                let next = &fragments[i + 1];
326
327                // If both fragments have same font, use minimal spacing
328                // Otherwise use normal spacing for font transitions
329                let spacing = match (&fragment.font_name, &next.font_name) {
330                    (Some(f1), Some(f2)) if f1 == f2 => " ", // Same font: tight spacing
331                    _ => "  ", // Different/unknown font: normal spacing
332                };
333
334                result.push_str(spacing);
335            }
336        }
337
338        result
339    }
340
341    /// Parse amount with language-aware decimal handling
342    fn parse_amount(&self, value: &str) -> Option<f64> {
343        // Determine decimal format based on language
344        let uses_european_format = matches!(
345            self.language,
346            Some(Language::Spanish) | Some(Language::German) | Some(Language::Italian)
347        );
348
349        let normalized = if uses_european_format {
350            // European format: 1.234,56 → remove dots (thousands), replace comma with dot (decimal)
351            value.replace('.', "").replace(',', ".")
352        } else {
353            // US/UK format: 1,234.56 → remove commas (thousands), dot is already decimal
354            value.replace(',', "")
355        };
356
357        normalized.parse::<f64>().ok()
358    }
359
360    /// Calculate confidence score for a match using multi-factor scoring
361    ///
362    /// Combines multiple factors to produce a final confidence score:
363    /// 1. **Base Pattern Confidence** (0.7-0.9): From pattern matching quality
364    /// 2. **Value Validation Bonus** (-0.5 to +0.2): Format and content validation
365    /// 3. **Proximity Bonus** (0.0 to +0.15): Distance from field label keywords
366    ///
367    /// # Arguments
368    ///
369    /// * `field_type` - The type of field being scored (affects which validator is applied)
370    /// * `base_confidence` - Initial confidence from pattern match quality
371    /// * `matched_value` - The extracted value (used for validation)
372    /// * `full_text` - Complete text of the invoice (used for proximity calculation)
373    ///
374    /// # Returns
375    ///
376    /// Final confidence score clamped to [0.0, 1.0]
377    ///
378    /// # Examples
379    ///
380    /// ```ignore
381    /// // Invoice date with valid format gets validation bonus
382    /// let confidence = extractor.calculate_confidence(
383    ///     &InvoiceFieldType::InvoiceDate,
384    ///     0.85,  // base from pattern
385    ///     "20/01/2025",
386    ///     full_text
387    /// );
388    /// // Result: 0.85 + 0.20 (valid date) + proximity = ~1.0
389    /// ```
390    fn calculate_confidence(
391        &self,
392        field_type: &InvoiceFieldType,
393        base_confidence: f64,
394        matched_value: &str,
395        full_text: &str,
396    ) -> f64 {
397        // Start with base confidence from pattern matching
398        let mut score = base_confidence;
399
400        // Apply value validation adjustments based on field type
401        let validation_adjustment = match field_type {
402            InvoiceFieldType::InvoiceDate | InvoiceFieldType::DueDate => {
403                validators::validate_date(matched_value)
404            }
405            InvoiceFieldType::TotalAmount
406            | InvoiceFieldType::TaxAmount
407            | InvoiceFieldType::NetAmount
408            | InvoiceFieldType::LineItemUnitPrice => validators::validate_amount(matched_value),
409            InvoiceFieldType::InvoiceNumber => validators::validate_invoice_number(matched_value),
410            InvoiceFieldType::VatNumber => validators::validate_vat_number(matched_value),
411            // No validators yet for these fields
412            InvoiceFieldType::SupplierName
413            | InvoiceFieldType::CustomerName
414            | InvoiceFieldType::Currency
415            | InvoiceFieldType::ArticleNumber
416            | InvoiceFieldType::LineItemDescription
417            | InvoiceFieldType::LineItemQuantity => 0.0,
418        };
419
420        score += validation_adjustment;
421
422        // Apply proximity bonus (closeness to field label in text)
423        let proximity_bonus = self.calculate_proximity_bonus(field_type, matched_value, full_text);
424        score += proximity_bonus;
425
426        // Clamp to valid range [0.0, 1.0]
427        score.clamp(0.0, 1.0)
428    }
429
430    /// Calculate proximity bonus based on distance from field label keywords
431    ///
432    /// Fields that appear close to their expected label keywords receive a bonus.
433    /// This helps distinguish between correct matches and ambiguous values that
434    /// happen to match the pattern but appear in the wrong context.
435    ///
436    /// # Proximity Bonus Scale
437    ///
438    /// - **+0.15**: Keyword within 20 characters of match
439    /// - **+0.10**: Keyword within 50 characters
440    /// - **+0.05**: Keyword within 100 characters
441    /// - **0.00**: Keyword beyond 100 characters or not found
442    ///
443    /// # Arguments
444    ///
445    /// * `field_type` - The type of field (determines which keywords to search for)
446    /// * `matched_value` - The extracted value
447    /// * `full_text` - Complete invoice text
448    ///
449    /// # Returns
450    ///
451    /// Proximity bonus in range [0.0, 0.15]
452    fn calculate_proximity_bonus(
453        &self,
454        field_type: &InvoiceFieldType,
455        matched_value: &str,
456        full_text: &str,
457    ) -> f64 {
458        // Define keywords for each field type (language-agnostic where possible)
459        let keywords: Vec<&str> = match field_type {
460            InvoiceFieldType::InvoiceNumber => {
461                vec![
462                    "Invoice", "Factura", "Rechnung", "Fattura", "Number", "Número", "Nr",
463                ]
464            }
465            InvoiceFieldType::InvoiceDate => {
466                vec!["Date", "Fecha", "Datum", "Data", "Invoice Date"]
467            }
468            InvoiceFieldType::DueDate => {
469                vec!["Due", "Vencimiento", "Fällig", "Scadenza", "Payment"]
470            }
471            InvoiceFieldType::TotalAmount => {
472                vec![
473                    "Total",
474                    "Grand Total",
475                    "Amount Due",
476                    "Gesamtbetrag",
477                    "Totale",
478                ]
479            }
480            InvoiceFieldType::TaxAmount => {
481                vec!["VAT", "IVA", "MwSt", "Tax", "Impuesto"]
482            }
483            InvoiceFieldType::NetAmount => {
484                vec![
485                    "Subtotal",
486                    "Net",
487                    "Neto",
488                    "Nettobetrag",
489                    "Imponibile",
490                    "Base",
491                ]
492            }
493            InvoiceFieldType::VatNumber => {
494                vec!["VAT", "CIF", "NIF", "USt", "Partita IVA", "Tax ID"]
495            }
496            InvoiceFieldType::CustomerName => {
497                vec!["Bill to", "Customer", "Client", "Cliente"]
498            }
499            InvoiceFieldType::SupplierName => {
500                vec!["From", "Supplier", "Vendor", "Proveedor"]
501            }
502            _ => return 0.0, // No proximity bonus for other fields
503        };
504
505        // Find the matched value position in full text
506        let match_pos = match full_text.find(matched_value) {
507            Some(pos) => pos,
508            None => return 0.0, // Value not found in text (shouldn't happen)
509        };
510
511        // Find the closest keyword and calculate distance
512        let mut min_distance = usize::MAX;
513        for keyword in keywords {
514            // Case-insensitive search
515            let text_lower = full_text.to_lowercase();
516            let keyword_lower = keyword.to_lowercase();
517
518            if let Some(keyword_pos) = text_lower.find(&keyword_lower) {
519                let distance = if keyword_pos < match_pos {
520                    match_pos - keyword_pos
521                } else {
522                    keyword_pos - match_pos
523                };
524
525                min_distance = min_distance.min(distance);
526            }
527        }
528
529        // Award bonus based on proximity (distance in characters)
530        match min_distance {
531            0..=20 => 0.15,   // Very close (same line, adjacent)
532            21..=50 => 0.10,  // Close (nearby in layout)
533            51..=100 => 0.05, // Moderately close
534            _ => 0.0,         // Too far or not found
535        }
536    }
537
538    /// Find the bounding box of a matched value in the fragments
539    fn find_match_position(&self, matched_value: &str, fragments: &[TextFragment]) -> BoundingBox {
540        // Simple approach: find first fragment containing the value
541        for fragment in fragments {
542            if fragment.text.contains(matched_value) {
543                return BoundingBox::new(fragment.x, fragment.y, fragment.width, fragment.height);
544            }
545        }
546
547        // Fallback: use first fragment's position
548        if let Some(first) = fragments.first() {
549            BoundingBox::new(first.x, first.y, first.width, first.height)
550        } else {
551            BoundingBox::new(0.0, 0.0, 0.0, 0.0)
552        }
553    }
554
555    /// Convert field type and string value to typed InvoiceField
556    fn convert_to_invoice_field(
557        &self,
558        field_type: InvoiceFieldType,
559        value: &str,
560    ) -> Option<InvoiceField> {
561        match field_type {
562            InvoiceFieldType::InvoiceNumber => Some(InvoiceField::InvoiceNumber(value.to_string())),
563            InvoiceFieldType::InvoiceDate => Some(InvoiceField::InvoiceDate(value.to_string())),
564            InvoiceFieldType::DueDate => Some(InvoiceField::DueDate(value.to_string())),
565            InvoiceFieldType::TotalAmount => {
566                self.parse_amount(value).map(InvoiceField::TotalAmount)
567            }
568            InvoiceFieldType::TaxAmount => self.parse_amount(value).map(InvoiceField::TaxAmount),
569            InvoiceFieldType::NetAmount => self.parse_amount(value).map(InvoiceField::NetAmount),
570            InvoiceFieldType::VatNumber => Some(InvoiceField::VatNumber(value.to_string())),
571            InvoiceFieldType::SupplierName => Some(InvoiceField::SupplierName(value.to_string())),
572            InvoiceFieldType::CustomerName => Some(InvoiceField::CustomerName(value.to_string())),
573            InvoiceFieldType::Currency => Some(InvoiceField::Currency(value.to_string())),
574            InvoiceFieldType::ArticleNumber => Some(InvoiceField::ArticleNumber(value.to_string())),
575            InvoiceFieldType::LineItemDescription => {
576                Some(InvoiceField::LineItemDescription(value.to_string()))
577            }
578            InvoiceFieldType::LineItemQuantity => {
579                self.parse_amount(value).map(InvoiceField::LineItemQuantity)
580            }
581            InvoiceFieldType::LineItemUnitPrice => self
582                .parse_amount(value)
583                .map(InvoiceField::LineItemUnitPrice),
584        }
585    }
586}
587
588/// Builder for configuring `InvoiceExtractor`
589///
590/// Provides a fluent API for configuring extraction behavior. All settings
591/// have sensible defaults for immediate use.
592///
593/// # Defaults
594///
595/// - **Language**: None (uses default patterns)
596/// - **Confidence Threshold**: 0.7 (70%)
597/// - **Use Kerning**: true (stored but not yet functional - see `use_kerning()` docs)
598///
599/// # Examples
600///
601/// ```
602/// use oxidize_pdf::text::invoice::InvoiceExtractor;
603///
604/// // Minimal configuration
605/// let extractor = InvoiceExtractor::builder()
606///     .with_language("es")
607///     .build();
608///
609/// // Full configuration
610/// let extractor = InvoiceExtractor::builder()
611///     .with_language("de")
612///     .confidence_threshold(0.85)
613///     .use_kerning(false)
614///     .build();
615/// ```
616pub struct InvoiceExtractorBuilder {
617    language: Option<Language>,
618    confidence_threshold: f64,
619    use_kerning: bool,
620    custom_patterns: Option<PatternLibrary>,
621}
622
623impl InvoiceExtractorBuilder {
624    /// Create a new builder with default settings
625    ///
626    /// Defaults:
627    /// - No language (uses English patterns)
628    /// - Confidence threshold: 0.7
629    /// - Kerning: enabled
630    pub fn new() -> Self {
631        Self {
632            language: None,
633            confidence_threshold: 0.7,
634            use_kerning: true,
635            custom_patterns: None,
636        }
637    }
638
639    /// Set the language for pattern matching
640    ///
641    /// Accepts language codes: "es", "en", "de", "it"
642    ///
643    /// # Examples
644    ///
645    /// ```
646    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
647    ///
648    /// let extractor = InvoiceExtractor::builder()
649    ///     .with_language("es")  // Spanish patterns
650    ///     .build();
651    /// ```
652    pub fn with_language(mut self, lang: &str) -> Self {
653        self.language = Language::from_code(lang);
654        self
655    }
656
657    /// Set the minimum confidence threshold (0.0 to 1.0)
658    ///
659    /// Fields below this threshold are filtered out. Higher values reduce
660    /// false positives but may miss valid fields.
661    ///
662    /// Recommended values:
663    /// - **0.5**: Maximum recall (may include false positives)
664    /// - **0.7**: Balanced (default)
665    /// - **0.9**: Maximum precision (may miss valid fields)
666    ///
667    /// # Examples
668    ///
669    /// ```
670    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
671    ///
672    /// // High precision mode
673    /// let extractor = InvoiceExtractor::builder()
674    ///     .confidence_threshold(0.9)
675    ///     .build();
676    /// ```
677    ///
678    /// # Validation
679    ///
680    /// The threshold is automatically clamped to the valid range [0.0, 1.0].
681    /// Values outside this range are silently adjusted to the nearest valid value.
682    pub fn confidence_threshold(mut self, threshold: f64) -> Self {
683        self.confidence_threshold = threshold.clamp(0.0, 1.0);
684        self
685    }
686
687    /// Enable or disable kerning-aware text positioning (PLANNED for v2.0)
688    ///
689    /// **Current Behavior**: This flag is stored but NOT yet used in extraction logic.
690    ///
691    /// **Planned Feature** (v2.0): When enabled, text reconstruction will use actual
692    /// font kerning pairs to calculate accurate character spacing, improving pattern
693    /// matching for invoices with tight kerning (e.g., "AV", "To").
694    ///
695    /// **Why Not Implemented**: Requires architectural changes to expose font metadata
696    /// in `TextFragment`. See struct documentation for technical details.
697    ///
698    /// # Examples
699    ///
700    /// ```
701    /// use oxidize_pdf::text::invoice::InvoiceExtractor;
702    ///
703    /// // Enable for future use (no effect in v1.x)
704    /// let extractor = InvoiceExtractor::builder()
705    ///     .use_kerning(true)  // ⚠️ Stored but not yet functional
706    ///     .build();
707    /// ```
708    pub fn use_kerning(mut self, enabled: bool) -> Self {
709        self.use_kerning = enabled;
710        self
711    }
712
713    /// Use a custom pattern library instead of language-based defaults
714    ///
715    /// Allows complete control over invoice pattern matching by providing a
716    /// custom `PatternLibrary`. Useful for specialized invoice formats or
717    /// combining default patterns with custom additions.
718    ///
719    /// **Note**: When using custom patterns, the `with_language()` setting is ignored.
720    ///
721    /// # Examples
722    ///
723    /// **Example 1: Use default patterns and add custom ones**
724    /// ```
725    /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
726    ///
727    /// // Start with Spanish defaults
728    /// let mut patterns = PatternLibrary::default_spanish();
729    ///
730    /// // Add custom pattern for your specific invoice format
731    /// patterns.add_pattern(
732    ///     FieldPattern::new(
733    ///         InvoiceFieldType::InvoiceNumber,
734    ///         r"Ref:\s*([A-Z0-9\-]+)",  // Your custom format
735    ///         0.85,
736    ///         Some(Language::Spanish)
737    ///     ).unwrap()
738    /// );
739    ///
740    /// let extractor = InvoiceExtractor::builder()
741    ///     .with_custom_patterns(patterns)
742    ///     .build();
743    /// ```
744    ///
745    /// **Example 2: Build completely custom pattern library**
746    /// ```
747    /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
748    ///
749    /// let mut patterns = PatternLibrary::new();
750    ///
751    /// // Add only the patterns you need
752    /// patterns.add_pattern(
753    ///     FieldPattern::new(
754    ///         InvoiceFieldType::InvoiceNumber,
755    ///         r"Order\s+#([0-9]+)",
756    ///         0.9,
757    ///         None  // Language-agnostic
758    ///     ).unwrap()
759    /// );
760    ///
761    /// let extractor = InvoiceExtractor::builder()
762    ///     .with_custom_patterns(patterns)
763    ///     .confidence_threshold(0.8)
764    ///     .build();
765    /// ```
766    pub fn with_custom_patterns(mut self, patterns: PatternLibrary) -> Self {
767        self.custom_patterns = Some(patterns);
768        self
769    }
770
771    /// Build the InvoiceExtractor
772    pub fn build(self) -> InvoiceExtractor {
773        // Use custom patterns if provided, otherwise create from language
774        let pattern_library = if let Some(custom) = self.custom_patterns {
775            custom
776        } else if let Some(lang) = self.language {
777            PatternLibrary::with_language(lang)
778        } else {
779            PatternLibrary::new()
780        };
781
782        InvoiceExtractor {
783            pattern_library,
784            confidence_threshold: self.confidence_threshold,
785            use_kerning: self.use_kerning,
786            language: self.language,
787        }
788    }
789}
790
791impl Default for InvoiceExtractorBuilder {
792    fn default() -> Self {
793        Self::new()
794    }
795}
796
797#[cfg(test)]
798mod tests {
799    use super::*;
800
801    #[test]
802    fn test_builder_defaults() {
803        let extractor = InvoiceExtractor::builder().build();
804        assert_eq!(extractor.confidence_threshold, 0.7);
805        assert!(extractor.use_kerning);
806        assert!(extractor.language.is_none());
807    }
808
809    #[test]
810    fn test_builder_with_language() {
811        let extractor = InvoiceExtractor::builder().with_language("es").build();
812        assert_eq!(extractor.language, Some(Language::Spanish));
813    }
814
815    #[test]
816    fn test_builder_confidence_threshold() {
817        let extractor = InvoiceExtractor::builder()
818            .confidence_threshold(0.9)
819            .build();
820        assert_eq!(extractor.confidence_threshold, 0.9);
821    }
822
823    #[test]
824    fn test_builder_use_kerning() {
825        let extractor = InvoiceExtractor::builder().use_kerning(false).build();
826        assert!(!extractor.use_kerning);
827    }
828
829    #[test]
830    fn test_use_kerning_stored_for_future_use() {
831        // Verify the flag is stored correctly (even though not yet functional)
832        let extractor_enabled = InvoiceExtractor::builder().use_kerning(true).build();
833        assert!(
834            extractor_enabled.use_kerning,
835            "use_kerning should be stored as true"
836        );
837
838        let extractor_disabled = InvoiceExtractor::builder().use_kerning(false).build();
839        assert!(
840            !extractor_disabled.use_kerning,
841            "use_kerning should be stored as false"
842        );
843
844        // Default value
845        let extractor_default = InvoiceExtractor::builder().build();
846        assert!(
847            extractor_default.use_kerning,
848            "use_kerning should default to true"
849        );
850    }
851}
oxidize_pdf/text/invoice/extractor.rs

oxidize_pdf/text/invoice/
extractor.rs