oxidize_pdf/text/invoice/extractor.rs
1//! Invoice data extractor
2//!
3//! This module provides the main `InvoiceExtractor` type for extracting structured
4//! data from invoice PDFs using pattern matching and confidence scoring.
5//!
6//! # Architecture
7//!
8//! The extraction process follows a pipeline:
9//!
10//! ```text
11//! TextFragments → Text Reconstruction → Pattern Matching → Type Conversion → InvoiceData
12//! ```
13//!
14//! 1. **Text Reconstruction**: Join text fragments with spatial awareness
15//! 2. **Pattern Matching**: Apply language-specific regex patterns
16//! 3. **Confidence Scoring**: Calculate confidence for each match (0.0-1.0)
17//! 4. **Type Conversion**: Convert strings to typed fields (amounts, dates, etc.)
18//! 5. **Filtering**: Remove low-confidence matches below threshold
19//!
20//! # Usage
21//!
22//! ```ignore
23//! use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
24//! use oxidize_pdf::text::invoice::InvoiceExtractor;
25//! use oxidize_pdf::Document;
26//!
27//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
28//! // Extract text from PDF
29//! let doc = Document::open("invoice.pdf")?;
30//! let page = doc.get_page(1)?;
31//! let text_extractor = TextExtractor::new();
32//! let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
33//!
34//! // Extract invoice data
35//! let extractor = InvoiceExtractor::builder()
36//! .with_language("es")
37//! .confidence_threshold(0.7)
38//! .build();
39//!
40//! let invoice = extractor.extract(&extracted.fragments)?;
41//! println!("Found {} fields", invoice.field_count());
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! # Confidence Scoring
47//!
48//! Each extracted field has a confidence score (0.0 = no confidence, 1.0 = certain):
49//!
50//! - **0.9**: Critical fields (invoice number, total amount)
51//! - **0.8**: Important fields (dates, tax amounts)
52//! - **0.7**: Standard fields (VAT numbers, names)
53//!
54//! Fields below the confidence threshold are automatically filtered out.
55
56use super::error::{ExtractionError, Result};
57use super::patterns::{InvoiceFieldType, PatternLibrary};
58use super::types::{
59 BoundingBox, ExtractedField, InvoiceData, InvoiceField, InvoiceMetadata, Language,
60};
61use super::validators;
62use crate::text::extraction::TextFragment;
63
64/// Invoice data extractor with configurable pattern matching
65///
66/// This is the main entry point for invoice extraction. Use the builder pattern
67/// to configure language, confidence thresholds, and other options.
68///
69/// # Examples
70///
71/// ```
72/// use oxidize_pdf::text::invoice::InvoiceExtractor;
73///
74/// // Spanish invoices with high confidence threshold and kerning-aware spacing
75/// let extractor = InvoiceExtractor::builder()
76/// .with_language("es")
77/// .confidence_threshold(0.85)
78/// .use_kerning(true) // Enables font-aware spacing in text reconstruction
79/// .build();
80/// ```
81///
82/// # Thread Safety
83///
84/// `InvoiceExtractor` is immutable after construction and can be safely shared
85/// across threads. Consider creating one extractor per language and reusing it.
86pub struct InvoiceExtractor {
87 pattern_library: PatternLibrary,
88 confidence_threshold: f64,
89 /// Enable kerning-aware text reconstruction
90 ///
91 /// When enabled, adjusts inter-fragment spacing based on font continuity.
92 /// Fragments with the same font use tighter spacing (single space), while
93 /// font changes use normal spacing (double space).
94 ///
95 /// **Implementation Note**: This is a simplified version of true kerning.
96 /// Full kerning with font metrics requires access to kerning pair tables,
97 /// which would require passing `font_cache` or `Document` reference.
98 /// The current implementation provides spacing improvements without
99 /// breaking API compatibility.
100 use_kerning: bool,
101 language: Option<Language>,
102}
103
104impl InvoiceExtractor {
105 /// Create a new builder for configuring the extractor
106 ///
107 /// This is the recommended way to create an `InvoiceExtractor`.
108 ///
109 /// # Examples
110 ///
111 /// ```
112 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
113 ///
114 /// let extractor = InvoiceExtractor::builder()
115 /// .with_language("es")
116 /// .confidence_threshold(0.8)
117 /// .build();
118 /// ```
119 pub fn builder() -> InvoiceExtractorBuilder {
120 InvoiceExtractorBuilder::new()
121 }
122
123 /// Extract structured invoice data from text fragments
124 ///
125 /// This is the main extraction method. It processes text fragments from a PDF page
126 /// and returns structured invoice data with confidence scores.
127 ///
128 /// # Process
129 ///
130 /// 1. Text fragments are reconstructed into full text
131 /// 2. Language-specific patterns are applied
132 /// 3. Matches are converted to typed fields
133 /// 4. Confidence scores are calculated
134 /// 5. Low-confidence fields are filtered out
135 ///
136 /// # Arguments
137 ///
138 /// * `text_fragments` - Text fragments extracted from PDF page (from `TextExtractor`)
139 ///
140 /// # Returns
141 ///
142 /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if:
143 /// - No text fragments provided
144 /// - PDF page is empty
145 /// - Text extraction failed
146 ///
147 /// # Examples
148 ///
149 /// ```ignore
150 /// use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
151 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
152 /// use oxidize_pdf::Document;
153 ///
154 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
155 /// let doc = Document::open("invoice.pdf")?;
156 /// let page = doc.get_page(1)?;
157 ///
158 /// // Extract text
159 /// let text_extractor = TextExtractor::new();
160 /// let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
161 ///
162 /// // Extract invoice data
163 /// let extractor = InvoiceExtractor::builder()
164 /// .with_language("es")
165 /// .build();
166 ///
167 /// let invoice = extractor.extract(&extracted.fragments)?;
168 ///
169 /// // Access extracted fields
170 /// for field in &invoice.fields {
171 /// println!("{}: {:?} (confidence: {:.2})",
172 /// field.field_type.name(),
173 /// field.field_type,
174 /// field.confidence
175 /// );
176 /// }
177 /// # Ok(())
178 /// # }
179 /// ```
180 ///
181 /// # Performance
182 ///
183 /// Extraction is CPU-bound and typically completes in <100ms for standard invoices.
184 /// The extractor can be safely reused across multiple pages and threads.
185 pub fn extract(&self, text_fragments: &[TextFragment]) -> Result<InvoiceData> {
186 if text_fragments.is_empty() {
187 return Err(ExtractionError::NoTextFound(1));
188 }
189
190 // Step 1: Reconstruct full text with position tracking
191 let full_text = self.reconstruct_text(text_fragments);
192
193 // Step 2: Apply pattern matching
194 let matches = self.pattern_library.match_text(&full_text);
195
196 // Step 3: Convert matches to ExtractedField with proper types
197 let mut fields = Vec::new();
198 for (field_type, matched_value, base_confidence) in matches {
199 // Calculate confidence score with context
200 let confidence =
201 self.calculate_confidence(&field_type, base_confidence, &matched_value, &full_text);
202
203 // Skip fields below threshold
204 if confidence < self.confidence_threshold {
205 continue;
206 }
207
208 // Find position of this match in fragments
209 let position = self.find_match_position(&matched_value, text_fragments);
210
211 // Convert to proper InvoiceField with typed data
212 if let Some(invoice_field) = self.convert_to_invoice_field(field_type, &matched_value) {
213 fields.push(ExtractedField::new(
214 invoice_field,
215 confidence,
216 position,
217 matched_value,
218 ));
219 }
220 }
221
222 // Step 4: Calculate overall confidence
223 let overall_confidence = if fields.is_empty() {
224 0.0
225 } else {
226 fields.iter().map(|f| f.confidence).sum::<f64>() / fields.len() as f64
227 };
228
229 // Step 5: Create metadata
230 let metadata = InvoiceMetadata::new(1, overall_confidence)
231 .with_language(self.language.unwrap_or(Language::English));
232
233 Ok(InvoiceData::new(fields, metadata))
234 }
235
236 /// Extract invoice data from plain text (convenience method for testing)
237 ///
238 /// This is a convenience wrapper around `extract()` that creates synthetic
239 /// TextFragment objects from plain text input. Primarily useful for testing
240 /// and simple scenarios where you don't have actual PDF text fragments.
241 ///
242 /// **Note**: This method creates fragments without position information,
243 /// so proximity-based scoring may be less accurate than with real PDF fragments.
244 ///
245 /// # Arguments
246 ///
247 /// * `text` - Plain text string to extract invoice data from
248 ///
249 /// # Returns
250 ///
251 /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if text is empty
252 ///
253 /// # Examples
254 ///
255 /// ```
256 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
257 ///
258 /// let extractor = InvoiceExtractor::builder()
259 /// .with_language("en")
260 /// .confidence_threshold(0.7)
261 /// .build();
262 ///
263 /// let invoice_text = "Invoice Number: INV-001\nTotal: £100.00";
264 /// let result = extractor.extract_from_text(invoice_text)?;
265 ///
266 /// assert!(!result.fields.is_empty());
267 /// # Ok::<(), Box<dyn std::error::Error>>(())
268 /// ```
269 pub fn extract_from_text(&self, text: &str) -> Result<InvoiceData> {
270 if text.is_empty() {
271 return Err(ExtractionError::NoTextFound(1));
272 }
273
274 // Create a single synthetic TextFragment from the text
275 let fragment = TextFragment {
276 text: text.to_string(),
277 x: 0.0,
278 y: 0.0,
279 width: 0.0,
280 height: 12.0,
281 font_size: 12.0,
282 font_name: None,
283 is_bold: false,
284 is_italic: false,
285 color: None,
286 space_decisions: Vec::new(),
287 };
288
289 // Use the standard extract method
290 self.extract(&[fragment])
291 }
292
293 /// Reconstruct text from fragments
294 ///
295 /// When `use_kerning` is enabled, applies tighter spacing between fragments
296 /// that share the same font, simulating kerning-aware text reconstruction.
297 ///
298 /// **Implementation**: While full kerning requires font metrics (kerning pairs),
299 /// this simplified version adjusts inter-fragment spacing based on font continuity.
300 /// Fragments with the same font get minimal spacing (single space), while font
301 /// changes get normal spacing (double space).
302 fn reconstruct_text(&self, fragments: &[TextFragment]) -> String {
303 if fragments.is_empty() {
304 return String::new();
305 }
306
307 if !self.use_kerning {
308 // Default: join all fragments with single space
309 return fragments
310 .iter()
311 .map(|f| f.text.as_str())
312 .collect::<Vec<_>>()
313 .join(" ");
314 }
315
316 // Kerning-aware: use tighter spacing for same-font fragments
317 let mut result = String::with_capacity(
318 fragments.iter().map(|f| f.text.len()).sum::<usize>() + fragments.len(),
319 );
320
321 for (i, fragment) in fragments.iter().enumerate() {
322 result.push_str(&fragment.text);
323
324 // Add spacing between fragments
325 if i < fragments.len() - 1 {
326 let next = &fragments[i + 1];
327
328 // If both fragments have same font, use minimal spacing
329 // Otherwise use normal spacing for font transitions
330 let spacing = match (&fragment.font_name, &next.font_name) {
331 (Some(f1), Some(f2)) if f1 == f2 => " ", // Same font: tight spacing
332 _ => " ", // Different/unknown font: normal spacing
333 };
334
335 result.push_str(spacing);
336 }
337 }
338
339 result
340 }
341
342 /// Parse amount with language-aware decimal handling
343 fn parse_amount(&self, value: &str) -> Option<f64> {
344 // Determine decimal format based on language
345 let uses_european_format = matches!(
346 self.language,
347 Some(Language::Spanish) | Some(Language::German) | Some(Language::Italian)
348 );
349
350 let normalized = if uses_european_format {
351 // European format: 1.234,56 → remove dots (thousands), replace comma with dot (decimal)
352 value.replace('.', "").replace(',', ".")
353 } else {
354 // US/UK format: 1,234.56 → remove commas (thousands), dot is already decimal
355 value.replace(',', "")
356 };
357
358 normalized.parse::<f64>().ok()
359 }
360
361 /// Calculate confidence score for a match using multi-factor scoring
362 ///
363 /// Combines multiple factors to produce a final confidence score:
364 /// 1. **Base Pattern Confidence** (0.7-0.9): From pattern matching quality
365 /// 2. **Value Validation Bonus** (-0.5 to +0.2): Format and content validation
366 /// 3. **Proximity Bonus** (0.0 to +0.15): Distance from field label keywords
367 ///
368 /// # Arguments
369 ///
370 /// * `field_type` - The type of field being scored (affects which validator is applied)
371 /// * `base_confidence` - Initial confidence from pattern match quality
372 /// * `matched_value` - The extracted value (used for validation)
373 /// * `full_text` - Complete text of the invoice (used for proximity calculation)
374 ///
375 /// # Returns
376 ///
377 /// Final confidence score clamped to [0.0, 1.0]
378 ///
379 /// # Examples
380 ///
381 /// ```ignore
382 /// // Invoice date with valid format gets validation bonus
383 /// let confidence = extractor.calculate_confidence(
384 /// &InvoiceFieldType::InvoiceDate,
385 /// 0.85, // base from pattern
386 /// "20/01/2025",
387 /// full_text
388 /// );
389 /// // Result: 0.85 + 0.20 (valid date) + proximity = ~1.0
390 /// ```
391 fn calculate_confidence(
392 &self,
393 field_type: &InvoiceFieldType,
394 base_confidence: f64,
395 matched_value: &str,
396 full_text: &str,
397 ) -> f64 {
398 // Start with base confidence from pattern matching
399 let mut score = base_confidence;
400
401 // Apply value validation adjustments based on field type
402 let validation_adjustment = match field_type {
403 InvoiceFieldType::InvoiceDate | InvoiceFieldType::DueDate => {
404 validators::validate_date(matched_value)
405 }
406 InvoiceFieldType::TotalAmount
407 | InvoiceFieldType::TaxAmount
408 | InvoiceFieldType::NetAmount
409 | InvoiceFieldType::LineItemUnitPrice => validators::validate_amount(matched_value),
410 InvoiceFieldType::InvoiceNumber => validators::validate_invoice_number(matched_value),
411 InvoiceFieldType::VatNumber => validators::validate_vat_number(matched_value),
412 // No validators yet for these fields
413 InvoiceFieldType::SupplierName
414 | InvoiceFieldType::CustomerName
415 | InvoiceFieldType::Currency
416 | InvoiceFieldType::ArticleNumber
417 | InvoiceFieldType::LineItemDescription
418 | InvoiceFieldType::LineItemQuantity => 0.0,
419 };
420
421 score += validation_adjustment;
422
423 // Apply proximity bonus (closeness to field label in text)
424 let proximity_bonus = self.calculate_proximity_bonus(field_type, matched_value, full_text);
425 score += proximity_bonus;
426
427 // Clamp to valid range [0.0, 1.0]
428 score.clamp(0.0, 1.0)
429 }
430
431 /// Calculate proximity bonus based on distance from field label keywords
432 ///
433 /// Fields that appear close to their expected label keywords receive a bonus.
434 /// This helps distinguish between correct matches and ambiguous values that
435 /// happen to match the pattern but appear in the wrong context.
436 ///
437 /// # Proximity Bonus Scale
438 ///
439 /// - **+0.15**: Keyword within 20 characters of match
440 /// - **+0.10**: Keyword within 50 characters
441 /// - **+0.05**: Keyword within 100 characters
442 /// - **0.00**: Keyword beyond 100 characters or not found
443 ///
444 /// # Arguments
445 ///
446 /// * `field_type` - The type of field (determines which keywords to search for)
447 /// * `matched_value` - The extracted value
448 /// * `full_text` - Complete invoice text
449 ///
450 /// # Returns
451 ///
452 /// Proximity bonus in range [0.0, 0.15]
453 fn calculate_proximity_bonus(
454 &self,
455 field_type: &InvoiceFieldType,
456 matched_value: &str,
457 full_text: &str,
458 ) -> f64 {
459 // Define keywords for each field type (language-agnostic where possible)
460 let keywords: Vec<&str> = match field_type {
461 InvoiceFieldType::InvoiceNumber => {
462 vec![
463 "Invoice", "Factura", "Rechnung", "Fattura", "Number", "Número", "Nr",
464 ]
465 }
466 InvoiceFieldType::InvoiceDate => {
467 vec!["Date", "Fecha", "Datum", "Data", "Invoice Date"]
468 }
469 InvoiceFieldType::DueDate => {
470 vec!["Due", "Vencimiento", "Fällig", "Scadenza", "Payment"]
471 }
472 InvoiceFieldType::TotalAmount => {
473 vec![
474 "Total",
475 "Grand Total",
476 "Amount Due",
477 "Gesamtbetrag",
478 "Totale",
479 ]
480 }
481 InvoiceFieldType::TaxAmount => {
482 vec!["VAT", "IVA", "MwSt", "Tax", "Impuesto"]
483 }
484 InvoiceFieldType::NetAmount => {
485 vec![
486 "Subtotal",
487 "Net",
488 "Neto",
489 "Nettobetrag",
490 "Imponibile",
491 "Base",
492 ]
493 }
494 InvoiceFieldType::VatNumber => {
495 vec!["VAT", "CIF", "NIF", "USt", "Partita IVA", "Tax ID"]
496 }
497 InvoiceFieldType::CustomerName => {
498 vec!["Bill to", "Customer", "Client", "Cliente"]
499 }
500 InvoiceFieldType::SupplierName => {
501 vec!["From", "Supplier", "Vendor", "Proveedor"]
502 }
503 _ => return 0.0, // No proximity bonus for other fields
504 };
505
506 // Find the matched value position in full text
507 let match_pos = match full_text.find(matched_value) {
508 Some(pos) => pos,
509 None => return 0.0, // Value not found in text (shouldn't happen)
510 };
511
512 // Find the closest keyword and calculate distance
513 let mut min_distance = usize::MAX;
514 for keyword in keywords {
515 // Case-insensitive search
516 let text_lower = full_text.to_lowercase();
517 let keyword_lower = keyword.to_lowercase();
518
519 if let Some(keyword_pos) = text_lower.find(&keyword_lower) {
520 let distance = if keyword_pos < match_pos {
521 match_pos - keyword_pos
522 } else {
523 keyword_pos - match_pos
524 };
525
526 min_distance = min_distance.min(distance);
527 }
528 }
529
530 // Award bonus based on proximity (distance in characters)
531 match min_distance {
532 0..=20 => 0.15, // Very close (same line, adjacent)
533 21..=50 => 0.10, // Close (nearby in layout)
534 51..=100 => 0.05, // Moderately close
535 _ => 0.0, // Too far or not found
536 }
537 }
538
539 /// Find the bounding box of a matched value in the fragments
540 fn find_match_position(&self, matched_value: &str, fragments: &[TextFragment]) -> BoundingBox {
541 // Simple approach: find first fragment containing the value
542 for fragment in fragments {
543 if fragment.text.contains(matched_value) {
544 return BoundingBox::new(fragment.x, fragment.y, fragment.width, fragment.height);
545 }
546 }
547
548 // Fallback: use first fragment's position
549 if let Some(first) = fragments.first() {
550 BoundingBox::new(first.x, first.y, first.width, first.height)
551 } else {
552 BoundingBox::new(0.0, 0.0, 0.0, 0.0)
553 }
554 }
555
556 /// Convert field type and string value to typed InvoiceField
557 fn convert_to_invoice_field(
558 &self,
559 field_type: InvoiceFieldType,
560 value: &str,
561 ) -> Option<InvoiceField> {
562 match field_type {
563 InvoiceFieldType::InvoiceNumber => Some(InvoiceField::InvoiceNumber(value.to_string())),
564 InvoiceFieldType::InvoiceDate => Some(InvoiceField::InvoiceDate(value.to_string())),
565 InvoiceFieldType::DueDate => Some(InvoiceField::DueDate(value.to_string())),
566 InvoiceFieldType::TotalAmount => {
567 self.parse_amount(value).map(InvoiceField::TotalAmount)
568 }
569 InvoiceFieldType::TaxAmount => self.parse_amount(value).map(InvoiceField::TaxAmount),
570 InvoiceFieldType::NetAmount => self.parse_amount(value).map(InvoiceField::NetAmount),
571 InvoiceFieldType::VatNumber => Some(InvoiceField::VatNumber(value.to_string())),
572 InvoiceFieldType::SupplierName => Some(InvoiceField::SupplierName(value.to_string())),
573 InvoiceFieldType::CustomerName => Some(InvoiceField::CustomerName(value.to_string())),
574 InvoiceFieldType::Currency => Some(InvoiceField::Currency(value.to_string())),
575 InvoiceFieldType::ArticleNumber => Some(InvoiceField::ArticleNumber(value.to_string())),
576 InvoiceFieldType::LineItemDescription => {
577 Some(InvoiceField::LineItemDescription(value.to_string()))
578 }
579 InvoiceFieldType::LineItemQuantity => {
580 self.parse_amount(value).map(InvoiceField::LineItemQuantity)
581 }
582 InvoiceFieldType::LineItemUnitPrice => self
583 .parse_amount(value)
584 .map(InvoiceField::LineItemUnitPrice),
585 }
586 }
587}
588
589/// Builder for configuring `InvoiceExtractor`
590///
591/// Provides a fluent API for configuring extraction behavior. All settings
592/// have sensible defaults for immediate use.
593///
594/// # Defaults
595///
596/// - **Language**: None (uses default patterns)
597/// - **Confidence Threshold**: 0.7 (70%)
598/// - **Use Kerning**: true (stored but not yet functional - see `use_kerning()` docs)
599///
600/// # Examples
601///
602/// ```
603/// use oxidize_pdf::text::invoice::InvoiceExtractor;
604///
605/// // Minimal configuration
606/// let extractor = InvoiceExtractor::builder()
607/// .with_language("es")
608/// .build();
609///
610/// // Full configuration
611/// let extractor = InvoiceExtractor::builder()
612/// .with_language("de")
613/// .confidence_threshold(0.85)
614/// .use_kerning(false)
615/// .build();
616/// ```
617pub struct InvoiceExtractorBuilder {
618 language: Option<Language>,
619 confidence_threshold: f64,
620 use_kerning: bool,
621 custom_patterns: Option<PatternLibrary>,
622}
623
624impl InvoiceExtractorBuilder {
625 /// Create a new builder with default settings
626 ///
627 /// Defaults:
628 /// - No language (uses English patterns)
629 /// - Confidence threshold: 0.7
630 /// - Kerning: enabled
631 pub fn new() -> Self {
632 Self {
633 language: None,
634 confidence_threshold: 0.7,
635 use_kerning: true,
636 custom_patterns: None,
637 }
638 }
639
640 /// Set the language for pattern matching
641 ///
642 /// Accepts language codes: "es", "en", "de", "it"
643 ///
644 /// # Examples
645 ///
646 /// ```
647 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
648 ///
649 /// let extractor = InvoiceExtractor::builder()
650 /// .with_language("es") // Spanish patterns
651 /// .build();
652 /// ```
653 pub fn with_language(mut self, lang: &str) -> Self {
654 self.language = Language::from_code(lang);
655 self
656 }
657
658 /// Set the minimum confidence threshold (0.0 to 1.0)
659 ///
660 /// Fields below this threshold are filtered out. Higher values reduce
661 /// false positives but may miss valid fields.
662 ///
663 /// Recommended values:
664 /// - **0.5**: Maximum recall (may include false positives)
665 /// - **0.7**: Balanced (default)
666 /// - **0.9**: Maximum precision (may miss valid fields)
667 ///
668 /// # Examples
669 ///
670 /// ```
671 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
672 ///
673 /// // High precision mode
674 /// let extractor = InvoiceExtractor::builder()
675 /// .confidence_threshold(0.9)
676 /// .build();
677 /// ```
678 ///
679 /// # Validation
680 ///
681 /// The threshold is automatically clamped to the valid range [0.0, 1.0].
682 /// Values outside this range are silently adjusted to the nearest valid value.
683 pub fn confidence_threshold(mut self, threshold: f64) -> Self {
684 self.confidence_threshold = threshold.clamp(0.0, 1.0);
685 self
686 }
687
688 /// Enable or disable kerning-aware text positioning (PLANNED for v2.0)
689 ///
690 /// **Current Behavior**: This flag is stored but NOT yet used in extraction logic.
691 ///
692 /// **Planned Feature** (v2.0): When enabled, text reconstruction will use actual
693 /// font kerning pairs to calculate accurate character spacing, improving pattern
694 /// matching for invoices with tight kerning (e.g., "AV", "To").
695 ///
696 /// **Why Not Implemented**: Requires architectural changes to expose font metadata
697 /// in `TextFragment`. See struct documentation for technical details.
698 ///
699 /// # Examples
700 ///
701 /// ```
702 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
703 ///
704 /// // Enable for future use (no effect in v1.x)
705 /// let extractor = InvoiceExtractor::builder()
706 /// .use_kerning(true) // ⚠️ Stored but not yet functional
707 /// .build();
708 /// ```
709 pub fn use_kerning(mut self, enabled: bool) -> Self {
710 self.use_kerning = enabled;
711 self
712 }
713
714 /// Use a custom pattern library instead of language-based defaults
715 ///
716 /// Allows complete control over invoice pattern matching by providing a
717 /// custom `PatternLibrary`. Useful for specialized invoice formats or
718 /// combining default patterns with custom additions.
719 ///
720 /// **Note**: When using custom patterns, the `with_language()` setting is ignored.
721 ///
722 /// # Examples
723 ///
724 /// **Example 1: Use default patterns and add custom ones**
725 /// ```
726 /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
727 ///
728 /// // Start with Spanish defaults
729 /// let mut patterns = PatternLibrary::default_spanish();
730 ///
731 /// // Add custom pattern for your specific invoice format
732 /// patterns.add_pattern(
733 /// FieldPattern::new(
734 /// InvoiceFieldType::InvoiceNumber,
735 /// r"Ref:\s*([A-Z0-9\-]+)", // Your custom format
736 /// 0.85,
737 /// Some(Language::Spanish)
738 /// ).unwrap()
739 /// );
740 ///
741 /// let extractor = InvoiceExtractor::builder()
742 /// .with_custom_patterns(patterns)
743 /// .build();
744 /// ```
745 ///
746 /// **Example 2: Build completely custom pattern library**
747 /// ```
748 /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
749 ///
750 /// let mut patterns = PatternLibrary::new();
751 ///
752 /// // Add only the patterns you need
753 /// patterns.add_pattern(
754 /// FieldPattern::new(
755 /// InvoiceFieldType::InvoiceNumber,
756 /// r"Order\s+#([0-9]+)",
757 /// 0.9,
758 /// None // Language-agnostic
759 /// ).unwrap()
760 /// );
761 ///
762 /// let extractor = InvoiceExtractor::builder()
763 /// .with_custom_patterns(patterns)
764 /// .confidence_threshold(0.8)
765 /// .build();
766 /// ```
767 pub fn with_custom_patterns(mut self, patterns: PatternLibrary) -> Self {
768 self.custom_patterns = Some(patterns);
769 self
770 }
771
772 /// Build the InvoiceExtractor
773 pub fn build(self) -> InvoiceExtractor {
774 // Use custom patterns if provided, otherwise create from language
775 let pattern_library = if let Some(custom) = self.custom_patterns {
776 custom
777 } else if let Some(lang) = self.language {
778 PatternLibrary::with_language(lang)
779 } else {
780 PatternLibrary::new()
781 };
782
783 InvoiceExtractor {
784 pattern_library,
785 confidence_threshold: self.confidence_threshold,
786 use_kerning: self.use_kerning,
787 language: self.language,
788 }
789 }
790}
791
792impl Default for InvoiceExtractorBuilder {
793 fn default() -> Self {
794 Self::new()
795 }
796}
797
798#[cfg(test)]
799mod tests {
800 use super::*;
801
802 #[test]
803 fn test_builder_defaults() {
804 let extractor = InvoiceExtractor::builder().build();
805 assert_eq!(extractor.confidence_threshold, 0.7);
806 assert!(extractor.use_kerning);
807 assert!(extractor.language.is_none());
808 }
809
810 #[test]
811 fn test_builder_with_language() {
812 let extractor = InvoiceExtractor::builder().with_language("es").build();
813 assert_eq!(extractor.language, Some(Language::Spanish));
814 }
815
816 #[test]
817 fn test_builder_confidence_threshold() {
818 let extractor = InvoiceExtractor::builder()
819 .confidence_threshold(0.9)
820 .build();
821 assert_eq!(extractor.confidence_threshold, 0.9);
822 }
823
824 #[test]
825 fn test_builder_use_kerning() {
826 let extractor = InvoiceExtractor::builder().use_kerning(false).build();
827 assert!(!extractor.use_kerning);
828 }
829
830 #[test]
831 fn test_use_kerning_stored_for_future_use() {
832 // Verify the flag is stored correctly (even though not yet functional)
833 let extractor_enabled = InvoiceExtractor::builder().use_kerning(true).build();
834 assert!(
835 extractor_enabled.use_kerning,
836 "use_kerning should be stored as true"
837 );
838
839 let extractor_disabled = InvoiceExtractor::builder().use_kerning(false).build();
840 assert!(
841 !extractor_disabled.use_kerning,
842 "use_kerning should be stored as false"
843 );
844
845 // Default value
846 let extractor_default = InvoiceExtractor::builder().build();
847 assert!(
848 extractor_default.use_kerning,
849 "use_kerning should default to true"
850 );
851 }
852}