oxidize_pdf/text/invoice/extractor.rs
1//! Invoice data extractor
2//!
3//! This module provides the main `InvoiceExtractor` type for extracting structured
4//! data from invoice PDFs using pattern matching and confidence scoring.
5//!
6//! # Architecture
7//!
8//! The extraction process follows a pipeline:
9//!
10//! ```text
11//! TextFragments → Text Reconstruction → Pattern Matching → Type Conversion → InvoiceData
12//! ```
13//!
14//! 1. **Text Reconstruction**: Join text fragments with spatial awareness
15//! 2. **Pattern Matching**: Apply language-specific regex patterns
16//! 3. **Confidence Scoring**: Calculate confidence for each match (0.0-1.0)
17//! 4. **Type Conversion**: Convert strings to typed fields (amounts, dates, etc.)
18//! 5. **Filtering**: Remove low-confidence matches below threshold
19//!
20//! # Usage
21//!
22//! ```ignore
23//! use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
24//! use oxidize_pdf::text::invoice::InvoiceExtractor;
25//! use oxidize_pdf::Document;
26//!
27//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
28//! // Extract text from PDF
29//! let doc = Document::open("invoice.pdf")?;
30//! let page = doc.get_page(1)?;
31//! let text_extractor = TextExtractor::new();
32//! let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
33//!
34//! // Extract invoice data
35//! let extractor = InvoiceExtractor::builder()
36//! .with_language("es")
37//! .confidence_threshold(0.7)
38//! .build();
39//!
40//! let invoice = extractor.extract(&extracted.fragments)?;
41//! println!("Found {} fields", invoice.field_count());
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! # Confidence Scoring
47//!
48//! Each extracted field has a confidence score (0.0 = no confidence, 1.0 = certain):
49//!
50//! - **0.9**: Critical fields (invoice number, total amount)
51//! - **0.8**: Important fields (dates, tax amounts)
52//! - **0.7**: Standard fields (VAT numbers, names)
53//!
54//! Fields below the confidence threshold are automatically filtered out.
55
56use super::error::{ExtractionError, Result};
57use super::patterns::{InvoiceFieldType, PatternLibrary};
58use super::types::{
59 BoundingBox, ExtractedField, InvoiceData, InvoiceField, InvoiceMetadata, Language,
60};
61use super::validators;
62use crate::text::extraction::TextFragment;
63
64/// Invoice data extractor with configurable pattern matching
65///
66/// This is the main entry point for invoice extraction. Use the builder pattern
67/// to configure language, confidence thresholds, and other options.
68///
69/// # Examples
70///
71/// ```
72/// use oxidize_pdf::text::invoice::InvoiceExtractor;
73///
74/// // Spanish invoices with high confidence threshold and kerning-aware spacing
75/// let extractor = InvoiceExtractor::builder()
76/// .with_language("es")
77/// .confidence_threshold(0.85)
78/// .use_kerning(true) // Enables font-aware spacing in text reconstruction
79/// .build();
80/// ```
81///
82/// # Thread Safety
83///
84/// `InvoiceExtractor` is immutable after construction and can be safely shared
85/// across threads. Consider creating one extractor per language and reusing it.
86pub struct InvoiceExtractor {
87 pattern_library: PatternLibrary,
88 confidence_threshold: f64,
89 /// Enable kerning-aware text reconstruction
90 ///
91 /// When enabled, adjusts inter-fragment spacing based on font continuity.
92 /// Fragments with the same font use tighter spacing (single space), while
93 /// font changes use normal spacing (double space).
94 ///
95 /// **Implementation Note**: This is a simplified version of true kerning.
96 /// Full kerning with font metrics requires access to kerning pair tables,
97 /// which would require passing `font_cache` or `Document` reference.
98 /// The current implementation provides spacing improvements without
99 /// breaking API compatibility.
100 use_kerning: bool,
101 language: Option<Language>,
102}
103
104impl InvoiceExtractor {
105 /// Create a new builder for configuring the extractor
106 ///
107 /// This is the recommended way to create an `InvoiceExtractor`.
108 ///
109 /// # Examples
110 ///
111 /// ```
112 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
113 ///
114 /// let extractor = InvoiceExtractor::builder()
115 /// .with_language("es")
116 /// .confidence_threshold(0.8)
117 /// .build();
118 /// ```
119 pub fn builder() -> InvoiceExtractorBuilder {
120 InvoiceExtractorBuilder::new()
121 }
122
123 /// Extract structured invoice data from text fragments
124 ///
125 /// This is the main extraction method. It processes text fragments from a PDF page
126 /// and returns structured invoice data with confidence scores.
127 ///
128 /// # Process
129 ///
130 /// 1. Text fragments are reconstructed into full text
131 /// 2. Language-specific patterns are applied
132 /// 3. Matches are converted to typed fields
133 /// 4. Confidence scores are calculated
134 /// 5. Low-confidence fields are filtered out
135 ///
136 /// # Arguments
137 ///
138 /// * `text_fragments` - Text fragments extracted from PDF page (from `TextExtractor`)
139 ///
140 /// # Returns
141 ///
142 /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if:
143 /// - No text fragments provided
144 /// - PDF page is empty
145 /// - Text extraction failed
146 ///
147 /// # Examples
148 ///
149 /// ```ignore
150 /// use oxidize_pdf::text::extraction::{TextExtractor, ExtractionOptions};
151 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
152 /// use oxidize_pdf::Document;
153 ///
154 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
155 /// let doc = Document::open("invoice.pdf")?;
156 /// let page = doc.get_page(1)?;
157 ///
158 /// // Extract text
159 /// let text_extractor = TextExtractor::new();
160 /// let extracted = text_extractor.extract_text(&doc, page, &ExtractionOptions::default())?;
161 ///
162 /// // Extract invoice data
163 /// let extractor = InvoiceExtractor::builder()
164 /// .with_language("es")
165 /// .build();
166 ///
167 /// let invoice = extractor.extract(&extracted.fragments)?;
168 ///
169 /// // Access extracted fields
170 /// for field in &invoice.fields {
171 /// println!("{}: {:?} (confidence: {:.2})",
172 /// field.field_type.name(),
173 /// field.field_type,
174 /// field.confidence
175 /// );
176 /// }
177 /// # Ok(())
178 /// # }
179 /// ```
180 ///
181 /// # Performance
182 ///
183 /// Extraction is CPU-bound and typically completes in <100ms for standard invoices.
184 /// The extractor can be safely reused across multiple pages and threads.
185 pub fn extract(&self, text_fragments: &[TextFragment]) -> Result<InvoiceData> {
186 if text_fragments.is_empty() {
187 return Err(ExtractionError::NoTextFound(1));
188 }
189
190 // Step 1: Reconstruct full text with position tracking
191 let full_text = self.reconstruct_text(text_fragments);
192
193 // Step 2: Apply pattern matching
194 let matches = self.pattern_library.match_text(&full_text);
195
196 // Step 3: Convert matches to ExtractedField with proper types
197 let mut fields = Vec::new();
198 for (field_type, matched_value, base_confidence) in matches {
199 // Calculate confidence score with context
200 let confidence =
201 self.calculate_confidence(&field_type, base_confidence, &matched_value, &full_text);
202
203 // Skip fields below threshold
204 if confidence < self.confidence_threshold {
205 continue;
206 }
207
208 // Find position of this match in fragments
209 let position = self.find_match_position(&matched_value, text_fragments);
210
211 // Convert to proper InvoiceField with typed data
212 if let Some(invoice_field) = self.convert_to_invoice_field(field_type, &matched_value) {
213 fields.push(ExtractedField::new(
214 invoice_field,
215 confidence,
216 position,
217 matched_value,
218 ));
219 }
220 }
221
222 // Step 4: Calculate overall confidence
223 let overall_confidence = if fields.is_empty() {
224 0.0
225 } else {
226 fields.iter().map(|f| f.confidence).sum::<f64>() / fields.len() as f64
227 };
228
229 // Step 5: Create metadata
230 let metadata = InvoiceMetadata::new(1, overall_confidence)
231 .with_language(self.language.unwrap_or(Language::English));
232
233 Ok(InvoiceData::new(fields, metadata))
234 }
235
236 /// Extract invoice data from plain text (convenience method for testing)
237 ///
238 /// This is a convenience wrapper around `extract()` that creates synthetic
239 /// TextFragment objects from plain text input. Primarily useful for testing
240 /// and simple scenarios where you don't have actual PDF text fragments.
241 ///
242 /// **Note**: This method creates fragments without position information,
243 /// so proximity-based scoring may be less accurate than with real PDF fragments.
244 ///
245 /// # Arguments
246 ///
247 /// * `text` - Plain text string to extract invoice data from
248 ///
249 /// # Returns
250 ///
251 /// Returns `Ok(InvoiceData)` with extracted fields, or `Err` if text is empty
252 ///
253 /// # Examples
254 ///
255 /// ```
256 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
257 ///
258 /// let extractor = InvoiceExtractor::builder()
259 /// .with_language("en")
260 /// .confidence_threshold(0.7)
261 /// .build();
262 ///
263 /// let invoice_text = "Invoice Number: INV-001\nTotal: £100.00";
264 /// let result = extractor.extract_from_text(invoice_text)?;
265 ///
266 /// assert!(!result.fields.is_empty());
267 /// # Ok::<(), Box<dyn std::error::Error>>(())
268 /// ```
269 pub fn extract_from_text(&self, text: &str) -> Result<InvoiceData> {
270 if text.is_empty() {
271 return Err(ExtractionError::NoTextFound(1));
272 }
273
274 // Create a single synthetic TextFragment from the text
275 let fragment = TextFragment {
276 text: text.to_string(),
277 x: 0.0,
278 y: 0.0,
279 width: 0.0,
280 height: 12.0,
281 font_size: 12.0,
282 font_name: None,
283 is_bold: false,
284 is_italic: false,
285 color: None,
286 };
287
288 // Use the standard extract method
289 self.extract(&[fragment])
290 }
291
292 /// Reconstruct text from fragments
293 ///
294 /// When `use_kerning` is enabled, applies tighter spacing between fragments
295 /// that share the same font, simulating kerning-aware text reconstruction.
296 ///
297 /// **Implementation**: While full kerning requires font metrics (kerning pairs),
298 /// this simplified version adjusts inter-fragment spacing based on font continuity.
299 /// Fragments with the same font get minimal spacing (single space), while font
300 /// changes get normal spacing (double space).
301 fn reconstruct_text(&self, fragments: &[TextFragment]) -> String {
302 if fragments.is_empty() {
303 return String::new();
304 }
305
306 if !self.use_kerning {
307 // Default: join all fragments with single space
308 return fragments
309 .iter()
310 .map(|f| f.text.as_str())
311 .collect::<Vec<_>>()
312 .join(" ");
313 }
314
315 // Kerning-aware: use tighter spacing for same-font fragments
316 let mut result = String::with_capacity(
317 fragments.iter().map(|f| f.text.len()).sum::<usize>() + fragments.len(),
318 );
319
320 for (i, fragment) in fragments.iter().enumerate() {
321 result.push_str(&fragment.text);
322
323 // Add spacing between fragments
324 if i < fragments.len() - 1 {
325 let next = &fragments[i + 1];
326
327 // If both fragments have same font, use minimal spacing
328 // Otherwise use normal spacing for font transitions
329 let spacing = match (&fragment.font_name, &next.font_name) {
330 (Some(f1), Some(f2)) if f1 == f2 => " ", // Same font: tight spacing
331 _ => " ", // Different/unknown font: normal spacing
332 };
333
334 result.push_str(spacing);
335 }
336 }
337
338 result
339 }
340
341 /// Parse amount with language-aware decimal handling
342 fn parse_amount(&self, value: &str) -> Option<f64> {
343 // Determine decimal format based on language
344 let uses_european_format = matches!(
345 self.language,
346 Some(Language::Spanish) | Some(Language::German) | Some(Language::Italian)
347 );
348
349 let normalized = if uses_european_format {
350 // European format: 1.234,56 → remove dots (thousands), replace comma with dot (decimal)
351 value.replace('.', "").replace(',', ".")
352 } else {
353 // US/UK format: 1,234.56 → remove commas (thousands), dot is already decimal
354 value.replace(',', "")
355 };
356
357 normalized.parse::<f64>().ok()
358 }
359
360 /// Calculate confidence score for a match using multi-factor scoring
361 ///
362 /// Combines multiple factors to produce a final confidence score:
363 /// 1. **Base Pattern Confidence** (0.7-0.9): From pattern matching quality
364 /// 2. **Value Validation Bonus** (-0.5 to +0.2): Format and content validation
365 /// 3. **Proximity Bonus** (0.0 to +0.15): Distance from field label keywords
366 ///
367 /// # Arguments
368 ///
369 /// * `field_type` - The type of field being scored (affects which validator is applied)
370 /// * `base_confidence` - Initial confidence from pattern match quality
371 /// * `matched_value` - The extracted value (used for validation)
372 /// * `full_text` - Complete text of the invoice (used for proximity calculation)
373 ///
374 /// # Returns
375 ///
376 /// Final confidence score clamped to [0.0, 1.0]
377 ///
378 /// # Examples
379 ///
380 /// ```ignore
381 /// // Invoice date with valid format gets validation bonus
382 /// let confidence = extractor.calculate_confidence(
383 /// &InvoiceFieldType::InvoiceDate,
384 /// 0.85, // base from pattern
385 /// "20/01/2025",
386 /// full_text
387 /// );
388 /// // Result: 0.85 + 0.20 (valid date) + proximity = ~1.0
389 /// ```
390 fn calculate_confidence(
391 &self,
392 field_type: &InvoiceFieldType,
393 base_confidence: f64,
394 matched_value: &str,
395 full_text: &str,
396 ) -> f64 {
397 // Start with base confidence from pattern matching
398 let mut score = base_confidence;
399
400 // Apply value validation adjustments based on field type
401 let validation_adjustment = match field_type {
402 InvoiceFieldType::InvoiceDate | InvoiceFieldType::DueDate => {
403 validators::validate_date(matched_value)
404 }
405 InvoiceFieldType::TotalAmount
406 | InvoiceFieldType::TaxAmount
407 | InvoiceFieldType::NetAmount
408 | InvoiceFieldType::LineItemUnitPrice => validators::validate_amount(matched_value),
409 InvoiceFieldType::InvoiceNumber => validators::validate_invoice_number(matched_value),
410 InvoiceFieldType::VatNumber => validators::validate_vat_number(matched_value),
411 // No validators yet for these fields
412 InvoiceFieldType::SupplierName
413 | InvoiceFieldType::CustomerName
414 | InvoiceFieldType::Currency
415 | InvoiceFieldType::ArticleNumber
416 | InvoiceFieldType::LineItemDescription
417 | InvoiceFieldType::LineItemQuantity => 0.0,
418 };
419
420 score += validation_adjustment;
421
422 // Apply proximity bonus (closeness to field label in text)
423 let proximity_bonus = self.calculate_proximity_bonus(field_type, matched_value, full_text);
424 score += proximity_bonus;
425
426 // Clamp to valid range [0.0, 1.0]
427 score.clamp(0.0, 1.0)
428 }
429
430 /// Calculate proximity bonus based on distance from field label keywords
431 ///
432 /// Fields that appear close to their expected label keywords receive a bonus.
433 /// This helps distinguish between correct matches and ambiguous values that
434 /// happen to match the pattern but appear in the wrong context.
435 ///
436 /// # Proximity Bonus Scale
437 ///
438 /// - **+0.15**: Keyword within 20 characters of match
439 /// - **+0.10**: Keyword within 50 characters
440 /// - **+0.05**: Keyword within 100 characters
441 /// - **0.00**: Keyword beyond 100 characters or not found
442 ///
443 /// # Arguments
444 ///
445 /// * `field_type` - The type of field (determines which keywords to search for)
446 /// * `matched_value` - The extracted value
447 /// * `full_text` - Complete invoice text
448 ///
449 /// # Returns
450 ///
451 /// Proximity bonus in range [0.0, 0.15]
452 fn calculate_proximity_bonus(
453 &self,
454 field_type: &InvoiceFieldType,
455 matched_value: &str,
456 full_text: &str,
457 ) -> f64 {
458 // Define keywords for each field type (language-agnostic where possible)
459 let keywords: Vec<&str> = match field_type {
460 InvoiceFieldType::InvoiceNumber => {
461 vec![
462 "Invoice", "Factura", "Rechnung", "Fattura", "Number", "Número", "Nr",
463 ]
464 }
465 InvoiceFieldType::InvoiceDate => {
466 vec!["Date", "Fecha", "Datum", "Data", "Invoice Date"]
467 }
468 InvoiceFieldType::DueDate => {
469 vec!["Due", "Vencimiento", "Fällig", "Scadenza", "Payment"]
470 }
471 InvoiceFieldType::TotalAmount => {
472 vec![
473 "Total",
474 "Grand Total",
475 "Amount Due",
476 "Gesamtbetrag",
477 "Totale",
478 ]
479 }
480 InvoiceFieldType::TaxAmount => {
481 vec!["VAT", "IVA", "MwSt", "Tax", "Impuesto"]
482 }
483 InvoiceFieldType::NetAmount => {
484 vec![
485 "Subtotal",
486 "Net",
487 "Neto",
488 "Nettobetrag",
489 "Imponibile",
490 "Base",
491 ]
492 }
493 InvoiceFieldType::VatNumber => {
494 vec!["VAT", "CIF", "NIF", "USt", "Partita IVA", "Tax ID"]
495 }
496 InvoiceFieldType::CustomerName => {
497 vec!["Bill to", "Customer", "Client", "Cliente"]
498 }
499 InvoiceFieldType::SupplierName => {
500 vec!["From", "Supplier", "Vendor", "Proveedor"]
501 }
502 _ => return 0.0, // No proximity bonus for other fields
503 };
504
505 // Find the matched value position in full text
506 let match_pos = match full_text.find(matched_value) {
507 Some(pos) => pos,
508 None => return 0.0, // Value not found in text (shouldn't happen)
509 };
510
511 // Find the closest keyword and calculate distance
512 let mut min_distance = usize::MAX;
513 for keyword in keywords {
514 // Case-insensitive search
515 let text_lower = full_text.to_lowercase();
516 let keyword_lower = keyword.to_lowercase();
517
518 if let Some(keyword_pos) = text_lower.find(&keyword_lower) {
519 let distance = if keyword_pos < match_pos {
520 match_pos - keyword_pos
521 } else {
522 keyword_pos - match_pos
523 };
524
525 min_distance = min_distance.min(distance);
526 }
527 }
528
529 // Award bonus based on proximity (distance in characters)
530 match min_distance {
531 0..=20 => 0.15, // Very close (same line, adjacent)
532 21..=50 => 0.10, // Close (nearby in layout)
533 51..=100 => 0.05, // Moderately close
534 _ => 0.0, // Too far or not found
535 }
536 }
537
538 /// Find the bounding box of a matched value in the fragments
539 fn find_match_position(&self, matched_value: &str, fragments: &[TextFragment]) -> BoundingBox {
540 // Simple approach: find first fragment containing the value
541 for fragment in fragments {
542 if fragment.text.contains(matched_value) {
543 return BoundingBox::new(fragment.x, fragment.y, fragment.width, fragment.height);
544 }
545 }
546
547 // Fallback: use first fragment's position
548 if let Some(first) = fragments.first() {
549 BoundingBox::new(first.x, first.y, first.width, first.height)
550 } else {
551 BoundingBox::new(0.0, 0.0, 0.0, 0.0)
552 }
553 }
554
555 /// Convert field type and string value to typed InvoiceField
556 fn convert_to_invoice_field(
557 &self,
558 field_type: InvoiceFieldType,
559 value: &str,
560 ) -> Option<InvoiceField> {
561 match field_type {
562 InvoiceFieldType::InvoiceNumber => Some(InvoiceField::InvoiceNumber(value.to_string())),
563 InvoiceFieldType::InvoiceDate => Some(InvoiceField::InvoiceDate(value.to_string())),
564 InvoiceFieldType::DueDate => Some(InvoiceField::DueDate(value.to_string())),
565 InvoiceFieldType::TotalAmount => {
566 self.parse_amount(value).map(InvoiceField::TotalAmount)
567 }
568 InvoiceFieldType::TaxAmount => self.parse_amount(value).map(InvoiceField::TaxAmount),
569 InvoiceFieldType::NetAmount => self.parse_amount(value).map(InvoiceField::NetAmount),
570 InvoiceFieldType::VatNumber => Some(InvoiceField::VatNumber(value.to_string())),
571 InvoiceFieldType::SupplierName => Some(InvoiceField::SupplierName(value.to_string())),
572 InvoiceFieldType::CustomerName => Some(InvoiceField::CustomerName(value.to_string())),
573 InvoiceFieldType::Currency => Some(InvoiceField::Currency(value.to_string())),
574 InvoiceFieldType::ArticleNumber => Some(InvoiceField::ArticleNumber(value.to_string())),
575 InvoiceFieldType::LineItemDescription => {
576 Some(InvoiceField::LineItemDescription(value.to_string()))
577 }
578 InvoiceFieldType::LineItemQuantity => {
579 self.parse_amount(value).map(InvoiceField::LineItemQuantity)
580 }
581 InvoiceFieldType::LineItemUnitPrice => self
582 .parse_amount(value)
583 .map(InvoiceField::LineItemUnitPrice),
584 }
585 }
586}
587
588/// Builder for configuring `InvoiceExtractor`
589///
590/// Provides a fluent API for configuring extraction behavior. All settings
591/// have sensible defaults for immediate use.
592///
593/// # Defaults
594///
595/// - **Language**: None (uses default patterns)
596/// - **Confidence Threshold**: 0.7 (70%)
597/// - **Use Kerning**: true (stored but not yet functional - see `use_kerning()` docs)
598///
599/// # Examples
600///
601/// ```
602/// use oxidize_pdf::text::invoice::InvoiceExtractor;
603///
604/// // Minimal configuration
605/// let extractor = InvoiceExtractor::builder()
606/// .with_language("es")
607/// .build();
608///
609/// // Full configuration
610/// let extractor = InvoiceExtractor::builder()
611/// .with_language("de")
612/// .confidence_threshold(0.85)
613/// .use_kerning(false)
614/// .build();
615/// ```
616pub struct InvoiceExtractorBuilder {
617 language: Option<Language>,
618 confidence_threshold: f64,
619 use_kerning: bool,
620 custom_patterns: Option<PatternLibrary>,
621}
622
623impl InvoiceExtractorBuilder {
624 /// Create a new builder with default settings
625 ///
626 /// Defaults:
627 /// - No language (uses English patterns)
628 /// - Confidence threshold: 0.7
629 /// - Kerning: enabled
630 pub fn new() -> Self {
631 Self {
632 language: None,
633 confidence_threshold: 0.7,
634 use_kerning: true,
635 custom_patterns: None,
636 }
637 }
638
639 /// Set the language for pattern matching
640 ///
641 /// Accepts language codes: "es", "en", "de", "it"
642 ///
643 /// # Examples
644 ///
645 /// ```
646 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
647 ///
648 /// let extractor = InvoiceExtractor::builder()
649 /// .with_language("es") // Spanish patterns
650 /// .build();
651 /// ```
652 pub fn with_language(mut self, lang: &str) -> Self {
653 self.language = Language::from_code(lang);
654 self
655 }
656
657 /// Set the minimum confidence threshold (0.0 to 1.0)
658 ///
659 /// Fields below this threshold are filtered out. Higher values reduce
660 /// false positives but may miss valid fields.
661 ///
662 /// Recommended values:
663 /// - **0.5**: Maximum recall (may include false positives)
664 /// - **0.7**: Balanced (default)
665 /// - **0.9**: Maximum precision (may miss valid fields)
666 ///
667 /// # Examples
668 ///
669 /// ```
670 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
671 ///
672 /// // High precision mode
673 /// let extractor = InvoiceExtractor::builder()
674 /// .confidence_threshold(0.9)
675 /// .build();
676 /// ```
677 ///
678 /// # Validation
679 ///
680 /// The threshold is automatically clamped to the valid range [0.0, 1.0].
681 /// Values outside this range are silently adjusted to the nearest valid value.
682 pub fn confidence_threshold(mut self, threshold: f64) -> Self {
683 self.confidence_threshold = threshold.clamp(0.0, 1.0);
684 self
685 }
686
687 /// Enable or disable kerning-aware text positioning (PLANNED for v2.0)
688 ///
689 /// **Current Behavior**: This flag is stored but NOT yet used in extraction logic.
690 ///
691 /// **Planned Feature** (v2.0): When enabled, text reconstruction will use actual
692 /// font kerning pairs to calculate accurate character spacing, improving pattern
693 /// matching for invoices with tight kerning (e.g., "AV", "To").
694 ///
695 /// **Why Not Implemented**: Requires architectural changes to expose font metadata
696 /// in `TextFragment`. See struct documentation for technical details.
697 ///
698 /// # Examples
699 ///
700 /// ```
701 /// use oxidize_pdf::text::invoice::InvoiceExtractor;
702 ///
703 /// // Enable for future use (no effect in v1.x)
704 /// let extractor = InvoiceExtractor::builder()
705 /// .use_kerning(true) // ⚠️ Stored but not yet functional
706 /// .build();
707 /// ```
708 pub fn use_kerning(mut self, enabled: bool) -> Self {
709 self.use_kerning = enabled;
710 self
711 }
712
713 /// Use a custom pattern library instead of language-based defaults
714 ///
715 /// Allows complete control over invoice pattern matching by providing a
716 /// custom `PatternLibrary`. Useful for specialized invoice formats or
717 /// combining default patterns with custom additions.
718 ///
719 /// **Note**: When using custom patterns, the `with_language()` setting is ignored.
720 ///
721 /// # Examples
722 ///
723 /// **Example 1: Use default patterns and add custom ones**
724 /// ```
725 /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
726 ///
727 /// // Start with Spanish defaults
728 /// let mut patterns = PatternLibrary::default_spanish();
729 ///
730 /// // Add custom pattern for your specific invoice format
731 /// patterns.add_pattern(
732 /// FieldPattern::new(
733 /// InvoiceFieldType::InvoiceNumber,
734 /// r"Ref:\s*([A-Z0-9\-]+)", // Your custom format
735 /// 0.85,
736 /// Some(Language::Spanish)
737 /// ).unwrap()
738 /// );
739 ///
740 /// let extractor = InvoiceExtractor::builder()
741 /// .with_custom_patterns(patterns)
742 /// .build();
743 /// ```
744 ///
745 /// **Example 2: Build completely custom pattern library**
746 /// ```
747 /// use oxidize_pdf::text::invoice::{InvoiceExtractor, PatternLibrary, FieldPattern, InvoiceFieldType, Language};
748 ///
749 /// let mut patterns = PatternLibrary::new();
750 ///
751 /// // Add only the patterns you need
752 /// patterns.add_pattern(
753 /// FieldPattern::new(
754 /// InvoiceFieldType::InvoiceNumber,
755 /// r"Order\s+#([0-9]+)",
756 /// 0.9,
757 /// None // Language-agnostic
758 /// ).unwrap()
759 /// );
760 ///
761 /// let extractor = InvoiceExtractor::builder()
762 /// .with_custom_patterns(patterns)
763 /// .confidence_threshold(0.8)
764 /// .build();
765 /// ```
766 pub fn with_custom_patterns(mut self, patterns: PatternLibrary) -> Self {
767 self.custom_patterns = Some(patterns);
768 self
769 }
770
771 /// Build the InvoiceExtractor
772 pub fn build(self) -> InvoiceExtractor {
773 // Use custom patterns if provided, otherwise create from language
774 let pattern_library = if let Some(custom) = self.custom_patterns {
775 custom
776 } else if let Some(lang) = self.language {
777 PatternLibrary::with_language(lang)
778 } else {
779 PatternLibrary::new()
780 };
781
782 InvoiceExtractor {
783 pattern_library,
784 confidence_threshold: self.confidence_threshold,
785 use_kerning: self.use_kerning,
786 language: self.language,
787 }
788 }
789}
790
791impl Default for InvoiceExtractorBuilder {
792 fn default() -> Self {
793 Self::new()
794 }
795}
796
797#[cfg(test)]
798mod tests {
799 use super::*;
800
801 #[test]
802 fn test_builder_defaults() {
803 let extractor = InvoiceExtractor::builder().build();
804 assert_eq!(extractor.confidence_threshold, 0.7);
805 assert!(extractor.use_kerning);
806 assert!(extractor.language.is_none());
807 }
808
809 #[test]
810 fn test_builder_with_language() {
811 let extractor = InvoiceExtractor::builder().with_language("es").build();
812 assert_eq!(extractor.language, Some(Language::Spanish));
813 }
814
815 #[test]
816 fn test_builder_confidence_threshold() {
817 let extractor = InvoiceExtractor::builder()
818 .confidence_threshold(0.9)
819 .build();
820 assert_eq!(extractor.confidence_threshold, 0.9);
821 }
822
823 #[test]
824 fn test_builder_use_kerning() {
825 let extractor = InvoiceExtractor::builder().use_kerning(false).build();
826 assert!(!extractor.use_kerning);
827 }
828
829 #[test]
830 fn test_use_kerning_stored_for_future_use() {
831 // Verify the flag is stored correctly (even though not yet functional)
832 let extractor_enabled = InvoiceExtractor::builder().use_kerning(true).build();
833 assert!(
834 extractor_enabled.use_kerning,
835 "use_kerning should be stored as true"
836 );
837
838 let extractor_disabled = InvoiceExtractor::builder().use_kerning(false).build();
839 assert!(
840 !extractor_disabled.use_kerning,
841 "use_kerning should be stored as false"
842 );
843
844 // Default value
845 let extractor_default = InvoiceExtractor::builder().build();
846 assert!(
847 extractor_default.use_kerning,
848 "use_kerning should default to true"
849 );
850 }
851}