Skip to main content

converge_provider/
ocr.rs

1// Copyright 2024-2025 Aprio One AB, Sweden
2// Author: Kenneth Pernyer, kenneth@aprio.one
3// SPDX-License-Identifier: MIT
4// See LICENSE file in the project root for full license information.
5
6//! OCR / Document AI providers.
7//!
8//! This module provides integration with OCR models for document understanding,
9//! text extraction, and structured content parsing from PDFs, scans, and images.
10//!
11//! # Available Providers
12//!
13//! - [`MistralOcrProvider`] - Mistral OCR 3 (GDPR-compliant, EU)
14//! - [`DeepSeekOcrProvider`] - DeepSeek OCR 2 (Visual Causal Flow)
15//! - [`LightOnOcrProvider`] - LightOnOCR-2-1B (Efficient, open-source)
16//!
17//! # Example
18//!
19//! ```ignore
20//! use converge_provider::ocr::{OcrProvider, MistralOcrProvider, OcrRequest};
21//!
22//! let provider = MistralOcrProvider::from_env()?;
23//! let request = OcrRequest::from_pdf_bytes(pdf_bytes);
24//! let result = provider.extract(&request)?;
25//!
26//! println!("Extracted text: {}", result.text);
27//! for table in result.tables {
28//!     println!("Table: {:?}", table);
29//! }
30//! ```
31
32use serde::{Deserialize, Serialize};
33
34/// Error type for OCR operations.
35#[derive(Debug, thiserror::Error)]
36pub enum OcrError {
37    /// Network/HTTP error.
38    #[error("Network error: {0}")]
39    Network(String),
40
41    /// API authentication error.
42    #[error("Authentication error: {0}")]
43    Auth(String),
44
45    /// Rate limit exceeded.
46    #[error("Rate limit exceeded: {0}")]
47    RateLimit(String),
48
49    /// API response parsing error.
50    #[error("Parse error: {0}")]
51    Parse(String),
52
53    /// Invalid input (unsupported format, etc.).
54    #[error("Invalid input: {0}")]
55    InvalidInput(String),
56
57    /// General API error.
58    #[error("API error: {0}")]
59    Api(String),
60}
61
62/// Input type for OCR processing.
63#[derive(Debug, Clone)]
64pub enum OcrInput {
65    /// PDF document as bytes.
66    PdfBytes(Vec<u8>),
67    /// Image as bytes (PNG, JPEG, etc.).
68    ImageBytes(Vec<u8>),
69    /// URL to a document or image.
70    Url(String),
71    /// Base64-encoded document or image.
72    Base64(String),
73}
74
75/// OCR extraction request.
76#[derive(Debug, Clone)]
77pub struct OcrRequest {
78    /// Input document or image.
79    pub input: OcrInput,
80    /// Output format preference.
81    pub output_format: OcrOutputFormat,
82    /// Language hints (ISO 639-1 codes).
83    pub languages: Vec<String>,
84    /// Whether to extract tables.
85    pub extract_tables: bool,
86    /// Whether to extract images/figures.
87    pub extract_images: bool,
88    /// Page range (for multi-page documents).
89    pub page_range: Option<(usize, usize)>,
90}
91
92impl OcrRequest {
93    /// Creates a request from PDF bytes.
94    #[must_use]
95    pub fn from_pdf_bytes(bytes: Vec<u8>) -> Self {
96        Self {
97            input: OcrInput::PdfBytes(bytes),
98            output_format: OcrOutputFormat::Markdown,
99            languages: vec![],
100            extract_tables: true,
101            extract_images: false,
102            page_range: None,
103        }
104    }
105
106    /// Creates a request from image bytes.
107    #[must_use]
108    pub fn from_image_bytes(bytes: Vec<u8>) -> Self {
109        Self {
110            input: OcrInput::ImageBytes(bytes),
111            output_format: OcrOutputFormat::Markdown,
112            languages: vec![],
113            extract_tables: true,
114            extract_images: false,
115            page_range: None,
116        }
117    }
118
119    /// Creates a request from a URL.
120    #[must_use]
121    pub fn from_url(url: impl Into<String>) -> Self {
122        Self {
123            input: OcrInput::Url(url.into()),
124            output_format: OcrOutputFormat::Markdown,
125            languages: vec![],
126            extract_tables: true,
127            extract_images: false,
128            page_range: None,
129        }
130    }
131
132    /// Sets the output format.
133    #[must_use]
134    pub fn with_output_format(mut self, format: OcrOutputFormat) -> Self {
135        self.output_format = format;
136        self
137    }
138
139    /// Adds language hints.
140    #[must_use]
141    pub fn with_languages(mut self, languages: Vec<String>) -> Self {
142        self.languages = languages;
143        self
144    }
145
146    /// Sets whether to extract tables.
147    #[must_use]
148    pub fn with_extract_tables(mut self, extract: bool) -> Self {
149        self.extract_tables = extract;
150        self
151    }
152
153    /// Sets whether to extract images.
154    #[must_use]
155    pub fn with_extract_images(mut self, extract: bool) -> Self {
156        self.extract_images = extract;
157        self
158    }
159
160    /// Sets the page range for multi-page documents.
161    #[must_use]
162    pub fn with_page_range(mut self, start: usize, end: usize) -> Self {
163        self.page_range = Some((start, end));
164        self
165    }
166}
167
168/// Output format for OCR results.
169#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
170pub enum OcrOutputFormat {
171    /// Plain text.
172    Text,
173    /// Markdown with structure preserved.
174    #[default]
175    Markdown,
176    /// HTML with table reconstruction.
177    Html,
178    /// JSON with structured data.
179    Json,
180}
181
182/// A detected table in the document.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct OcrTable {
185    /// Page number (0-indexed).
186    pub page: usize,
187    /// Table as HTML or markdown.
188    pub content: String,
189    /// Bounding box (x, y, width, height) if available.
190    pub bbox: Option<(f64, f64, f64, f64)>,
191}
192
193/// A detected image/figure in the document.
194#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct OcrImage {
196    /// Page number (0-indexed).
197    pub page: usize,
198    /// Image description or alt text.
199    pub description: Option<String>,
200    /// Bounding box (x, y, width, height).
201    pub bbox: Option<(f64, f64, f64, f64)>,
202    /// Base64-encoded image data (if extracted).
203    pub data: Option<String>,
204}
205
206/// Provenance information for OCR results.
207///
208/// Captures everything needed for reproducibility and tracing:
209/// - Tool version and configuration
210/// - Input/output hashes for trace links
211/// - Preprocessing parameters applied
212#[derive(Debug, Clone, Serialize, Deserialize, Default)]
213pub struct OcrProvenance {
214    /// Provider/tool name (e.g., "tesseract", "mistral-ocr", "deepseek-ocr").
215    pub provider: String,
216    /// Model or engine version (e.g., "5.3.0", "mistral-ocr-2512").
217    pub version: String,
218    /// Language pack(s) used (e.g., ["eng", "deu"]).
219    pub languages: Vec<String>,
220    /// Preprocessing parameters applied.
221    pub preprocessing: OcrPreprocessing,
222    /// SHA-256 hash of input bytes (for trace links).
223    pub input_hash: Option<String>,
224    /// SHA-256 hash of output text (for trace links).
225    pub output_hash: Option<String>,
226    /// Additional metadata (tool-specific).
227    #[serde(default)]
228    pub metadata: std::collections::HashMap<String, String>,
229}
230
231/// Preprocessing parameters applied before OCR.
232#[derive(Debug, Clone, Serialize, Deserialize, Default)]
233pub struct OcrPreprocessing {
234    /// DPI used for rendering (for PDFs).
235    pub dpi: Option<u32>,
236    /// Whether binarization was applied.
237    pub binarized: bool,
238    /// Whether deskewing was applied.
239    pub deskewed: bool,
240    /// Whether noise removal was applied.
241    pub denoised: bool,
242    /// Page segmentation mode (Tesseract-specific).
243    pub psm: Option<u32>,
244    /// OCR engine mode (Tesseract-specific).
245    pub oem: Option<u32>,
246}
247
248/// Confidence summary for OCR results.
249#[derive(Debug, Clone, Serialize, Deserialize, Default)]
250pub struct OcrConfidence {
251    /// Overall mean confidence (0.0-1.0).
252    pub mean: f64,
253    /// Minimum word confidence.
254    pub min: f64,
255    /// Maximum word confidence.
256    pub max: f64,
257    /// Standard deviation of confidence scores.
258    pub std_dev: Option<f64>,
259    /// Number of words with confidence below threshold.
260    pub low_confidence_words: usize,
261    /// Threshold used for low confidence (default 0.6).
262    pub threshold: f64,
263}
264
265/// A word or text span with position and confidence.
266///
267/// For Tesseract, this comes from TSV or hOCR output.
268/// Useful for validation: you can check where each word came from
269/// and flag low-confidence regions.
270#[derive(Debug, Clone, Serialize, Deserialize)]
271pub struct OcrSpan {
272    /// The text content of this span.
273    pub text: String,
274    /// Confidence score (0.0-1.0).
275    pub confidence: f64,
276    /// Page number (0-indexed).
277    pub page: usize,
278    /// Bounding box: (x, y, width, height) in pixels.
279    pub bbox: Option<(i32, i32, i32, i32)>,
280    /// Block number (page segmentation unit).
281    pub block_num: Option<i32>,
282    /// Paragraph number within block.
283    pub par_num: Option<i32>,
284    /// Line number within paragraph.
285    pub line_num: Option<i32>,
286    /// Word number within line.
287    pub word_num: Option<i32>,
288}
289
290impl OcrSpan {
291    /// Creates a new span with text and confidence.
292    #[must_use]
293    pub fn new(text: impl Into<String>, confidence: f64) -> Self {
294        Self {
295            text: text.into(),
296            confidence,
297            page: 0,
298            bbox: None,
299            block_num: None,
300            par_num: None,
301            line_num: None,
302            word_num: None,
303        }
304    }
305
306    /// Sets the bounding box.
307    #[must_use]
308    pub fn with_bbox(mut self, x: i32, y: i32, w: i32, h: i32) -> Self {
309        self.bbox = Some((x, y, w, h));
310        self
311    }
312
313    /// Sets the page number.
314    #[must_use]
315    pub fn with_page(mut self, page: usize) -> Self {
316        self.page = page;
317        self
318    }
319
320    /// Checks if this span has low confidence (below threshold).
321    #[must_use]
322    pub fn is_low_confidence(&self, threshold: f64) -> bool {
323        self.confidence < threshold
324    }
325}
326
327/// Tesseract-specific output format.
328///
329/// Controls what kind of output to request from Tesseract.
330#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
331pub enum TesseractOutputFormat {
332    /// Plain text (default).
333    #[default]
334    Text,
335    /// TSV with word-level confidence and bounding boxes.
336    /// Columns: level, page_num, block_num, par_num, line_num, word_num,
337    ///          left, top, width, height, conf, text
338    Tsv,
339    /// hOCR HTML format with bounding boxes.
340    /// Useful for downstream table/layout analysis.
341    Hocr,
342    /// ALTO XML format (common in libraries/archives).
343    Alto,
344}
345
346/// OCR extraction result.
347#[derive(Debug, Clone, Serialize, Deserialize)]
348pub struct OcrResult {
349    /// Extracted text content.
350    pub text: String,
351    /// Number of pages processed.
352    pub pages: usize,
353    /// Word/text spans with positions and confidence.
354    /// Populated when using TSV or hOCR output format (Tesseract).
355    /// Useful for validation: check where each word came from.
356    #[serde(default)]
357    pub spans: Vec<OcrSpan>,
358    /// Detected tables.
359    pub tables: Vec<OcrTable>,
360    /// Detected images/figures.
361    pub images: Vec<OcrImage>,
362    /// Confidence summary (per-word statistics).
363    pub confidence: Option<OcrConfidence>,
364    /// Processing time in milliseconds.
365    pub processing_time_ms: Option<u64>,
366    /// Provenance for reproducibility and tracing.
367    pub provenance: OcrProvenance,
368}
369
370/// Trait for OCR providers.
371pub trait OcrProvider: Send + Sync {
372    /// Returns the provider name.
373    fn name(&self) -> &'static str;
374
375    /// Returns the model being used.
376    fn model(&self) -> &str;
377
378    /// Extracts text and structure from a document.
379    ///
380    /// # Errors
381    ///
382    /// Returns error if extraction fails.
383    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError>;
384}
385
386// =============================================================================
387// Mistral OCR Provider
388// =============================================================================
389
390/// Mistral OCR 3 provider.
391///
392/// Mistral OCR 3 is designed for document AI at scale, handling forms, invoices,
393/// complex tables, handwriting, and low-quality scans. It outputs structured
394/// text/HTML suitable for RAG and agent workflows.
395///
396/// # Features
397/// - 74% win rate over OCR 2 on forms, handwriting, tables
398/// - Markdown output with HTML table reconstruction
399/// - GDPR-compliant (France)
400/// - $2 per 1000 pages ($1 with batch API)
401///
402/// # Example
403///
404/// ```ignore
405/// use converge_provider::ocr::{MistralOcrProvider, OcrRequest};
406///
407/// let provider = MistralOcrProvider::from_env()?;
408/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
409/// ```
410pub struct MistralOcrProvider {
411    api_key: String,
412    model: String,
413    base_url: String,
414    client: reqwest::blocking::Client,
415}
416
417impl MistralOcrProvider {
418    /// Creates a new Mistral OCR provider.
419    #[must_use]
420    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
421        Self {
422            api_key: api_key.into(),
423            model: model.into(),
424            base_url: "https://api.mistral.ai/v1".to_string(),
425            client: reqwest::blocking::Client::new(),
426        }
427    }
428
429    /// Creates a provider using the `MISTRAL_API_KEY` environment variable.
430    ///
431    /// Uses `mistral-ocr-latest` as the default model.
432    ///
433    /// # Errors
434    ///
435    /// Returns error if the environment variable is not set.
436    pub fn from_env() -> Result<Self, OcrError> {
437        let api_key = std::env::var("MISTRAL_API_KEY")
438            .map_err(|_| OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string()))?;
439        Ok(Self::new(api_key, "mistral-ocr-latest"))
440    }
441
442    /// Creates a provider with a specific model.
443    ///
444    /// # Errors
445    ///
446    /// Returns error if the environment variable is not set.
447    pub fn from_env_with_model(model: impl Into<String>) -> Result<Self, OcrError> {
448        let api_key = std::env::var("MISTRAL_API_KEY")
449            .map_err(|_| OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string()))?;
450        Ok(Self::new(api_key, model))
451    }
452
453    /// Uses a custom base URL.
454    #[must_use]
455    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
456        self.base_url = url.into();
457        self
458    }
459}
460
461impl OcrProvider for MistralOcrProvider {
462    fn name(&self) -> &'static str {
463        "mistral-ocr"
464    }
465
466    fn model(&self) -> &str {
467        &self.model
468    }
469
470    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
471        // Build the request body based on input type
472        let document = match &request.input {
473            OcrInput::PdfBytes(bytes) => {
474                serde_json::json!({
475                    "type": "document_url",
476                    "document_url": format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
477                })
478            }
479            OcrInput::ImageBytes(bytes) => {
480                serde_json::json!({
481                    "type": "image_url",
482                    "image_url": format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
483                })
484            }
485            OcrInput::Url(url) => {
486                if url.ends_with(".pdf") {
487                    serde_json::json!({
488                        "type": "document_url",
489                        "document_url": url
490                    })
491                } else {
492                    serde_json::json!({
493                        "type": "image_url",
494                        "image_url": url
495                    })
496                }
497            }
498            OcrInput::Base64(data) => {
499                serde_json::json!({
500                    "type": "document_url",
501                    "document_url": format!("data:application/pdf;base64,{}", data)
502                })
503            }
504        };
505
506        let body = serde_json::json!({
507            "model": self.model,
508            "document": document,
509            "include_image_base64": request.extract_images
510        });
511
512        let response = self
513            .client
514            .post(format!("{}/ocr", self.base_url))
515            .header("Authorization", format!("Bearer {}", self.api_key))
516            .header("Content-Type", "application/json")
517            .json(&body)
518            .send()
519            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
520
521        let status = response.status();
522        if !status.is_success() {
523            let error_text = response.text().unwrap_or_default();
524            return match status.as_u16() {
525                401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
526                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
527                _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
528            };
529        }
530
531        let api_response: MistralOcrResponse = response
532            .json()
533            .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
534
535        // Convert to our result format
536        let mut tables = vec![];
537        let mut images = vec![];
538        let mut text = String::new();
539
540        for (page_idx, page) in api_response.pages.iter().enumerate() {
541            text.push_str(&page.markdown);
542            text.push_str("\n\n");
543
544            // Extract tables from markdown (simplified)
545            // In practice, Mistral returns tables as HTML within markdown
546            if page.markdown.contains("<table") {
547                tables.push(OcrTable {
548                    page: page_idx,
549                    content: page.markdown.clone(),
550                    bbox: None,
551                });
552            }
553
554            // Extract images if present
555            for img in &page.images {
556                images.push(OcrImage {
557                    page: page_idx,
558                    description: None,
559                    bbox: None,
560                    data: img.image_base64.clone(),
561                });
562            }
563        }
564
565        Ok(OcrResult {
566            text: text.trim().to_string(),
567            pages: api_response.pages.len(),
568            spans: vec![], // Mistral OCR doesn't provide word-level spans
569            tables,
570            images,
571            confidence: None,
572            processing_time_ms: None,
573            provenance: OcrProvenance {
574                provider: "mistral-ocr".to_string(),
575                version: self.model.clone(),
576                languages: request.languages.clone(),
577                preprocessing: OcrPreprocessing::default(),
578                input_hash: None, // TODO: compute from input
579                output_hash: None, // TODO: compute from output
580                metadata: std::collections::HashMap::new(),
581            },
582        })
583    }
584}
585
586#[derive(Debug, Deserialize)]
587struct MistralOcrResponse {
588    pages: Vec<MistralOcrPage>,
589}
590
591#[derive(Debug, Deserialize)]
592struct MistralOcrPage {
593    markdown: String,
594    #[serde(default)]
595    images: Vec<MistralOcrImage>,
596}
597
598#[derive(Debug, Deserialize)]
599struct MistralOcrImage {
600    #[serde(default)]
601    image_base64: Option<String>,
602}
603
604// =============================================================================
605// DeepSeek OCR Provider
606// =============================================================================
607
608/// DeepSeek OCR 2 provider.
609///
610/// DeepSeek OCR 2 is a 3B-parameter vision-language model with the DeepEncoder V2
611/// architecture featuring Visual Causal Flow for human-like reading order.
612///
613/// # Features
614/// - SOTA on document understanding benchmarks
615/// - Human-like visual reading order
616/// - Semantic visual reasoning
617/// - 16x token compression
618///
619/// # Example
620///
621/// ```ignore
622/// use converge_provider::ocr::{DeepSeekOcrProvider, OcrRequest};
623///
624/// let provider = DeepSeekOcrProvider::from_env()?;
625/// let result = provider.extract(&OcrRequest::from_image_bytes(image_bytes))?;
626/// ```
627pub struct DeepSeekOcrProvider {
628    api_key: String,
629    model: String,
630    base_url: String,
631    client: reqwest::blocking::Client,
632}
633
634impl DeepSeekOcrProvider {
635    /// Creates a new DeepSeek OCR provider.
636    #[must_use]
637    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
638        Self {
639            api_key: api_key.into(),
640            model: model.into(),
641            base_url: "https://api.deepseek.com/v1".to_string(),
642            client: reqwest::blocking::Client::new(),
643        }
644    }
645
646    /// Creates a provider using the `DEEPSEEK_API_KEY` environment variable.
647    ///
648    /// Uses `deepseek-ocr-2` as the default model.
649    ///
650    /// # Errors
651    ///
652    /// Returns error if the environment variable is not set.
653    pub fn from_env() -> Result<Self, OcrError> {
654        let api_key = std::env::var("DEEPSEEK_API_KEY")
655            .map_err(|_| OcrError::Auth("DEEPSEEK_API_KEY environment variable not set".to_string()))?;
656        Ok(Self::new(api_key, "deepseek-ocr-2"))
657    }
658
659    /// Uses a custom base URL.
660    #[must_use]
661    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
662        self.base_url = url.into();
663        self
664    }
665}
666
667impl OcrProvider for DeepSeekOcrProvider {
668    fn name(&self) -> &'static str {
669        "deepseek-ocr"
670    }
671
672    fn model(&self) -> &str {
673        &self.model
674    }
675
676    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
677        // DeepSeek OCR uses a chat-like API with vision capabilities
678        let image_content = match &request.input {
679            OcrInput::ImageBytes(bytes) => {
680                format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
681            }
682            OcrInput::PdfBytes(bytes) => {
683                // DeepSeek OCR expects images; for PDF, we'd need to convert pages
684                // For now, treat as base64 document
685                format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
686            }
687            OcrInput::Url(url) => url.clone(),
688            OcrInput::Base64(data) => format!("data:image/png;base64,{data}"),
689        };
690
691        let body = serde_json::json!({
692            "model": self.model,
693            "messages": [{
694                "role": "user",
695                "content": [
696                    {
697                        "type": "image_url",
698                        "image_url": {
699                            "url": image_content
700                        }
701                    },
702                    {
703                        "type": "text",
704                        "text": "Extract all text from this document, preserving structure, tables, and reading order. Output in markdown format."
705                    }
706                ]
707            }],
708            "max_tokens": 8192
709        });
710
711        let response = self
712            .client
713            .post(format!("{}/chat/completions", self.base_url))
714            .header("Authorization", format!("Bearer {}", self.api_key))
715            .header("Content-Type", "application/json")
716            .json(&body)
717            .send()
718            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
719
720        let status = response.status();
721        if !status.is_success() {
722            let error_text = response.text().unwrap_or_default();
723            return match status.as_u16() {
724                401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
725                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
726                _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
727            };
728        }
729
730        let api_response: DeepSeekOcrResponse = response
731            .json()
732            .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
733
734        let text = api_response
735            .choices
736            .first()
737            .and_then(|c| c.message.content.clone())
738            .unwrap_or_default();
739
740        Ok(OcrResult {
741            text,
742            pages: 1, // DeepSeek processes one image at a time
743            spans: vec![], // DeepSeek OCR doesn't provide word-level spans
744            tables: vec![],
745            images: vec![],
746            confidence: None,
747            processing_time_ms: None,
748            provenance: OcrProvenance {
749                provider: "deepseek-ocr".to_string(),
750                version: self.model.clone(),
751                languages: request.languages.clone(),
752                preprocessing: OcrPreprocessing::default(),
753                input_hash: None,
754                output_hash: None,
755                metadata: std::collections::HashMap::new(),
756            },
757        })
758    }
759}
760
761#[derive(Debug, Deserialize)]
762struct DeepSeekOcrResponse {
763    choices: Vec<DeepSeekOcrChoice>,
764}
765
766#[derive(Debug, Deserialize)]
767struct DeepSeekOcrChoice {
768    message: DeepSeekOcrMessage,
769}
770
771#[derive(Debug, Deserialize)]
772struct DeepSeekOcrMessage {
773    content: Option<String>,
774}
775
776// =============================================================================
777// LightOn OCR Provider
778// =============================================================================
779
780/// LightOnOCR-2-1B provider.
781///
782/// LightOnOCR-2 is an efficient 1B-parameter vision-language model that achieves
783/// SOTA on OlmOCR-Bench while being 9x smaller than competitors.
784///
785/// # Features
786/// - 1B parameters, 9x smaller than competitors
787/// - 5.71 pages/s on H100 (~493k pages/day)
788/// - <$0.01 per 1000 pages
789/// - Apache 2.0 license, open weights
790/// - GDPR-compliant (France)
791///
792/// # Example
793///
794/// ```ignore
795/// use converge_provider::ocr::{LightOnOcrProvider, OcrRequest};
796///
797/// let provider = LightOnOcrProvider::from_env()?;
798/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
799/// ```
800pub struct LightOnOcrProvider {
801    api_key: String,
802    model: String,
803    base_url: String,
804    client: reqwest::blocking::Client,
805}
806
807impl LightOnOcrProvider {
808    /// Creates a new LightOn OCR provider.
809    #[must_use]
810    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
811        Self {
812            api_key: api_key.into(),
813            model: model.into(),
814            base_url: "https://api-inference.huggingface.co/models".to_string(),
815            client: reqwest::blocking::Client::new(),
816        }
817    }
818
819    /// Creates a provider using the `HUGGINGFACE_API_KEY` environment variable.
820    ///
821    /// Uses `lightonai/LightOnOCR-2-1B` as the default model.
822    ///
823    /// # Errors
824    ///
825    /// Returns error if the environment variable is not set.
826    pub fn from_env() -> Result<Self, OcrError> {
827        let api_key = std::env::var("HUGGINGFACE_API_KEY")
828            .map_err(|_| OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string()))?;
829        Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B"))
830    }
831
832    /// Creates a provider with the bbox variant for figure extraction.
833    ///
834    /// # Errors
835    ///
836    /// Returns error if the environment variable is not set.
837    pub fn from_env_with_bbox() -> Result<Self, OcrError> {
838        let api_key = std::env::var("HUGGINGFACE_API_KEY")
839            .map_err(|_| OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string()))?;
840        Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B-bbox"))
841    }
842
843    /// Uses a custom base URL.
844    #[must_use]
845    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
846        self.base_url = url.into();
847        self
848    }
849}
850
851impl OcrProvider for LightOnOcrProvider {
852    fn name(&self) -> &'static str {
853        "lighton-ocr"
854    }
855
856    fn model(&self) -> &str {
857        &self.model
858    }
859
860    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
861        // LightOnOCR uses HuggingFace Inference API
862        let image_bytes = match &request.input {
863            OcrInput::ImageBytes(bytes) => bytes.clone(),
864            OcrInput::PdfBytes(_) => {
865                return Err(OcrError::InvalidInput(
866                    "LightOnOCR requires image input. Convert PDF pages to images first.".to_string()
867                ));
868            }
869            OcrInput::Url(url) => {
870                // Fetch the image
871                let response = self.client
872                    .get(url)
873                    .send()
874                    .map_err(|e| OcrError::Network(format!("Failed to fetch image: {e}")))?;
875                response.bytes()
876                    .map_err(|e| OcrError::Network(format!("Failed to read image: {e}")))?
877                    .to_vec()
878            }
879            OcrInput::Base64(data) => {
880                base64::Engine::decode(&base64::engine::general_purpose::STANDARD, data)
881                    .map_err(|e| OcrError::Parse(format!("Invalid base64: {e}")))?
882            }
883        };
884
885        let response = self
886            .client
887            .post(format!("{}/{}", self.base_url, self.model))
888            .header("Authorization", format!("Bearer {}", self.api_key))
889            .header("Content-Type", "application/octet-stream")
890            .body(image_bytes)
891            .send()
892            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
893
894        let status = response.status();
895        if !status.is_success() {
896            let error_text = response.text().unwrap_or_default();
897            return match status.as_u16() {
898                401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
899                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
900                503 => Err(OcrError::Api("Model is loading, please retry".to_string())),
901                _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
902            };
903        }
904
905        // LightOnOCR returns the extracted text directly
906        let text = response
907            .text()
908            .map_err(|e| OcrError::Parse(format!("Failed to read response: {e}")))?;
909
910        Ok(OcrResult {
911            text,
912            pages: 1,
913            spans: vec![], // LightOnOCR doesn't provide word-level spans via HF API
914            tables: vec![],
915            images: vec![],
916            confidence: None,
917            processing_time_ms: None,
918            provenance: OcrProvenance {
919                provider: "lighton-ocr".to_string(),
920                version: self.model.clone(),
921                languages: request.languages.clone(),
922                preprocessing: OcrPreprocessing::default(),
923                input_hash: None,
924                output_hash: None,
925                metadata: std::collections::HashMap::new(),
926            },
927        })
928    }
929}
930
931// =============================================================================
932// Tesseract OCR Provider (Feature-gated, Local)
933// =============================================================================
934//
935// Tesseract is the "boring, reliable" OCR workhorse: classic OCR engine,
936// open source, runs fully locally, and easy to wrap in a Converge-style
937// Provider boundary.
938//
939// =============================================================================
940// WHAT TESSERACT IS
941// =============================================================================
942//
943// - An OCR engine originally from HP, maintained under open source.
944// - Takes images (PNG/JPG/TIFF etc) and outputs text, optionally with layout.
945// - Can run with different language packs (English, Swedish, etc).
946// - NOT a "big neural multimodal model" - it's a tool-like OCR system with
947//   modern recognition components but still very deterministic.
948//
949// =============================================================================
950// WHERE IT SHINES
951// =============================================================================
952//
953// - Clean scans, printed documents, forms, invoices, manuals, receipts
954// - High-contrast screenshots
955// - Simple page layouts
956// - Deterministic runs: same input + same version + same settings = same output
957//
958// =============================================================================
959// WHERE IT STRUGGLES
960// =============================================================================
961//
962// - Handwriting (varies, usually weak vs modern DL OCR)
963// - Low-quality photos (blur, perspective, glare)
964// - Complex layouts with tables/columns (unless you guide it well with PSM)
965// - Mixed languages without explicit config
966//
967// If your primary use case is handwriting, camera photos with glare, or dense
968// multi-column PDFs with complex tables, consider a DL-based OCR instead.
969//
970// =============================================================================
971// OUTPUT FORMATS
972// =============================================================================
973//
974// Tesseract can produce:
975// - Plain text: Just the extracted text
976// - TSV: Word-level info with confidence and bounding boxes
977// - hOCR: HTML-like format with bounding boxes (useful for validation)
978// - ALTO XML: Common in libraries/archives
979//
980// For Converge, hOCR/TSV is useful because you can validate "evidence":
981// - Bounding boxes (where each word came from)
982// - Per-word confidence
983// - Page segmentation decisions
984//
985// =============================================================================
986// KEY KNOBS
987// =============================================================================
988//
989// 1. Page Segmentation Mode (PSM) - THE BIGGEST PRACTICAL LEVER
990//    Tells Tesseract what kind of page it's looking at:
991//    - 0 = OSD only (orientation and script detection)
992//    - 1 = Automatic page segmentation with OSD
993//    - 3 = Fully automatic page segmentation (default)
994//    - 4 = Single column of variable sizes
995//    - 6 = Uniform block of text
996//    - 7 = Single text line
997//    - 8 = Single word
998//    - 11 = Sparse text
999//    If you set the wrong mode, accuracy tanks.
1000//
1001// 2. OCR Engine Mode (OEM)
1002//    Chooses which internal engine strategy to use:
1003//    - 0 = Legacy engine only
1004//    - 1 = Neural nets LSTM engine only
1005//    - 2 = Legacy + LSTM engines
1006//    - 3 = Default (auto-select best available)
1007//    Defaults are usually fine, but pin for reproducibility.
1008//
1009// 3. Language Packs
1010//    Set -l eng / -l swe etc. DON'T leave language detection implicit.
1011//
1012// 4. Preprocessing
1013//    Tesseract is EXTREMELY sensitive to:
1014//    - Resolution (DPI) - 300 DPI is typical minimum
1015//    - Binarization (thresholding)
1016//    - Denoise
1017//    - Deskew
1018//    - Contrast normalization
1019//
1020//    This is where "Rust purity" can shine: do deterministic preprocessing
1021//    in Rust (image crate) and then pass a cleaned image to Tesseract.
1022//
1023// =============================================================================
1024// CONVERGE-STYLE INTEGRATION PATTERN
1025// =============================================================================
1026//
1027// Treat OCR as a provider that returns a PROPOSAL, never truth.
1028//
1029// Shape:
1030//   DocumentBytes → ProposedTextExtraction → Validators → Facts/StructuredFields
1031//
1032// Provider output (recommended):
1033//   - text: extracted text
1034//   - spans: optional words/lines with bounding boxes (from TSV/hOCR)
1035//   - confidence: summary stats (mean, min, histogram)
1036//   - tool_provenance:
1037//       - engine = "tesseract"
1038//       - tesseract_version
1039//       - lang
1040//       - psm, oem
1041//       - preprocess_pipeline_hash
1042//   - trace_link:
1043//       - input hash (bytes)
1044//       - output hash
1045//       - settings hash
1046//
1047// Validators (examples):
1048//   - min_confidence >= 0.75 else STOP or WARN
1049//   - required_fields_present (invoice number/date/amount)
1050//   - layout sanity (if table expected, require hOCR structure)
1051//   - PII redaction gate before storage
1052//
1053// =============================================================================
1054// PACKAGING AND DEPLOYMENT
1055// =============================================================================
1056//
1057// Tesseract is a native dependency. Manage cleanly:
1058//
1059// Best practice for "one binary experience":
1060//   - Ship your Rust binary
1061//   - Vendor/bundle Tesseract in installer (or provide "cz doctor" check)
1062//   - Pin versions for reproducibility
1063//
1064// On macOS: Most people install via Homebrew, but for deterministic
1065// environments, package with your app or use Nix.
1066//
1067// =============================================================================
1068// ARCHITECTURE (Rust-first compromise)
1069// =============================================================================
1070//
1071// Tesseract integration follows the "Rust-first compromise" pattern:
1072// - Pure Converge architecture (providers, traces, gates, promotion)
1073// - OCR runs locally with no cloud data exposure
1074// - Accepts native dependency (tesseract + leptonica)
1075//
1076// Integration options (in order of preference):
1077// 1. Sidecar binary: invoke `tesseract` CLI via std::process::Command
1078// 2. FFI binding: link against libtesseract (more complex, faster)
1079// 3. System dependency: require tesseract installed (brew, apt, nix)
1080//
1081// The provider returns:
1082// - Extracted text
1083// - Confidence summary (per-word statistics)
1084// - Provenance: tool version, language pack, preprocessing params
1085// - Trace link hashes of input bytes and output
1086//
1087// Determinism: Stable for same input image + same Tesseract version.
1088//
1089// When to use:
1090// - Scanned PDFs, clean prints, forms, invoices, receipts
1091// - "Extract text so downstream validators can reason"
1092// - GDPR/data sovereignty requirements (no cloud exposure)
1093//
1094// Future: Can be swapped with Burn/candle-based OCR model without
1095// changing the core contracts (OcrProvider trait).
1096//
1097// =============================================================================
1098
1099/// Configuration for Tesseract OCR provider.
1100///
1101/// # Feature Gate
1102///
1103/// This provider requires the `tesseract` feature:
1104/// ```toml
1105/// [dependencies]
1106/// converge-provider = { version = "0.2", features = ["tesseract"] }
1107/// ```
1108///
1109/// # System Requirements
1110///
1111/// Tesseract must be installed on the system:
1112/// - macOS: `brew install tesseract tesseract-lang`
1113/// - Ubuntu: `apt install tesseract-ocr tesseract-ocr-eng`
1114/// - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki
1115///
1116/// # Key Knobs
1117///
1118/// **Page Segmentation Mode (PSM)** - The biggest practical lever:
1119/// - 0 = OSD only (orientation and script detection)
1120/// - 1 = Automatic page segmentation with OSD
1121/// - 3 = Fully automatic page segmentation (default)
1122/// - 4 = Single column of variable sizes
1123/// - 6 = Uniform block of text
1124/// - 7 = Single text line
1125/// - 8 = Single word
1126/// - 11 = Sparse text
1127///
1128/// If you set the wrong mode, accuracy tanks.
1129///
1130/// **OCR Engine Mode (OEM)**:
1131/// - 0 = Legacy engine only
1132/// - 1 = Neural nets LSTM engine only
1133/// - 2 = Legacy + LSTM engines
1134/// - 3 = Default (auto-select best available)
1135///
1136/// **Preprocessing**: Tesseract is EXTREMELY sensitive to:
1137/// - Resolution (DPI) - 300 DPI is typical minimum
1138/// - Binarization, denoise, deskew, contrast normalization
1139///
1140/// # Example (Future)
1141///
1142/// ```ignore
1143/// use converge_provider::ocr::{TesseractOcrProvider, TesseractConfig, TesseractOutputFormat, OcrRequest};
1144///
1145/// let config = TesseractConfig::new()
1146///     .with_languages(vec!["eng", "deu"])
1147///     .with_dpi(300)
1148///     .with_psm(3)  // Fully automatic
1149///     .with_output_format(TesseractOutputFormat::Tsv);  // Get bounding boxes
1150///
1151/// let provider = TesseractOcrProvider::with_config(config);
1152/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
1153///
1154/// // Provenance includes tool version, language pack, preprocessing
1155/// println!("Tesseract version: {}", result.provenance.version);
1156/// println!("Confidence: {:.2}%", result.confidence.unwrap().mean * 100.0);
1157///
1158/// // Check spans for evidence validation
1159/// for span in &result.spans {
1160///     if span.is_low_confidence(0.75) {
1161///         println!("Low confidence word: {} ({:.0}%)", span.text, span.confidence * 100.0);
1162///     }
1163/// }
1164/// ```
1165#[derive(Debug, Clone)]
1166pub struct TesseractConfig {
1167    /// Path to tesseract binary (default: "tesseract" in PATH).
1168    pub binary_path: String,
1169    /// Path to tessdata directory (language files).
1170    pub tessdata_path: Option<String>,
1171    /// Languages to use (e.g., ["eng", "deu"]).
1172    /// DON'T leave language detection implicit!
1173    pub languages: Vec<String>,
1174    /// DPI for PDF rendering (default: 300).
1175    /// 300 DPI is typical minimum for good results.
1176    pub dpi: u32,
1177    /// Page segmentation mode (PSM).
1178    /// 0 = OSD only, 1 = auto + OSD, 3 = fully auto (default), 6 = uniform block, etc.
1179    /// THIS IS THE BIGGEST PRACTICAL LEVER. Wrong mode = bad accuracy.
1180    pub psm: u32,
1181    /// OCR engine mode (OEM).
1182    /// 0 = Legacy, 1 = Neural LSTM, 2 = Legacy + LSTM, 3 = Default (auto).
1183    /// Pin for reproducibility.
1184    pub oem: u32,
1185    /// Output format (text, TSV, hOCR, ALTO).
1186    /// Use TSV or hOCR for word-level confidence and bounding boxes.
1187    pub output_format: TesseractOutputFormat,
1188    /// Whether to apply preprocessing (deskew, denoise, binarize).
1189    /// Tesseract is EXTREMELY sensitive to image quality.
1190    pub preprocess: bool,
1191    /// Timeout in seconds for OCR operation.
1192    pub timeout_secs: u64,
1193}
1194
1195impl Default for TesseractConfig {
1196    fn default() -> Self {
1197        Self {
1198            binary_path: "tesseract".to_string(),
1199            tessdata_path: None,
1200            languages: vec!["eng".to_string()],
1201            dpi: 300,
1202            psm: 3, // Fully automatic page segmentation
1203            oem: 3, // Default (auto-select best available)
1204            output_format: TesseractOutputFormat::Text,
1205            preprocess: true,
1206            timeout_secs: 60,
1207        }
1208    }
1209}
1210
1211impl TesseractConfig {
1212    /// Creates a new Tesseract configuration with defaults.
1213    #[must_use]
1214    pub fn new() -> Self {
1215        Self::default()
1216    }
1217
1218    /// Sets the path to the tesseract binary.
1219    #[must_use]
1220    pub fn with_binary_path(mut self, path: impl Into<String>) -> Self {
1221        self.binary_path = path.into();
1222        self
1223    }
1224
1225    /// Sets the tessdata directory path.
1226    #[must_use]
1227    pub fn with_tessdata_path(mut self, path: impl Into<String>) -> Self {
1228        self.tessdata_path = Some(path.into());
1229        self
1230    }
1231
1232    /// Sets the languages to use.
1233    #[must_use]
1234    pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1235        self.languages = languages.into_iter().map(Into::into).collect();
1236        self
1237    }
1238
1239    /// Sets the DPI for PDF rendering.
1240    #[must_use]
1241    pub fn with_dpi(mut self, dpi: u32) -> Self {
1242        self.dpi = dpi;
1243        self
1244    }
1245
1246    /// Sets the page segmentation mode.
1247    #[must_use]
1248    pub fn with_psm(mut self, psm: u32) -> Self {
1249        self.psm = psm;
1250        self
1251    }
1252
1253    /// Sets the OCR engine mode.
1254    #[must_use]
1255    pub fn with_oem(mut self, oem: u32) -> Self {
1256        self.oem = oem;
1257        self
1258    }
1259
1260    /// Sets whether to apply preprocessing.
1261    #[must_use]
1262    pub fn with_preprocess(mut self, preprocess: bool) -> Self {
1263        self.preprocess = preprocess;
1264        self
1265    }
1266
1267    /// Sets the timeout in seconds.
1268    #[must_use]
1269    pub fn with_timeout(mut self, secs: u64) -> Self {
1270        self.timeout_secs = secs;
1271        self
1272    }
1273
1274    /// Sets the output format.
1275    /// Use TSV or hOCR for word-level confidence and bounding boxes.
1276    #[must_use]
1277    pub fn with_output_format(mut self, format: TesseractOutputFormat) -> Self {
1278        self.output_format = format;
1279        self
1280    }
1281}
1282
1283/// Tesseract OCR provider (stub - not yet implemented).
1284///
1285/// This is a placeholder for the local Tesseract OCR integration.
1286/// The actual implementation will be feature-gated behind `tesseract`.
1287///
1288/// # Architecture
1289///
1290/// ```text
1291/// TesseractOcrProvider
1292///     │
1293///     ├── Input (PDF/Image bytes)
1294///     │      │
1295///     │      ▼
1296///     ├── Preprocessing (optional)
1297///     │      ├── PDF → Images (pdftoppm/pdf2image)
1298///     │      ├── Deskew (leptonica)
1299///     │      ├── Denoise (leptonica)
1300///     │      └── Binarize (leptonica)
1301///     │      │
1302///     │      ▼
1303///     ├── Tesseract CLI/FFI
1304///     │      │
1305///     │      ▼
1306///     ├── Output
1307///     │      ├── Text (plain or hOCR/ALTO)
1308///     │      ├── Confidence (per-word)
1309///     │      └── Bounding boxes (optional)
1310///     │      │
1311///     │      ▼
1312///     └── OcrResult with Provenance
1313///            ├── text
1314///            ├── confidence summary
1315///            ├── provenance (version, langs, params)
1316///            └── trace hashes (input/output)
1317/// ```
1318///
1319/// # Future Implementation
1320///
1321/// When the `tesseract` feature is enabled:
1322///
1323/// ```ignore
1324/// #[cfg(feature = "tesseract")]
1325/// impl OcrProvider for TesseractOcrProvider {
1326///     fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
1327///         // 1. Hash input for trace links
1328///         // 2. Preprocess if needed (PDF→image, deskew, etc.)
1329///         // 3. Invoke tesseract CLI or FFI
1330///         // 4. Parse output (text + confidence)
1331///         // 5. Hash output for trace links
1332///         // 6. Return OcrResult with full provenance
1333///     }
1334/// }
1335/// ```
1336#[derive(Debug)]
1337pub struct TesseractOcrProvider {
1338    config: TesseractConfig,
1339}
1340
1341impl TesseractOcrProvider {
1342    /// Creates a new Tesseract OCR provider with default configuration.
1343    #[must_use]
1344    pub fn new() -> Self {
1345        Self {
1346            config: TesseractConfig::default(),
1347        }
1348    }
1349
1350    /// Creates a provider with custom configuration.
1351    #[must_use]
1352    pub fn with_config(config: TesseractConfig) -> Self {
1353        Self { config }
1354    }
1355
1356    /// Sets the languages to use.
1357    #[must_use]
1358    pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1359        self.config.languages = languages.into_iter().map(Into::into).collect();
1360        self
1361    }
1362
1363    /// Sets the DPI for PDF rendering.
1364    #[must_use]
1365    pub fn with_dpi(mut self, dpi: u32) -> Self {
1366        self.config.dpi = dpi;
1367        self
1368    }
1369
1370    /// Checks if Tesseract is available on the system.
1371    ///
1372    /// # Errors
1373    ///
1374    /// Returns error if Tesseract is not found or cannot be executed.
1375    pub fn check_availability(&self) -> Result<String, OcrError> {
1376        // This is a stub - actual implementation would run `tesseract --version`
1377        Err(OcrError::Api(
1378            "Tesseract provider not yet implemented. Enable the 'tesseract' feature.".to_string()
1379        ))
1380    }
1381
1382    /// Returns the Tesseract version (stub).
1383    pub fn version(&self) -> Option<String> {
1384        None // Stub - would parse `tesseract --version` output
1385    }
1386}
1387
1388impl Default for TesseractOcrProvider {
1389    fn default() -> Self {
1390        Self::new()
1391    }
1392}
1393
1394// Stub implementation - will be replaced when feature is implemented
1395impl OcrProvider for TesseractOcrProvider {
1396    fn name(&self) -> &'static str {
1397        "tesseract"
1398    }
1399
1400    fn model(&self) -> &str {
1401        "tesseract-stub"
1402    }
1403
1404    fn extract(&self, _request: &OcrRequest) -> Result<OcrResult, OcrError> {
1405        Err(OcrError::Api(
1406            "Tesseract OCR provider not yet implemented. \
1407             This is a placeholder for future local OCR support. \
1408             For now, use MistralOcrProvider, DeepSeekOcrProvider, or LightOnOcrProvider."
1409                .to_string()
1410        ))
1411    }
1412}
1413
1414// =============================================================================
1415// Helper functions for provenance
1416// =============================================================================
1417
1418/// Computes SHA-256 hash of bytes for trace links.
1419#[must_use]
1420pub fn compute_hash(data: &[u8]) -> String {
1421    use sha2::{Sha256, Digest};
1422    let mut hasher = Sha256::new();
1423    hasher.update(data);
1424    format!("{:x}", hasher.finalize())
1425}
1426
1427/// Computes input/output hashes and returns updated provenance.
1428#[must_use]
1429pub fn with_trace_hashes(mut provenance: OcrProvenance, input: &[u8], output: &str) -> OcrProvenance {
1430    provenance.input_hash = Some(compute_hash(input));
1431    provenance.output_hash = Some(compute_hash(output.as_bytes()));
1432    provenance
1433}
1434
1435#[cfg(test)]
1436mod tests {
1437    use super::*;
1438
1439    #[test]
1440    fn test_ocr_request_builder() {
1441        let request = OcrRequest::from_pdf_bytes(vec![1, 2, 3])
1442            .with_output_format(OcrOutputFormat::Html)
1443            .with_languages(vec!["en".to_string(), "de".to_string()])
1444            .with_extract_tables(true)
1445            .with_extract_images(true)
1446            .with_page_range(0, 10);
1447
1448        assert_eq!(request.output_format, OcrOutputFormat::Html);
1449        assert_eq!(request.languages, vec!["en", "de"]);
1450        assert!(request.extract_tables);
1451        assert!(request.extract_images);
1452        assert_eq!(request.page_range, Some((0, 10)));
1453    }
1454
1455    #[test]
1456    fn test_ocr_output_format_default() {
1457        let format = OcrOutputFormat::default();
1458        assert_eq!(format, OcrOutputFormat::Markdown);
1459    }
1460}