Skip to main content

converge_provider/
ocr.rs

1// Copyright 2024-2026 Reflective Labs
2// SPDX-License-Identifier: MIT
3// See LICENSE file in the project root for full license information.
4
5//! OCR / Document AI providers.
6//!
7//! This module provides integration with OCR models for document understanding,
8//! text extraction, and structured content parsing from PDFs, scans, and images.
9//!
10//! # Available Providers
11//!
12//! - [`MistralOcrProvider`] - Mistral OCR 3 (GDPR-compliant, EU)
13//! - [`DeepSeekOcrProvider`] - `DeepSeek` OCR 2 (Visual Causal Flow)
14//! - [`LightOnOcrProvider`] - LightOnOCR-2-1B (Efficient, open-source)
15//!
16//! # Example
17//!
18//! ```ignore
19//! use converge_provider::ocr::{OcrProvider, MistralOcrProvider, OcrRequest};
20//!
21//! let provider = MistralOcrProvider::from_env()?;
22//! let request = OcrRequest::from_pdf_bytes(pdf_bytes);
23//! let result = provider.extract(&request)?;
24//!
25//! println!("Extracted text: {}", result.text);
26//! for table in result.tables {
27//!     println!("Table: {:?}", table);
28//! }
29//! ```
30
31use serde::{Deserialize, Serialize};
32
33/// Error type for OCR operations.
34#[derive(Debug, thiserror::Error)]
35pub enum OcrError {
36    /// Network/HTTP error.
37    #[error("Network error: {0}")]
38    Network(String),
39
40    /// API authentication error.
41    #[error("Authentication error: {0}")]
42    Auth(String),
43
44    /// Rate limit exceeded.
45    #[error("Rate limit exceeded: {0}")]
46    RateLimit(String),
47
48    /// API response parsing error.
49    #[error("Parse error: {0}")]
50    Parse(String),
51
52    /// Invalid input (unsupported format, etc.).
53    #[error("Invalid input: {0}")]
54    InvalidInput(String),
55
56    /// General API error.
57    #[error("API error: {0}")]
58    Api(String),
59}
60
61/// Input type for OCR processing.
62#[derive(Debug, Clone)]
63pub enum OcrInput {
64    /// PDF document as bytes.
65    PdfBytes(Vec<u8>),
66    /// Image as bytes (PNG, JPEG, etc.).
67    ImageBytes(Vec<u8>),
68    /// URL to a document or image.
69    Url(String),
70    /// Base64-encoded document or image.
71    Base64(String),
72}
73
74/// OCR extraction request.
75#[derive(Debug, Clone)]
76pub struct OcrRequest {
77    /// Input document or image.
78    pub input: OcrInput,
79    /// Output format preference.
80    pub output_format: OcrOutputFormat,
81    /// Language hints (ISO 639-1 codes).
82    pub languages: Vec<String>,
83    /// Whether to extract tables.
84    pub extract_tables: bool,
85    /// Whether to extract images/figures.
86    pub extract_images: bool,
87    /// Page range (for multi-page documents).
88    pub page_range: Option<(usize, usize)>,
89}
90
91impl OcrRequest {
92    /// Creates a request from PDF bytes.
93    #[must_use]
94    pub fn from_pdf_bytes(bytes: Vec<u8>) -> Self {
95        Self {
96            input: OcrInput::PdfBytes(bytes),
97            output_format: OcrOutputFormat::Markdown,
98            languages: vec![],
99            extract_tables: true,
100            extract_images: false,
101            page_range: None,
102        }
103    }
104
105    /// Creates a request from image bytes.
106    #[must_use]
107    pub fn from_image_bytes(bytes: Vec<u8>) -> Self {
108        Self {
109            input: OcrInput::ImageBytes(bytes),
110            output_format: OcrOutputFormat::Markdown,
111            languages: vec![],
112            extract_tables: true,
113            extract_images: false,
114            page_range: None,
115        }
116    }
117
118    /// Creates a request from a URL.
119    #[must_use]
120    pub fn from_url(url: impl Into<String>) -> Self {
121        Self {
122            input: OcrInput::Url(url.into()),
123            output_format: OcrOutputFormat::Markdown,
124            languages: vec![],
125            extract_tables: true,
126            extract_images: false,
127            page_range: None,
128        }
129    }
130
131    /// Sets the output format.
132    #[must_use]
133    pub fn with_output_format(mut self, format: OcrOutputFormat) -> Self {
134        self.output_format = format;
135        self
136    }
137
138    /// Adds language hints.
139    #[must_use]
140    pub fn with_languages(mut self, languages: Vec<String>) -> Self {
141        self.languages = languages;
142        self
143    }
144
145    /// Sets whether to extract tables.
146    #[must_use]
147    pub fn with_extract_tables(mut self, extract: bool) -> Self {
148        self.extract_tables = extract;
149        self
150    }
151
152    /// Sets whether to extract images.
153    #[must_use]
154    pub fn with_extract_images(mut self, extract: bool) -> Self {
155        self.extract_images = extract;
156        self
157    }
158
159    /// Sets the page range for multi-page documents.
160    #[must_use]
161    pub fn with_page_range(mut self, start: usize, end: usize) -> Self {
162        self.page_range = Some((start, end));
163        self
164    }
165}
166
167/// Output format for OCR results.
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
169pub enum OcrOutputFormat {
170    /// Plain text.
171    Text,
172    /// Markdown with structure preserved.
173    #[default]
174    Markdown,
175    /// HTML with table reconstruction.
176    Html,
177    /// JSON with structured data.
178    Json,
179}
180
181/// A detected table in the document.
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct OcrTable {
184    /// Page number (0-indexed).
185    pub page: usize,
186    /// Table as HTML or markdown.
187    pub content: String,
188    /// Bounding box (x, y, width, height) if available.
189    pub bbox: Option<(f64, f64, f64, f64)>,
190}
191
192/// A detected image/figure in the document.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct OcrImage {
195    /// Page number (0-indexed).
196    pub page: usize,
197    /// Image description or alt text.
198    pub description: Option<String>,
199    /// Bounding box (x, y, width, height).
200    pub bbox: Option<(f64, f64, f64, f64)>,
201    /// Base64-encoded image data (if extracted).
202    pub data: Option<String>,
203}
204
205/// Provenance information for OCR results.
206///
207/// Captures everything needed for reproducibility and tracing:
208/// - Tool version and configuration
209/// - Input/output hashes for trace links
210/// - Preprocessing parameters applied
211#[derive(Debug, Clone, Serialize, Deserialize, Default)]
212pub struct OcrProvenance {
213    /// Provider/tool name (e.g., "tesseract", "mistral-ocr", "deepseek-ocr").
214    pub provider: String,
215    /// Model or engine version (e.g., "5.3.0", "mistral-ocr-2512").
216    pub version: String,
217    /// Language pack(s) used (e.g., ["eng", "deu"]).
218    pub languages: Vec<String>,
219    /// Preprocessing parameters applied.
220    pub preprocessing: OcrPreprocessing,
221    /// SHA-256 hash of input bytes (for trace links).
222    pub input_hash: Option<String>,
223    /// SHA-256 hash of output text (for trace links).
224    pub output_hash: Option<String>,
225    /// Additional metadata (tool-specific).
226    #[serde(default)]
227    pub metadata: std::collections::HashMap<String, String>,
228}
229
230/// Preprocessing parameters applied before OCR.
231#[derive(Debug, Clone, Serialize, Deserialize, Default)]
232pub struct OcrPreprocessing {
233    /// DPI used for rendering (for PDFs).
234    pub dpi: Option<u32>,
235    /// Whether binarization was applied.
236    pub binarized: bool,
237    /// Whether deskewing was applied.
238    pub deskewed: bool,
239    /// Whether noise removal was applied.
240    pub denoised: bool,
241    /// Page segmentation mode (Tesseract-specific).
242    pub psm: Option<u32>,
243    /// OCR engine mode (Tesseract-specific).
244    pub oem: Option<u32>,
245}
246
247/// Confidence summary for OCR results.
248#[derive(Debug, Clone, Serialize, Deserialize, Default)]
249pub struct OcrConfidence {
250    /// Overall mean confidence (0.0-1.0).
251    pub mean: f64,
252    /// Minimum word confidence.
253    pub min: f64,
254    /// Maximum word confidence.
255    pub max: f64,
256    /// Standard deviation of confidence scores.
257    pub std_dev: Option<f64>,
258    /// Number of words with confidence below threshold.
259    pub low_confidence_words: usize,
260    /// Threshold used for low confidence (default 0.6).
261    pub threshold: f64,
262}
263
264/// A word or text span with position and confidence.
265///
266/// For Tesseract, this comes from TSV or hOCR output.
267/// Useful for validation: you can check where each word came from
268/// and flag low-confidence regions.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct OcrSpan {
271    /// The text content of this span.
272    pub text: String,
273    /// Confidence score (0.0-1.0).
274    pub confidence: f64,
275    /// Page number (0-indexed).
276    pub page: usize,
277    /// Bounding box: (x, y, width, height) in pixels.
278    pub bbox: Option<(i32, i32, i32, i32)>,
279    /// Block number (page segmentation unit).
280    pub block_num: Option<i32>,
281    /// Paragraph number within block.
282    pub par_num: Option<i32>,
283    /// Line number within paragraph.
284    pub line_num: Option<i32>,
285    /// Word number within line.
286    pub word_num: Option<i32>,
287}
288
289impl OcrSpan {
290    /// Creates a new span with text and confidence.
291    #[must_use]
292    pub fn new(text: impl Into<String>, confidence: f64) -> Self {
293        Self {
294            text: text.into(),
295            confidence,
296            page: 0,
297            bbox: None,
298            block_num: None,
299            par_num: None,
300            line_num: None,
301            word_num: None,
302        }
303    }
304
305    /// Sets the bounding box.
306    #[must_use]
307    pub fn with_bbox(mut self, x: i32, y: i32, w: i32, h: i32) -> Self {
308        self.bbox = Some((x, y, w, h));
309        self
310    }
311
312    /// Sets the page number.
313    #[must_use]
314    pub fn with_page(mut self, page: usize) -> Self {
315        self.page = page;
316        self
317    }
318
319    /// Checks if this span has low confidence (below threshold).
320    #[must_use]
321    pub fn is_low_confidence(&self, threshold: f64) -> bool {
322        self.confidence < threshold
323    }
324}
325
326/// Tesseract-specific output format.
327///
328/// Controls what kind of output to request from Tesseract.
329#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
330pub enum TesseractOutputFormat {
331    /// Plain text (default).
332    #[default]
333    Text,
334    /// TSV with word-level confidence and bounding boxes.
335    /// Columns: level, `page_num`, `block_num`, `par_num`, `line_num`, `word_num`,
336    ///          left, top, width, height, conf, text
337    Tsv,
338    /// hOCR HTML format with bounding boxes.
339    /// Useful for downstream table/layout analysis.
340    Hocr,
341    /// ALTO XML format (common in libraries/archives).
342    Alto,
343}
344
345/// OCR extraction result.
346#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct OcrResult {
348    /// Extracted text content.
349    pub text: String,
350    /// Number of pages processed.
351    pub pages: usize,
352    /// Word/text spans with positions and confidence.
353    /// Populated when using TSV or hOCR output format (Tesseract).
354    /// Useful for validation: check where each word came from.
355    #[serde(default)]
356    pub spans: Vec<OcrSpan>,
357    /// Detected tables.
358    pub tables: Vec<OcrTable>,
359    /// Detected images/figures.
360    pub images: Vec<OcrImage>,
361    /// Confidence summary (per-word statistics).
362    pub confidence: Option<OcrConfidence>,
363    /// Processing time in milliseconds.
364    pub processing_time_ms: Option<u64>,
365    /// Provenance for reproducibility and tracing.
366    pub provenance: OcrProvenance,
367}
368
369/// Trait for OCR providers.
370pub trait OcrProvider: Send + Sync {
371    /// Returns the provider name.
372    fn name(&self) -> &'static str;
373
374    /// Returns the model being used.
375    fn model(&self) -> &str;
376
377    /// Extracts text and structure from a document.
378    ///
379    /// # Errors
380    ///
381    /// Returns error if extraction fails.
382    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError>;
383}
384
385// =============================================================================
386// Mistral OCR Provider
387// =============================================================================
388
389/// Mistral OCR 3 provider.
390///
391/// Mistral OCR 3 is designed for document AI at scale, handling forms, invoices,
392/// complex tables, handwriting, and low-quality scans. It outputs structured
393/// text/HTML suitable for RAG and agent workflows.
394///
395/// # Features
396/// - 74% win rate over OCR 2 on forms, handwriting, tables
397/// - Markdown output with HTML table reconstruction
398/// - GDPR-compliant (France)
399/// - $2 per 1000 pages ($1 with batch API)
400///
401/// # Example
402///
403/// ```ignore
404/// use converge_provider::ocr::{MistralOcrProvider, OcrRequest};
405///
406/// let provider = MistralOcrProvider::from_env()?;
407/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
408/// ```
409pub struct MistralOcrProvider {
410    api_key: crate::secret::SecretString,
411    model: String,
412    base_url: String,
413    client: reqwest::blocking::Client,
414}
415
416impl MistralOcrProvider {
417    /// Creates a new Mistral OCR provider.
418    #[must_use]
419    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
420        Self {
421            api_key: crate::secret::SecretString::new(api_key),
422            model: model.into(),
423            base_url: "https://api.mistral.ai/v1".to_string(),
424            client: reqwest::blocking::Client::new(),
425        }
426    }
427
428    /// Creates a provider using the `MISTRAL_API_KEY` environment variable.
429    ///
430    /// Uses `mistral-ocr-latest` as the default model.
431    ///
432    /// # Errors
433    ///
434    /// Returns error if the environment variable is not set.
435    pub fn from_env() -> Result<Self, OcrError> {
436        let api_key = std::env::var("MISTRAL_API_KEY").map_err(|_| {
437            OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string())
438        })?;
439        Ok(Self::new(api_key, "mistral-ocr-latest"))
440    }
441
442    /// Creates a provider with a specific model.
443    ///
444    /// # Errors
445    ///
446    /// Returns error if the environment variable is not set.
447    pub fn from_env_with_model(model: impl Into<String>) -> Result<Self, OcrError> {
448        let api_key = std::env::var("MISTRAL_API_KEY").map_err(|_| {
449            OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string())
450        })?;
451        Ok(Self::new(api_key, model))
452    }
453
454    /// Uses a custom base URL.
455    #[must_use]
456    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
457        self.base_url = url.into();
458        self
459    }
460}
461
462impl OcrProvider for MistralOcrProvider {
463    fn name(&self) -> &'static str {
464        "mistral-ocr"
465    }
466
467    fn model(&self) -> &str {
468        &self.model
469    }
470
471    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
472        // Build the request body based on input type
473        let document = match &request.input {
474            OcrInput::PdfBytes(bytes) => {
475                serde_json::json!({
476                    "type": "document_url",
477                    "document_url": format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
478                })
479            }
480            OcrInput::ImageBytes(bytes) => {
481                serde_json::json!({
482                    "type": "image_url",
483                    "image_url": format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
484                })
485            }
486            OcrInput::Url(url) => {
487                if std::path::Path::new(url)
488                    .extension()
489                    .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
490                {
491                    serde_json::json!({
492                        "type": "document_url",
493                        "document_url": url
494                    })
495                } else {
496                    serde_json::json!({
497                        "type": "image_url",
498                        "image_url": url
499                    })
500                }
501            }
502            OcrInput::Base64(data) => {
503                serde_json::json!({
504                    "type": "document_url",
505                    "document_url": format!("data:application/pdf;base64,{}", data)
506                })
507            }
508        };
509
510        let body = serde_json::json!({
511            "model": self.model,
512            "document": document,
513            "include_image_base64": request.extract_images
514        });
515
516        let response = self
517            .client
518            .post(format!("{}/ocr", self.base_url))
519            .header("Authorization", format!("Bearer {}", self.api_key.expose()))
520            .header("Content-Type", "application/json")
521            .json(&body)
522            .send()
523            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
524
525        let status = response.status();
526        if !status.is_success() {
527            let error_text = response.text().unwrap_or_default();
528            return match status.as_u16() {
529                401 | 403 => Err(OcrError::Auth(format!(
530                    "Authentication failed: {error_text}"
531                ))),
532                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
533                _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
534            };
535        }
536
537        let api_response: MistralOcrResponse = response
538            .json()
539            .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
540
541        // Convert to our result format
542        let mut tables = vec![];
543        let mut images = vec![];
544        let mut text = String::new();
545
546        for (page_idx, page) in api_response.pages.iter().enumerate() {
547            text.push_str(&page.markdown);
548            text.push_str("\n\n");
549
550            // Extract tables from markdown (simplified)
551            // In practice, Mistral returns tables as HTML within markdown
552            if page.markdown.contains("<table") {
553                tables.push(OcrTable {
554                    page: page_idx,
555                    content: page.markdown.clone(),
556                    bbox: None,
557                });
558            }
559
560            // Extract images if present
561            for img in &page.images {
562                images.push(OcrImage {
563                    page: page_idx,
564                    description: None,
565                    bbox: None,
566                    data: img.image_base64.clone(),
567                });
568            }
569        }
570
571        Ok(OcrResult {
572            text: text.trim().to_string(),
573            pages: api_response.pages.len(),
574            spans: vec![], // Mistral OCR doesn't provide word-level spans
575            tables,
576            images,
577            confidence: None,
578            processing_time_ms: None,
579            provenance: OcrProvenance {
580                provider: "mistral-ocr".to_string(),
581                version: self.model.clone(),
582                languages: request.languages.clone(),
583                preprocessing: OcrPreprocessing::default(),
584                input_hash: None,  // TODO: compute from input
585                output_hash: None, // TODO: compute from output
586                metadata: std::collections::HashMap::new(),
587            },
588        })
589    }
590}
591
592#[derive(Debug, Deserialize)]
593struct MistralOcrResponse {
594    pages: Vec<MistralOcrPage>,
595}
596
597#[derive(Debug, Deserialize)]
598struct MistralOcrPage {
599    markdown: String,
600    #[serde(default)]
601    images: Vec<MistralOcrImage>,
602}
603
604#[derive(Debug, Deserialize)]
605struct MistralOcrImage {
606    #[serde(default)]
607    image_base64: Option<String>,
608}
609
610// =============================================================================
611// DeepSeek OCR Provider
612// =============================================================================
613
614/// `DeepSeek` OCR 2 provider.
615///
616/// `DeepSeek` OCR 2 is a 3B-parameter vision-language model with the `DeepEncoder` V2
617/// architecture featuring Visual Causal Flow for human-like reading order.
618///
619/// # Features
620/// - SOTA on document understanding benchmarks
621/// - Human-like visual reading order
622/// - Semantic visual reasoning
623/// - 16x token compression
624///
625/// # Example
626///
627/// ```ignore
628/// use converge_provider::ocr::{DeepSeekOcrProvider, OcrRequest};
629///
630/// let provider = DeepSeekOcrProvider::from_env()?;
631/// let result = provider.extract(&OcrRequest::from_image_bytes(image_bytes))?;
632/// ```
633pub struct DeepSeekOcrProvider {
634    api_key: crate::secret::SecretString,
635    model: String,
636    base_url: String,
637    client: reqwest::blocking::Client,
638}
639
640impl DeepSeekOcrProvider {
641    /// Creates a new `DeepSeek` OCR provider.
642    #[must_use]
643    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
644        Self {
645            api_key: crate::secret::SecretString::new(api_key),
646            model: model.into(),
647            base_url: "https://api.deepseek.com/v1".to_string(),
648            client: reqwest::blocking::Client::new(),
649        }
650    }
651
652    /// Creates a provider using the `DEEPSEEK_API_KEY` environment variable.
653    ///
654    /// Uses `deepseek-ocr-2` as the default model.
655    ///
656    /// # Errors
657    ///
658    /// Returns error if the environment variable is not set.
659    pub fn from_env() -> Result<Self, OcrError> {
660        let api_key = std::env::var("DEEPSEEK_API_KEY").map_err(|_| {
661            OcrError::Auth("DEEPSEEK_API_KEY environment variable not set".to_string())
662        })?;
663        Ok(Self::new(api_key, "deepseek-ocr-2"))
664    }
665
666    /// Uses a custom base URL.
667    #[must_use]
668    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
669        self.base_url = url.into();
670        self
671    }
672}
673
674impl OcrProvider for DeepSeekOcrProvider {
675    fn name(&self) -> &'static str {
676        "deepseek-ocr"
677    }
678
679    fn model(&self) -> &str {
680        &self.model
681    }
682
683    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
684        // DeepSeek OCR uses a chat-like API with vision capabilities
685        let image_content = match &request.input {
686            OcrInput::ImageBytes(bytes) => {
687                format!(
688                    "data:image/png;base64,{}",
689                    base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes)
690                )
691            }
692            OcrInput::PdfBytes(bytes) => {
693                // DeepSeek OCR expects images; for PDF, we'd need to convert pages
694                // For now, treat as base64 document
695                format!(
696                    "data:application/pdf;base64,{}",
697                    base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes)
698                )
699            }
700            OcrInput::Url(url) => url.clone(),
701            OcrInput::Base64(data) => format!("data:image/png;base64,{data}"),
702        };
703
704        let body = serde_json::json!({
705            "model": self.model,
706            "messages": [{
707                "role": "user",
708                "content": [
709                    {
710                        "type": "image_url",
711                        "image_url": {
712                            "url": image_content
713                        }
714                    },
715                    {
716                        "type": "text",
717                        "text": "Extract all text from this document, preserving structure, tables, and reading order. Output in markdown format."
718                    }
719                ]
720            }],
721            "max_tokens": 8192
722        });
723
724        let response = self
725            .client
726            .post(format!("{}/chat/completions", self.base_url))
727            .header("Authorization", format!("Bearer {}", self.api_key.expose()))
728            .header("Content-Type", "application/json")
729            .json(&body)
730            .send()
731            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
732
733        let status = response.status();
734        if !status.is_success() {
735            let error_text = response.text().unwrap_or_default();
736            return match status.as_u16() {
737                401 | 403 => Err(OcrError::Auth(format!(
738                    "Authentication failed: {error_text}"
739                ))),
740                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
741                _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
742            };
743        }
744
745        let api_response: DeepSeekOcrResponse = response
746            .json()
747            .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
748
749        let text = api_response
750            .choices
751            .first()
752            .and_then(|c| c.message.content.clone())
753            .unwrap_or_default();
754
755        Ok(OcrResult {
756            text,
757            pages: 1,      // DeepSeek processes one image at a time
758            spans: vec![], // DeepSeek OCR doesn't provide word-level spans
759            tables: vec![],
760            images: vec![],
761            confidence: None,
762            processing_time_ms: None,
763            provenance: OcrProvenance {
764                provider: "deepseek-ocr".to_string(),
765                version: self.model.clone(),
766                languages: request.languages.clone(),
767                preprocessing: OcrPreprocessing::default(),
768                input_hash: None,
769                output_hash: None,
770                metadata: std::collections::HashMap::new(),
771            },
772        })
773    }
774}
775
776#[derive(Debug, Deserialize)]
777struct DeepSeekOcrResponse {
778    choices: Vec<DeepSeekOcrChoice>,
779}
780
781#[derive(Debug, Deserialize)]
782struct DeepSeekOcrChoice {
783    message: DeepSeekOcrMessage,
784}
785
786#[derive(Debug, Deserialize)]
787struct DeepSeekOcrMessage {
788    content: Option<String>,
789}
790
791// =============================================================================
792// LightOn OCR Provider
793// =============================================================================
794
795/// LightOnOCR-2-1B provider.
796///
797/// LightOnOCR-2 is an efficient 1B-parameter vision-language model that achieves
798/// SOTA on OlmOCR-Bench while being 9x smaller than competitors.
799///
800/// # Features
801/// - 1B parameters, 9x smaller than competitors
802/// - 5.71 pages/s on H100 (~493k pages/day)
803/// - <$0.01 per 1000 pages
804/// - Apache 2.0 license, open weights
805/// - GDPR-compliant (France)
806///
807/// # Example
808///
809/// ```ignore
810/// use converge_provider::ocr::{LightOnOcrProvider, OcrRequest};
811///
812/// let provider = LightOnOcrProvider::from_env()?;
813/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
814/// ```
815pub struct LightOnOcrProvider {
816    api_key: crate::secret::SecretString,
817    model: String,
818    base_url: String,
819    client: reqwest::blocking::Client,
820}
821
822impl LightOnOcrProvider {
823    /// Creates a new `LightOn` OCR provider.
824    #[must_use]
825    pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
826        Self {
827            api_key: crate::secret::SecretString::new(api_key),
828            model: model.into(),
829            base_url: "https://api-inference.huggingface.co/models".to_string(),
830            client: reqwest::blocking::Client::new(),
831        }
832    }
833
834    /// Creates a provider using the `HUGGINGFACE_API_KEY` environment variable.
835    ///
836    /// Uses `lightonai/LightOnOCR-2-1B` as the default model.
837    ///
838    /// # Errors
839    ///
840    /// Returns error if the environment variable is not set.
841    pub fn from_env() -> Result<Self, OcrError> {
842        let api_key = std::env::var("HUGGINGFACE_API_KEY").map_err(|_| {
843            OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string())
844        })?;
845        Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B"))
846    }
847
848    /// Creates a provider with the bbox variant for figure extraction.
849    ///
850    /// # Errors
851    ///
852    /// Returns error if the environment variable is not set.
853    pub fn from_env_with_bbox() -> Result<Self, OcrError> {
854        let api_key = std::env::var("HUGGINGFACE_API_KEY").map_err(|_| {
855            OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string())
856        })?;
857        Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B-bbox"))
858    }
859
860    /// Uses a custom base URL.
861    #[must_use]
862    pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
863        self.base_url = url.into();
864        self
865    }
866}
867
868impl OcrProvider for LightOnOcrProvider {
869    fn name(&self) -> &'static str {
870        "lighton-ocr"
871    }
872
873    fn model(&self) -> &str {
874        &self.model
875    }
876
877    fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
878        // LightOnOCR uses HuggingFace Inference API
879        let image_bytes = match &request.input {
880            OcrInput::ImageBytes(bytes) => bytes.clone(),
881            OcrInput::PdfBytes(_) => {
882                return Err(OcrError::InvalidInput(
883                    "LightOnOCR requires image input. Convert PDF pages to images first."
884                        .to_string(),
885                ));
886            }
887            OcrInput::Url(url) => {
888                // Fetch the image
889                let response = self
890                    .client
891                    .get(url)
892                    .send()
893                    .map_err(|e| OcrError::Network(format!("Failed to fetch image: {e}")))?;
894                response
895                    .bytes()
896                    .map_err(|e| OcrError::Network(format!("Failed to read image: {e}")))?
897                    .to_vec()
898            }
899            OcrInput::Base64(data) => {
900                base64::Engine::decode(&base64::engine::general_purpose::STANDARD, data)
901                    .map_err(|e| OcrError::Parse(format!("Invalid base64: {e}")))?
902            }
903        };
904
905        let response = self
906            .client
907            .post(format!("{}/{}", self.base_url, self.model))
908            .header("Authorization", format!("Bearer {}", self.api_key.expose()))
909            .header("Content-Type", "application/octet-stream")
910            .body(image_bytes)
911            .send()
912            .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
913
914        let status = response.status();
915        if !status.is_success() {
916            let error_text = response.text().unwrap_or_default();
917            return match status.as_u16() {
918                401 | 403 => Err(OcrError::Auth(format!(
919                    "Authentication failed: {error_text}"
920                ))),
921                429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
922                503 => Err(OcrError::Api("Model is loading, please retry".to_string())),
923                _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
924            };
925        }
926
927        // LightOnOCR returns the extracted text directly
928        let text = response
929            .text()
930            .map_err(|e| OcrError::Parse(format!("Failed to read response: {e}")))?;
931
932        Ok(OcrResult {
933            text,
934            pages: 1,
935            spans: vec![], // LightOnOCR doesn't provide word-level spans via HF API
936            tables: vec![],
937            images: vec![],
938            confidence: None,
939            processing_time_ms: None,
940            provenance: OcrProvenance {
941                provider: "lighton-ocr".to_string(),
942                version: self.model.clone(),
943                languages: request.languages.clone(),
944                preprocessing: OcrPreprocessing::default(),
945                input_hash: None,
946                output_hash: None,
947                metadata: std::collections::HashMap::new(),
948            },
949        })
950    }
951}
952
953// =============================================================================
954// Tesseract OCR Provider (Feature-gated, Local)
955// =============================================================================
956//
957// Tesseract is the "boring, reliable" OCR workhorse: classic OCR engine,
958// open source, runs fully locally, and easy to wrap in a Converge-style
959// Provider boundary.
960//
961// =============================================================================
962// WHAT TESSERACT IS
963// =============================================================================
964//
965// - An OCR engine originally from HP, maintained under open source.
966// - Takes images (PNG/JPG/TIFF etc) and outputs text, optionally with layout.
967// - Can run with different language packs (English, Swedish, etc).
968// - NOT a "big neural multimodal model" - it's a tool-like OCR system with
969//   modern recognition components but still very deterministic.
970//
971// =============================================================================
972// WHERE IT SHINES
973// =============================================================================
974//
975// - Clean scans, printed documents, forms, invoices, manuals, receipts
976// - High-contrast screenshots
977// - Simple page layouts
978// - Deterministic runs: same input + same version + same settings = same output
979//
980// =============================================================================
981// WHERE IT STRUGGLES
982// =============================================================================
983//
984// - Handwriting (varies, usually weak vs modern DL OCR)
985// - Low-quality photos (blur, perspective, glare)
986// - Complex layouts with tables/columns (unless you guide it well with PSM)
987// - Mixed languages without explicit config
988//
989// If your primary use case is handwriting, camera photos with glare, or dense
990// multi-column PDFs with complex tables, consider a DL-based OCR instead.
991//
992// =============================================================================
993// OUTPUT FORMATS
994// =============================================================================
995//
996// Tesseract can produce:
997// - Plain text: Just the extracted text
998// - TSV: Word-level info with confidence and bounding boxes
999// - hOCR: HTML-like format with bounding boxes (useful for validation)
1000// - ALTO XML: Common in libraries/archives
1001//
1002// For Converge, hOCR/TSV is useful because you can validate "evidence":
1003// - Bounding boxes (where each word came from)
1004// - Per-word confidence
1005// - Page segmentation decisions
1006//
1007// =============================================================================
1008// KEY KNOBS
1009// =============================================================================
1010//
1011// 1. Page Segmentation Mode (PSM) - THE BIGGEST PRACTICAL LEVER
1012//    Tells Tesseract what kind of page it's looking at:
1013//    - 0 = OSD only (orientation and script detection)
1014//    - 1 = Automatic page segmentation with OSD
1015//    - 3 = Fully automatic page segmentation (default)
1016//    - 4 = Single column of variable sizes
1017//    - 6 = Uniform block of text
1018//    - 7 = Single text line
1019//    - 8 = Single word
1020//    - 11 = Sparse text
1021//    If you set the wrong mode, accuracy tanks.
1022//
1023// 2. OCR Engine Mode (OEM)
1024//    Chooses which internal engine strategy to use:
1025//    - 0 = Legacy engine only
1026//    - 1 = Neural nets LSTM engine only
1027//    - 2 = Legacy + LSTM engines
1028//    - 3 = Default (auto-select best available)
1029//    Defaults are usually fine, but pin for reproducibility.
1030//
1031// 3. Language Packs
1032//    Set -l eng / -l swe etc. DON'T leave language detection implicit.
1033//
1034// 4. Preprocessing
1035//    Tesseract is EXTREMELY sensitive to:
1036//    - Resolution (DPI) - 300 DPI is typical minimum
1037//    - Binarization (thresholding)
1038//    - Denoise
1039//    - Deskew
1040//    - Contrast normalization
1041//
1042//    This is where "Rust purity" can shine: do deterministic preprocessing
1043//    in Rust (image crate) and then pass a cleaned image to Tesseract.
1044//
1045// =============================================================================
1046// CONVERGE-STYLE INTEGRATION PATTERN
1047// =============================================================================
1048//
1049// Treat OCR as a provider that returns a PROPOSAL, never truth.
1050//
1051// Shape:
1052//   DocumentBytes → ProposedTextExtraction → Validators → Facts/StructuredFields
1053//
1054// Provider output (recommended):
1055//   - text: extracted text
1056//   - spans: optional words/lines with bounding boxes (from TSV/hOCR)
1057//   - confidence: summary stats (mean, min, histogram)
1058//   - tool_provenance:
1059//       - engine = "tesseract"
1060//       - tesseract_version
1061//       - lang
1062//       - psm, oem
1063//       - preprocess_pipeline_hash
1064//   - trace_link:
1065//       - input hash (bytes)
1066//       - output hash
1067//       - settings hash
1068//
1069// Validators (examples):
1070//   - min_confidence >= 0.75 else STOP or WARN
1071//   - required_fields_present (invoice number/date/amount)
1072//   - layout sanity (if table expected, require hOCR structure)
1073//   - PII redaction gate before storage
1074//
1075// =============================================================================
1076// PACKAGING AND DEPLOYMENT
1077// =============================================================================
1078//
1079// Tesseract is a native dependency. Manage cleanly:
1080//
1081// Best practice for "one binary experience":
1082//   - Ship your Rust binary
1083//   - Vendor/bundle Tesseract in installer (or provide "cz doctor" check)
1084//   - Pin versions for reproducibility
1085//
1086// On macOS: Most people install via Homebrew, but for deterministic
1087// environments, package with your app or use Nix.
1088//
1089// =============================================================================
1090// ARCHITECTURE (Rust-first compromise)
1091// =============================================================================
1092//
1093// Tesseract integration follows the "Rust-first compromise" pattern:
1094// - Pure Converge architecture (providers, traces, gates, promotion)
1095// - OCR runs locally with no cloud data exposure
1096// - Accepts native dependency (tesseract + leptonica)
1097//
1098// Integration options (in order of preference):
1099// 1. Sidecar binary: invoke `tesseract` CLI via std::process::Command
1100// 2. FFI binding: link against libtesseract (more complex, faster)
1101// 3. System dependency: require tesseract installed (brew, apt, nix)
1102//
1103// The provider returns:
1104// - Extracted text
1105// - Confidence summary (per-word statistics)
1106// - Provenance: tool version, language pack, preprocessing params
1107// - Trace link hashes of input bytes and output
1108//
1109// Determinism: Stable for same input image + same Tesseract version.
1110//
1111// When to use:
1112// - Scanned PDFs, clean prints, forms, invoices, receipts
1113// - "Extract text so downstream validators can reason"
1114// - GDPR/data sovereignty requirements (no cloud exposure)
1115//
1116// Future: Can be swapped with Burn/candle-based OCR model without
1117// changing the core contracts (OcrProvider trait).
1118//
1119// =============================================================================
1120
1121/// Configuration for Tesseract OCR provider.
1122///
1123/// # Feature Gate
1124///
1125/// This provider requires the `tesseract` feature:
1126/// ```toml
1127/// [dependencies]
1128/// converge-provider = { version = "0.2", features = ["tesseract"] }
1129/// ```
1130///
1131/// # System Requirements
1132///
1133/// Tesseract must be installed on the system:
1134/// - macOS: `brew install tesseract tesseract-lang`
1135/// - Ubuntu: `apt install tesseract-ocr tesseract-ocr-eng`
1136/// - Windows: Download from <https://github.com/UB-Mannheim/tesseract/wiki>
1137///
1138/// # Key Knobs
1139///
1140/// **Page Segmentation Mode (PSM)** - The biggest practical lever:
1141/// - 0 = OSD only (orientation and script detection)
1142/// - 1 = Automatic page segmentation with OSD
1143/// - 3 = Fully automatic page segmentation (default)
1144/// - 4 = Single column of variable sizes
1145/// - 6 = Uniform block of text
1146/// - 7 = Single text line
1147/// - 8 = Single word
1148/// - 11 = Sparse text
1149///
1150/// If you set the wrong mode, accuracy tanks.
1151///
1152/// **OCR Engine Mode (OEM)**:
1153/// - 0 = Legacy engine only
1154/// - 1 = Neural nets LSTM engine only
1155/// - 2 = Legacy + LSTM engines
1156/// - 3 = Default (auto-select best available)
1157///
1158/// **Preprocessing**: Tesseract is EXTREMELY sensitive to:
1159/// - Resolution (DPI) - 300 DPI is typical minimum
1160/// - Binarization, denoise, deskew, contrast normalization
1161///
1162/// # Example (Future)
1163///
1164/// ```ignore
1165/// use converge_provider::ocr::{TesseractOcrProvider, TesseractConfig, TesseractOutputFormat, OcrRequest};
1166///
1167/// let config = TesseractConfig::new()
1168///     .with_languages(vec!["eng", "deu"])
1169///     .with_dpi(300)
1170///     .with_psm(3)  // Fully automatic
1171///     .with_output_format(TesseractOutputFormat::Tsv);  // Get bounding boxes
1172///
1173/// let provider = TesseractOcrProvider::with_config(config);
1174/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
1175///
1176/// // Provenance includes tool version, language pack, preprocessing
1177/// println!("Tesseract version: {}", result.provenance.version);
1178/// println!("Confidence: {:.2}%", result.confidence.unwrap().mean * 100.0);
1179///
1180/// // Check spans for evidence validation
1181/// for span in &result.spans {
1182///     if span.is_low_confidence(0.75) {
1183///         println!("Low confidence word: {} ({:.0}%)", span.text, span.confidence * 100.0);
1184///     }
1185/// }
1186/// ```
1187#[derive(Debug, Clone)]
1188pub struct TesseractConfig {
1189    /// Path to tesseract binary (default: "tesseract" in PATH).
1190    pub binary_path: String,
1191    /// Path to tessdata directory (language files).
1192    pub tessdata_path: Option<String>,
1193    /// Languages to use (e.g., ["eng", "deu"]).
1194    /// DON'T leave language detection implicit!
1195    pub languages: Vec<String>,
1196    /// DPI for PDF rendering (default: 300).
1197    /// 300 DPI is typical minimum for good results.
1198    pub dpi: u32,
1199    /// Page segmentation mode (PSM).
1200    /// 0 = OSD only, 1 = auto + OSD, 3 = fully auto (default), 6 = uniform block, etc.
1201    /// THIS IS THE BIGGEST PRACTICAL LEVER. Wrong mode = bad accuracy.
1202    pub psm: u32,
1203    /// OCR engine mode (OEM).
1204    /// 0 = Legacy, 1 = Neural LSTM, 2 = Legacy + LSTM, 3 = Default (auto).
1205    /// Pin for reproducibility.
1206    pub oem: u32,
1207    /// Output format (text, TSV, hOCR, ALTO).
1208    /// Use TSV or hOCR for word-level confidence and bounding boxes.
1209    pub output_format: TesseractOutputFormat,
1210    /// Whether to apply preprocessing (deskew, denoise, binarize).
1211    /// Tesseract is EXTREMELY sensitive to image quality.
1212    pub preprocess: bool,
1213    /// Timeout in seconds for OCR operation.
1214    pub timeout_secs: u64,
1215}
1216
1217impl Default for TesseractConfig {
1218    fn default() -> Self {
1219        Self {
1220            binary_path: "tesseract".to_string(),
1221            tessdata_path: None,
1222            languages: vec!["eng".to_string()],
1223            dpi: 300,
1224            psm: 3, // Fully automatic page segmentation
1225            oem: 3, // Default (auto-select best available)
1226            output_format: TesseractOutputFormat::Text,
1227            preprocess: true,
1228            timeout_secs: 60,
1229        }
1230    }
1231}
1232
1233impl TesseractConfig {
1234    /// Creates a new Tesseract configuration with defaults.
1235    #[must_use]
1236    pub fn new() -> Self {
1237        Self::default()
1238    }
1239
1240    /// Sets the path to the tesseract binary.
1241    #[must_use]
1242    pub fn with_binary_path(mut self, path: impl Into<String>) -> Self {
1243        self.binary_path = path.into();
1244        self
1245    }
1246
1247    /// Sets the tessdata directory path.
1248    #[must_use]
1249    pub fn with_tessdata_path(mut self, path: impl Into<String>) -> Self {
1250        self.tessdata_path = Some(path.into());
1251        self
1252    }
1253
1254    /// Sets the languages to use.
1255    #[must_use]
1256    pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1257        self.languages = languages.into_iter().map(Into::into).collect();
1258        self
1259    }
1260
1261    /// Sets the DPI for PDF rendering.
1262    #[must_use]
1263    pub fn with_dpi(mut self, dpi: u32) -> Self {
1264        self.dpi = dpi;
1265        self
1266    }
1267
1268    /// Sets the page segmentation mode.
1269    #[must_use]
1270    pub fn with_psm(mut self, psm: u32) -> Self {
1271        self.psm = psm;
1272        self
1273    }
1274
1275    /// Sets the OCR engine mode.
1276    #[must_use]
1277    pub fn with_oem(mut self, oem: u32) -> Self {
1278        self.oem = oem;
1279        self
1280    }
1281
1282    /// Sets whether to apply preprocessing.
1283    #[must_use]
1284    pub fn with_preprocess(mut self, preprocess: bool) -> Self {
1285        self.preprocess = preprocess;
1286        self
1287    }
1288
1289    /// Sets the timeout in seconds.
1290    #[must_use]
1291    pub fn with_timeout(mut self, secs: u64) -> Self {
1292        self.timeout_secs = secs;
1293        self
1294    }
1295
1296    /// Sets the output format.
1297    /// Use TSV or hOCR for word-level confidence and bounding boxes.
1298    #[must_use]
1299    pub fn with_output_format(mut self, format: TesseractOutputFormat) -> Self {
1300        self.output_format = format;
1301        self
1302    }
1303}
1304
1305/// Tesseract OCR provider (stub - not yet implemented).
1306///
1307/// This is a placeholder for the local Tesseract OCR integration.
1308/// The actual implementation will be feature-gated behind `tesseract`.
1309///
1310/// # Architecture
1311///
1312/// ```text
1313/// TesseractOcrProvider
1314///     │
1315///     ├── Input (PDF/Image bytes)
1316///     │      │
1317///     │      ▼
1318///     ├── Preprocessing (optional)
1319///     │      ├── PDF → Images (pdftoppm/pdf2image)
1320///     │      ├── Deskew (leptonica)
1321///     │      ├── Denoise (leptonica)
1322///     │      └── Binarize (leptonica)
1323///     │      │
1324///     │      ▼
1325///     ├── Tesseract CLI/FFI
1326///     │      │
1327///     │      ▼
1328///     ├── Output
1329///     │      ├── Text (plain or hOCR/ALTO)
1330///     │      ├── Confidence (per-word)
1331///     │      └── Bounding boxes (optional)
1332///     │      │
1333///     │      ▼
1334///     └── OcrResult with Provenance
1335///            ├── text
1336///            ├── confidence summary
1337///            ├── provenance (version, langs, params)
1338///            └── trace hashes (input/output)
1339/// ```
1340///
1341/// # Future Implementation
1342///
1343/// When the `tesseract` feature is enabled:
1344///
1345/// ```ignore
1346/// #[cfg(feature = "tesseract")]
1347/// impl OcrProvider for TesseractOcrProvider {
1348///     fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
1349///         // 1. Hash input for trace links
1350///         // 2. Preprocess if needed (PDF→image, deskew, etc.)
1351///         // 3. Invoke tesseract CLI or FFI
1352///         // 4. Parse output (text + confidence)
1353///         // 5. Hash output for trace links
1354///         // 6. Return OcrResult with full provenance
1355///     }
1356/// }
1357/// ```
1358#[derive(Debug)]
1359pub struct TesseractOcrProvider {
1360    config: TesseractConfig,
1361}
1362
1363impl TesseractOcrProvider {
1364    /// Creates a new Tesseract OCR provider with default configuration.
1365    #[must_use]
1366    pub fn new() -> Self {
1367        Self {
1368            config: TesseractConfig::default(),
1369        }
1370    }
1371
1372    /// Creates a provider with custom configuration.
1373    #[must_use]
1374    pub fn with_config(config: TesseractConfig) -> Self {
1375        Self { config }
1376    }
1377
1378    /// Sets the languages to use.
1379    #[must_use]
1380    pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1381        self.config.languages = languages.into_iter().map(Into::into).collect();
1382        self
1383    }
1384
1385    /// Sets the DPI for PDF rendering.
1386    #[must_use]
1387    pub fn with_dpi(mut self, dpi: u32) -> Self {
1388        self.config.dpi = dpi;
1389        self
1390    }
1391
1392    /// Checks if Tesseract is available on the system.
1393    ///
1394    /// # Errors
1395    ///
1396    /// Returns error if Tesseract is not found or cannot be executed.
1397    pub fn check_availability(&self) -> Result<String, OcrError> {
1398        // This is a stub - actual implementation would run `tesseract --version`
1399        Err(OcrError::Api(
1400            "Tesseract provider not yet implemented. Enable the 'tesseract' feature.".to_string(),
1401        ))
1402    }
1403
1404    /// Returns the Tesseract version (stub).
1405    #[must_use]
1406    pub fn version(&self) -> Option<String> {
1407        None // Stub - would parse `tesseract --version` output
1408    }
1409}
1410
1411impl Default for TesseractOcrProvider {
1412    fn default() -> Self {
1413        Self::new()
1414    }
1415}
1416
1417// Stub implementation - will be replaced when feature is implemented
1418impl OcrProvider for TesseractOcrProvider {
1419    fn name(&self) -> &'static str {
1420        "tesseract"
1421    }
1422
1423    fn model(&self) -> &'static str {
1424        "tesseract-stub"
1425    }
1426
1427    fn extract(&self, _request: &OcrRequest) -> Result<OcrResult, OcrError> {
1428        Err(OcrError::Api(
1429            "Tesseract OCR provider not yet implemented. \
1430             This is a placeholder for future local OCR support. \
1431             For now, use MistralOcrProvider, DeepSeekOcrProvider, or LightOnOcrProvider."
1432                .to_string(),
1433        ))
1434    }
1435}
1436
1437// =============================================================================
1438// Helper functions for provenance
1439// =============================================================================
1440
1441/// Computes SHA-256 hash of bytes for trace links.
1442#[must_use]
1443pub fn compute_hash(data: &[u8]) -> String {
1444    use sha2::{Digest, Sha256};
1445    let mut hasher = Sha256::new();
1446    hasher.update(data);
1447    format!("{:x}", hasher.finalize())
1448}
1449
1450/// Computes input/output hashes and returns updated provenance.
1451#[must_use]
1452pub fn with_trace_hashes(
1453    mut provenance: OcrProvenance,
1454    input: &[u8],
1455    output: &str,
1456) -> OcrProvenance {
1457    provenance.input_hash = Some(compute_hash(input));
1458    provenance.output_hash = Some(compute_hash(output.as_bytes()));
1459    provenance
1460}
1461
1462#[cfg(test)]
1463mod tests {
1464    use super::*;
1465
1466    #[test]
1467    fn test_ocr_request_builder() {
1468        let request = OcrRequest::from_pdf_bytes(vec![1, 2, 3])
1469            .with_output_format(OcrOutputFormat::Html)
1470            .with_languages(vec!["en".to_string(), "de".to_string()])
1471            .with_extract_tables(true)
1472            .with_extract_images(true)
1473            .with_page_range(0, 10);
1474
1475        assert_eq!(request.output_format, OcrOutputFormat::Html);
1476        assert_eq!(request.languages, vec!["en", "de"]);
1477        assert!(request.extract_tables);
1478        assert!(request.extract_images);
1479        assert_eq!(request.page_range, Some((0, 10)));
1480    }
1481
1482    #[test]
1483    fn test_ocr_output_format_default() {
1484        let format = OcrOutputFormat::default();
1485        assert_eq!(format, OcrOutputFormat::Markdown);
1486    }
1487}