converge_provider/ocr.rs
1// Copyright 2024-2026 Reflective Labs
2// SPDX-License-Identifier: MIT
3// See LICENSE file in the project root for full license information.
4
5//! OCR / Document AI providers.
6//!
7//! This module provides integration with OCR models for document understanding,
8//! text extraction, and structured content parsing from PDFs, scans, and images.
9//!
10//! # Available Providers
11//!
12//! - [`MistralOcrProvider`] - Mistral OCR 3 (GDPR-compliant, EU)
13//! - [`DeepSeekOcrProvider`] - `DeepSeek` OCR 2 (Visual Causal Flow)
14//! - [`LightOnOcrProvider`] - LightOnOCR-2-1B (Efficient, open-source)
15//!
16//! # Example
17//!
18//! ```ignore
19//! use converge_provider::ocr::{OcrProvider, MistralOcrProvider, OcrRequest};
20//!
21//! let provider = MistralOcrProvider::from_env()?;
22//! let request = OcrRequest::from_pdf_bytes(pdf_bytes);
23//! let result = provider.extract(&request)?;
24//!
25//! println!("Extracted text: {}", result.text);
26//! for table in result.tables {
27//! println!("Table: {:?}", table);
28//! }
29//! ```
30
31use serde::{Deserialize, Serialize};
32
33/// Error type for OCR operations.
34#[derive(Debug, thiserror::Error)]
35pub enum OcrError {
36 /// Network/HTTP error.
37 #[error("Network error: {0}")]
38 Network(String),
39
40 /// API authentication error.
41 #[error("Authentication error: {0}")]
42 Auth(String),
43
44 /// Rate limit exceeded.
45 #[error("Rate limit exceeded: {0}")]
46 RateLimit(String),
47
48 /// API response parsing error.
49 #[error("Parse error: {0}")]
50 Parse(String),
51
52 /// Invalid input (unsupported format, etc.).
53 #[error("Invalid input: {0}")]
54 InvalidInput(String),
55
56 /// General API error.
57 #[error("API error: {0}")]
58 Api(String),
59}
60
61/// Input type for OCR processing.
62#[derive(Debug, Clone)]
63pub enum OcrInput {
64 /// PDF document as bytes.
65 PdfBytes(Vec<u8>),
66 /// Image as bytes (PNG, JPEG, etc.).
67 ImageBytes(Vec<u8>),
68 /// URL to a document or image.
69 Url(String),
70 /// Base64-encoded document or image.
71 Base64(String),
72}
73
74/// OCR extraction request.
75#[derive(Debug, Clone)]
76pub struct OcrRequest {
77 /// Input document or image.
78 pub input: OcrInput,
79 /// Output format preference.
80 pub output_format: OcrOutputFormat,
81 /// Language hints (ISO 639-1 codes).
82 pub languages: Vec<String>,
83 /// Whether to extract tables.
84 pub extract_tables: bool,
85 /// Whether to extract images/figures.
86 pub extract_images: bool,
87 /// Page range (for multi-page documents).
88 pub page_range: Option<(usize, usize)>,
89}
90
91impl OcrRequest {
92 /// Creates a request from PDF bytes.
93 #[must_use]
94 pub fn from_pdf_bytes(bytes: Vec<u8>) -> Self {
95 Self {
96 input: OcrInput::PdfBytes(bytes),
97 output_format: OcrOutputFormat::Markdown,
98 languages: vec![],
99 extract_tables: true,
100 extract_images: false,
101 page_range: None,
102 }
103 }
104
105 /// Creates a request from image bytes.
106 #[must_use]
107 pub fn from_image_bytes(bytes: Vec<u8>) -> Self {
108 Self {
109 input: OcrInput::ImageBytes(bytes),
110 output_format: OcrOutputFormat::Markdown,
111 languages: vec![],
112 extract_tables: true,
113 extract_images: false,
114 page_range: None,
115 }
116 }
117
118 /// Creates a request from a URL.
119 #[must_use]
120 pub fn from_url(url: impl Into<String>) -> Self {
121 Self {
122 input: OcrInput::Url(url.into()),
123 output_format: OcrOutputFormat::Markdown,
124 languages: vec![],
125 extract_tables: true,
126 extract_images: false,
127 page_range: None,
128 }
129 }
130
131 /// Sets the output format.
132 #[must_use]
133 pub fn with_output_format(mut self, format: OcrOutputFormat) -> Self {
134 self.output_format = format;
135 self
136 }
137
138 /// Adds language hints.
139 #[must_use]
140 pub fn with_languages(mut self, languages: Vec<String>) -> Self {
141 self.languages = languages;
142 self
143 }
144
145 /// Sets whether to extract tables.
146 #[must_use]
147 pub fn with_extract_tables(mut self, extract: bool) -> Self {
148 self.extract_tables = extract;
149 self
150 }
151
152 /// Sets whether to extract images.
153 #[must_use]
154 pub fn with_extract_images(mut self, extract: bool) -> Self {
155 self.extract_images = extract;
156 self
157 }
158
159 /// Sets the page range for multi-page documents.
160 #[must_use]
161 pub fn with_page_range(mut self, start: usize, end: usize) -> Self {
162 self.page_range = Some((start, end));
163 self
164 }
165}
166
167/// Output format for OCR results.
168#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
169pub enum OcrOutputFormat {
170 /// Plain text.
171 Text,
172 /// Markdown with structure preserved.
173 #[default]
174 Markdown,
175 /// HTML with table reconstruction.
176 Html,
177 /// JSON with structured data.
178 Json,
179}
180
181/// A detected table in the document.
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct OcrTable {
184 /// Page number (0-indexed).
185 pub page: usize,
186 /// Table as HTML or markdown.
187 pub content: String,
188 /// Bounding box (x, y, width, height) if available.
189 pub bbox: Option<(f64, f64, f64, f64)>,
190}
191
192/// A detected image/figure in the document.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct OcrImage {
195 /// Page number (0-indexed).
196 pub page: usize,
197 /// Image description or alt text.
198 pub description: Option<String>,
199 /// Bounding box (x, y, width, height).
200 pub bbox: Option<(f64, f64, f64, f64)>,
201 /// Base64-encoded image data (if extracted).
202 pub data: Option<String>,
203}
204
205/// Provenance information for OCR results.
206///
207/// Captures everything needed for reproducibility and tracing:
208/// - Tool version and configuration
209/// - Input/output hashes for trace links
210/// - Preprocessing parameters applied
211#[derive(Debug, Clone, Serialize, Deserialize, Default)]
212pub struct OcrProvenance {
213 /// Provider/tool name (e.g., "tesseract", "mistral-ocr", "deepseek-ocr").
214 pub provider: String,
215 /// Model or engine version (e.g., "5.3.0", "mistral-ocr-2512").
216 pub version: String,
217 /// Language pack(s) used (e.g., ["eng", "deu"]).
218 pub languages: Vec<String>,
219 /// Preprocessing parameters applied.
220 pub preprocessing: OcrPreprocessing,
221 /// SHA-256 hash of input bytes (for trace links).
222 pub input_hash: Option<String>,
223 /// SHA-256 hash of output text (for trace links).
224 pub output_hash: Option<String>,
225 /// Additional metadata (tool-specific).
226 #[serde(default)]
227 pub metadata: std::collections::HashMap<String, String>,
228}
229
230/// Preprocessing parameters applied before OCR.
231#[derive(Debug, Clone, Serialize, Deserialize, Default)]
232pub struct OcrPreprocessing {
233 /// DPI used for rendering (for PDFs).
234 pub dpi: Option<u32>,
235 /// Whether binarization was applied.
236 pub binarized: bool,
237 /// Whether deskewing was applied.
238 pub deskewed: bool,
239 /// Whether noise removal was applied.
240 pub denoised: bool,
241 /// Page segmentation mode (Tesseract-specific).
242 pub psm: Option<u32>,
243 /// OCR engine mode (Tesseract-specific).
244 pub oem: Option<u32>,
245}
246
247/// Confidence summary for OCR results.
248#[derive(Debug, Clone, Serialize, Deserialize, Default)]
249pub struct OcrConfidence {
250 /// Overall mean confidence (0.0-1.0).
251 pub mean: f64,
252 /// Minimum word confidence.
253 pub min: f64,
254 /// Maximum word confidence.
255 pub max: f64,
256 /// Standard deviation of confidence scores.
257 pub std_dev: Option<f64>,
258 /// Number of words with confidence below threshold.
259 pub low_confidence_words: usize,
260 /// Threshold used for low confidence (default 0.6).
261 pub threshold: f64,
262}
263
264/// A word or text span with position and confidence.
265///
266/// For Tesseract, this comes from TSV or hOCR output.
267/// Useful for validation: you can check where each word came from
268/// and flag low-confidence regions.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct OcrSpan {
271 /// The text content of this span.
272 pub text: String,
273 /// Confidence score (0.0-1.0).
274 pub confidence: f64,
275 /// Page number (0-indexed).
276 pub page: usize,
277 /// Bounding box: (x, y, width, height) in pixels.
278 pub bbox: Option<(i32, i32, i32, i32)>,
279 /// Block number (page segmentation unit).
280 pub block_num: Option<i32>,
281 /// Paragraph number within block.
282 pub par_num: Option<i32>,
283 /// Line number within paragraph.
284 pub line_num: Option<i32>,
285 /// Word number within line.
286 pub word_num: Option<i32>,
287}
288
289impl OcrSpan {
290 /// Creates a new span with text and confidence.
291 #[must_use]
292 pub fn new(text: impl Into<String>, confidence: f64) -> Self {
293 Self {
294 text: text.into(),
295 confidence,
296 page: 0,
297 bbox: None,
298 block_num: None,
299 par_num: None,
300 line_num: None,
301 word_num: None,
302 }
303 }
304
305 /// Sets the bounding box.
306 #[must_use]
307 pub fn with_bbox(mut self, x: i32, y: i32, w: i32, h: i32) -> Self {
308 self.bbox = Some((x, y, w, h));
309 self
310 }
311
312 /// Sets the page number.
313 #[must_use]
314 pub fn with_page(mut self, page: usize) -> Self {
315 self.page = page;
316 self
317 }
318
319 /// Checks if this span has low confidence (below threshold).
320 #[must_use]
321 pub fn is_low_confidence(&self, threshold: f64) -> bool {
322 self.confidence < threshold
323 }
324}
325
326/// Tesseract-specific output format.
327///
328/// Controls what kind of output to request from Tesseract.
329#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
330pub enum TesseractOutputFormat {
331 /// Plain text (default).
332 #[default]
333 Text,
334 /// TSV with word-level confidence and bounding boxes.
335 /// Columns: level, `page_num`, `block_num`, `par_num`, `line_num`, `word_num`,
336 /// left, top, width, height, conf, text
337 Tsv,
338 /// hOCR HTML format with bounding boxes.
339 /// Useful for downstream table/layout analysis.
340 Hocr,
341 /// ALTO XML format (common in libraries/archives).
342 Alto,
343}
344
345/// OCR extraction result.
346#[derive(Debug, Clone, Serialize, Deserialize)]
347pub struct OcrResult {
348 /// Extracted text content.
349 pub text: String,
350 /// Number of pages processed.
351 pub pages: usize,
352 /// Word/text spans with positions and confidence.
353 /// Populated when using TSV or hOCR output format (Tesseract).
354 /// Useful for validation: check where each word came from.
355 #[serde(default)]
356 pub spans: Vec<OcrSpan>,
357 /// Detected tables.
358 pub tables: Vec<OcrTable>,
359 /// Detected images/figures.
360 pub images: Vec<OcrImage>,
361 /// Confidence summary (per-word statistics).
362 pub confidence: Option<OcrConfidence>,
363 /// Processing time in milliseconds.
364 pub processing_time_ms: Option<u64>,
365 /// Provenance for reproducibility and tracing.
366 pub provenance: OcrProvenance,
367}
368
369/// Trait for OCR providers.
370pub trait OcrProvider: Send + Sync {
371 /// Returns the provider name.
372 fn name(&self) -> &'static str;
373
374 /// Returns the model being used.
375 fn model(&self) -> &str;
376
377 /// Extracts text and structure from a document.
378 ///
379 /// # Errors
380 ///
381 /// Returns error if extraction fails.
382 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError>;
383}
384
385// =============================================================================
386// Mistral OCR Provider
387// =============================================================================
388
389/// Mistral OCR 3 provider.
390///
391/// Mistral OCR 3 is designed for document AI at scale, handling forms, invoices,
392/// complex tables, handwriting, and low-quality scans. It outputs structured
393/// text/HTML suitable for RAG and agent workflows.
394///
395/// # Features
396/// - 74% win rate over OCR 2 on forms, handwriting, tables
397/// - Markdown output with HTML table reconstruction
398/// - GDPR-compliant (France)
399/// - $2 per 1000 pages ($1 with batch API)
400///
401/// # Example
402///
403/// ```ignore
404/// use converge_provider::ocr::{MistralOcrProvider, OcrRequest};
405///
406/// let provider = MistralOcrProvider::from_env()?;
407/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
408/// ```
409pub struct MistralOcrProvider {
410 api_key: crate::secret::SecretString,
411 model: String,
412 base_url: String,
413 client: reqwest::blocking::Client,
414}
415
416impl MistralOcrProvider {
417 /// Creates a new Mistral OCR provider.
418 #[must_use]
419 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
420 Self {
421 api_key: crate::secret::SecretString::new(api_key),
422 model: model.into(),
423 base_url: "https://api.mistral.ai/v1".to_string(),
424 client: reqwest::blocking::Client::new(),
425 }
426 }
427
428 /// Creates a provider using the `MISTRAL_API_KEY` environment variable.
429 ///
430 /// Uses `mistral-ocr-latest` as the default model.
431 ///
432 /// # Errors
433 ///
434 /// Returns error if the environment variable is not set.
435 pub fn from_env() -> Result<Self, OcrError> {
436 let api_key = std::env::var("MISTRAL_API_KEY").map_err(|_| {
437 OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string())
438 })?;
439 Ok(Self::new(api_key, "mistral-ocr-latest"))
440 }
441
442 /// Creates a provider with a specific model.
443 ///
444 /// # Errors
445 ///
446 /// Returns error if the environment variable is not set.
447 pub fn from_env_with_model(model: impl Into<String>) -> Result<Self, OcrError> {
448 let api_key = std::env::var("MISTRAL_API_KEY").map_err(|_| {
449 OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string())
450 })?;
451 Ok(Self::new(api_key, model))
452 }
453
454 /// Uses a custom base URL.
455 #[must_use]
456 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
457 self.base_url = url.into();
458 self
459 }
460}
461
462impl OcrProvider for MistralOcrProvider {
463 fn name(&self) -> &'static str {
464 "mistral-ocr"
465 }
466
467 fn model(&self) -> &str {
468 &self.model
469 }
470
471 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
472 // Build the request body based on input type
473 let document = match &request.input {
474 OcrInput::PdfBytes(bytes) => {
475 serde_json::json!({
476 "type": "document_url",
477 "document_url": format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
478 })
479 }
480 OcrInput::ImageBytes(bytes) => {
481 serde_json::json!({
482 "type": "image_url",
483 "image_url": format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
484 })
485 }
486 OcrInput::Url(url) => {
487 if std::path::Path::new(url)
488 .extension()
489 .is_some_and(|ext| ext.eq_ignore_ascii_case("pdf"))
490 {
491 serde_json::json!({
492 "type": "document_url",
493 "document_url": url
494 })
495 } else {
496 serde_json::json!({
497 "type": "image_url",
498 "image_url": url
499 })
500 }
501 }
502 OcrInput::Base64(data) => {
503 serde_json::json!({
504 "type": "document_url",
505 "document_url": format!("data:application/pdf;base64,{}", data)
506 })
507 }
508 };
509
510 let body = serde_json::json!({
511 "model": self.model,
512 "document": document,
513 "include_image_base64": request.extract_images
514 });
515
516 let response = self
517 .client
518 .post(format!("{}/ocr", self.base_url))
519 .header("Authorization", format!("Bearer {}", self.api_key.expose()))
520 .header("Content-Type", "application/json")
521 .json(&body)
522 .send()
523 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
524
525 let status = response.status();
526 if !status.is_success() {
527 let error_text = response.text().unwrap_or_default();
528 return match status.as_u16() {
529 401 | 403 => Err(OcrError::Auth(format!(
530 "Authentication failed: {error_text}"
531 ))),
532 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
533 _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
534 };
535 }
536
537 let api_response: MistralOcrResponse = response
538 .json()
539 .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
540
541 // Convert to our result format
542 let mut tables = vec![];
543 let mut images = vec![];
544 let mut text = String::new();
545
546 for (page_idx, page) in api_response.pages.iter().enumerate() {
547 text.push_str(&page.markdown);
548 text.push_str("\n\n");
549
550 // Extract tables from markdown (simplified)
551 // In practice, Mistral returns tables as HTML within markdown
552 if page.markdown.contains("<table") {
553 tables.push(OcrTable {
554 page: page_idx,
555 content: page.markdown.clone(),
556 bbox: None,
557 });
558 }
559
560 // Extract images if present
561 for img in &page.images {
562 images.push(OcrImage {
563 page: page_idx,
564 description: None,
565 bbox: None,
566 data: img.image_base64.clone(),
567 });
568 }
569 }
570
571 Ok(OcrResult {
572 text: text.trim().to_string(),
573 pages: api_response.pages.len(),
574 spans: vec![], // Mistral OCR doesn't provide word-level spans
575 tables,
576 images,
577 confidence: None,
578 processing_time_ms: None,
579 provenance: OcrProvenance {
580 provider: "mistral-ocr".to_string(),
581 version: self.model.clone(),
582 languages: request.languages.clone(),
583 preprocessing: OcrPreprocessing::default(),
584 input_hash: None, // TODO: compute from input
585 output_hash: None, // TODO: compute from output
586 metadata: std::collections::HashMap::new(),
587 },
588 })
589 }
590}
591
592#[derive(Debug, Deserialize)]
593struct MistralOcrResponse {
594 pages: Vec<MistralOcrPage>,
595}
596
597#[derive(Debug, Deserialize)]
598struct MistralOcrPage {
599 markdown: String,
600 #[serde(default)]
601 images: Vec<MistralOcrImage>,
602}
603
604#[derive(Debug, Deserialize)]
605struct MistralOcrImage {
606 #[serde(default)]
607 image_base64: Option<String>,
608}
609
610// =============================================================================
611// DeepSeek OCR Provider
612// =============================================================================
613
614/// `DeepSeek` OCR 2 provider.
615///
616/// `DeepSeek` OCR 2 is a 3B-parameter vision-language model with the `DeepEncoder` V2
617/// architecture featuring Visual Causal Flow for human-like reading order.
618///
619/// # Features
620/// - SOTA on document understanding benchmarks
621/// - Human-like visual reading order
622/// - Semantic visual reasoning
623/// - 16x token compression
624///
625/// # Example
626///
627/// ```ignore
628/// use converge_provider::ocr::{DeepSeekOcrProvider, OcrRequest};
629///
630/// let provider = DeepSeekOcrProvider::from_env()?;
631/// let result = provider.extract(&OcrRequest::from_image_bytes(image_bytes))?;
632/// ```
633pub struct DeepSeekOcrProvider {
634 api_key: crate::secret::SecretString,
635 model: String,
636 base_url: String,
637 client: reqwest::blocking::Client,
638}
639
640impl DeepSeekOcrProvider {
641 /// Creates a new `DeepSeek` OCR provider.
642 #[must_use]
643 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
644 Self {
645 api_key: crate::secret::SecretString::new(api_key),
646 model: model.into(),
647 base_url: "https://api.deepseek.com/v1".to_string(),
648 client: reqwest::blocking::Client::new(),
649 }
650 }
651
652 /// Creates a provider using the `DEEPSEEK_API_KEY` environment variable.
653 ///
654 /// Uses `deepseek-ocr-2` as the default model.
655 ///
656 /// # Errors
657 ///
658 /// Returns error if the environment variable is not set.
659 pub fn from_env() -> Result<Self, OcrError> {
660 let api_key = std::env::var("DEEPSEEK_API_KEY").map_err(|_| {
661 OcrError::Auth("DEEPSEEK_API_KEY environment variable not set".to_string())
662 })?;
663 Ok(Self::new(api_key, "deepseek-ocr-2"))
664 }
665
666 /// Uses a custom base URL.
667 #[must_use]
668 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
669 self.base_url = url.into();
670 self
671 }
672}
673
674impl OcrProvider for DeepSeekOcrProvider {
675 fn name(&self) -> &'static str {
676 "deepseek-ocr"
677 }
678
679 fn model(&self) -> &str {
680 &self.model
681 }
682
683 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
684 // DeepSeek OCR uses a chat-like API with vision capabilities
685 let image_content = match &request.input {
686 OcrInput::ImageBytes(bytes) => {
687 format!(
688 "data:image/png;base64,{}",
689 base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes)
690 )
691 }
692 OcrInput::PdfBytes(bytes) => {
693 // DeepSeek OCR expects images; for PDF, we'd need to convert pages
694 // For now, treat as base64 document
695 format!(
696 "data:application/pdf;base64,{}",
697 base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes)
698 )
699 }
700 OcrInput::Url(url) => url.clone(),
701 OcrInput::Base64(data) => format!("data:image/png;base64,{data}"),
702 };
703
704 let body = serde_json::json!({
705 "model": self.model,
706 "messages": [{
707 "role": "user",
708 "content": [
709 {
710 "type": "image_url",
711 "image_url": {
712 "url": image_content
713 }
714 },
715 {
716 "type": "text",
717 "text": "Extract all text from this document, preserving structure, tables, and reading order. Output in markdown format."
718 }
719 ]
720 }],
721 "max_tokens": 8192
722 });
723
724 let response = self
725 .client
726 .post(format!("{}/chat/completions", self.base_url))
727 .header("Authorization", format!("Bearer {}", self.api_key.expose()))
728 .header("Content-Type", "application/json")
729 .json(&body)
730 .send()
731 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
732
733 let status = response.status();
734 if !status.is_success() {
735 let error_text = response.text().unwrap_or_default();
736 return match status.as_u16() {
737 401 | 403 => Err(OcrError::Auth(format!(
738 "Authentication failed: {error_text}"
739 ))),
740 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
741 _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
742 };
743 }
744
745 let api_response: DeepSeekOcrResponse = response
746 .json()
747 .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
748
749 let text = api_response
750 .choices
751 .first()
752 .and_then(|c| c.message.content.clone())
753 .unwrap_or_default();
754
755 Ok(OcrResult {
756 text,
757 pages: 1, // DeepSeek processes one image at a time
758 spans: vec![], // DeepSeek OCR doesn't provide word-level spans
759 tables: vec![],
760 images: vec![],
761 confidence: None,
762 processing_time_ms: None,
763 provenance: OcrProvenance {
764 provider: "deepseek-ocr".to_string(),
765 version: self.model.clone(),
766 languages: request.languages.clone(),
767 preprocessing: OcrPreprocessing::default(),
768 input_hash: None,
769 output_hash: None,
770 metadata: std::collections::HashMap::new(),
771 },
772 })
773 }
774}
775
776#[derive(Debug, Deserialize)]
777struct DeepSeekOcrResponse {
778 choices: Vec<DeepSeekOcrChoice>,
779}
780
781#[derive(Debug, Deserialize)]
782struct DeepSeekOcrChoice {
783 message: DeepSeekOcrMessage,
784}
785
786#[derive(Debug, Deserialize)]
787struct DeepSeekOcrMessage {
788 content: Option<String>,
789}
790
791// =============================================================================
792// LightOn OCR Provider
793// =============================================================================
794
795/// LightOnOCR-2-1B provider.
796///
797/// LightOnOCR-2 is an efficient 1B-parameter vision-language model that achieves
798/// SOTA on OlmOCR-Bench while being 9x smaller than competitors.
799///
800/// # Features
801/// - 1B parameters, 9x smaller than competitors
802/// - 5.71 pages/s on H100 (~493k pages/day)
803/// - <$0.01 per 1000 pages
804/// - Apache 2.0 license, open weights
805/// - GDPR-compliant (France)
806///
807/// # Example
808///
809/// ```ignore
810/// use converge_provider::ocr::{LightOnOcrProvider, OcrRequest};
811///
812/// let provider = LightOnOcrProvider::from_env()?;
813/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
814/// ```
815pub struct LightOnOcrProvider {
816 api_key: crate::secret::SecretString,
817 model: String,
818 base_url: String,
819 client: reqwest::blocking::Client,
820}
821
822impl LightOnOcrProvider {
823 /// Creates a new `LightOn` OCR provider.
824 #[must_use]
825 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
826 Self {
827 api_key: crate::secret::SecretString::new(api_key),
828 model: model.into(),
829 base_url: "https://api-inference.huggingface.co/models".to_string(),
830 client: reqwest::blocking::Client::new(),
831 }
832 }
833
834 /// Creates a provider using the `HUGGINGFACE_API_KEY` environment variable.
835 ///
836 /// Uses `lightonai/LightOnOCR-2-1B` as the default model.
837 ///
838 /// # Errors
839 ///
840 /// Returns error if the environment variable is not set.
841 pub fn from_env() -> Result<Self, OcrError> {
842 let api_key = std::env::var("HUGGINGFACE_API_KEY").map_err(|_| {
843 OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string())
844 })?;
845 Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B"))
846 }
847
848 /// Creates a provider with the bbox variant for figure extraction.
849 ///
850 /// # Errors
851 ///
852 /// Returns error if the environment variable is not set.
853 pub fn from_env_with_bbox() -> Result<Self, OcrError> {
854 let api_key = std::env::var("HUGGINGFACE_API_KEY").map_err(|_| {
855 OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string())
856 })?;
857 Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B-bbox"))
858 }
859
860 /// Uses a custom base URL.
861 #[must_use]
862 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
863 self.base_url = url.into();
864 self
865 }
866}
867
868impl OcrProvider for LightOnOcrProvider {
869 fn name(&self) -> &'static str {
870 "lighton-ocr"
871 }
872
873 fn model(&self) -> &str {
874 &self.model
875 }
876
877 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
878 // LightOnOCR uses HuggingFace Inference API
879 let image_bytes = match &request.input {
880 OcrInput::ImageBytes(bytes) => bytes.clone(),
881 OcrInput::PdfBytes(_) => {
882 return Err(OcrError::InvalidInput(
883 "LightOnOCR requires image input. Convert PDF pages to images first."
884 .to_string(),
885 ));
886 }
887 OcrInput::Url(url) => {
888 // Fetch the image
889 let response = self
890 .client
891 .get(url)
892 .send()
893 .map_err(|e| OcrError::Network(format!("Failed to fetch image: {e}")))?;
894 response
895 .bytes()
896 .map_err(|e| OcrError::Network(format!("Failed to read image: {e}")))?
897 .to_vec()
898 }
899 OcrInput::Base64(data) => {
900 base64::Engine::decode(&base64::engine::general_purpose::STANDARD, data)
901 .map_err(|e| OcrError::Parse(format!("Invalid base64: {e}")))?
902 }
903 };
904
905 let response = self
906 .client
907 .post(format!("{}/{}", self.base_url, self.model))
908 .header("Authorization", format!("Bearer {}", self.api_key.expose()))
909 .header("Content-Type", "application/octet-stream")
910 .body(image_bytes)
911 .send()
912 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
913
914 let status = response.status();
915 if !status.is_success() {
916 let error_text = response.text().unwrap_or_default();
917 return match status.as_u16() {
918 401 | 403 => Err(OcrError::Auth(format!(
919 "Authentication failed: {error_text}"
920 ))),
921 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
922 503 => Err(OcrError::Api("Model is loading, please retry".to_string())),
923 _ => Err(OcrError::Api(format!("API error ({status}): {error_text}"))),
924 };
925 }
926
927 // LightOnOCR returns the extracted text directly
928 let text = response
929 .text()
930 .map_err(|e| OcrError::Parse(format!("Failed to read response: {e}")))?;
931
932 Ok(OcrResult {
933 text,
934 pages: 1,
935 spans: vec![], // LightOnOCR doesn't provide word-level spans via HF API
936 tables: vec![],
937 images: vec![],
938 confidence: None,
939 processing_time_ms: None,
940 provenance: OcrProvenance {
941 provider: "lighton-ocr".to_string(),
942 version: self.model.clone(),
943 languages: request.languages.clone(),
944 preprocessing: OcrPreprocessing::default(),
945 input_hash: None,
946 output_hash: None,
947 metadata: std::collections::HashMap::new(),
948 },
949 })
950 }
951}
952
953// =============================================================================
954// Tesseract OCR Provider (Feature-gated, Local)
955// =============================================================================
956//
957// Tesseract is the "boring, reliable" OCR workhorse: classic OCR engine,
958// open source, runs fully locally, and easy to wrap in a Converge-style
959// Provider boundary.
960//
961// =============================================================================
962// WHAT TESSERACT IS
963// =============================================================================
964//
965// - An OCR engine originally from HP, maintained under open source.
966// - Takes images (PNG/JPG/TIFF etc) and outputs text, optionally with layout.
967// - Can run with different language packs (English, Swedish, etc).
968// - NOT a "big neural multimodal model" - it's a tool-like OCR system with
969// modern recognition components but still very deterministic.
970//
971// =============================================================================
972// WHERE IT SHINES
973// =============================================================================
974//
975// - Clean scans, printed documents, forms, invoices, manuals, receipts
976// - High-contrast screenshots
977// - Simple page layouts
978// - Deterministic runs: same input + same version + same settings = same output
979//
980// =============================================================================
981// WHERE IT STRUGGLES
982// =============================================================================
983//
984// - Handwriting (varies, usually weak vs modern DL OCR)
985// - Low-quality photos (blur, perspective, glare)
986// - Complex layouts with tables/columns (unless you guide it well with PSM)
987// - Mixed languages without explicit config
988//
989// If your primary use case is handwriting, camera photos with glare, or dense
990// multi-column PDFs with complex tables, consider a DL-based OCR instead.
991//
992// =============================================================================
993// OUTPUT FORMATS
994// =============================================================================
995//
996// Tesseract can produce:
997// - Plain text: Just the extracted text
998// - TSV: Word-level info with confidence and bounding boxes
999// - hOCR: HTML-like format with bounding boxes (useful for validation)
1000// - ALTO XML: Common in libraries/archives
1001//
1002// For Converge, hOCR/TSV is useful because you can validate "evidence":
1003// - Bounding boxes (where each word came from)
1004// - Per-word confidence
1005// - Page segmentation decisions
1006//
1007// =============================================================================
1008// KEY KNOBS
1009// =============================================================================
1010//
1011// 1. Page Segmentation Mode (PSM) - THE BIGGEST PRACTICAL LEVER
1012// Tells Tesseract what kind of page it's looking at:
1013// - 0 = OSD only (orientation and script detection)
1014// - 1 = Automatic page segmentation with OSD
1015// - 3 = Fully automatic page segmentation (default)
1016// - 4 = Single column of variable sizes
1017// - 6 = Uniform block of text
1018// - 7 = Single text line
1019// - 8 = Single word
1020// - 11 = Sparse text
1021// If you set the wrong mode, accuracy tanks.
1022//
1023// 2. OCR Engine Mode (OEM)
1024// Chooses which internal engine strategy to use:
1025// - 0 = Legacy engine only
1026// - 1 = Neural nets LSTM engine only
1027// - 2 = Legacy + LSTM engines
1028// - 3 = Default (auto-select best available)
1029// Defaults are usually fine, but pin for reproducibility.
1030//
1031// 3. Language Packs
1032// Set -l eng / -l swe etc. DON'T leave language detection implicit.
1033//
1034// 4. Preprocessing
1035// Tesseract is EXTREMELY sensitive to:
1036// - Resolution (DPI) - 300 DPI is typical minimum
1037// - Binarization (thresholding)
1038// - Denoise
1039// - Deskew
1040// - Contrast normalization
1041//
1042// This is where "Rust purity" can shine: do deterministic preprocessing
1043// in Rust (image crate) and then pass a cleaned image to Tesseract.
1044//
1045// =============================================================================
1046// CONVERGE-STYLE INTEGRATION PATTERN
1047// =============================================================================
1048//
1049// Treat OCR as a provider that returns a PROPOSAL, never truth.
1050//
1051// Shape:
1052// DocumentBytes → ProposedTextExtraction → Validators → Facts/StructuredFields
1053//
1054// Provider output (recommended):
1055// - text: extracted text
1056// - spans: optional words/lines with bounding boxes (from TSV/hOCR)
1057// - confidence: summary stats (mean, min, histogram)
1058// - tool_provenance:
1059// - engine = "tesseract"
1060// - tesseract_version
1061// - lang
1062// - psm, oem
1063// - preprocess_pipeline_hash
1064// - trace_link:
1065// - input hash (bytes)
1066// - output hash
1067// - settings hash
1068//
1069// Validators (examples):
1070// - min_confidence >= 0.75 else STOP or WARN
1071// - required_fields_present (invoice number/date/amount)
1072// - layout sanity (if table expected, require hOCR structure)
1073// - PII redaction gate before storage
1074//
1075// =============================================================================
1076// PACKAGING AND DEPLOYMENT
1077// =============================================================================
1078//
1079// Tesseract is a native dependency. Manage cleanly:
1080//
1081// Best practice for "one binary experience":
1082// - Ship your Rust binary
1083// - Vendor/bundle Tesseract in installer (or provide "cz doctor" check)
1084// - Pin versions for reproducibility
1085//
1086// On macOS: Most people install via Homebrew, but for deterministic
1087// environments, package with your app or use Nix.
1088//
1089// =============================================================================
1090// ARCHITECTURE (Rust-first compromise)
1091// =============================================================================
1092//
1093// Tesseract integration follows the "Rust-first compromise" pattern:
1094// - Pure Converge architecture (providers, traces, gates, promotion)
1095// - OCR runs locally with no cloud data exposure
1096// - Accepts native dependency (tesseract + leptonica)
1097//
1098// Integration options (in order of preference):
1099// 1. Sidecar binary: invoke `tesseract` CLI via std::process::Command
1100// 2. FFI binding: link against libtesseract (more complex, faster)
1101// 3. System dependency: require tesseract installed (brew, apt, nix)
1102//
1103// The provider returns:
1104// - Extracted text
1105// - Confidence summary (per-word statistics)
1106// - Provenance: tool version, language pack, preprocessing params
1107// - Trace link hashes of input bytes and output
1108//
1109// Determinism: Stable for same input image + same Tesseract version.
1110//
1111// When to use:
1112// - Scanned PDFs, clean prints, forms, invoices, receipts
1113// - "Extract text so downstream validators can reason"
1114// - GDPR/data sovereignty requirements (no cloud exposure)
1115//
1116// Future: Can be swapped with Burn/candle-based OCR model without
1117// changing the core contracts (OcrProvider trait).
1118//
1119// =============================================================================
1120
1121/// Configuration for Tesseract OCR provider.
1122///
1123/// # Feature Gate
1124///
1125/// This provider requires the `tesseract` feature:
1126/// ```toml
1127/// [dependencies]
1128/// converge-provider = { version = "0.2", features = ["tesseract"] }
1129/// ```
1130///
1131/// # System Requirements
1132///
1133/// Tesseract must be installed on the system:
1134/// - macOS: `brew install tesseract tesseract-lang`
1135/// - Ubuntu: `apt install tesseract-ocr tesseract-ocr-eng`
1136/// - Windows: Download from <https://github.com/UB-Mannheim/tesseract/wiki>
1137///
1138/// # Key Knobs
1139///
1140/// **Page Segmentation Mode (PSM)** - The biggest practical lever:
1141/// - 0 = OSD only (orientation and script detection)
1142/// - 1 = Automatic page segmentation with OSD
1143/// - 3 = Fully automatic page segmentation (default)
1144/// - 4 = Single column of variable sizes
1145/// - 6 = Uniform block of text
1146/// - 7 = Single text line
1147/// - 8 = Single word
1148/// - 11 = Sparse text
1149///
1150/// If you set the wrong mode, accuracy tanks.
1151///
1152/// **OCR Engine Mode (OEM)**:
1153/// - 0 = Legacy engine only
1154/// - 1 = Neural nets LSTM engine only
1155/// - 2 = Legacy + LSTM engines
1156/// - 3 = Default (auto-select best available)
1157///
1158/// **Preprocessing**: Tesseract is EXTREMELY sensitive to:
1159/// - Resolution (DPI) - 300 DPI is typical minimum
1160/// - Binarization, denoise, deskew, contrast normalization
1161///
1162/// # Example (Future)
1163///
1164/// ```ignore
1165/// use converge_provider::ocr::{TesseractOcrProvider, TesseractConfig, TesseractOutputFormat, OcrRequest};
1166///
1167/// let config = TesseractConfig::new()
1168/// .with_languages(vec!["eng", "deu"])
1169/// .with_dpi(300)
1170/// .with_psm(3) // Fully automatic
1171/// .with_output_format(TesseractOutputFormat::Tsv); // Get bounding boxes
1172///
1173/// let provider = TesseractOcrProvider::with_config(config);
1174/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
1175///
1176/// // Provenance includes tool version, language pack, preprocessing
1177/// println!("Tesseract version: {}", result.provenance.version);
1178/// println!("Confidence: {:.2}%", result.confidence.unwrap().mean * 100.0);
1179///
1180/// // Check spans for evidence validation
1181/// for span in &result.spans {
1182/// if span.is_low_confidence(0.75) {
1183/// println!("Low confidence word: {} ({:.0}%)", span.text, span.confidence * 100.0);
1184/// }
1185/// }
1186/// ```
1187#[derive(Debug, Clone)]
1188pub struct TesseractConfig {
1189 /// Path to tesseract binary (default: "tesseract" in PATH).
1190 pub binary_path: String,
1191 /// Path to tessdata directory (language files).
1192 pub tessdata_path: Option<String>,
1193 /// Languages to use (e.g., ["eng", "deu"]).
1194 /// DON'T leave language detection implicit!
1195 pub languages: Vec<String>,
1196 /// DPI for PDF rendering (default: 300).
1197 /// 300 DPI is typical minimum for good results.
1198 pub dpi: u32,
1199 /// Page segmentation mode (PSM).
1200 /// 0 = OSD only, 1 = auto + OSD, 3 = fully auto (default), 6 = uniform block, etc.
1201 /// THIS IS THE BIGGEST PRACTICAL LEVER. Wrong mode = bad accuracy.
1202 pub psm: u32,
1203 /// OCR engine mode (OEM).
1204 /// 0 = Legacy, 1 = Neural LSTM, 2 = Legacy + LSTM, 3 = Default (auto).
1205 /// Pin for reproducibility.
1206 pub oem: u32,
1207 /// Output format (text, TSV, hOCR, ALTO).
1208 /// Use TSV or hOCR for word-level confidence and bounding boxes.
1209 pub output_format: TesseractOutputFormat,
1210 /// Whether to apply preprocessing (deskew, denoise, binarize).
1211 /// Tesseract is EXTREMELY sensitive to image quality.
1212 pub preprocess: bool,
1213 /// Timeout in seconds for OCR operation.
1214 pub timeout_secs: u64,
1215}
1216
1217impl Default for TesseractConfig {
1218 fn default() -> Self {
1219 Self {
1220 binary_path: "tesseract".to_string(),
1221 tessdata_path: None,
1222 languages: vec!["eng".to_string()],
1223 dpi: 300,
1224 psm: 3, // Fully automatic page segmentation
1225 oem: 3, // Default (auto-select best available)
1226 output_format: TesseractOutputFormat::Text,
1227 preprocess: true,
1228 timeout_secs: 60,
1229 }
1230 }
1231}
1232
1233impl TesseractConfig {
1234 /// Creates a new Tesseract configuration with defaults.
1235 #[must_use]
1236 pub fn new() -> Self {
1237 Self::default()
1238 }
1239
1240 /// Sets the path to the tesseract binary.
1241 #[must_use]
1242 pub fn with_binary_path(mut self, path: impl Into<String>) -> Self {
1243 self.binary_path = path.into();
1244 self
1245 }
1246
1247 /// Sets the tessdata directory path.
1248 #[must_use]
1249 pub fn with_tessdata_path(mut self, path: impl Into<String>) -> Self {
1250 self.tessdata_path = Some(path.into());
1251 self
1252 }
1253
1254 /// Sets the languages to use.
1255 #[must_use]
1256 pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1257 self.languages = languages.into_iter().map(Into::into).collect();
1258 self
1259 }
1260
1261 /// Sets the DPI for PDF rendering.
1262 #[must_use]
1263 pub fn with_dpi(mut self, dpi: u32) -> Self {
1264 self.dpi = dpi;
1265 self
1266 }
1267
1268 /// Sets the page segmentation mode.
1269 #[must_use]
1270 pub fn with_psm(mut self, psm: u32) -> Self {
1271 self.psm = psm;
1272 self
1273 }
1274
1275 /// Sets the OCR engine mode.
1276 #[must_use]
1277 pub fn with_oem(mut self, oem: u32) -> Self {
1278 self.oem = oem;
1279 self
1280 }
1281
1282 /// Sets whether to apply preprocessing.
1283 #[must_use]
1284 pub fn with_preprocess(mut self, preprocess: bool) -> Self {
1285 self.preprocess = preprocess;
1286 self
1287 }
1288
1289 /// Sets the timeout in seconds.
1290 #[must_use]
1291 pub fn with_timeout(mut self, secs: u64) -> Self {
1292 self.timeout_secs = secs;
1293 self
1294 }
1295
1296 /// Sets the output format.
1297 /// Use TSV or hOCR for word-level confidence and bounding boxes.
1298 #[must_use]
1299 pub fn with_output_format(mut self, format: TesseractOutputFormat) -> Self {
1300 self.output_format = format;
1301 self
1302 }
1303}
1304
1305/// Tesseract OCR provider (stub - not yet implemented).
1306///
1307/// This is a placeholder for the local Tesseract OCR integration.
1308/// The actual implementation will be feature-gated behind `tesseract`.
1309///
1310/// # Architecture
1311///
1312/// ```text
1313/// TesseractOcrProvider
1314/// │
1315/// ├── Input (PDF/Image bytes)
1316/// │ │
1317/// │ ▼
1318/// ├── Preprocessing (optional)
1319/// │ ├── PDF → Images (pdftoppm/pdf2image)
1320/// │ ├── Deskew (leptonica)
1321/// │ ├── Denoise (leptonica)
1322/// │ └── Binarize (leptonica)
1323/// │ │
1324/// │ ▼
1325/// ├── Tesseract CLI/FFI
1326/// │ │
1327/// │ ▼
1328/// ├── Output
1329/// │ ├── Text (plain or hOCR/ALTO)
1330/// │ ├── Confidence (per-word)
1331/// │ └── Bounding boxes (optional)
1332/// │ │
1333/// │ ▼
1334/// └── OcrResult with Provenance
1335/// ├── text
1336/// ├── confidence summary
1337/// ├── provenance (version, langs, params)
1338/// └── trace hashes (input/output)
1339/// ```
1340///
1341/// # Future Implementation
1342///
1343/// When the `tesseract` feature is enabled:
1344///
1345/// ```ignore
1346/// #[cfg(feature = "tesseract")]
1347/// impl OcrProvider for TesseractOcrProvider {
1348/// fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
1349/// // 1. Hash input for trace links
1350/// // 2. Preprocess if needed (PDF→image, deskew, etc.)
1351/// // 3. Invoke tesseract CLI or FFI
1352/// // 4. Parse output (text + confidence)
1353/// // 5. Hash output for trace links
1354/// // 6. Return OcrResult with full provenance
1355/// }
1356/// }
1357/// ```
1358#[derive(Debug)]
1359pub struct TesseractOcrProvider {
1360 config: TesseractConfig,
1361}
1362
1363impl TesseractOcrProvider {
1364 /// Creates a new Tesseract OCR provider with default configuration.
1365 #[must_use]
1366 pub fn new() -> Self {
1367 Self {
1368 config: TesseractConfig::default(),
1369 }
1370 }
1371
1372 /// Creates a provider with custom configuration.
1373 #[must_use]
1374 pub fn with_config(config: TesseractConfig) -> Self {
1375 Self { config }
1376 }
1377
1378 /// Sets the languages to use.
1379 #[must_use]
1380 pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1381 self.config.languages = languages.into_iter().map(Into::into).collect();
1382 self
1383 }
1384
1385 /// Sets the DPI for PDF rendering.
1386 #[must_use]
1387 pub fn with_dpi(mut self, dpi: u32) -> Self {
1388 self.config.dpi = dpi;
1389 self
1390 }
1391
1392 /// Checks if Tesseract is available on the system.
1393 ///
1394 /// # Errors
1395 ///
1396 /// Returns error if Tesseract is not found or cannot be executed.
1397 pub fn check_availability(&self) -> Result<String, OcrError> {
1398 // This is a stub - actual implementation would run `tesseract --version`
1399 Err(OcrError::Api(
1400 "Tesseract provider not yet implemented. Enable the 'tesseract' feature.".to_string(),
1401 ))
1402 }
1403
1404 /// Returns the Tesseract version (stub).
1405 #[must_use]
1406 pub fn version(&self) -> Option<String> {
1407 None // Stub - would parse `tesseract --version` output
1408 }
1409}
1410
1411impl Default for TesseractOcrProvider {
1412 fn default() -> Self {
1413 Self::new()
1414 }
1415}
1416
1417// Stub implementation - will be replaced when feature is implemented
1418impl OcrProvider for TesseractOcrProvider {
1419 fn name(&self) -> &'static str {
1420 "tesseract"
1421 }
1422
1423 fn model(&self) -> &'static str {
1424 "tesseract-stub"
1425 }
1426
1427 fn extract(&self, _request: &OcrRequest) -> Result<OcrResult, OcrError> {
1428 Err(OcrError::Api(
1429 "Tesseract OCR provider not yet implemented. \
1430 This is a placeholder for future local OCR support. \
1431 For now, use MistralOcrProvider, DeepSeekOcrProvider, or LightOnOcrProvider."
1432 .to_string(),
1433 ))
1434 }
1435}
1436
1437// =============================================================================
1438// Helper functions for provenance
1439// =============================================================================
1440
1441/// Computes SHA-256 hash of bytes for trace links.
1442#[must_use]
1443pub fn compute_hash(data: &[u8]) -> String {
1444 use sha2::{Digest, Sha256};
1445 let mut hasher = Sha256::new();
1446 hasher.update(data);
1447 format!("{:x}", hasher.finalize())
1448}
1449
1450/// Computes input/output hashes and returns updated provenance.
1451#[must_use]
1452pub fn with_trace_hashes(
1453 mut provenance: OcrProvenance,
1454 input: &[u8],
1455 output: &str,
1456) -> OcrProvenance {
1457 provenance.input_hash = Some(compute_hash(input));
1458 provenance.output_hash = Some(compute_hash(output.as_bytes()));
1459 provenance
1460}
1461
1462#[cfg(test)]
1463mod tests {
1464 use super::*;
1465
1466 #[test]
1467 fn test_ocr_request_builder() {
1468 let request = OcrRequest::from_pdf_bytes(vec![1, 2, 3])
1469 .with_output_format(OcrOutputFormat::Html)
1470 .with_languages(vec!["en".to_string(), "de".to_string()])
1471 .with_extract_tables(true)
1472 .with_extract_images(true)
1473 .with_page_range(0, 10);
1474
1475 assert_eq!(request.output_format, OcrOutputFormat::Html);
1476 assert_eq!(request.languages, vec!["en", "de"]);
1477 assert!(request.extract_tables);
1478 assert!(request.extract_images);
1479 assert_eq!(request.page_range, Some((0, 10)));
1480 }
1481
1482 #[test]
1483 fn test_ocr_output_format_default() {
1484 let format = OcrOutputFormat::default();
1485 assert_eq!(format, OcrOutputFormat::Markdown);
1486 }
1487}