converge_provider/ocr.rs
1// Copyright 2024-2025 Aprio One AB, Sweden
2// Author: Kenneth Pernyer, kenneth@aprio.one
3// SPDX-License-Identifier: MIT
4// See LICENSE file in the project root for full license information.
5
6//! OCR / Document AI providers.
7//!
8//! This module provides integration with OCR models for document understanding,
9//! text extraction, and structured content parsing from PDFs, scans, and images.
10//!
11//! # Available Providers
12//!
13//! - [`MistralOcrProvider`] - Mistral OCR 3 (GDPR-compliant, EU)
14//! - [`DeepSeekOcrProvider`] - DeepSeek OCR 2 (Visual Causal Flow)
15//! - [`LightOnOcrProvider`] - LightOnOCR-2-1B (Efficient, open-source)
16//!
17//! # Example
18//!
19//! ```ignore
20//! use converge_provider::ocr::{OcrProvider, MistralOcrProvider, OcrRequest};
21//!
22//! let provider = MistralOcrProvider::from_env()?;
23//! let request = OcrRequest::from_pdf_bytes(pdf_bytes);
24//! let result = provider.extract(&request)?;
25//!
26//! println!("Extracted text: {}", result.text);
27//! for table in result.tables {
28//! println!("Table: {:?}", table);
29//! }
30//! ```
31
32use serde::{Deserialize, Serialize};
33
34/// Error type for OCR operations.
35#[derive(Debug, thiserror::Error)]
36pub enum OcrError {
37 /// Network/HTTP error.
38 #[error("Network error: {0}")]
39 Network(String),
40
41 /// API authentication error.
42 #[error("Authentication error: {0}")]
43 Auth(String),
44
45 /// Rate limit exceeded.
46 #[error("Rate limit exceeded: {0}")]
47 RateLimit(String),
48
49 /// API response parsing error.
50 #[error("Parse error: {0}")]
51 Parse(String),
52
53 /// Invalid input (unsupported format, etc.).
54 #[error("Invalid input: {0}")]
55 InvalidInput(String),
56
57 /// General API error.
58 #[error("API error: {0}")]
59 Api(String),
60}
61
62/// Input type for OCR processing.
63#[derive(Debug, Clone)]
64pub enum OcrInput {
65 /// PDF document as bytes.
66 PdfBytes(Vec<u8>),
67 /// Image as bytes (PNG, JPEG, etc.).
68 ImageBytes(Vec<u8>),
69 /// URL to a document or image.
70 Url(String),
71 /// Base64-encoded document or image.
72 Base64(String),
73}
74
75/// OCR extraction request.
76#[derive(Debug, Clone)]
77pub struct OcrRequest {
78 /// Input document or image.
79 pub input: OcrInput,
80 /// Output format preference.
81 pub output_format: OcrOutputFormat,
82 /// Language hints (ISO 639-1 codes).
83 pub languages: Vec<String>,
84 /// Whether to extract tables.
85 pub extract_tables: bool,
86 /// Whether to extract images/figures.
87 pub extract_images: bool,
88 /// Page range (for multi-page documents).
89 pub page_range: Option<(usize, usize)>,
90}
91
92impl OcrRequest {
93 /// Creates a request from PDF bytes.
94 #[must_use]
95 pub fn from_pdf_bytes(bytes: Vec<u8>) -> Self {
96 Self {
97 input: OcrInput::PdfBytes(bytes),
98 output_format: OcrOutputFormat::Markdown,
99 languages: vec![],
100 extract_tables: true,
101 extract_images: false,
102 page_range: None,
103 }
104 }
105
106 /// Creates a request from image bytes.
107 #[must_use]
108 pub fn from_image_bytes(bytes: Vec<u8>) -> Self {
109 Self {
110 input: OcrInput::ImageBytes(bytes),
111 output_format: OcrOutputFormat::Markdown,
112 languages: vec![],
113 extract_tables: true,
114 extract_images: false,
115 page_range: None,
116 }
117 }
118
119 /// Creates a request from a URL.
120 #[must_use]
121 pub fn from_url(url: impl Into<String>) -> Self {
122 Self {
123 input: OcrInput::Url(url.into()),
124 output_format: OcrOutputFormat::Markdown,
125 languages: vec![],
126 extract_tables: true,
127 extract_images: false,
128 page_range: None,
129 }
130 }
131
132 /// Sets the output format.
133 #[must_use]
134 pub fn with_output_format(mut self, format: OcrOutputFormat) -> Self {
135 self.output_format = format;
136 self
137 }
138
139 /// Adds language hints.
140 #[must_use]
141 pub fn with_languages(mut self, languages: Vec<String>) -> Self {
142 self.languages = languages;
143 self
144 }
145
146 /// Sets whether to extract tables.
147 #[must_use]
148 pub fn with_extract_tables(mut self, extract: bool) -> Self {
149 self.extract_tables = extract;
150 self
151 }
152
153 /// Sets whether to extract images.
154 #[must_use]
155 pub fn with_extract_images(mut self, extract: bool) -> Self {
156 self.extract_images = extract;
157 self
158 }
159
160 /// Sets the page range for multi-page documents.
161 #[must_use]
162 pub fn with_page_range(mut self, start: usize, end: usize) -> Self {
163 self.page_range = Some((start, end));
164 self
165 }
166}
167
168/// Output format for OCR results.
169#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
170pub enum OcrOutputFormat {
171 /// Plain text.
172 Text,
173 /// Markdown with structure preserved.
174 #[default]
175 Markdown,
176 /// HTML with table reconstruction.
177 Html,
178 /// JSON with structured data.
179 Json,
180}
181
182/// A detected table in the document.
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct OcrTable {
185 /// Page number (0-indexed).
186 pub page: usize,
187 /// Table as HTML or markdown.
188 pub content: String,
189 /// Bounding box (x, y, width, height) if available.
190 pub bbox: Option<(f64, f64, f64, f64)>,
191}
192
193/// A detected image/figure in the document.
194#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct OcrImage {
196 /// Page number (0-indexed).
197 pub page: usize,
198 /// Image description or alt text.
199 pub description: Option<String>,
200 /// Bounding box (x, y, width, height).
201 pub bbox: Option<(f64, f64, f64, f64)>,
202 /// Base64-encoded image data (if extracted).
203 pub data: Option<String>,
204}
205
206/// Provenance information for OCR results.
207///
208/// Captures everything needed for reproducibility and tracing:
209/// - Tool version and configuration
210/// - Input/output hashes for trace links
211/// - Preprocessing parameters applied
212#[derive(Debug, Clone, Serialize, Deserialize, Default)]
213pub struct OcrProvenance {
214 /// Provider/tool name (e.g., "tesseract", "mistral-ocr", "deepseek-ocr").
215 pub provider: String,
216 /// Model or engine version (e.g., "5.3.0", "mistral-ocr-2512").
217 pub version: String,
218 /// Language pack(s) used (e.g., ["eng", "deu"]).
219 pub languages: Vec<String>,
220 /// Preprocessing parameters applied.
221 pub preprocessing: OcrPreprocessing,
222 /// SHA-256 hash of input bytes (for trace links).
223 pub input_hash: Option<String>,
224 /// SHA-256 hash of output text (for trace links).
225 pub output_hash: Option<String>,
226 /// Additional metadata (tool-specific).
227 #[serde(default)]
228 pub metadata: std::collections::HashMap<String, String>,
229}
230
231/// Preprocessing parameters applied before OCR.
232#[derive(Debug, Clone, Serialize, Deserialize, Default)]
233pub struct OcrPreprocessing {
234 /// DPI used for rendering (for PDFs).
235 pub dpi: Option<u32>,
236 /// Whether binarization was applied.
237 pub binarized: bool,
238 /// Whether deskewing was applied.
239 pub deskewed: bool,
240 /// Whether noise removal was applied.
241 pub denoised: bool,
242 /// Page segmentation mode (Tesseract-specific).
243 pub psm: Option<u32>,
244 /// OCR engine mode (Tesseract-specific).
245 pub oem: Option<u32>,
246}
247
248/// Confidence summary for OCR results.
249#[derive(Debug, Clone, Serialize, Deserialize, Default)]
250pub struct OcrConfidence {
251 /// Overall mean confidence (0.0-1.0).
252 pub mean: f64,
253 /// Minimum word confidence.
254 pub min: f64,
255 /// Maximum word confidence.
256 pub max: f64,
257 /// Standard deviation of confidence scores.
258 pub std_dev: Option<f64>,
259 /// Number of words with confidence below threshold.
260 pub low_confidence_words: usize,
261 /// Threshold used for low confidence (default 0.6).
262 pub threshold: f64,
263}
264
265/// A word or text span with position and confidence.
266///
267/// For Tesseract, this comes from TSV or hOCR output.
268/// Useful for validation: you can check where each word came from
269/// and flag low-confidence regions.
270#[derive(Debug, Clone, Serialize, Deserialize)]
271pub struct OcrSpan {
272 /// The text content of this span.
273 pub text: String,
274 /// Confidence score (0.0-1.0).
275 pub confidence: f64,
276 /// Page number (0-indexed).
277 pub page: usize,
278 /// Bounding box: (x, y, width, height) in pixels.
279 pub bbox: Option<(i32, i32, i32, i32)>,
280 /// Block number (page segmentation unit).
281 pub block_num: Option<i32>,
282 /// Paragraph number within block.
283 pub par_num: Option<i32>,
284 /// Line number within paragraph.
285 pub line_num: Option<i32>,
286 /// Word number within line.
287 pub word_num: Option<i32>,
288}
289
290impl OcrSpan {
291 /// Creates a new span with text and confidence.
292 #[must_use]
293 pub fn new(text: impl Into<String>, confidence: f64) -> Self {
294 Self {
295 text: text.into(),
296 confidence,
297 page: 0,
298 bbox: None,
299 block_num: None,
300 par_num: None,
301 line_num: None,
302 word_num: None,
303 }
304 }
305
306 /// Sets the bounding box.
307 #[must_use]
308 pub fn with_bbox(mut self, x: i32, y: i32, w: i32, h: i32) -> Self {
309 self.bbox = Some((x, y, w, h));
310 self
311 }
312
313 /// Sets the page number.
314 #[must_use]
315 pub fn with_page(mut self, page: usize) -> Self {
316 self.page = page;
317 self
318 }
319
320 /// Checks if this span has low confidence (below threshold).
321 #[must_use]
322 pub fn is_low_confidence(&self, threshold: f64) -> bool {
323 self.confidence < threshold
324 }
325}
326
327/// Tesseract-specific output format.
328///
329/// Controls what kind of output to request from Tesseract.
330#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
331pub enum TesseractOutputFormat {
332 /// Plain text (default).
333 #[default]
334 Text,
335 /// TSV with word-level confidence and bounding boxes.
336 /// Columns: level, page_num, block_num, par_num, line_num, word_num,
337 /// left, top, width, height, conf, text
338 Tsv,
339 /// hOCR HTML format with bounding boxes.
340 /// Useful for downstream table/layout analysis.
341 Hocr,
342 /// ALTO XML format (common in libraries/archives).
343 Alto,
344}
345
346/// OCR extraction result.
347#[derive(Debug, Clone, Serialize, Deserialize)]
348pub struct OcrResult {
349 /// Extracted text content.
350 pub text: String,
351 /// Number of pages processed.
352 pub pages: usize,
353 /// Word/text spans with positions and confidence.
354 /// Populated when using TSV or hOCR output format (Tesseract).
355 /// Useful for validation: check where each word came from.
356 #[serde(default)]
357 pub spans: Vec<OcrSpan>,
358 /// Detected tables.
359 pub tables: Vec<OcrTable>,
360 /// Detected images/figures.
361 pub images: Vec<OcrImage>,
362 /// Confidence summary (per-word statistics).
363 pub confidence: Option<OcrConfidence>,
364 /// Processing time in milliseconds.
365 pub processing_time_ms: Option<u64>,
366 /// Provenance for reproducibility and tracing.
367 pub provenance: OcrProvenance,
368}
369
370/// Trait for OCR providers.
371pub trait OcrProvider: Send + Sync {
372 /// Returns the provider name.
373 fn name(&self) -> &'static str;
374
375 /// Returns the model being used.
376 fn model(&self) -> &str;
377
378 /// Extracts text and structure from a document.
379 ///
380 /// # Errors
381 ///
382 /// Returns error if extraction fails.
383 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError>;
384}
385
386// =============================================================================
387// Mistral OCR Provider
388// =============================================================================
389
390/// Mistral OCR 3 provider.
391///
392/// Mistral OCR 3 is designed for document AI at scale, handling forms, invoices,
393/// complex tables, handwriting, and low-quality scans. It outputs structured
394/// text/HTML suitable for RAG and agent workflows.
395///
396/// # Features
397/// - 74% win rate over OCR 2 on forms, handwriting, tables
398/// - Markdown output with HTML table reconstruction
399/// - GDPR-compliant (France)
400/// - $2 per 1000 pages ($1 with batch API)
401///
402/// # Example
403///
404/// ```ignore
405/// use converge_provider::ocr::{MistralOcrProvider, OcrRequest};
406///
407/// let provider = MistralOcrProvider::from_env()?;
408/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
409/// ```
410pub struct MistralOcrProvider {
411 api_key: String,
412 model: String,
413 base_url: String,
414 client: reqwest::blocking::Client,
415}
416
417impl MistralOcrProvider {
418 /// Creates a new Mistral OCR provider.
419 #[must_use]
420 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
421 Self {
422 api_key: api_key.into(),
423 model: model.into(),
424 base_url: "https://api.mistral.ai/v1".to_string(),
425 client: reqwest::blocking::Client::new(),
426 }
427 }
428
429 /// Creates a provider using the `MISTRAL_API_KEY` environment variable.
430 ///
431 /// Uses `mistral-ocr-latest` as the default model.
432 ///
433 /// # Errors
434 ///
435 /// Returns error if the environment variable is not set.
436 pub fn from_env() -> Result<Self, OcrError> {
437 let api_key = std::env::var("MISTRAL_API_KEY")
438 .map_err(|_| OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string()))?;
439 Ok(Self::new(api_key, "mistral-ocr-latest"))
440 }
441
442 /// Creates a provider with a specific model.
443 ///
444 /// # Errors
445 ///
446 /// Returns error if the environment variable is not set.
447 pub fn from_env_with_model(model: impl Into<String>) -> Result<Self, OcrError> {
448 let api_key = std::env::var("MISTRAL_API_KEY")
449 .map_err(|_| OcrError::Auth("MISTRAL_API_KEY environment variable not set".to_string()))?;
450 Ok(Self::new(api_key, model))
451 }
452
453 /// Uses a custom base URL.
454 #[must_use]
455 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
456 self.base_url = url.into();
457 self
458 }
459}
460
461impl OcrProvider for MistralOcrProvider {
462 fn name(&self) -> &'static str {
463 "mistral-ocr"
464 }
465
466 fn model(&self) -> &str {
467 &self.model
468 }
469
470 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
471 // Build the request body based on input type
472 let document = match &request.input {
473 OcrInput::PdfBytes(bytes) => {
474 serde_json::json!({
475 "type": "document_url",
476 "document_url": format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
477 })
478 }
479 OcrInput::ImageBytes(bytes) => {
480 serde_json::json!({
481 "type": "image_url",
482 "image_url": format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
483 })
484 }
485 OcrInput::Url(url) => {
486 if url.ends_with(".pdf") {
487 serde_json::json!({
488 "type": "document_url",
489 "document_url": url
490 })
491 } else {
492 serde_json::json!({
493 "type": "image_url",
494 "image_url": url
495 })
496 }
497 }
498 OcrInput::Base64(data) => {
499 serde_json::json!({
500 "type": "document_url",
501 "document_url": format!("data:application/pdf;base64,{}", data)
502 })
503 }
504 };
505
506 let body = serde_json::json!({
507 "model": self.model,
508 "document": document,
509 "include_image_base64": request.extract_images
510 });
511
512 let response = self
513 .client
514 .post(format!("{}/ocr", self.base_url))
515 .header("Authorization", format!("Bearer {}", self.api_key))
516 .header("Content-Type", "application/json")
517 .json(&body)
518 .send()
519 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
520
521 let status = response.status();
522 if !status.is_success() {
523 let error_text = response.text().unwrap_or_default();
524 return match status.as_u16() {
525 401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
526 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
527 _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
528 };
529 }
530
531 let api_response: MistralOcrResponse = response
532 .json()
533 .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
534
535 // Convert to our result format
536 let mut tables = vec![];
537 let mut images = vec![];
538 let mut text = String::new();
539
540 for (page_idx, page) in api_response.pages.iter().enumerate() {
541 text.push_str(&page.markdown);
542 text.push_str("\n\n");
543
544 // Extract tables from markdown (simplified)
545 // In practice, Mistral returns tables as HTML within markdown
546 if page.markdown.contains("<table") {
547 tables.push(OcrTable {
548 page: page_idx,
549 content: page.markdown.clone(),
550 bbox: None,
551 });
552 }
553
554 // Extract images if present
555 for img in &page.images {
556 images.push(OcrImage {
557 page: page_idx,
558 description: None,
559 bbox: None,
560 data: img.image_base64.clone(),
561 });
562 }
563 }
564
565 Ok(OcrResult {
566 text: text.trim().to_string(),
567 pages: api_response.pages.len(),
568 spans: vec![], // Mistral OCR doesn't provide word-level spans
569 tables,
570 images,
571 confidence: None,
572 processing_time_ms: None,
573 provenance: OcrProvenance {
574 provider: "mistral-ocr".to_string(),
575 version: self.model.clone(),
576 languages: request.languages.clone(),
577 preprocessing: OcrPreprocessing::default(),
578 input_hash: None, // TODO: compute from input
579 output_hash: None, // TODO: compute from output
580 metadata: std::collections::HashMap::new(),
581 },
582 })
583 }
584}
585
586#[derive(Debug, Deserialize)]
587struct MistralOcrResponse {
588 pages: Vec<MistralOcrPage>,
589}
590
591#[derive(Debug, Deserialize)]
592struct MistralOcrPage {
593 markdown: String,
594 #[serde(default)]
595 images: Vec<MistralOcrImage>,
596}
597
598#[derive(Debug, Deserialize)]
599struct MistralOcrImage {
600 #[serde(default)]
601 image_base64: Option<String>,
602}
603
604// =============================================================================
605// DeepSeek OCR Provider
606// =============================================================================
607
608/// DeepSeek OCR 2 provider.
609///
610/// DeepSeek OCR 2 is a 3B-parameter vision-language model with the DeepEncoder V2
611/// architecture featuring Visual Causal Flow for human-like reading order.
612///
613/// # Features
614/// - SOTA on document understanding benchmarks
615/// - Human-like visual reading order
616/// - Semantic visual reasoning
617/// - 16x token compression
618///
619/// # Example
620///
621/// ```ignore
622/// use converge_provider::ocr::{DeepSeekOcrProvider, OcrRequest};
623///
624/// let provider = DeepSeekOcrProvider::from_env()?;
625/// let result = provider.extract(&OcrRequest::from_image_bytes(image_bytes))?;
626/// ```
627pub struct DeepSeekOcrProvider {
628 api_key: String,
629 model: String,
630 base_url: String,
631 client: reqwest::blocking::Client,
632}
633
634impl DeepSeekOcrProvider {
635 /// Creates a new DeepSeek OCR provider.
636 #[must_use]
637 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
638 Self {
639 api_key: api_key.into(),
640 model: model.into(),
641 base_url: "https://api.deepseek.com/v1".to_string(),
642 client: reqwest::blocking::Client::new(),
643 }
644 }
645
646 /// Creates a provider using the `DEEPSEEK_API_KEY` environment variable.
647 ///
648 /// Uses `deepseek-ocr-2` as the default model.
649 ///
650 /// # Errors
651 ///
652 /// Returns error if the environment variable is not set.
653 pub fn from_env() -> Result<Self, OcrError> {
654 let api_key = std::env::var("DEEPSEEK_API_KEY")
655 .map_err(|_| OcrError::Auth("DEEPSEEK_API_KEY environment variable not set".to_string()))?;
656 Ok(Self::new(api_key, "deepseek-ocr-2"))
657 }
658
659 /// Uses a custom base URL.
660 #[must_use]
661 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
662 self.base_url = url.into();
663 self
664 }
665}
666
667impl OcrProvider for DeepSeekOcrProvider {
668 fn name(&self) -> &'static str {
669 "deepseek-ocr"
670 }
671
672 fn model(&self) -> &str {
673 &self.model
674 }
675
676 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
677 // DeepSeek OCR uses a chat-like API with vision capabilities
678 let image_content = match &request.input {
679 OcrInput::ImageBytes(bytes) => {
680 format!("data:image/png;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
681 }
682 OcrInput::PdfBytes(bytes) => {
683 // DeepSeek OCR expects images; for PDF, we'd need to convert pages
684 // For now, treat as base64 document
685 format!("data:application/pdf;base64,{}", base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes))
686 }
687 OcrInput::Url(url) => url.clone(),
688 OcrInput::Base64(data) => format!("data:image/png;base64,{data}"),
689 };
690
691 let body = serde_json::json!({
692 "model": self.model,
693 "messages": [{
694 "role": "user",
695 "content": [
696 {
697 "type": "image_url",
698 "image_url": {
699 "url": image_content
700 }
701 },
702 {
703 "type": "text",
704 "text": "Extract all text from this document, preserving structure, tables, and reading order. Output in markdown format."
705 }
706 ]
707 }],
708 "max_tokens": 8192
709 });
710
711 let response = self
712 .client
713 .post(format!("{}/chat/completions", self.base_url))
714 .header("Authorization", format!("Bearer {}", self.api_key))
715 .header("Content-Type", "application/json")
716 .json(&body)
717 .send()
718 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
719
720 let status = response.status();
721 if !status.is_success() {
722 let error_text = response.text().unwrap_or_default();
723 return match status.as_u16() {
724 401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
725 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
726 _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
727 };
728 }
729
730 let api_response: DeepSeekOcrResponse = response
731 .json()
732 .map_err(|e| OcrError::Parse(format!("Failed to parse response: {e}")))?;
733
734 let text = api_response
735 .choices
736 .first()
737 .and_then(|c| c.message.content.clone())
738 .unwrap_or_default();
739
740 Ok(OcrResult {
741 text,
742 pages: 1, // DeepSeek processes one image at a time
743 spans: vec![], // DeepSeek OCR doesn't provide word-level spans
744 tables: vec![],
745 images: vec![],
746 confidence: None,
747 processing_time_ms: None,
748 provenance: OcrProvenance {
749 provider: "deepseek-ocr".to_string(),
750 version: self.model.clone(),
751 languages: request.languages.clone(),
752 preprocessing: OcrPreprocessing::default(),
753 input_hash: None,
754 output_hash: None,
755 metadata: std::collections::HashMap::new(),
756 },
757 })
758 }
759}
760
761#[derive(Debug, Deserialize)]
762struct DeepSeekOcrResponse {
763 choices: Vec<DeepSeekOcrChoice>,
764}
765
766#[derive(Debug, Deserialize)]
767struct DeepSeekOcrChoice {
768 message: DeepSeekOcrMessage,
769}
770
771#[derive(Debug, Deserialize)]
772struct DeepSeekOcrMessage {
773 content: Option<String>,
774}
775
776// =============================================================================
777// LightOn OCR Provider
778// =============================================================================
779
780/// LightOnOCR-2-1B provider.
781///
782/// LightOnOCR-2 is an efficient 1B-parameter vision-language model that achieves
783/// SOTA on OlmOCR-Bench while being 9x smaller than competitors.
784///
785/// # Features
786/// - 1B parameters, 9x smaller than competitors
787/// - 5.71 pages/s on H100 (~493k pages/day)
788/// - <$0.01 per 1000 pages
789/// - Apache 2.0 license, open weights
790/// - GDPR-compliant (France)
791///
792/// # Example
793///
794/// ```ignore
795/// use converge_provider::ocr::{LightOnOcrProvider, OcrRequest};
796///
797/// let provider = LightOnOcrProvider::from_env()?;
798/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
799/// ```
800pub struct LightOnOcrProvider {
801 api_key: String,
802 model: String,
803 base_url: String,
804 client: reqwest::blocking::Client,
805}
806
807impl LightOnOcrProvider {
808 /// Creates a new LightOn OCR provider.
809 #[must_use]
810 pub fn new(api_key: impl Into<String>, model: impl Into<String>) -> Self {
811 Self {
812 api_key: api_key.into(),
813 model: model.into(),
814 base_url: "https://api-inference.huggingface.co/models".to_string(),
815 client: reqwest::blocking::Client::new(),
816 }
817 }
818
819 /// Creates a provider using the `HUGGINGFACE_API_KEY` environment variable.
820 ///
821 /// Uses `lightonai/LightOnOCR-2-1B` as the default model.
822 ///
823 /// # Errors
824 ///
825 /// Returns error if the environment variable is not set.
826 pub fn from_env() -> Result<Self, OcrError> {
827 let api_key = std::env::var("HUGGINGFACE_API_KEY")
828 .map_err(|_| OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string()))?;
829 Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B"))
830 }
831
832 /// Creates a provider with the bbox variant for figure extraction.
833 ///
834 /// # Errors
835 ///
836 /// Returns error if the environment variable is not set.
837 pub fn from_env_with_bbox() -> Result<Self, OcrError> {
838 let api_key = std::env::var("HUGGINGFACE_API_KEY")
839 .map_err(|_| OcrError::Auth("HUGGINGFACE_API_KEY environment variable not set".to_string()))?;
840 Ok(Self::new(api_key, "lightonai/LightOnOCR-2-1B-bbox"))
841 }
842
843 /// Uses a custom base URL.
844 #[must_use]
845 pub fn with_base_url(mut self, url: impl Into<String>) -> Self {
846 self.base_url = url.into();
847 self
848 }
849}
850
851impl OcrProvider for LightOnOcrProvider {
852 fn name(&self) -> &'static str {
853 "lighton-ocr"
854 }
855
856 fn model(&self) -> &str {
857 &self.model
858 }
859
860 fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
861 // LightOnOCR uses HuggingFace Inference API
862 let image_bytes = match &request.input {
863 OcrInput::ImageBytes(bytes) => bytes.clone(),
864 OcrInput::PdfBytes(_) => {
865 return Err(OcrError::InvalidInput(
866 "LightOnOCR requires image input. Convert PDF pages to images first.".to_string()
867 ));
868 }
869 OcrInput::Url(url) => {
870 // Fetch the image
871 let response = self.client
872 .get(url)
873 .send()
874 .map_err(|e| OcrError::Network(format!("Failed to fetch image: {e}")))?;
875 response.bytes()
876 .map_err(|e| OcrError::Network(format!("Failed to read image: {e}")))?
877 .to_vec()
878 }
879 OcrInput::Base64(data) => {
880 base64::Engine::decode(&base64::engine::general_purpose::STANDARD, data)
881 .map_err(|e| OcrError::Parse(format!("Invalid base64: {e}")))?
882 }
883 };
884
885 let response = self
886 .client
887 .post(format!("{}/{}", self.base_url, self.model))
888 .header("Authorization", format!("Bearer {}", self.api_key))
889 .header("Content-Type", "application/octet-stream")
890 .body(image_bytes)
891 .send()
892 .map_err(|e| OcrError::Network(format!("Request failed: {e}")))?;
893
894 let status = response.status();
895 if !status.is_success() {
896 let error_text = response.text().unwrap_or_default();
897 return match status.as_u16() {
898 401 | 403 => Err(OcrError::Auth(format!("Authentication failed: {error_text}"))),
899 429 => Err(OcrError::RateLimit("Rate limit exceeded".to_string())),
900 503 => Err(OcrError::Api("Model is loading, please retry".to_string())),
901 _ => Err(OcrError::Api(format!("API error ({}): {}", status, error_text))),
902 };
903 }
904
905 // LightOnOCR returns the extracted text directly
906 let text = response
907 .text()
908 .map_err(|e| OcrError::Parse(format!("Failed to read response: {e}")))?;
909
910 Ok(OcrResult {
911 text,
912 pages: 1,
913 spans: vec![], // LightOnOCR doesn't provide word-level spans via HF API
914 tables: vec![],
915 images: vec![],
916 confidence: None,
917 processing_time_ms: None,
918 provenance: OcrProvenance {
919 provider: "lighton-ocr".to_string(),
920 version: self.model.clone(),
921 languages: request.languages.clone(),
922 preprocessing: OcrPreprocessing::default(),
923 input_hash: None,
924 output_hash: None,
925 metadata: std::collections::HashMap::new(),
926 },
927 })
928 }
929}
930
931// =============================================================================
932// Tesseract OCR Provider (Feature-gated, Local)
933// =============================================================================
934//
935// Tesseract is the "boring, reliable" OCR workhorse: classic OCR engine,
936// open source, runs fully locally, and easy to wrap in a Converge-style
937// Provider boundary.
938//
939// =============================================================================
940// WHAT TESSERACT IS
941// =============================================================================
942//
943// - An OCR engine originally from HP, maintained under open source.
944// - Takes images (PNG/JPG/TIFF etc) and outputs text, optionally with layout.
945// - Can run with different language packs (English, Swedish, etc).
946// - NOT a "big neural multimodal model" - it's a tool-like OCR system with
947// modern recognition components but still very deterministic.
948//
949// =============================================================================
950// WHERE IT SHINES
951// =============================================================================
952//
953// - Clean scans, printed documents, forms, invoices, manuals, receipts
954// - High-contrast screenshots
955// - Simple page layouts
956// - Deterministic runs: same input + same version + same settings = same output
957//
958// =============================================================================
959// WHERE IT STRUGGLES
960// =============================================================================
961//
962// - Handwriting (varies, usually weak vs modern DL OCR)
963// - Low-quality photos (blur, perspective, glare)
964// - Complex layouts with tables/columns (unless you guide it well with PSM)
965// - Mixed languages without explicit config
966//
967// If your primary use case is handwriting, camera photos with glare, or dense
968// multi-column PDFs with complex tables, consider a DL-based OCR instead.
969//
970// =============================================================================
971// OUTPUT FORMATS
972// =============================================================================
973//
974// Tesseract can produce:
975// - Plain text: Just the extracted text
976// - TSV: Word-level info with confidence and bounding boxes
977// - hOCR: HTML-like format with bounding boxes (useful for validation)
978// - ALTO XML: Common in libraries/archives
979//
980// For Converge, hOCR/TSV is useful because you can validate "evidence":
981// - Bounding boxes (where each word came from)
982// - Per-word confidence
983// - Page segmentation decisions
984//
985// =============================================================================
986// KEY KNOBS
987// =============================================================================
988//
989// 1. Page Segmentation Mode (PSM) - THE BIGGEST PRACTICAL LEVER
990// Tells Tesseract what kind of page it's looking at:
991// - 0 = OSD only (orientation and script detection)
992// - 1 = Automatic page segmentation with OSD
993// - 3 = Fully automatic page segmentation (default)
994// - 4 = Single column of variable sizes
995// - 6 = Uniform block of text
996// - 7 = Single text line
997// - 8 = Single word
998// - 11 = Sparse text
999// If you set the wrong mode, accuracy tanks.
1000//
1001// 2. OCR Engine Mode (OEM)
1002// Chooses which internal engine strategy to use:
1003// - 0 = Legacy engine only
1004// - 1 = Neural nets LSTM engine only
1005// - 2 = Legacy + LSTM engines
1006// - 3 = Default (auto-select best available)
1007// Defaults are usually fine, but pin for reproducibility.
1008//
1009// 3. Language Packs
1010// Set -l eng / -l swe etc. DON'T leave language detection implicit.
1011//
1012// 4. Preprocessing
1013// Tesseract is EXTREMELY sensitive to:
1014// - Resolution (DPI) - 300 DPI is typical minimum
1015// - Binarization (thresholding)
1016// - Denoise
1017// - Deskew
1018// - Contrast normalization
1019//
1020// This is where "Rust purity" can shine: do deterministic preprocessing
1021// in Rust (image crate) and then pass a cleaned image to Tesseract.
1022//
1023// =============================================================================
1024// CONVERGE-STYLE INTEGRATION PATTERN
1025// =============================================================================
1026//
1027// Treat OCR as a provider that returns a PROPOSAL, never truth.
1028//
1029// Shape:
1030// DocumentBytes → ProposedTextExtraction → Validators → Facts/StructuredFields
1031//
1032// Provider output (recommended):
1033// - text: extracted text
1034// - spans: optional words/lines with bounding boxes (from TSV/hOCR)
1035// - confidence: summary stats (mean, min, histogram)
1036// - tool_provenance:
1037// - engine = "tesseract"
1038// - tesseract_version
1039// - lang
1040// - psm, oem
1041// - preprocess_pipeline_hash
1042// - trace_link:
1043// - input hash (bytes)
1044// - output hash
1045// - settings hash
1046//
1047// Validators (examples):
1048// - min_confidence >= 0.75 else STOP or WARN
1049// - required_fields_present (invoice number/date/amount)
1050// - layout sanity (if table expected, require hOCR structure)
1051// - PII redaction gate before storage
1052//
1053// =============================================================================
1054// PACKAGING AND DEPLOYMENT
1055// =============================================================================
1056//
1057// Tesseract is a native dependency. Manage cleanly:
1058//
1059// Best practice for "one binary experience":
1060// - Ship your Rust binary
1061// - Vendor/bundle Tesseract in installer (or provide "cz doctor" check)
1062// - Pin versions for reproducibility
1063//
1064// On macOS: Most people install via Homebrew, but for deterministic
1065// environments, package with your app or use Nix.
1066//
1067// =============================================================================
1068// ARCHITECTURE (Rust-first compromise)
1069// =============================================================================
1070//
1071// Tesseract integration follows the "Rust-first compromise" pattern:
1072// - Pure Converge architecture (providers, traces, gates, promotion)
1073// - OCR runs locally with no cloud data exposure
1074// - Accepts native dependency (tesseract + leptonica)
1075//
1076// Integration options (in order of preference):
1077// 1. Sidecar binary: invoke `tesseract` CLI via std::process::Command
1078// 2. FFI binding: link against libtesseract (more complex, faster)
1079// 3. System dependency: require tesseract installed (brew, apt, nix)
1080//
1081// The provider returns:
1082// - Extracted text
1083// - Confidence summary (per-word statistics)
1084// - Provenance: tool version, language pack, preprocessing params
1085// - Trace link hashes of input bytes and output
1086//
1087// Determinism: Stable for same input image + same Tesseract version.
1088//
1089// When to use:
1090// - Scanned PDFs, clean prints, forms, invoices, receipts
1091// - "Extract text so downstream validators can reason"
1092// - GDPR/data sovereignty requirements (no cloud exposure)
1093//
1094// Future: Can be swapped with Burn/candle-based OCR model without
1095// changing the core contracts (OcrProvider trait).
1096//
1097// =============================================================================
1098
1099/// Configuration for Tesseract OCR provider.
1100///
1101/// # Feature Gate
1102///
1103/// This provider requires the `tesseract` feature:
1104/// ```toml
1105/// [dependencies]
1106/// converge-provider = { version = "0.2", features = ["tesseract"] }
1107/// ```
1108///
1109/// # System Requirements
1110///
1111/// Tesseract must be installed on the system:
1112/// - macOS: `brew install tesseract tesseract-lang`
1113/// - Ubuntu: `apt install tesseract-ocr tesseract-ocr-eng`
1114/// - Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki
1115///
1116/// # Key Knobs
1117///
1118/// **Page Segmentation Mode (PSM)** - The biggest practical lever:
1119/// - 0 = OSD only (orientation and script detection)
1120/// - 1 = Automatic page segmentation with OSD
1121/// - 3 = Fully automatic page segmentation (default)
1122/// - 4 = Single column of variable sizes
1123/// - 6 = Uniform block of text
1124/// - 7 = Single text line
1125/// - 8 = Single word
1126/// - 11 = Sparse text
1127///
1128/// If you set the wrong mode, accuracy tanks.
1129///
1130/// **OCR Engine Mode (OEM)**:
1131/// - 0 = Legacy engine only
1132/// - 1 = Neural nets LSTM engine only
1133/// - 2 = Legacy + LSTM engines
1134/// - 3 = Default (auto-select best available)
1135///
1136/// **Preprocessing**: Tesseract is EXTREMELY sensitive to:
1137/// - Resolution (DPI) - 300 DPI is typical minimum
1138/// - Binarization, denoise, deskew, contrast normalization
1139///
1140/// # Example (Future)
1141///
1142/// ```ignore
1143/// use converge_provider::ocr::{TesseractOcrProvider, TesseractConfig, TesseractOutputFormat, OcrRequest};
1144///
1145/// let config = TesseractConfig::new()
1146/// .with_languages(vec!["eng", "deu"])
1147/// .with_dpi(300)
1148/// .with_psm(3) // Fully automatic
1149/// .with_output_format(TesseractOutputFormat::Tsv); // Get bounding boxes
1150///
1151/// let provider = TesseractOcrProvider::with_config(config);
1152/// let result = provider.extract(&OcrRequest::from_pdf_bytes(pdf_bytes))?;
1153///
1154/// // Provenance includes tool version, language pack, preprocessing
1155/// println!("Tesseract version: {}", result.provenance.version);
1156/// println!("Confidence: {:.2}%", result.confidence.unwrap().mean * 100.0);
1157///
1158/// // Check spans for evidence validation
1159/// for span in &result.spans {
1160/// if span.is_low_confidence(0.75) {
1161/// println!("Low confidence word: {} ({:.0}%)", span.text, span.confidence * 100.0);
1162/// }
1163/// }
1164/// ```
1165#[derive(Debug, Clone)]
1166pub struct TesseractConfig {
1167 /// Path to tesseract binary (default: "tesseract" in PATH).
1168 pub binary_path: String,
1169 /// Path to tessdata directory (language files).
1170 pub tessdata_path: Option<String>,
1171 /// Languages to use (e.g., ["eng", "deu"]).
1172 /// DON'T leave language detection implicit!
1173 pub languages: Vec<String>,
1174 /// DPI for PDF rendering (default: 300).
1175 /// 300 DPI is typical minimum for good results.
1176 pub dpi: u32,
1177 /// Page segmentation mode (PSM).
1178 /// 0 = OSD only, 1 = auto + OSD, 3 = fully auto (default), 6 = uniform block, etc.
1179 /// THIS IS THE BIGGEST PRACTICAL LEVER. Wrong mode = bad accuracy.
1180 pub psm: u32,
1181 /// OCR engine mode (OEM).
1182 /// 0 = Legacy, 1 = Neural LSTM, 2 = Legacy + LSTM, 3 = Default (auto).
1183 /// Pin for reproducibility.
1184 pub oem: u32,
1185 /// Output format (text, TSV, hOCR, ALTO).
1186 /// Use TSV or hOCR for word-level confidence and bounding boxes.
1187 pub output_format: TesseractOutputFormat,
1188 /// Whether to apply preprocessing (deskew, denoise, binarize).
1189 /// Tesseract is EXTREMELY sensitive to image quality.
1190 pub preprocess: bool,
1191 /// Timeout in seconds for OCR operation.
1192 pub timeout_secs: u64,
1193}
1194
1195impl Default for TesseractConfig {
1196 fn default() -> Self {
1197 Self {
1198 binary_path: "tesseract".to_string(),
1199 tessdata_path: None,
1200 languages: vec!["eng".to_string()],
1201 dpi: 300,
1202 psm: 3, // Fully automatic page segmentation
1203 oem: 3, // Default (auto-select best available)
1204 output_format: TesseractOutputFormat::Text,
1205 preprocess: true,
1206 timeout_secs: 60,
1207 }
1208 }
1209}
1210
1211impl TesseractConfig {
1212 /// Creates a new Tesseract configuration with defaults.
1213 #[must_use]
1214 pub fn new() -> Self {
1215 Self::default()
1216 }
1217
1218 /// Sets the path to the tesseract binary.
1219 #[must_use]
1220 pub fn with_binary_path(mut self, path: impl Into<String>) -> Self {
1221 self.binary_path = path.into();
1222 self
1223 }
1224
1225 /// Sets the tessdata directory path.
1226 #[must_use]
1227 pub fn with_tessdata_path(mut self, path: impl Into<String>) -> Self {
1228 self.tessdata_path = Some(path.into());
1229 self
1230 }
1231
1232 /// Sets the languages to use.
1233 #[must_use]
1234 pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1235 self.languages = languages.into_iter().map(Into::into).collect();
1236 self
1237 }
1238
1239 /// Sets the DPI for PDF rendering.
1240 #[must_use]
1241 pub fn with_dpi(mut self, dpi: u32) -> Self {
1242 self.dpi = dpi;
1243 self
1244 }
1245
1246 /// Sets the page segmentation mode.
1247 #[must_use]
1248 pub fn with_psm(mut self, psm: u32) -> Self {
1249 self.psm = psm;
1250 self
1251 }
1252
1253 /// Sets the OCR engine mode.
1254 #[must_use]
1255 pub fn with_oem(mut self, oem: u32) -> Self {
1256 self.oem = oem;
1257 self
1258 }
1259
1260 /// Sets whether to apply preprocessing.
1261 #[must_use]
1262 pub fn with_preprocess(mut self, preprocess: bool) -> Self {
1263 self.preprocess = preprocess;
1264 self
1265 }
1266
1267 /// Sets the timeout in seconds.
1268 #[must_use]
1269 pub fn with_timeout(mut self, secs: u64) -> Self {
1270 self.timeout_secs = secs;
1271 self
1272 }
1273
1274 /// Sets the output format.
1275 /// Use TSV or hOCR for word-level confidence and bounding boxes.
1276 #[must_use]
1277 pub fn with_output_format(mut self, format: TesseractOutputFormat) -> Self {
1278 self.output_format = format;
1279 self
1280 }
1281}
1282
1283/// Tesseract OCR provider (stub - not yet implemented).
1284///
1285/// This is a placeholder for the local Tesseract OCR integration.
1286/// The actual implementation will be feature-gated behind `tesseract`.
1287///
1288/// # Architecture
1289///
1290/// ```text
1291/// TesseractOcrProvider
1292/// │
1293/// ├── Input (PDF/Image bytes)
1294/// │ │
1295/// │ ▼
1296/// ├── Preprocessing (optional)
1297/// │ ├── PDF → Images (pdftoppm/pdf2image)
1298/// │ ├── Deskew (leptonica)
1299/// │ ├── Denoise (leptonica)
1300/// │ └── Binarize (leptonica)
1301/// │ │
1302/// │ ▼
1303/// ├── Tesseract CLI/FFI
1304/// │ │
1305/// │ ▼
1306/// ├── Output
1307/// │ ├── Text (plain or hOCR/ALTO)
1308/// │ ├── Confidence (per-word)
1309/// │ └── Bounding boxes (optional)
1310/// │ │
1311/// │ ▼
1312/// └── OcrResult with Provenance
1313/// ├── text
1314/// ├── confidence summary
1315/// ├── provenance (version, langs, params)
1316/// └── trace hashes (input/output)
1317/// ```
1318///
1319/// # Future Implementation
1320///
1321/// When the `tesseract` feature is enabled:
1322///
1323/// ```ignore
1324/// #[cfg(feature = "tesseract")]
1325/// impl OcrProvider for TesseractOcrProvider {
1326/// fn extract(&self, request: &OcrRequest) -> Result<OcrResult, OcrError> {
1327/// // 1. Hash input for trace links
1328/// // 2. Preprocess if needed (PDF→image, deskew, etc.)
1329/// // 3. Invoke tesseract CLI or FFI
1330/// // 4. Parse output (text + confidence)
1331/// // 5. Hash output for trace links
1332/// // 6. Return OcrResult with full provenance
1333/// }
1334/// }
1335/// ```
1336#[derive(Debug)]
1337pub struct TesseractOcrProvider {
1338 config: TesseractConfig,
1339}
1340
1341impl TesseractOcrProvider {
1342 /// Creates a new Tesseract OCR provider with default configuration.
1343 #[must_use]
1344 pub fn new() -> Self {
1345 Self {
1346 config: TesseractConfig::default(),
1347 }
1348 }
1349
1350 /// Creates a provider with custom configuration.
1351 #[must_use]
1352 pub fn with_config(config: TesseractConfig) -> Self {
1353 Self { config }
1354 }
1355
1356 /// Sets the languages to use.
1357 #[must_use]
1358 pub fn with_languages(mut self, languages: Vec<impl Into<String>>) -> Self {
1359 self.config.languages = languages.into_iter().map(Into::into).collect();
1360 self
1361 }
1362
1363 /// Sets the DPI for PDF rendering.
1364 #[must_use]
1365 pub fn with_dpi(mut self, dpi: u32) -> Self {
1366 self.config.dpi = dpi;
1367 self
1368 }
1369
1370 /// Checks if Tesseract is available on the system.
1371 ///
1372 /// # Errors
1373 ///
1374 /// Returns error if Tesseract is not found or cannot be executed.
1375 pub fn check_availability(&self) -> Result<String, OcrError> {
1376 // This is a stub - actual implementation would run `tesseract --version`
1377 Err(OcrError::Api(
1378 "Tesseract provider not yet implemented. Enable the 'tesseract' feature.".to_string()
1379 ))
1380 }
1381
1382 /// Returns the Tesseract version (stub).
1383 pub fn version(&self) -> Option<String> {
1384 None // Stub - would parse `tesseract --version` output
1385 }
1386}
1387
1388impl Default for TesseractOcrProvider {
1389 fn default() -> Self {
1390 Self::new()
1391 }
1392}
1393
1394// Stub implementation - will be replaced when feature is implemented
1395impl OcrProvider for TesseractOcrProvider {
1396 fn name(&self) -> &'static str {
1397 "tesseract"
1398 }
1399
1400 fn model(&self) -> &str {
1401 "tesseract-stub"
1402 }
1403
1404 fn extract(&self, _request: &OcrRequest) -> Result<OcrResult, OcrError> {
1405 Err(OcrError::Api(
1406 "Tesseract OCR provider not yet implemented. \
1407 This is a placeholder for future local OCR support. \
1408 For now, use MistralOcrProvider, DeepSeekOcrProvider, or LightOnOcrProvider."
1409 .to_string()
1410 ))
1411 }
1412}
1413
1414// =============================================================================
1415// Helper functions for provenance
1416// =============================================================================
1417
1418/// Computes SHA-256 hash of bytes for trace links.
1419#[must_use]
1420pub fn compute_hash(data: &[u8]) -> String {
1421 use sha2::{Sha256, Digest};
1422 let mut hasher = Sha256::new();
1423 hasher.update(data);
1424 format!("{:x}", hasher.finalize())
1425}
1426
1427/// Computes input/output hashes and returns updated provenance.
1428#[must_use]
1429pub fn with_trace_hashes(mut provenance: OcrProvenance, input: &[u8], output: &str) -> OcrProvenance {
1430 provenance.input_hash = Some(compute_hash(input));
1431 provenance.output_hash = Some(compute_hash(output.as_bytes()));
1432 provenance
1433}
1434
1435#[cfg(test)]
1436mod tests {
1437 use super::*;
1438
1439 #[test]
1440 fn test_ocr_request_builder() {
1441 let request = OcrRequest::from_pdf_bytes(vec![1, 2, 3])
1442 .with_output_format(OcrOutputFormat::Html)
1443 .with_languages(vec!["en".to_string(), "de".to_string()])
1444 .with_extract_tables(true)
1445 .with_extract_images(true)
1446 .with_page_range(0, 10);
1447
1448 assert_eq!(request.output_format, OcrOutputFormat::Html);
1449 assert_eq!(request.languages, vec!["en", "de"]);
1450 assert!(request.extract_tables);
1451 assert!(request.extract_images);
1452 assert_eq!(request.page_range, Some((0, 10)));
1453 }
1454
1455 #[test]
1456 fn test_ocr_output_format_default() {
1457 let format = OcrOutputFormat::default();
1458 assert_eq!(format, OcrOutputFormat::Markdown);
1459 }
1460}