gaze_document/ocr/mod.rs
1//! OCR adapter contract surface and concrete backends.
2//!
3//! The [`OcrAdapter`] trait stays open for future backends (cloud OCR, Apple
4//! Vision, etc.). The shipping backend in v0.0.x is the Tesseract subprocess
5//! adapter under [`tesseract`].
6//!
7//! The trait itself remains fail-loud-default via [`PendingOcrAdapter`] so an
8//! adopter who builds a trait object without picking a backend gets a typed
9//! `NotImplemented` error instead of silent zero-detection output (Axis 1
10//! fail-closed).
11
12use crate::DocumentError;
13
14mod normalize;
15
16pub(crate) use normalize::normalize_ocr_artifacts;
17
18#[cfg(feature = "ocr-tesseract")]
19#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
20pub mod tesseract;
21
22#[cfg(feature = "ocr-tesseract")]
23#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
24pub use tesseract::TesseractOcr;
25
26/// Result of an OCR pass: full text + a structured confidence summary.
27///
28/// Backend-agnostic — concrete adapters (e.g., [`tesseract::TesseractOcr`])
29/// produce values of this shape so the rest of the pipeline does not
30/// hard-code one OCR backend's surface.
31#[non_exhaustive]
32#[derive(Debug, Clone)]
33pub struct OcrResult {
34 /// Extracted plain text, with original line breaks preserved.
35 pub text: String,
36 /// Mean per-word OCR confidence in `[0.0, 100.0]`. `None` when the page
37 /// contained no recognizable words.
38 pub mean_confidence: Option<f32>,
39 /// Number of words emitted with confidence `>= 0`.
40 pub word_count: usize,
41 /// OCR language tag used (e.g., `eng`).
42 pub lang: String,
43}
44
45impl OcrResult {
46 /// Build an OCR result from raw fields.
47 pub fn new(
48 text: String,
49 mean_confidence: Option<f32>,
50 word_count: usize,
51 lang: String,
52 ) -> Self {
53 Self {
54 text,
55 mean_confidence,
56 word_count,
57 lang,
58 }
59 }
60}
61
62/// Adapter contract for OCR backends.
63///
64/// Implementations live behind feature flags (e.g., `ocr-tesseract`) and
65/// must round-trip page coordinates so the eventual `layout::ReadingOrder`
66/// pass can stitch the result without losing source spans.
67pub trait OcrAdapter {
68 /// Extract textual content from `_bytes` (raw image / page payload).
69 ///
70 /// # Errors
71 /// Implementations return [`DocumentError`] on backend failure.
72 fn extract_text(&self, bytes: &[u8]) -> Result<String, DocumentError>;
73}
74
75/// Reserved fail-loud adapter.
76///
77/// Used as an explicit "no backend wired" sentinel. Every call returns
78/// [`DocumentError::NotImplemented`] so accidental wiring is caught at the
79/// call site (Axis 1 fail-closed).
80#[non_exhaustive]
81pub struct PendingOcrAdapter {
82 _private: (),
83}
84
85impl PendingOcrAdapter {
86 /// Build the fail-loud adapter.
87 ///
88 /// # Errors
89 /// Always returns [`DocumentError::NotImplemented`]. The placeholder
90 /// exists so adopters wiring a trait object without a concrete backend
91 /// fail at construction time rather than receiving silent zero-output
92 /// (Axis 1 fail-closed).
93 pub fn new() -> Result<Self, DocumentError> {
94 Err(DocumentError::NotImplemented(
95 "PendingOcrAdapter::new (wire a concrete OCR backend)",
96 ))
97 }
98}
99
100impl OcrAdapter for PendingOcrAdapter {
101 fn extract_text(&self, _bytes: &[u8]) -> Result<String, DocumentError> {
102 Err(DocumentError::NotImplemented(
103 "PendingOcrAdapter::extract_text (wire a concrete OCR backend)",
104 ))
105 }
106}