1mod normalize;
9
10use crate::DocumentError;
11
12pub(crate) use normalize::normalize_ocr_artifacts;
13
14#[cfg(feature = "ocr-tesseract")]
15#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
16pub mod tesseract;
17
18#[cfg(feature = "ocr-tesseract")]
19#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
20pub use tesseract::TesseractBackend;
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub enum ImageFormat {
25 Png,
27 Jpeg,
29 Tiff,
31}
32
33impl ImageFormat {
34 pub fn extension(self) -> &'static str {
36 match self {
37 Self::Png => "png",
38 Self::Jpeg => "jpg",
39 Self::Tiff => "tiff",
40 }
41 }
42}
43
44pub fn detect_image_format(bytes: &[u8]) -> Result<ImageFormat, DocumentError> {
51 if bytes.starts_with(b"\x89PNG") {
52 return Ok(ImageFormat::Png);
53 }
54 if bytes.starts_with(b"\xFF\xD8\xFF") {
55 return Ok(ImageFormat::Jpeg);
56 }
57 if bytes.starts_with(b"II\x2A\x00") || bytes.starts_with(b"MM\x00\x2A") {
58 return Ok(ImageFormat::Tiff);
59 }
60 Err(DocumentError::UnsupportedInput {
61 path: std::path::PathBuf::new(),
62 reason: "image bytes are not PNG, JPEG, or TIFF",
63 })
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct ImageInput {
69 pub bytes: Vec<u8>,
71 pub format: ImageFormat,
73 pub dpi: Option<u32>,
75}
76
77#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct LanguageTag(String);
80
81impl LanguageTag {
82 pub fn new(tag: impl Into<String>) -> Self {
84 Self(tag.into())
85 }
86
87 pub fn as_str(&self) -> &str {
89 &self.0
90 }
91}
92
93impl Default for LanguageTag {
94 fn default() -> Self {
95 Self::new("eng")
96 }
97}
98
99#[derive(Debug, Clone, PartialEq, Eq)]
101pub struct OcrHints {
102 pub languages: Vec<LanguageTag>,
104}
105
106impl OcrHints {
107 pub fn english() -> Self {
109 Self {
110 languages: vec![LanguageTag::default()],
111 }
112 }
113
114 pub fn primary_language(&self) -> &str {
116 self.languages
117 .first()
118 .map(LanguageTag::as_str)
119 .unwrap_or("eng")
120 }
121}
122
123impl Default for OcrHints {
124 fn default() -> Self {
125 Self::english()
126 }
127}
128
129#[derive(Debug, Clone, Copy, PartialEq, Eq)]
131pub struct BBox {
132 pub x: u32,
134 pub y: u32,
136 pub w: u32,
138 pub h: u32,
140}
141
142#[derive(Debug, Clone, PartialEq)]
144pub struct OcrSpan {
145 pub text: String,
147 pub bbox: BBox,
149 pub confidence: Option<f32>,
151}
152
153#[derive(Debug, thiserror::Error)]
155pub enum OcrError {
156 #[error("backend init failed: {0}")]
158 InitFailed(String),
159 #[error("recognize failed: {0}")]
161 RecognizeFailed(String),
162 #[error("unsupported image format: {0:?}")]
164 UnsupportedFormat(ImageFormat),
165 #[error("backend internal error: {0}")]
167 Internal(String),
168}
169
170pub trait OcrBackend: Send + Sync {
172 fn name(&self) -> &str;
174
175 fn recognize(&self, image: ImageInput, hints: OcrHints) -> Result<Vec<OcrSpan>, OcrError>;
177}
178
179#[non_exhaustive]
185#[derive(Debug, Clone)]
186pub struct OcrResult {
187 pub text: String,
189 pub mean_confidence: Option<f32>,
192 pub word_count: usize,
194 pub lang: String,
196}
197
198impl OcrResult {
199 pub(crate) fn new(
201 text: String,
202 mean_confidence: Option<f32>,
203 word_count: usize,
204 lang: String,
205 ) -> Self {
206 Self {
207 text,
208 mean_confidence,
209 word_count,
210 lang,
211 }
212 }
213
214 pub fn from_spans(spans: &[OcrSpan], lang: String) -> Self {
217 Self::from_spans_with_column_detection(spans, lang, false).0
218 }
219
220 pub(crate) fn from_spans_with_column_detection(
223 spans: &[OcrSpan],
224 lang: String,
225 column_detection: bool,
226 ) -> (Self, u32) {
227 let ordered = crate::postprocess::order_spans(spans, column_detection);
228 let mut conf_sum = 0.0f64;
229 let mut conf_count = 0usize;
230 for span in spans {
231 if let Some(confidence) = span.confidence {
232 conf_sum += (confidence * 100.0) as f64;
233 conf_count += 1;
234 }
235 }
236 let mean_confidence = if conf_count == 0 {
237 None
238 } else {
239 Some((conf_sum / conf_count as f64) as f32)
240 };
241 (
242 Self {
243 text: ordered.text,
244 mean_confidence,
245 word_count: conf_count,
246 lang,
247 },
248 ordered.column_count,
249 )
250 }
251
252 pub(crate) fn mean_confidence_unit(&self) -> Option<f32> {
254 self.mean_confidence.map(|confidence| {
255 if confidence > 1.0 {
256 (confidence / 100.0).clamp(0.0, 1.0)
257 } else {
258 confidence.clamp(0.0, 1.0)
259 }
260 })
261 }
262}
263
264#[cfg(test)]
265mod tests {
266 use super::*;
267
268 #[test]
269 fn mean_confidence_unit_normalizes_legacy_percent_value() {
270 let result = OcrResult::new("body".to_string(), Some(91.0), 1, "eng".to_string());
271 assert_eq!(result.mean_confidence_unit(), Some(0.91));
272 }
273
274 #[test]
275 fn from_spans_reports_detected_columns() {
276 let spans = vec![
277 OcrSpan {
278 text: "A1".to_string(),
279 bbox: BBox {
280 x: 10,
281 y: 10,
282 w: 30,
283 h: 10,
284 },
285 confidence: Some(0.8),
286 },
287 OcrSpan {
288 text: "B1".to_string(),
289 bbox: BBox {
290 x: 280,
291 y: 10,
292 w: 30,
293 h: 10,
294 },
295 confidence: Some(0.8),
296 },
297 OcrSpan {
298 text: "A2".to_string(),
299 bbox: BBox {
300 x: 10,
301 y: 30,
302 w: 30,
303 h: 10,
304 },
305 confidence: Some(0.8),
306 },
307 OcrSpan {
308 text: "B2".to_string(),
309 bbox: BBox {
310 x: 280,
311 y: 30,
312 w: 30,
313 h: 10,
314 },
315 confidence: Some(0.8),
316 },
317 ];
318
319 let (result, columns) =
320 OcrResult::from_spans_with_column_detection(&spans, "eng".to_string(), true);
321
322 assert_eq!(columns, 2);
323 assert_eq!(result.text, "A1\nA2\n\nB1\nB2");
324 assert_eq!(result.mean_confidence_unit(), Some(0.8));
325 }
326
327 #[test]
328 fn detect_image_format_accepts_supported_magic_bytes() {
329 assert_eq!(
330 detect_image_format(b"\x89PNG\r\n\x1A\nrest").expect("png magic"),
331 ImageFormat::Png
332 );
333 assert_eq!(
334 detect_image_format(b"\xFF\xD8\xFF\xE0rest").expect("jpeg magic"),
335 ImageFormat::Jpeg
336 );
337 assert_eq!(
338 detect_image_format(b"II\x2A\x00rest").expect("little-endian tiff magic"),
339 ImageFormat::Tiff
340 );
341 assert_eq!(
342 detect_image_format(b"MM\x00\x2Arest").expect("big-endian tiff magic"),
343 ImageFormat::Tiff
344 );
345 }
346
347 #[test]
348 fn detect_image_format_rejects_unknown_bytes() {
349 let err = detect_image_format(b"not an image").expect_err("unknown format fails");
350 assert!(matches!(err, DocumentError::UnsupportedInput { .. }));
351 }
352}