mod normalize;
use crate::DocumentError;
pub(crate) use normalize::normalize_ocr_artifacts;
#[cfg(feature = "ocr-tesseract")]
#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
pub mod tesseract;
#[cfg(feature = "ocr-tesseract")]
#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
pub use tesseract::TesseractBackend;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ImageFormat {
Png,
Jpeg,
Tiff,
}
impl ImageFormat {
pub fn extension(self) -> &'static str {
match self {
Self::Png => "png",
Self::Jpeg => "jpg",
Self::Tiff => "tiff",
}
}
}
pub fn detect_image_format(bytes: &[u8]) -> Result<ImageFormat, DocumentError> {
if bytes.starts_with(b"\x89PNG") {
return Ok(ImageFormat::Png);
}
if bytes.starts_with(b"\xFF\xD8\xFF") {
return Ok(ImageFormat::Jpeg);
}
if bytes.starts_with(b"II\x2A\x00") || bytes.starts_with(b"MM\x00\x2A") {
return Ok(ImageFormat::Tiff);
}
Err(DocumentError::UnsupportedInput {
path: std::path::PathBuf::new(),
reason: "image bytes are not PNG, JPEG, or TIFF",
})
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ImageInput {
pub bytes: Vec<u8>,
pub format: ImageFormat,
pub dpi: Option<u32>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LanguageTag(String);
impl LanguageTag {
pub fn new(tag: impl Into<String>) -> Self {
Self(tag.into())
}
pub fn as_str(&self) -> &str {
&self.0
}
}
impl Default for LanguageTag {
fn default() -> Self {
Self::new("eng")
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OcrHints {
pub languages: Vec<LanguageTag>,
}
impl OcrHints {
pub fn english() -> Self {
Self {
languages: vec![LanguageTag::default()],
}
}
pub fn primary_language(&self) -> &str {
self.languages
.first()
.map(LanguageTag::as_str)
.unwrap_or("eng")
}
}
impl Default for OcrHints {
fn default() -> Self {
Self::english()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct BBox {
pub x: u32,
pub y: u32,
pub w: u32,
pub h: u32,
}
#[derive(Debug, Clone, PartialEq)]
pub struct OcrSpan {
pub text: String,
pub bbox: BBox,
pub confidence: Option<f32>,
}
#[derive(Debug, thiserror::Error)]
pub enum OcrError {
#[error("backend init failed: {0}")]
InitFailed(String),
#[error("recognize failed: {0}")]
RecognizeFailed(String),
#[error("unsupported image format: {0:?}")]
UnsupportedFormat(ImageFormat),
#[error("backend internal error: {0}")]
Internal(String),
}
pub trait OcrBackend: Send + Sync {
fn name(&self) -> &str;
fn recognize(&self, image: ImageInput, hints: OcrHints) -> Result<Vec<OcrSpan>, OcrError>;
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct OcrResult {
pub text: String,
pub mean_confidence: Option<f32>,
pub word_count: usize,
pub lang: String,
}
impl OcrResult {
pub(crate) fn new(
text: String,
mean_confidence: Option<f32>,
word_count: usize,
lang: String,
) -> Self {
Self {
text,
mean_confidence,
word_count,
lang,
}
}
pub fn from_spans(spans: &[OcrSpan], lang: String) -> Self {
Self::from_spans_with_column_detection(spans, lang, false).0
}
pub(crate) fn from_spans_with_column_detection(
spans: &[OcrSpan],
lang: String,
column_detection: bool,
) -> (Self, u32) {
let ordered = crate::postprocess::order_spans(spans, column_detection);
let mut conf_sum = 0.0f64;
let mut conf_count = 0usize;
for span in spans {
if let Some(confidence) = span.confidence {
conf_sum += (confidence * 100.0) as f64;
conf_count += 1;
}
}
let mean_confidence = if conf_count == 0 {
None
} else {
Some((conf_sum / conf_count as f64) as f32)
};
(
Self {
text: ordered.text,
mean_confidence,
word_count: conf_count,
lang,
},
ordered.column_count,
)
}
pub(crate) fn mean_confidence_unit(&self) -> Option<f32> {
self.mean_confidence.map(|confidence| {
if confidence > 1.0 {
(confidence / 100.0).clamp(0.0, 1.0)
} else {
confidence.clamp(0.0, 1.0)
}
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn mean_confidence_unit_normalizes_legacy_percent_value() {
let result = OcrResult::new("body".to_string(), Some(91.0), 1, "eng".to_string());
assert_eq!(result.mean_confidence_unit(), Some(0.91));
}
#[test]
fn from_spans_reports_detected_columns() {
let spans = vec![
OcrSpan {
text: "A1".to_string(),
bbox: BBox {
x: 10,
y: 10,
w: 30,
h: 10,
},
confidence: Some(0.8),
},
OcrSpan {
text: "B1".to_string(),
bbox: BBox {
x: 280,
y: 10,
w: 30,
h: 10,
},
confidence: Some(0.8),
},
OcrSpan {
text: "A2".to_string(),
bbox: BBox {
x: 10,
y: 30,
w: 30,
h: 10,
},
confidence: Some(0.8),
},
OcrSpan {
text: "B2".to_string(),
bbox: BBox {
x: 280,
y: 30,
w: 30,
h: 10,
},
confidence: Some(0.8),
},
];
let (result, columns) =
OcrResult::from_spans_with_column_detection(&spans, "eng".to_string(), true);
assert_eq!(columns, 2);
assert_eq!(result.text, "A1\nA2\n\nB1\nB2");
assert_eq!(result.mean_confidence_unit(), Some(0.8));
}
#[test]
fn detect_image_format_accepts_supported_magic_bytes() {
assert_eq!(
detect_image_format(b"\x89PNG\r\n\x1A\nrest").expect("png magic"),
ImageFormat::Png
);
assert_eq!(
detect_image_format(b"\xFF\xD8\xFF\xE0rest").expect("jpeg magic"),
ImageFormat::Jpeg
);
assert_eq!(
detect_image_format(b"II\x2A\x00rest").expect("little-endian tiff magic"),
ImageFormat::Tiff
);
assert_eq!(
detect_image_format(b"MM\x00\x2Arest").expect("big-endian tiff magic"),
ImageFormat::Tiff
);
}
#[test]
fn detect_image_format_rejects_unknown_bytes() {
let err = detect_image_format(b"not an image").expect_err("unknown format fails");
assert!(matches!(err, DocumentError::UnsupportedInput { .. }));
}
}