use std::{cell::RefCell, io::Cursor, str::Utf8Error};
use image::{DynamicImage, GrayImage};
use leptess::{
leptonica::PixError,
tesseract::{TessInitError, TessSetVariableError},
LepTess, Variable,
};
use log::trace;
use rayon::{broadcast, prelude::*};
use thiserror::Error;
pub struct OcrOpt<'a> {
tessdata_dir: &'a Option<String>,
lang: &'a str,
config: &'a Vec<(Variable, String)>,
dpi: i32,
}
impl<'a> OcrOpt<'a> {
#[must_use]
pub const fn new(
tessdata_dir: &'a Option<String>,
lang: &'a str,
config: &'a Vec<(Variable, String)>,
dpi: i32,
) -> Self {
Self {
tessdata_dir,
lang,
config,
dpi,
}
}
}
#[derive(Error, Debug)]
pub enum Error {
#[error("could not initialize tesseract")]
Initialize(#[from] TessInitError),
#[error("thread local var `TESSERACT` is already initialized")]
AlreadyInitialized,
#[error("could not set tesseract variable")]
SetVariable(#[from] TessSetVariableError),
#[error("could not write image to memory")]
WritePnmImage(#[from] image::ImageError),
#[error("could not set `Tesseract` image")]
SetImage(#[from] PixError),
#[error("could not get `Tesseract` text")]
GetText(#[from] Utf8Error),
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
thread_local! {
static TESSERACT: RefCell<Option<TesseractWrapper>> = const { RefCell::new(None) };
}
#[profiling::function]
pub fn process<Img>(images: Img, opt: &OcrOpt) -> Result<Vec<Result<String>>>
where
Img: IntoParallelIterator<Item = GrayImage>,
{
std::env::set_var("OMP_THREAD_LIMIT", "1");
let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
if TESSERACT.replace(Some(tesseract)).is_some() {
return Err(Error::AlreadyInitialized);
}
broadcast(|ctx| {
profiling::scope!("Tesseract Init Wrapper");
trace!(
"Init tesseract with lang `{}` on thread {}",
opt.lang,
ctx.index()
);
let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
if TESSERACT.replace(Some(tesseract)).is_some() {
return Err(Error::AlreadyInitialized);
}
Ok::<_, Error>(())
})
.into_iter()
.try_for_each(|init_res| init_res)?;
let subs = images
.into_par_iter()
.map(|image| {
let text = TESSERACT.with(|tesseract| {
profiling::scope!("tesseract_ocr");
let mut tesseract = tesseract.borrow_mut();
let tesseract = tesseract.as_mut().unwrap();
tesseract.set_image(image, opt.dpi)?;
tesseract.get_text()
})?;
Ok(text)
})
.collect::<Vec<Result<String>>>();
broadcast(|ctx| {
profiling::scope!("Tesseract Drop Wrapper");
trace!("Drop TesseractWrapper local var on thread {}", ctx.index());
if let Some(tesseract) = TESSERACT.take() {
drop(tesseract);
}
});
if let Some(tesseract) = TESSERACT.take() {
drop(tesseract);
}
Ok(subs)
}
struct TesseractWrapper {
leptess: LepTess,
}
impl TesseractWrapper {
fn new(
datapath: Option<&str>,
language: impl AsRef<str>,
config: &[(Variable, String)],
) -> Result<Self> {
profiling::scope!("TesseractWrapper new");
let mut leptess = LepTess::new(datapath, language.as_ref())?;
leptess.set_variable(leptess::Variable::ClassifyEnableLearning, "0")?;
leptess.set_variable(leptess::Variable::TesseditPagesegMode, "6")?;
leptess.set_variable(leptess::Variable::TesseditCharBlacklist, "|[]")?;
leptess.set_variable(leptess::Variable::TesseditDoInvert, "0")?;
for (key, value) in config {
leptess.set_variable(*key, value)?;
}
Ok(Self { leptess })
}
#[profiling::function]
fn set_image(&mut self, image: GrayImage, dpi: i32) -> Result<()> {
let bytes = {
profiling::scope!("TesseractWrapper Pnm create");
let mut bytes: Cursor<Vec<u8>> = Cursor::new(Vec::new());
DynamicImage::ImageLuma8(image).write_to(&mut bytes, image::ImageFormat::Pnm)?;
bytes
};
self.leptess.set_image_from_mem(bytes.get_ref())?;
self.leptess.set_source_resolution(dpi);
Ok(())
}
#[profiling::function]
fn get_text(&mut self) -> Result<String> {
Ok(self.leptess.get_utf8_text()?)
}
}