1use std::{cell::RefCell, io::Cursor, str::Utf8Error};
2
3use image::{DynamicImage, GrayImage};
4use leptess::{
5 leptonica::PixError,
6 tesseract::{TessInitError, TessSetVariableError},
7 LepTess, Variable,
8};
9use log::trace;
10use rayon::{broadcast, prelude::*};
11use thiserror::Error;
12
13pub struct OcrOpt<'a> {
15 tessdata_dir: &'a Option<String>,
16 lang: &'a str,
17 config: &'a Vec<(Variable, String)>,
18 dpi: i32,
19}
20
21impl<'a> OcrOpt<'a> {
22 #[must_use]
24 pub const fn new(
25 tessdata_dir: &'a Option<String>,
26 lang: &'a str,
27 config: &'a Vec<(Variable, String)>,
28 dpi: i32,
29 ) -> Self {
30 Self {
31 tessdata_dir,
32 lang,
33 config,
34 dpi,
35 }
36 }
37}
38
39#[derive(Error, Debug)]
41pub enum Error {
42 #[error("could not initialize tesseract")]
44 Initialize(#[from] TessInitError),
45
46 #[error("thread local var `TESSERACT` is already initialized")]
48 AlreadyInitialized,
49
50 #[error("could not set tesseract variable")]
52 SetVariable(#[from] TessSetVariableError),
53
54 #[error("could not write image to memory")]
56 WritePnmImage(#[from] image::ImageError),
57
58 #[error("could not set `Tesseract` image")]
60 SetImage(#[from] PixError),
61
62 #[error("could not get `Tesseract` text")]
64 GetText(#[from] Utf8Error),
65}
66
67pub type Result<T, E = Error> = std::result::Result<T, E>;
68
69thread_local! {
70 static TESSERACT: RefCell<Option<TesseractWrapper>> = const { RefCell::new(None) };
71}
72
73#[profiling::function]
75pub fn process<Img>(images: Img, opt: &OcrOpt) -> Result<Vec<Result<String>>>
76where
77 Img: IntoParallelIterator<Item = GrayImage>,
78{
79 std::env::set_var("OMP_THREAD_LIMIT", "1");
80
81 let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
83 if TESSERACT.replace(Some(tesseract)).is_some() {
84 return Err(Error::AlreadyInitialized);
85 }
86 broadcast(|ctx| {
88 profiling::scope!("Tesseract Init Wrapper");
89 trace!(
90 "Init tesseract with lang `{}` on thread {}",
91 opt.lang,
92 ctx.index()
93 );
94 let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
95 if TESSERACT.replace(Some(tesseract)).is_some() {
96 return Err(Error::AlreadyInitialized);
97 }
98 Ok::<_, Error>(())
99 })
100 .into_iter()
101 .try_for_each(|init_res| init_res)?;
102
103 let subs = images
105 .into_par_iter()
106 .map(|image| {
107 let text = TESSERACT.with(|tesseract| {
108 profiling::scope!("tesseract_ocr");
109 let mut tesseract = tesseract.borrow_mut();
110 let tesseract = tesseract.as_mut().unwrap();
111 tesseract.set_image(image, opt.dpi)?;
112 tesseract.get_text()
113 })?;
114 Ok(text)
115 })
116 .collect::<Vec<Result<String>>>();
117
118 broadcast(|ctx| {
120 profiling::scope!("Tesseract Drop Wrapper");
121 trace!("Drop TesseractWrapper local var on thread {}", ctx.index());
122 if let Some(tesseract) = TESSERACT.take() {
123 drop(tesseract);
124 }
125 });
126 if let Some(tesseract) = TESSERACT.take() {
128 drop(tesseract);
129 }
130
131 Ok(subs)
132}
133
134struct TesseractWrapper {
135 leptess: LepTess,
136}
137
138impl TesseractWrapper {
139 fn new(
140 datapath: Option<&str>,
141 language: impl AsRef<str>,
142 config: &[(Variable, String)],
143 ) -> Result<Self> {
144 profiling::scope!("TesseractWrapper new");
145
146 let mut leptess = LepTess::new(datapath, language.as_ref())?;
147 leptess.set_variable(leptess::Variable::ClassifyEnableLearning, "0")?;
151 leptess.set_variable(leptess::Variable::TesseditPagesegMode, "6")?;
154 leptess.set_variable(leptess::Variable::TesseditCharBlacklist, "|[]")?;
156 leptess.set_variable(leptess::Variable::TesseditDoInvert, "0")?;
158 for (key, value) in config {
160 leptess.set_variable(*key, value)?;
161 }
162 Ok(Self { leptess })
163 }
164
165 #[profiling::function]
167 fn set_image(&mut self, image: GrayImage, dpi: i32) -> Result<()> {
168 let bytes = {
169 profiling::scope!("TesseractWrapper Pnm create");
170 let mut bytes: Cursor<Vec<u8>> = Cursor::new(Vec::new());
171 DynamicImage::ImageLuma8(image).write_to(&mut bytes, image::ImageFormat::Pnm)?;
172 bytes
173 };
174 self.leptess.set_image_from_mem(bytes.get_ref())?;
175 self.leptess.set_source_resolution(dpi);
176 Ok(())
177 }
178
179 #[profiling::function]
181 fn get_text(&mut self) -> Result<String> {
182 Ok(self.leptess.get_utf8_text()?)
183 }
184}