subtile_ocr/
ocr.rs

1use std::{cell::RefCell, io::Cursor, str::Utf8Error};
2
3use image::{DynamicImage, GrayImage};
4use leptess::{
5    leptonica::PixError,
6    tesseract::{TessInitError, TessSetVariableError},
7    LepTess, Variable,
8};
9use log::trace;
10use rayon::{broadcast, prelude::*};
11use thiserror::Error;
12
13/// Options for orc with Tesseract
14pub struct OcrOpt<'a> {
15    tessdata_dir: &'a Option<String>,
16    lang: &'a str,
17    config: &'a Vec<(Variable, String)>,
18    dpi: i32,
19}
20
21impl<'a> OcrOpt<'a> {
22    /// Create a new `OcrOpt`
23    #[must_use]
24    pub const fn new(
25        tessdata_dir: &'a Option<String>,
26        lang: &'a str,
27        config: &'a Vec<(Variable, String)>,
28        dpi: i32,
29    ) -> Self {
30        Self {
31            tessdata_dir,
32            lang,
33            config,
34            dpi,
35        }
36    }
37}
38
39/// Error of the Ocr process with Tesseract.
40#[derive(Error, Debug)]
41pub enum Error {
42    /// Indicate than `Tesseract` could not be initialized.
43    #[error("could not initialize tesseract")]
44    Initialize(#[from] TessInitError),
45
46    /// Indicate than `TESSERACT` was already initialized on this thread
47    #[error("thread local var `TESSERACT` is already initialized")]
48    AlreadyInitialized,
49
50    /// Indicate an error during `Tesseract` variable set.
51    #[error("could not set tesseract variable")]
52    SetVariable(#[from] TessSetVariableError),
53
54    /// Indicate than the `pnm` image couldn't be wrote in memory.
55    #[error("could not write image to memory")]
56    WritePnmImage(#[from] image::ImageError),
57
58    /// Indicate a failure during set `Pnm` image to `Tesseract`.
59    #[error("could not set `Tesseract` image")]
60    SetImage(#[from] PixError),
61
62    /// Indicate than `Tesseract` failed to provide a text from the image.
63    #[error("could not get `Tesseract` text")]
64    GetText(#[from] Utf8Error),
65}
66
67pub type Result<T, E = Error> = std::result::Result<T, E>;
68
69thread_local! {
70    static TESSERACT: RefCell<Option<TesseractWrapper>> = const { RefCell::new(None) };
71}
72
73/// Process subtitles images with Tesseract `OCR`.
74#[profiling::function]
75pub fn process<Img>(images: Img, opt: &OcrOpt) -> Result<Vec<Result<String>>>
76where
77    Img: IntoParallelIterator<Item = GrayImage>,
78{
79    std::env::set_var("OMP_THREAD_LIMIT", "1");
80
81    // Init tesseract on the main thread:
82    let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
83    if TESSERACT.replace(Some(tesseract)).is_some() {
84        return Err(Error::AlreadyInitialized);
85    }
86    // and on threadpool:
87    broadcast(|ctx| {
88        profiling::scope!("Tesseract Init Wrapper");
89        trace!(
90            "Init tesseract with lang `{}` on thread {}",
91            opt.lang,
92            ctx.index()
93        );
94        let tesseract = TesseractWrapper::new(opt.tessdata_dir.as_deref(), opt.lang, opt.config)?;
95        if TESSERACT.replace(Some(tesseract)).is_some() {
96            return Err(Error::AlreadyInitialized);
97        }
98        Ok::<_, Error>(())
99    })
100    .into_iter()
101    .try_for_each(|init_res| init_res)?;
102
103    // Process images
104    let subs = images
105        .into_par_iter()
106        .map(|image| {
107            let text = TESSERACT.with(|tesseract| {
108                profiling::scope!("tesseract_ocr");
109                let mut tesseract = tesseract.borrow_mut();
110                let tesseract = tesseract.as_mut().unwrap();
111                tesseract.set_image(image, opt.dpi)?;
112                tesseract.get_text()
113            })?;
114            Ok(text)
115        })
116        .collect::<Vec<Result<String>>>();
117
118    // Clean tesseract from Thread local vars for Threadpool
119    broadcast(|ctx| {
120        profiling::scope!("Tesseract Drop Wrapper");
121        trace!("Drop TesseractWrapper local var on thread {}", ctx.index());
122        if let Some(tesseract) = TESSERACT.take() {
123            drop(tesseract);
124        }
125    });
126    // ... for main thread
127    if let Some(tesseract) = TESSERACT.take() {
128        drop(tesseract);
129    }
130
131    Ok(subs)
132}
133
134struct TesseractWrapper {
135    leptess: LepTess,
136}
137
138impl TesseractWrapper {
139    fn new(
140        datapath: Option<&str>,
141        language: impl AsRef<str>,
142        config: &[(Variable, String)],
143    ) -> Result<Self> {
144        profiling::scope!("TesseractWrapper new");
145
146        let mut leptess = LepTess::new(datapath, language.as_ref())?;
147        // Disable learning by default, though a user could re-enable this
148        // option with `-c`. We turn this off since we are are multithreading,
149        // so this option would result in non-deterministic output.
150        leptess.set_variable(leptess::Variable::ClassifyEnableLearning, "0")?;
151        // 6 is PSM_SINGLE_BLOCK. We have preprocessed the input into individual
152        // lines, and telling Tesseract this fact greatly improves accuracy.
153        leptess.set_variable(leptess::Variable::TesseditPagesegMode, "6")?;
154        // Avoid interpreting the characters I, l as |
155        leptess.set_variable(leptess::Variable::TesseditCharBlacklist, "|[]")?;
156        // Avoid than tesseract tried to invert the image
157        leptess.set_variable(leptess::Variable::TesseditDoInvert, "0")?;
158        // Add user options.
159        for (key, value) in config {
160            leptess.set_variable(*key, value)?;
161        }
162        Ok(Self { leptess })
163    }
164
165    /// Set the tesseract image to the given image's contents.
166    #[profiling::function]
167    fn set_image(&mut self, image: GrayImage, dpi: i32) -> Result<()> {
168        let bytes = {
169            profiling::scope!("TesseractWrapper Pnm create");
170            let mut bytes: Cursor<Vec<u8>> = Cursor::new(Vec::new());
171            DynamicImage::ImageLuma8(image).write_to(&mut bytes, image::ImageFormat::Pnm)?;
172            bytes
173        };
174        self.leptess.set_image_from_mem(bytes.get_ref())?;
175        self.leptess.set_source_resolution(dpi);
176        Ok(())
177    }
178
179    /// Get text.
180    #[profiling::function]
181    fn get_text(&mut self) -> Result<String> {
182        Ok(self.leptess.get_utf8_text()?)
183    }
184}