Skip to main content

apple_vision/recognize_text/
mod.rs

1//! [`TextRecognizer`] — wraps `VNRecognizeTextRequest` for image-file OCR.
2
3use core::ffi::c_char;
4use core::ptr;
5use std::ffi::CString;
6use std::path::Path;
7
8use crate::error::{from_swift, VisionError};
9use crate::ffi;
10
11/// Recognition strategy passed to Vision. `Fast` is real-time-friendly,
12/// `Accurate` does layout analysis and is significantly slower.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14#[non_exhaustive]
15#[derive(Default)]
16pub enum RecognitionLevel {
17    Fast,
18    #[default]
19    Accurate,
20}
21
22impl RecognitionLevel {
23    const fn as_raw(self) -> i32 {
24        match self {
25            Self::Fast => 0,
26            Self::Accurate => 1,
27        }
28    }
29}
30
31/// Bounding box in normalised (0.0..=1.0) image coordinates with origin at
32/// the bottom-left (Vision convention — flip `y` if you want top-left origin).
33#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct BoundingBox {
35    pub x: f64,
36    pub y: f64,
37    pub width: f64,
38    pub height: f64,
39}
40
41/// One recognised text observation.
42#[derive(Debug, Clone, PartialEq)]
43pub struct RecognizedText {
44    pub text: String,
45    /// Confidence in `0.0..=1.0`. `0` means Vision didn't report one.
46    pub confidence: f32,
47    pub bounding_box: BoundingBox,
48}
49
50/// OCR engine.
51///
52/// # Examples
53///
54/// ```rust,no_run
55/// use apple_vision::recognize_text::{TextRecognizer, RecognitionLevel};
56///
57/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
58/// let recognizer = TextRecognizer::new()
59///     .with_recognition_level(RecognitionLevel::Accurate)
60///     .with_language_correction(true);
61/// let observations = recognizer.recognize_in_path("screenshot.png")?;
62/// for obs in &observations {
63///     println!("{:.2} {:?}: {}", obs.confidence, obs.bounding_box, obs.text);
64/// }
65/// # Ok(())
66/// # }
67/// ```
68#[derive(Debug, Clone)]
69pub struct TextRecognizer {
70    recognition_level: RecognitionLevel,
71    uses_language_correction: bool,
72}
73
74impl Default for TextRecognizer {
75    fn default() -> Self {
76        Self::new()
77    }
78}
79
80impl TextRecognizer {
81    /// Construct with the Vision defaults: accurate mode + language
82    /// correction enabled.
83    #[must_use]
84    pub const fn new() -> Self {
85        Self {
86            recognition_level: RecognitionLevel::Accurate,
87            uses_language_correction: true,
88        }
89    }
90
91    #[must_use]
92    pub const fn with_recognition_level(mut self, level: RecognitionLevel) -> Self {
93        self.recognition_level = level;
94        self
95    }
96
97    #[must_use]
98    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
99        self.uses_language_correction = enabled;
100        self
101    }
102
103    /// Recognise text in the image at `path`. Supports any format
104    /// CoreGraphics' `ImageIO` can read (PNG, JPEG, HEIC, TIFF, ...).
105    ///
106    /// Returns observations in Vision's natural ordering (top-down,
107    /// left-to-right after layout analysis).
108    ///
109    /// # Errors
110    ///
111    /// Returns [`VisionError::InvalidArgument`] if `path` contains an
112    /// interior NUL byte, [`VisionError::ImageLoadFailed`] if the image
113    /// can't be read, or [`VisionError::RequestFailed`] if Vision rejects
114    /// the request.
115    pub fn recognize_in_path(
116        &self,
117        path: impl AsRef<Path>,
118    ) -> Result<Vec<RecognizedText>, VisionError> {
119        let path_str = path
120            .as_ref()
121            .to_str()
122            .ok_or_else(|| VisionError::InvalidArgument("non-UTF-8 path".into()))?;
123        let path_c = CString::new(path_str)
124            .map_err(|e| VisionError::InvalidArgument(format!("path NUL byte: {e}")))?;
125
126        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
127        let mut out_count: usize = 0;
128        let mut err_msg: *mut c_char = ptr::null_mut();
129        let status = unsafe {
130            ffi::vn_recognize_text_in_path(
131                path_c.as_ptr(),
132                self.recognition_level.as_raw(),
133                self.uses_language_correction,
134                &mut out_array,
135                &mut out_count,
136                &mut err_msg,
137            )
138        };
139        if status != ffi::status::OK {
140            return Err(unsafe { from_swift(status, err_msg) });
141        }
142
143        // Empty result is success (no text detected).
144        if out_array.is_null() || out_count == 0 {
145            return Ok(Vec::new());
146        }
147
148        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
149        let mut results = Vec::with_capacity(out_count);
150        for i in 0..out_count {
151            let raw = unsafe { &*typed_array.add(i) };
152            let text = if raw.text.is_null() {
153                String::new()
154            } else {
155                unsafe { core::ffi::CStr::from_ptr(raw.text) }
156                    .to_string_lossy()
157                    .into_owned()
158            };
159            results.push(RecognizedText {
160                text,
161                confidence: raw.confidence,
162                bounding_box: BoundingBox {
163                    x: raw.bbox_x,
164                    y: raw.bbox_y,
165                    width: raw.bbox_w,
166                    height: raw.bbox_h,
167                },
168            });
169        }
170
171        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
172        Ok(results)
173    }
174
175    /// Recognise text in a [`CVPixelBuffer`](apple_cf::cv::CVPixelBuffer)
176    /// directly, without a PNG round-trip. This is the zero-copy path for
177    /// live capture pipelines (screencapturekit / videotoolbox decoder /
178    /// `AVCaptureSession`).
179    ///
180    /// # Errors
181    ///
182    /// Returns [`VisionError::RequestFailed`] if Vision rejects the buffer
183    /// (e.g. unsupported pixel format).
184    pub fn recognize_in_pixel_buffer(
185        &self,
186        pixel_buffer: &apple_cf::cv::CVPixelBuffer,
187    ) -> Result<Vec<RecognizedText>, VisionError> {
188        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
189        let mut out_count: usize = 0;
190        let mut err_msg: *mut c_char = ptr::null_mut();
191        let status = unsafe {
192            ffi::vn_recognize_text_in_pixel_buffer(
193                pixel_buffer.as_ptr(),
194                self.recognition_level.as_raw(),
195                self.uses_language_correction,
196                &mut out_array,
197                &mut out_count,
198                &mut err_msg,
199            )
200        };
201        if status != ffi::status::OK {
202            return Err(unsafe { from_swift(status, err_msg) });
203        }
204        if out_array.is_null() || out_count == 0 {
205            return Ok(Vec::new());
206        }
207
208        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
209        let mut results = Vec::with_capacity(out_count);
210        for i in 0..out_count {
211            let raw = unsafe { &*typed_array.add(i) };
212            let text = if raw.text.is_null() {
213                String::new()
214            } else {
215                unsafe { core::ffi::CStr::from_ptr(raw.text) }
216                    .to_string_lossy()
217                    .into_owned()
218            };
219            results.push(RecognizedText {
220                text,
221                confidence: raw.confidence,
222                bounding_box: BoundingBox {
223                    x: raw.bbox_x,
224                    y: raw.bbox_y,
225                    width: raw.bbox_w,
226                    height: raw.bbox_h,
227                },
228            });
229        }
230
231        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
232        Ok(results)
233    }
234}
235
236#[doc(hidden)]
237/// Test helper used by the smoke test — renders `text` to a PNG so OCR can
238/// be exercised without bundling fixture files. Not part of the stable API.
239pub fn _test_helper_render_text_png(
240    text: &str,
241    width: i32,
242    height: i32,
243    path: &Path,
244) -> Result<(), VisionError> {
245    let text_c = CString::new(text).map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
246    let path_c = CString::new(path.to_string_lossy().as_ref())
247        .map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
248    let status = unsafe {
249        ffi::vn_test_helper_render_text_png(text_c.as_ptr(), width, height, path_c.as_ptr())
250    };
251    if status != ffi::status::OK {
252        return Err(VisionError::Unknown {
253            code: status,
254            message: "render helper failed".into(),
255        });
256    }
257    Ok(())
258}