Skip to main content

apple_vision/recognize_text/
mod.rs

1//! [`TextRecognizer`] — wraps `VNRecognizeTextRequest` for image-file OCR.
2
3use core::ffi::c_char;
4use core::ptr;
5use std::ffi::CString;
6use std::path::Path;
7
8use crate::error::{from_swift, VisionError};
9use crate::ffi;
10
11/// Recognition strategy passed to Vision. `Fast` is real-time-friendly,
12/// `Accurate` does layout analysis and is significantly slower.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14#[non_exhaustive]
15#[derive(Default)]
16pub enum RecognitionLevel {
17    Fast,
18    #[default]
19    Accurate,
20}
21
22impl RecognitionLevel {
23    pub(crate) const fn as_raw(self) -> i32 {
24        match self {
25            Self::Fast => 0,
26            Self::Accurate => 1,
27        }
28    }
29}
30
31/// Bounding box in normalised (0.0..=1.0) image coordinates with origin at
32/// the bottom-left (Vision convention — flip `y` if you want top-left origin).
33#[derive(Debug, Clone, Copy, PartialEq)]
34pub struct BoundingBox {
35    pub x: f64,
36    pub y: f64,
37    pub width: f64,
38    pub height: f64,
39}
40
41/// One recognised text observation.
42#[derive(Debug, Clone, PartialEq)]
43pub struct RecognizedText {
44    pub text: String,
45    /// Confidence in `0.0..=1.0`. `0` means Vision didn't report one.
46    pub confidence: f32,
47    pub bounding_box: BoundingBox,
48}
49
50impl RecognizedText {
51    /// Clone this value into the dedicated `VNRecognizedText` wrapper.
52    #[must_use]
53    pub fn candidate(&self) -> RecognizedTextCandidate {
54        self.clone().into()
55    }
56}
57
58/// Dedicated `VNRecognizedText` wrapper.
59#[derive(Debug, Clone, PartialEq)]
60pub struct RecognizedTextCandidate {
61    pub text: String,
62    pub confidence: f32,
63    pub bounding_box: BoundingBox,
64}
65
66impl From<RecognizedText> for RecognizedTextCandidate {
67    fn from(value: RecognizedText) -> Self {
68        Self {
69            text: value.text,
70            confidence: value.confidence,
71            bounding_box: value.bounding_box,
72        }
73    }
74}
75
76impl From<RecognizedTextCandidate> for RecognizedText {
77    fn from(value: RecognizedTextCandidate) -> Self {
78        Self {
79            text: value.text,
80            confidence: value.confidence,
81            bounding_box: value.bounding_box,
82        }
83    }
84}
85
86/// OCR engine.
87///
88/// # Examples
89///
90/// ```rust,no_run
91/// use apple_vision::recognize_text::{TextRecognizer, RecognitionLevel};
92///
93/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
94/// let recognizer = TextRecognizer::new()
95///     .with_recognition_level(RecognitionLevel::Accurate)
96///     .with_language_correction(true);
97/// let observations = recognizer.recognize_in_path("screenshot.png")?;
98/// for obs in &observations {
99///     println!("{:.2} {:?}: {}", obs.confidence, obs.bounding_box, obs.text);
100/// }
101/// # Ok(())
102/// # }
103/// ```
104#[derive(Debug, Clone)]
105pub struct TextRecognizer {
106    recognition_level: RecognitionLevel,
107    uses_language_correction: bool,
108}
109
110impl Default for TextRecognizer {
111    fn default() -> Self {
112        Self::new()
113    }
114}
115
116impl TextRecognizer {
117    /// Construct with the Vision defaults: accurate mode + language
118    /// correction enabled.
119    #[must_use]
120    pub const fn new() -> Self {
121        Self {
122            recognition_level: RecognitionLevel::Accurate,
123            uses_language_correction: true,
124        }
125    }
126
127    #[must_use]
128    pub const fn with_recognition_level(mut self, level: RecognitionLevel) -> Self {
129        self.recognition_level = level;
130        self
131    }
132
133    #[must_use]
134    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
135        self.uses_language_correction = enabled;
136        self
137    }
138
139    /// Recognise text in the image at `path`. Supports any format
140    /// CoreGraphics' `ImageIO` can read (PNG, JPEG, HEIC, TIFF, ...).
141    ///
142    /// Returns observations in Vision's natural ordering (top-down,
143    /// left-to-right after layout analysis).
144    ///
145    /// # Errors
146    ///
147    /// Returns [`VisionError::InvalidArgument`] if `path` contains an
148    /// interior NUL byte, [`VisionError::ImageLoadFailed`] if the image
149    /// can't be read, or [`VisionError::RequestFailed`] if Vision rejects
150    /// the request.
151    pub fn recognize_in_path(
152        &self,
153        path: impl AsRef<Path>,
154    ) -> Result<Vec<RecognizedText>, VisionError> {
155        let path_str = path
156            .as_ref()
157            .to_str()
158            .ok_or_else(|| VisionError::InvalidArgument("non-UTF-8 path".into()))?;
159        let path_c = CString::new(path_str)
160            .map_err(|e| VisionError::InvalidArgument(format!("path NUL byte: {e}")))?;
161
162        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
163        let mut out_count: usize = 0;
164        let mut err_msg: *mut c_char = ptr::null_mut();
165        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
166        let status = unsafe {
167            ffi::vn_recognize_text_in_path(
168                path_c.as_ptr(),
169                self.recognition_level.as_raw(),
170                self.uses_language_correction,
171                &mut out_array,
172                &mut out_count,
173                &mut err_msg,
174            )
175        };
176        if status != ffi::status::OK {
177            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
178            return Err(unsafe { from_swift(status, err_msg) });
179        }
180
181        // Empty result is success (no text detected).
182        if out_array.is_null() || out_count == 0 {
183            return Ok(Vec::new());
184        }
185
186        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
187        let mut results = Vec::with_capacity(out_count);
188        for i in 0..out_count {
189            // SAFETY: the pointer is valid for the reported element count; the index is in bounds.
190            let raw = unsafe { &*typed_array.add(i) };
191            let text = if raw.text.is_null() {
192                String::new()
193            } else {
194                // SAFETY: the C string pointer is non-null (checked above) and valid for the duration of this borrow.
195                unsafe { core::ffi::CStr::from_ptr(raw.text) }
196                    .to_string_lossy()
197                    .into_owned()
198            };
199            results.push(RecognizedText {
200                text,
201                confidence: raw.confidence,
202                bounding_box: BoundingBox {
203                    x: raw.bbox_x,
204                    y: raw.bbox_y,
205                    width: raw.bbox_w,
206                    height: raw.bbox_h,
207                },
208            });
209        }
210
211        // SAFETY: the pointer/count pair was allocated by the bridge and is freed exactly once here.
212        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
213        Ok(results)
214    }
215
216    /// Recognise text in a [`CVPixelBuffer`](apple_cf::cv::CVPixelBuffer)
217    /// directly, without a PNG round-trip. This is the zero-copy path for
218    /// live capture pipelines (screencapturekit / videotoolbox decoder /
219    /// `AVCaptureSession`).
220    ///
221    /// # Errors
222    ///
223    /// Returns [`VisionError::RequestFailed`] if Vision rejects the buffer
224    /// (e.g. unsupported pixel format).
225    pub fn recognize_in_pixel_buffer(
226        &self,
227        pixel_buffer: &apple_cf::cv::CVPixelBuffer,
228    ) -> Result<Vec<RecognizedText>, VisionError> {
229        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
230        let mut out_count: usize = 0;
231        let mut err_msg: *mut c_char = ptr::null_mut();
232        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
233        let status = unsafe {
234            ffi::vn_recognize_text_in_pixel_buffer(
235                pixel_buffer.as_ptr(),
236                self.recognition_level.as_raw(),
237                self.uses_language_correction,
238                &mut out_array,
239                &mut out_count,
240                &mut err_msg,
241            )
242        };
243        if status != ffi::status::OK {
244            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
245            return Err(unsafe { from_swift(status, err_msg) });
246        }
247        if out_array.is_null() || out_count == 0 {
248            return Ok(Vec::new());
249        }
250
251        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
252        let mut results = Vec::with_capacity(out_count);
253        for i in 0..out_count {
254            // SAFETY: the pointer is valid for the reported element count; the index is in bounds.
255            let raw = unsafe { &*typed_array.add(i) };
256            let text = if raw.text.is_null() {
257                String::new()
258            } else {
259                // SAFETY: the C string pointer is non-null (checked above) and valid for the duration of this borrow.
260                unsafe { core::ffi::CStr::from_ptr(raw.text) }
261                    .to_string_lossy()
262                    .into_owned()
263            };
264            results.push(RecognizedText {
265                text,
266                confidence: raw.confidence,
267                bounding_box: BoundingBox {
268                    x: raw.bbox_x,
269                    y: raw.bbox_y,
270                    width: raw.bbox_w,
271                    height: raw.bbox_h,
272                },
273            });
274        }
275
276        // SAFETY: the pointer/count pair was allocated by the bridge and is freed exactly once here.
277        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
278        Ok(results)
279    }
280}
281
282#[doc(hidden)]
283/// Test helper used by the smoke test — renders `text` to a PNG so OCR can
284/// be exercised without bundling fixture files. Not part of the stable API.
285pub fn _test_helper_render_text_png(
286    text: &str,
287    width: i32,
288    height: i32,
289    path: &Path,
290) -> Result<(), VisionError> {
291    let text_c = CString::new(text).map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
292    let path_c = CString::new(path.to_string_lossy().as_ref())
293        .map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
294    // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
295    let status = unsafe {
296        ffi::vn_test_helper_render_text_png(text_c.as_ptr(), width, height, path_c.as_ptr())
297    };
298    if status != ffi::status::OK {
299        return Err(VisionError::Unknown {
300            code: status,
301            message: "render helper failed".into(),
302        });
303    }
304    Ok(())
305}