Skip to main content

apple_vision/recognize_text/
mod.rs

1//! [`TextRecognizer`] — wraps `VNRecognizeTextRequest` for image-file OCR.
2
3use core::ffi::c_char;
4use core::ptr;
5use std::ffi::CString;
6use std::path::Path;
7
8use crate::error::{from_swift, VisionError};
9use crate::ffi;
10
11/// Recognition strategy passed to Vision. `Fast` is real-time-friendly,
12/// `Accurate` does layout analysis and is significantly slower.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14#[non_exhaustive]
15pub enum RecognitionLevel {
16    Fast,
17    Accurate,
18}
19
20impl Default for RecognitionLevel {
21    fn default() -> Self {
22        Self::Accurate
23    }
24}
25
26impl RecognitionLevel {
27    const fn as_raw(self) -> i32 {
28        match self {
29            Self::Fast => 0,
30            Self::Accurate => 1,
31        }
32    }
33}
34
35/// Bounding box in normalised (0.0..=1.0) image coordinates with origin at
36/// the bottom-left (Vision convention — flip `y` if you want top-left origin).
37#[derive(Debug, Clone, Copy, PartialEq)]
38pub struct BoundingBox {
39    pub x: f64,
40    pub y: f64,
41    pub width: f64,
42    pub height: f64,
43}
44
45/// One recognised text observation.
46#[derive(Debug, Clone, PartialEq)]
47pub struct RecognizedText {
48    pub text: String,
49    /// Confidence in `0.0..=1.0`. `0` means Vision didn't report one.
50    pub confidence: f32,
51    pub bounding_box: BoundingBox,
52}
53
54/// OCR engine.
55///
56/// # Examples
57///
58/// ```rust,no_run
59/// use apple_vision::recognize_text::{TextRecognizer, RecognitionLevel};
60///
61/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
62/// let recognizer = TextRecognizer::new()
63///     .with_recognition_level(RecognitionLevel::Accurate)
64///     .with_language_correction(true);
65/// let observations = recognizer.recognize_in_path("/tmp/screenshot.png")?;
66/// for obs in &observations {
67///     println!("{:.2} {:?}: {}", obs.confidence, obs.bounding_box, obs.text);
68/// }
69/// # Ok(())
70/// # }
71/// ```
72#[derive(Debug, Clone)]
73pub struct TextRecognizer {
74    recognition_level: RecognitionLevel,
75    uses_language_correction: bool,
76}
77
78impl Default for TextRecognizer {
79    fn default() -> Self {
80        Self::new()
81    }
82}
83
84impl TextRecognizer {
85    /// Construct with the Vision defaults: accurate mode + language
86    /// correction enabled.
87    #[must_use]
88    pub const fn new() -> Self {
89        Self {
90            recognition_level: RecognitionLevel::Accurate,
91            uses_language_correction: true,
92        }
93    }
94
95    #[must_use]
96    pub const fn with_recognition_level(mut self, level: RecognitionLevel) -> Self {
97        self.recognition_level = level;
98        self
99    }
100
101    #[must_use]
102    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
103        self.uses_language_correction = enabled;
104        self
105    }
106
107    /// Recognise text in the image at `path`. Supports any format
108    /// CoreGraphics' ImageIO can read (PNG, JPEG, HEIC, TIFF, ...).
109    ///
110    /// Returns observations in Vision's natural ordering (top-down,
111    /// left-to-right after layout analysis).
112    ///
113    /// # Errors
114    ///
115    /// Returns [`VisionError::InvalidArgument`] if `path` contains an
116    /// interior NUL byte, [`VisionError::ImageLoadFailed`] if the image
117    /// can't be read, or [`VisionError::RequestFailed`] if Vision rejects
118    /// the request.
119    pub fn recognize_in_path(
120        &self,
121        path: impl AsRef<Path>,
122    ) -> Result<Vec<RecognizedText>, VisionError> {
123        let path_str = path
124            .as_ref()
125            .to_str()
126            .ok_or_else(|| VisionError::InvalidArgument("non-UTF-8 path".into()))?;
127        let path_c = CString::new(path_str)
128            .map_err(|e| VisionError::InvalidArgument(format!("path NUL byte: {e}")))?;
129
130        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
131        let mut out_count: usize = 0;
132        let mut err_msg: *mut c_char = ptr::null_mut();
133        let status = unsafe {
134            ffi::vn_recognize_text_in_path(
135                path_c.as_ptr(),
136                self.recognition_level.as_raw(),
137                self.uses_language_correction,
138                &mut out_array,
139                &mut out_count,
140                &mut err_msg,
141            )
142        };
143        if status != ffi::status::OK {
144            return Err(unsafe { from_swift(status, err_msg) });
145        }
146
147        // Empty result is success (no text detected).
148        if out_array.is_null() || out_count == 0 {
149            return Ok(Vec::new());
150        }
151
152        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
153        let mut results = Vec::with_capacity(out_count);
154        for i in 0..out_count {
155            let raw = unsafe { &*typed_array.add(i) };
156            let text = if raw.text.is_null() {
157                String::new()
158            } else {
159                unsafe { core::ffi::CStr::from_ptr(raw.text) }
160                    .to_string_lossy()
161                    .into_owned()
162            };
163            results.push(RecognizedText {
164                text,
165                confidence: raw.confidence,
166                bounding_box: BoundingBox {
167                    x: raw.bbox_x,
168                    y: raw.bbox_y,
169                    width: raw.bbox_w,
170                    height: raw.bbox_h,
171                },
172            });
173        }
174
175        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
176        Ok(results)
177    }
178
179    /// Recognise text in a [`CVPixelBuffer`](apple_cf::cv::CVPixelBuffer)
180    /// directly, without a PNG round-trip. This is the zero-copy path for
181    /// live capture pipelines (screencapturekit / videotoolbox decoder /
182    /// AVCaptureSession).
183    ///
184    /// # Errors
185    ///
186    /// Returns [`VisionError::RequestFailed`] if Vision rejects the buffer
187    /// (e.g. unsupported pixel format).
188    pub fn recognize_in_pixel_buffer(
189        &self,
190        pixel_buffer: &apple_cf::cv::CVPixelBuffer,
191    ) -> Result<Vec<RecognizedText>, VisionError> {
192        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
193        let mut out_count: usize = 0;
194        let mut err_msg: *mut c_char = ptr::null_mut();
195        let status = unsafe {
196            ffi::vn_recognize_text_in_pixel_buffer(
197                pixel_buffer.as_ptr(),
198                self.recognition_level.as_raw(),
199                self.uses_language_correction,
200                &mut out_array,
201                &mut out_count,
202                &mut err_msg,
203            )
204        };
205        if status != ffi::status::OK {
206            return Err(unsafe { from_swift(status, err_msg) });
207        }
208        if out_array.is_null() || out_count == 0 {
209            return Ok(Vec::new());
210        }
211
212        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
213        let mut results = Vec::with_capacity(out_count);
214        for i in 0..out_count {
215            let raw = unsafe { &*typed_array.add(i) };
216            let text = if raw.text.is_null() {
217                String::new()
218            } else {
219                unsafe { core::ffi::CStr::from_ptr(raw.text) }
220                    .to_string_lossy()
221                    .into_owned()
222            };
223            results.push(RecognizedText {
224                text,
225                confidence: raw.confidence,
226                bounding_box: BoundingBox {
227                    x: raw.bbox_x,
228                    y: raw.bbox_y,
229                    width: raw.bbox_w,
230                    height: raw.bbox_h,
231                },
232            });
233        }
234
235        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
236        Ok(results)
237    }
238}
239
240#[doc(hidden)]
241/// Test helper used by the smoke test — renders `text` to a PNG so OCR can
242/// be exercised without bundling fixture files. Not part of the stable API.
243pub fn _test_helper_render_text_png(
244    text: &str,
245    width: i32,
246    height: i32,
247    path: &Path,
248) -> Result<(), VisionError> {
249    let text_c = CString::new(text).map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
250    let path_c = CString::new(path.to_string_lossy().as_ref())
251        .map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
252    let status = unsafe {
253        ffi::vn_test_helper_render_text_png(text_c.as_ptr(), width, height, path_c.as_ptr())
254    };
255    if status != ffi::status::OK {
256        return Err(VisionError::Unknown {
257            code: status,
258            message: "render helper failed".into(),
259        });
260    }
261    Ok(())
262}