Skip to main content

apple_vision/recognize_text/
mod.rs

1//! [`TextRecognizer`] — wraps `VNRecognizeTextRequest` for image-file OCR.
2
3use core::ffi::c_char;
4use core::ptr;
5use std::ffi::CString;
6use std::path::Path;
7
8use crate::error::{from_swift, VisionError};
9use crate::ffi;
10
11/// Recognition strategy passed to Vision. `Fast` is real-time-friendly,
12/// `Accurate` does layout analysis and is significantly slower.
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
14#[non_exhaustive]
15#[derive(Default)]
16pub enum RecognitionLevel {
17    Fast,
18    #[default]
19    Accurate,
20}
21
22
23impl RecognitionLevel {
24    const fn as_raw(self) -> i32 {
25        match self {
26            Self::Fast => 0,
27            Self::Accurate => 1,
28        }
29    }
30}
31
32/// Bounding box in normalised (0.0..=1.0) image coordinates with origin at
33/// the bottom-left (Vision convention — flip `y` if you want top-left origin).
34#[derive(Debug, Clone, Copy, PartialEq)]
35pub struct BoundingBox {
36    pub x: f64,
37    pub y: f64,
38    pub width: f64,
39    pub height: f64,
40}
41
42/// One recognised text observation.
43#[derive(Debug, Clone, PartialEq)]
44pub struct RecognizedText {
45    pub text: String,
46    /// Confidence in `0.0..=1.0`. `0` means Vision didn't report one.
47    pub confidence: f32,
48    pub bounding_box: BoundingBox,
49}
50
51/// OCR engine.
52///
53/// # Examples
54///
55/// ```rust,no_run
56/// use apple_vision::recognize_text::{TextRecognizer, RecognitionLevel};
57///
58/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
59/// let recognizer = TextRecognizer::new()
60///     .with_recognition_level(RecognitionLevel::Accurate)
61///     .with_language_correction(true);
62/// let observations = recognizer.recognize_in_path("/tmp/screenshot.png")?;
63/// for obs in &observations {
64///     println!("{:.2} {:?}: {}", obs.confidence, obs.bounding_box, obs.text);
65/// }
66/// # Ok(())
67/// # }
68/// ```
69#[derive(Debug, Clone)]
70pub struct TextRecognizer {
71    recognition_level: RecognitionLevel,
72    uses_language_correction: bool,
73}
74
75impl Default for TextRecognizer {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl TextRecognizer {
82    /// Construct with the Vision defaults: accurate mode + language
83    /// correction enabled.
84    #[must_use]
85    pub const fn new() -> Self {
86        Self {
87            recognition_level: RecognitionLevel::Accurate,
88            uses_language_correction: true,
89        }
90    }
91
92    #[must_use]
93    pub const fn with_recognition_level(mut self, level: RecognitionLevel) -> Self {
94        self.recognition_level = level;
95        self
96    }
97
98    #[must_use]
99    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
100        self.uses_language_correction = enabled;
101        self
102    }
103
104    /// Recognise text in the image at `path`. Supports any format
105    /// CoreGraphics' `ImageIO` can read (PNG, JPEG, HEIC, TIFF, ...).
106    ///
107    /// Returns observations in Vision's natural ordering (top-down,
108    /// left-to-right after layout analysis).
109    ///
110    /// # Errors
111    ///
112    /// Returns [`VisionError::InvalidArgument`] if `path` contains an
113    /// interior NUL byte, [`VisionError::ImageLoadFailed`] if the image
114    /// can't be read, or [`VisionError::RequestFailed`] if Vision rejects
115    /// the request.
116    pub fn recognize_in_path(
117        &self,
118        path: impl AsRef<Path>,
119    ) -> Result<Vec<RecognizedText>, VisionError> {
120        let path_str = path
121            .as_ref()
122            .to_str()
123            .ok_or_else(|| VisionError::InvalidArgument("non-UTF-8 path".into()))?;
124        let path_c = CString::new(path_str)
125            .map_err(|e| VisionError::InvalidArgument(format!("path NUL byte: {e}")))?;
126
127        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
128        let mut out_count: usize = 0;
129        let mut err_msg: *mut c_char = ptr::null_mut();
130        let status = unsafe {
131            ffi::vn_recognize_text_in_path(
132                path_c.as_ptr(),
133                self.recognition_level.as_raw(),
134                self.uses_language_correction,
135                &mut out_array,
136                &mut out_count,
137                &mut err_msg,
138            )
139        };
140        if status != ffi::status::OK {
141            return Err(unsafe { from_swift(status, err_msg) });
142        }
143
144        // Empty result is success (no text detected).
145        if out_array.is_null() || out_count == 0 {
146            return Ok(Vec::new());
147        }
148
149        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
150        let mut results = Vec::with_capacity(out_count);
151        for i in 0..out_count {
152            let raw = unsafe { &*typed_array.add(i) };
153            let text = if raw.text.is_null() {
154                String::new()
155            } else {
156                unsafe { core::ffi::CStr::from_ptr(raw.text) }
157                    .to_string_lossy()
158                    .into_owned()
159            };
160            results.push(RecognizedText {
161                text,
162                confidence: raw.confidence,
163                bounding_box: BoundingBox {
164                    x: raw.bbox_x,
165                    y: raw.bbox_y,
166                    width: raw.bbox_w,
167                    height: raw.bbox_h,
168                },
169            });
170        }
171
172        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
173        Ok(results)
174    }
175
176    /// Recognise text in a [`CVPixelBuffer`](apple_cf::cv::CVPixelBuffer)
177    /// directly, without a PNG round-trip. This is the zero-copy path for
178    /// live capture pipelines (screencapturekit / videotoolbox decoder /
179    /// `AVCaptureSession`).
180    ///
181    /// # Errors
182    ///
183    /// Returns [`VisionError::RequestFailed`] if Vision rejects the buffer
184    /// (e.g. unsupported pixel format).
185    pub fn recognize_in_pixel_buffer(
186        &self,
187        pixel_buffer: &apple_cf::cv::CVPixelBuffer,
188    ) -> Result<Vec<RecognizedText>, VisionError> {
189        let mut out_array: *mut core::ffi::c_void = ptr::null_mut();
190        let mut out_count: usize = 0;
191        let mut err_msg: *mut c_char = ptr::null_mut();
192        let status = unsafe {
193            ffi::vn_recognize_text_in_pixel_buffer(
194                pixel_buffer.as_ptr(),
195                self.recognition_level.as_raw(),
196                self.uses_language_correction,
197                &mut out_array,
198                &mut out_count,
199                &mut err_msg,
200            )
201        };
202        if status != ffi::status::OK {
203            return Err(unsafe { from_swift(status, err_msg) });
204        }
205        if out_array.is_null() || out_count == 0 {
206            return Ok(Vec::new());
207        }
208
209        let typed_array = out_array.cast::<ffi::RecognizedTextRaw>();
210        let mut results = Vec::with_capacity(out_count);
211        for i in 0..out_count {
212            let raw = unsafe { &*typed_array.add(i) };
213            let text = if raw.text.is_null() {
214                String::new()
215            } else {
216                unsafe { core::ffi::CStr::from_ptr(raw.text) }
217                    .to_string_lossy()
218                    .into_owned()
219            };
220            results.push(RecognizedText {
221                text,
222                confidence: raw.confidence,
223                bounding_box: BoundingBox {
224                    x: raw.bbox_x,
225                    y: raw.bbox_y,
226                    width: raw.bbox_w,
227                    height: raw.bbox_h,
228                },
229            });
230        }
231
232        unsafe { ffi::vn_recognized_text_free(out_array, out_count) };
233        Ok(results)
234    }
235}
236
237#[doc(hidden)]
238/// Test helper used by the smoke test — renders `text` to a PNG so OCR can
239/// be exercised without bundling fixture files. Not part of the stable API.
240pub fn _test_helper_render_text_png(
241    text: &str,
242    width: i32,
243    height: i32,
244    path: &Path,
245) -> Result<(), VisionError> {
246    let text_c = CString::new(text).map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
247    let path_c = CString::new(path.to_string_lossy().as_ref())
248        .map_err(|e| VisionError::InvalidArgument(e.to_string()))?;
249    let status = unsafe {
250        ffi::vn_test_helper_render_text_png(text_c.as_ptr(), width, height, path_c.as_ptr())
251    };
252    if status != ffi::status::OK {
253        return Err(VisionError::Unknown {
254            code: status,
255            message: "render helper failed".into(),
256        });
257    }
258    Ok(())
259}