Skip to main content

apple_vision/processing/
mod.rs

1//! Explicit request / handler / video-processing wrappers backed by Vision.
2//!
3//! This module exposes the generic Vision base classes that the rest of the
4//! crate used internally until v0.15.1: [`Request`] (`VNRequest`),
5//! [`Observation`] (`VNObservation`), [`ImageRequestHandler`]
6//! (`VNImageRequestHandler`), [`SequenceRequestHandler`]
7//! (`VNSequenceRequestHandler`), and [`VideoProcessor`] (`VNVideoProcessor`).
8//! The initial safe surface focuses on text recognition, which is already part
9//! of the crate's default feature set.
10
11use core::{
12    ffi::{c_char, c_void},
13    ptr,
14};
15use std::{
16    ffi::{CStr, CString},
17    path::{Path, PathBuf},
18};
19
20use crate::{
21    error::{from_swift, VisionError},
22    ffi,
23    recognize_text::{BoundingBox, RecognitionLevel, RecognizedText},
24};
25
26const VIDEO_CADENCE_DEFAULT: i32 = 0;
27const VIDEO_CADENCE_FRAME_RATE: i32 = 1;
28const VIDEO_CADENCE_TIME_INTERVAL: i32 = 2;
29
30/// The high-level Vision request kind carried by [`Request`].
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
32#[non_exhaustive]
33pub enum RequestKind {
34    /// `VNRecognizeTextRequest`
35    RecognizeText,
36}
37
38/// Shared `VNRequest` configuration.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct Request {
41    kind: RequestKind,
42    recognition_level: RecognitionLevel,
43    uses_language_correction: bool,
44    prefer_background_processing: bool,
45    uses_cpu_only: bool,
46    revision: Option<usize>,
47}
48
49impl Default for Request {
50    fn default() -> Self {
51        Self::recognize_text()
52    }
53}
54
55impl Request {
56    /// Build a text-recognition request backed by `VNRecognizeTextRequest`.
57    #[must_use]
58    pub const fn recognize_text() -> Self {
59        Self {
60            kind: RequestKind::RecognizeText,
61            recognition_level: RecognitionLevel::Accurate,
62            uses_language_correction: true,
63            prefer_background_processing: false,
64            uses_cpu_only: false,
65            revision: None,
66        }
67    }
68
69    /// Return the underlying request kind.
70    #[must_use]
71    pub const fn kind(&self) -> RequestKind {
72        self.kind
73    }
74
75    /// Select the OCR recognition strategy.
76    #[must_use]
77    pub const fn with_recognition_level(mut self, recognition_level: RecognitionLevel) -> Self {
78        self.recognition_level = recognition_level;
79        self
80    }
81
82    /// Enable or disable language correction.
83    #[must_use]
84    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
85        self.uses_language_correction = enabled;
86        self
87    }
88
89    /// Mirror `VNRequest.preferBackgroundProcessing`.
90    #[must_use]
91    pub const fn with_prefer_background_processing(mut self, enabled: bool) -> Self {
92        self.prefer_background_processing = enabled;
93        self
94    }
95
96    /// Mirror `VNRequest.usesCPUOnly`.
97    #[must_use]
98    pub const fn with_uses_cpu_only(mut self, enabled: bool) -> Self {
99        self.uses_cpu_only = enabled;
100        self
101    }
102
103    /// Override the request revision.
104    #[must_use]
105    pub const fn with_revision(mut self, revision: usize) -> Self {
106        self.revision = Some(revision);
107        self
108    }
109
110    #[must_use]
111    pub const fn recognition_level(&self) -> RecognitionLevel {
112        self.recognition_level
113    }
114
115    #[must_use]
116    pub const fn uses_language_correction(&self) -> bool {
117        self.uses_language_correction
118    }
119
120    #[must_use]
121    pub const fn prefer_background_processing(&self) -> bool {
122        self.prefer_background_processing
123    }
124
125    #[must_use]
126    pub const fn uses_cpu_only(&self) -> bool {
127        self.uses_cpu_only
128    }
129
130    #[must_use]
131    pub const fn revision(&self) -> Option<usize> {
132        self.revision
133    }
134
135    const fn recognition_level_raw(&self) -> i32 {
136        match self.recognition_level {
137            RecognitionLevel::Fast => 0,
138            RecognitionLevel::Accurate => 1,
139        }
140    }
141}
142
143/// Shared `VNObservation` metadata surfaced by Vision results.
144#[derive(Debug, Clone, PartialEq)]
145pub struct Observation {
146    /// Stable UUID generated by Vision for this observation.
147    pub uuid: String,
148    /// Base confidence score in `0.0..=1.0`.
149    pub confidence: f32,
150    /// Optional media time range in seconds.
151    pub time_range: Option<TimeRange>,
152}
153
154/// Media time range in seconds.
155#[derive(Debug, Clone, Copy, PartialEq)]
156pub struct TimeRange {
157    pub start_seconds: f64,
158    pub duration_seconds: f64,
159}
160
161/// One recognized-text observation plus shared [`Observation`] metadata.
162#[derive(Debug, Clone, PartialEq)]
163pub struct RecognizedTextObservation {
164    pub observation: Observation,
165    pub text: String,
166    pub bounding_box: BoundingBox,
167}
168
169impl RecognizedTextObservation {
170    /// Drop the generic observation metadata and keep the existing text result
171    /// shape used elsewhere in the crate.
172    #[must_use]
173    pub fn into_recognized_text(self) -> RecognizedText {
174        self.into()
175    }
176
177    /// Clone into the existing [`RecognizedText`] shape.
178    #[must_use]
179    pub fn as_recognized_text(&self) -> RecognizedText {
180        self.clone().into()
181    }
182}
183
184impl From<RecognizedTextObservation> for RecognizedText {
185    fn from(value: RecognizedTextObservation) -> Self {
186        Self {
187            text: value.text,
188            confidence: value.observation.confidence,
189            bounding_box: value.bounding_box,
190        }
191    }
192}
193
194/// Safe wrapper around `VNImageRequestHandler`.
195#[derive(Debug, Clone, PartialEq, Eq)]
196pub struct ImageRequestHandler {
197    image_path: PathBuf,
198}
199
200impl ImageRequestHandler {
201    /// Bind the handler to an image path.
202    #[must_use]
203    pub fn new(image_path: impl AsRef<Path>) -> Self {
204        Self {
205            image_path: image_path.as_ref().to_path_buf(),
206        }
207    }
208
209    /// Perform `request` against the bound image.
210    ///
211    /// # Errors
212    ///
213    /// Returns [`VisionError`] if the path is invalid, the image cannot be
214    /// loaded, or Vision rejects the request.
215    pub fn perform(&self, request: &Request) -> Result<Vec<RecognizedTextObservation>, VisionError> {
216        let image_c = path_to_cstring(&self.image_path, "image path")?;
217        let mut out_array: *mut c_void = ptr::null_mut();
218        let mut out_count: usize = 0;
219        let mut err_msg: *mut c_char = ptr::null_mut();
220        let status = unsafe {
221            ffi::vn_image_request_handler_perform_text_request(
222                image_c.as_ptr(),
223                request.recognition_level_raw(),
224                request.uses_language_correction,
225                request.prefer_background_processing,
226                request.uses_cpu_only,
227                request.revision.unwrap_or_default(),
228                request.revision.is_some(),
229                &mut out_array,
230                &mut out_count,
231                &mut err_msg,
232            )
233        };
234        if status != ffi::status::OK {
235            return Err(unsafe { from_swift(status, err_msg) });
236        }
237        Ok(collect_request_observations(out_array, out_count))
238    }
239}
240
241/// Safe wrapper around a retained `VNSequenceRequestHandler`.
242pub struct SequenceRequestHandler {
243    handle: *mut c_void,
244}
245
246impl SequenceRequestHandler {
247    /// Create a fresh sequence handler.
248    ///
249    /// # Errors
250    ///
251    /// Returns [`VisionError`] if the Swift bridge fails to allocate the
252    /// backing handler.
253    pub fn new() -> Result<Self, VisionError> {
254        let mut handle: *mut c_void = ptr::null_mut();
255        let mut err_msg: *mut c_char = ptr::null_mut();
256        let status = unsafe { ffi::vn_sequence_request_handler_create(&mut handle, &mut err_msg) };
257        if status != ffi::status::OK {
258            return Err(unsafe { from_swift(status, err_msg) });
259        }
260        if handle.is_null() {
261            return Err(VisionError::Unknown {
262                code: ffi::status::UNKNOWN,
263                message: "sequence request handler bridge returned a null handle".into(),
264            });
265        }
266        Ok(Self { handle })
267    }
268
269    /// Perform `request` on `image_path`, preserving Vision's sequence state
270    /// across calls.
271    ///
272    /// # Errors
273    ///
274    /// Returns [`VisionError`] if the path is invalid, the image cannot be
275    /// loaded, or Vision rejects the request.
276    pub fn perform(
277        &mut self,
278        image_path: impl AsRef<Path>,
279        request: &Request,
280    ) -> Result<Vec<RecognizedTextObservation>, VisionError> {
281        let image_c = path_to_cstring(image_path.as_ref(), "image path")?;
282        let mut out_array: *mut c_void = ptr::null_mut();
283        let mut out_count: usize = 0;
284        let mut err_msg: *mut c_char = ptr::null_mut();
285        let status = unsafe {
286            ffi::vn_sequence_request_handler_perform_text_request(
287                self.handle,
288                image_c.as_ptr(),
289                request.recognition_level_raw(),
290                request.uses_language_correction,
291                request.prefer_background_processing,
292                request.uses_cpu_only,
293                request.revision.unwrap_or_default(),
294                request.revision.is_some(),
295                &mut out_array,
296                &mut out_count,
297                &mut err_msg,
298            )
299        };
300        if status != ffi::status::OK {
301            return Err(unsafe { from_swift(status, err_msg) });
302        }
303        Ok(collect_request_observations(out_array, out_count))
304    }
305}
306
307impl Drop for SequenceRequestHandler {
308    fn drop(&mut self) {
309        if !self.handle.is_null() {
310            unsafe { ffi::vn_sequence_request_handler_free(self.handle) };
311        }
312    }
313}
314
315/// Cadence override for [`VideoProcessor`].
316#[derive(Debug, Clone, Copy, PartialEq)]
317#[non_exhaustive]
318pub enum VideoCadence {
319    /// Let Vision process every frame.
320    EveryFrame,
321    /// Sample the video at the given frame rate.
322    FrameRate(usize),
323    /// Sample the video at the given time interval, in seconds.
324    TimeIntervalSeconds(f64),
325}
326
327/// `VNVideoProcessor` request options.
328#[derive(Debug, Clone, Copy, PartialEq)]
329pub struct VideoProcessingOptions {
330    pub cadence: Option<VideoCadence>,
331}
332
333impl Default for VideoProcessingOptions {
334    fn default() -> Self {
335        Self::new()
336    }
337}
338
339impl VideoProcessingOptions {
340    #[must_use]
341    pub const fn new() -> Self {
342        Self { cadence: None }
343    }
344
345    #[must_use]
346    pub const fn with_cadence(mut self, cadence: VideoCadence) -> Self {
347        self.cadence = Some(cadence);
348        self
349    }
350}
351
352/// Safe wrapper around `VNVideoProcessor`.
353#[derive(Debug, Clone, PartialEq, Eq)]
354pub struct VideoProcessor {
355    video_path: PathBuf,
356}
357
358impl VideoProcessor {
359    /// Bind the processor to a video file.
360    #[must_use]
361    pub fn new(video_path: impl AsRef<Path>) -> Self {
362        Self {
363            video_path: video_path.as_ref().to_path_buf(),
364        }
365    }
366
367    /// Analyze the bound video with `request`.
368    ///
369    /// # Errors
370    ///
371    /// Returns [`VisionError`] if the path is invalid, the video cannot be
372    /// opened, the cadence is invalid, or Vision rejects the request.
373    pub fn analyze(
374        &self,
375        request: &Request,
376        options: VideoProcessingOptions,
377    ) -> Result<Vec<RecognizedTextObservation>, VisionError> {
378        let video_c = path_to_cstring(&self.video_path, "video path")?;
379        let (cadence_kind, cadence_value) = cadence_to_ffi(options.cadence)?;
380        let mut out_array: *mut c_void = ptr::null_mut();
381        let mut out_count: usize = 0;
382        let mut err_msg: *mut c_char = ptr::null_mut();
383        let status = unsafe {
384            ffi::vn_video_processor_analyze_text_request(
385                video_c.as_ptr(),
386                request.recognition_level_raw(),
387                request.uses_language_correction,
388                request.prefer_background_processing,
389                request.uses_cpu_only,
390                request.revision.unwrap_or_default(),
391                request.revision.is_some(),
392                cadence_kind,
393                cadence_value,
394                &mut out_array,
395                &mut out_count,
396                &mut err_msg,
397            )
398        };
399        if status != ffi::status::OK {
400            return Err(unsafe { from_swift(status, err_msg) });
401        }
402        Ok(collect_request_observations(out_array, out_count))
403    }
404}
405
406fn cadence_to_ffi(cadence: Option<VideoCadence>) -> Result<(i32, f64), VisionError> {
407    match cadence.unwrap_or(VideoCadence::EveryFrame) {
408        VideoCadence::EveryFrame => Ok((VIDEO_CADENCE_DEFAULT, 0.0)),
409        VideoCadence::FrameRate(frame_rate) => {
410            if frame_rate == 0 {
411                return Err(VisionError::InvalidArgument(
412                    "video cadence frame rate must be greater than zero".into(),
413                ));
414            }
415            let frame_rate = u32::try_from(frame_rate).map_err(|_| {
416                VisionError::InvalidArgument(
417                    "video cadence frame rate exceeds the supported range".into(),
418                )
419            })?;
420            Ok((VIDEO_CADENCE_FRAME_RATE, f64::from(frame_rate)))
421        }
422        VideoCadence::TimeIntervalSeconds(seconds) => {
423            if !seconds.is_finite() || seconds <= 0.0 {
424                return Err(VisionError::InvalidArgument(
425                    "video cadence time interval must be a finite positive number".into(),
426                ));
427            }
428            Ok((VIDEO_CADENCE_TIME_INTERVAL, seconds))
429        }
430    }
431}
432
433fn collect_request_observations(
434    array: *mut c_void,
435    count: usize,
436) -> Vec<RecognizedTextObservation> {
437    if array.is_null() || count == 0 {
438        return Vec::new();
439    }
440
441    let typed = array.cast::<ffi::RequestObservationRaw>();
442    let mut observations = Vec::with_capacity(count);
443    for index in 0..count {
444        let raw = unsafe { &*typed.add(index) };
445        let uuid = c_string_or_empty(raw.uuid);
446        let text = c_string_or_empty(raw.text);
447        let time_range = raw.has_time_range.then_some(TimeRange {
448            start_seconds: raw.time_range_start_seconds,
449            duration_seconds: raw.time_range_duration_seconds,
450        });
451        observations.push(RecognizedTextObservation {
452            observation: Observation {
453                uuid,
454                confidence: raw.confidence,
455                time_range,
456            },
457            text,
458            bounding_box: BoundingBox {
459                x: raw.bbox_x,
460                y: raw.bbox_y,
461                width: raw.bbox_w,
462                height: raw.bbox_h,
463            },
464        });
465    }
466
467    unsafe { ffi::vn_request_observations_free(array, count) };
468    observations
469}
470
471fn c_string_or_empty(ptr: *mut c_char) -> String {
472    if ptr.is_null() {
473        String::new()
474    } else {
475        unsafe { CStr::from_ptr(ptr) }.to_string_lossy().into_owned()
476    }
477}
478
479fn path_to_cstring(path: &Path, label: &str) -> Result<CString, VisionError> {
480    let path_str = path
481        .to_str()
482        .ok_or_else(|| VisionError::InvalidArgument(format!("non-UTF-8 {label}")))?;
483    CString::new(path_str)
484        .map_err(|err| VisionError::InvalidArgument(format!("{label} NUL byte: {err}")))
485}
486
487#[doc(hidden)]
488/// Test helper used by the smoke test + example — renders a tiny MOV file with
489/// two text segments so `VNVideoProcessor` can be exercised without fixture
490/// assets checked into git.
491pub fn _test_helper_render_text_video(
492    first_text: &str,
493    second_text: &str,
494    width: i32,
495    height: i32,
496    fps: i32,
497    frames_per_text: i32,
498    path: &Path,
499) -> Result<(), VisionError> {
500    let first_c =
501        CString::new(first_text).map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
502    let second_c =
503        CString::new(second_text).map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
504    let path_c = CString::new(path.to_string_lossy().as_ref())
505        .map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
506    let status = unsafe {
507        ffi::vn_test_helper_render_text_video(
508            first_c.as_ptr(),
509            second_c.as_ptr(),
510            width,
511            height,
512            fps,
513            frames_per_text,
514            path_c.as_ptr(),
515        )
516    };
517    if status != ffi::status::OK {
518        return Err(VisionError::Unknown {
519            code: status,
520            message: "video render helper failed".into(),
521        });
522    }
523    Ok(())
524}