Skip to main content

apple_vision/processing/
mod.rs

1//! Explicit request / handler / video-processing wrappers backed by Vision.
2//!
3//! This module exposes the generic Vision base classes that the rest of the
4//! crate used internally until v0.15.1: [`Request`] (`VNRequest`),
5//! [`Observation`] (`VNObservation`), [`ImageRequestHandler`]
6//! (`VNImageRequestHandler`), [`SequenceRequestHandler`]
7//! (`VNSequenceRequestHandler`), and [`VideoProcessor`] (`VNVideoProcessor`).
8//! The initial safe surface focuses on text recognition, which is already part
9//! of the crate's default feature set.
10
11use core::{
12    ffi::{c_char, c_void},
13    ptr,
14};
15use std::{
16    ffi::{CStr, CString},
17    path::{Path, PathBuf},
18};
19
20use crate::{
21    error::{from_swift, VisionError},
22    ffi,
23    recognize_text::{BoundingBox, RecognitionLevel, RecognizedText, RecognizedTextCandidate},
24    request_base::RequestRevisionProviding,
25    sdk::{ComputeStage, ImageOption},
26};
27
28const VIDEO_CADENCE_DEFAULT: i32 = 0;
29const VIDEO_CADENCE_FRAME_RATE: i32 = 1;
30const VIDEO_CADENCE_TIME_INTERVAL: i32 = 2;
31
32/// The high-level Vision request kind carried by [`Request`].
33#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
34#[non_exhaustive]
35pub enum RequestKind {
36    /// `VNRecognizeTextRequest`
37    RecognizeText,
38}
39
40/// Shared `VNRequest` configuration.
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct Request {
43    kind: RequestKind,
44    recognition_level: RecognitionLevel,
45    uses_language_correction: bool,
46    prefer_background_processing: bool,
47    uses_cpu_only: bool,
48    revision: Option<usize>,
49}
50
51impl Default for Request {
52    fn default() -> Self {
53        Self::recognize_text()
54    }
55}
56
57impl RequestRevisionProviding for Request {
58    fn request_revision(&self) -> Option<usize> {
59        self.revision()
60    }
61}
62
63impl Request {
64    /// Build a text-recognition request backed by `VNRecognizeTextRequest`.
65    #[must_use]
66    pub const fn recognize_text() -> Self {
67        Self {
68            kind: RequestKind::RecognizeText,
69            recognition_level: RecognitionLevel::Accurate,
70            uses_language_correction: true,
71            prefer_background_processing: false,
72            uses_cpu_only: false,
73            revision: None,
74        }
75    }
76
77    /// Return the underlying request kind.
78    #[must_use]
79    pub const fn kind(&self) -> RequestKind {
80        self.kind
81    }
82
83    /// Select the OCR recognition strategy.
84    #[must_use]
85    pub const fn with_recognition_level(mut self, recognition_level: RecognitionLevel) -> Self {
86        self.recognition_level = recognition_level;
87        self
88    }
89
90    /// Enable or disable language correction.
91    #[must_use]
92    pub const fn with_language_correction(mut self, enabled: bool) -> Self {
93        self.uses_language_correction = enabled;
94        self
95    }
96
97    /// Mirror `VNRequest.preferBackgroundProcessing`.
98    #[must_use]
99    pub const fn with_prefer_background_processing(mut self, enabled: bool) -> Self {
100        self.prefer_background_processing = enabled;
101        self
102    }
103
104    /// Mirror `VNRequest.usesCPUOnly`.
105    #[must_use]
106    pub const fn with_uses_cpu_only(mut self, enabled: bool) -> Self {
107        self.uses_cpu_only = enabled;
108        self
109    }
110
111    /// Override the request revision.
112    #[must_use]
113    pub const fn with_revision(mut self, revision: usize) -> Self {
114        self.revision = Some(revision);
115        self
116    }
117
118    #[must_use]
119    pub const fn recognition_level(&self) -> RecognitionLevel {
120        self.recognition_level
121    }
122
123    #[must_use]
124    pub const fn uses_language_correction(&self) -> bool {
125        self.uses_language_correction
126    }
127
128    #[must_use]
129    pub const fn prefer_background_processing(&self) -> bool {
130        self.prefer_background_processing
131    }
132
133    #[must_use]
134    pub const fn uses_cpu_only(&self) -> bool {
135        self.uses_cpu_only
136    }
137
138    #[must_use]
139    pub const fn revision(&self) -> Option<usize> {
140        self.revision
141    }
142
143    /// The compute-stage keys exposed by the current Vision SDK.
144    #[must_use]
145    pub const fn supported_compute_stages() -> &'static [ComputeStage] {
146        ComputeStage::ALL
147    }
148
149    const fn recognition_level_raw(&self) -> i32 {
150        match self.recognition_level {
151            RecognitionLevel::Fast => 0,
152            RecognitionLevel::Accurate => 1,
153        }
154    }
155}
156
157/// Shared `VNObservation` metadata surfaced by Vision results.
158#[derive(Debug, Clone, PartialEq)]
159pub struct Observation {
160    /// Stable UUID generated by Vision for this observation.
161    pub uuid: String,
162    /// Base confidence score in `0.0..=1.0`.
163    pub confidence: f32,
164    /// Optional media time range in seconds.
165    pub time_range: Option<TimeRange>,
166}
167
168/// Media time range in seconds.
169#[derive(Debug, Clone, Copy, PartialEq)]
170pub struct TimeRange {
171    pub start_seconds: f64,
172    pub duration_seconds: f64,
173}
174
175/// One recognized-text observation plus shared [`Observation`] metadata.
176#[derive(Debug, Clone, PartialEq)]
177pub struct RecognizedTextObservation {
178    pub observation: Observation,
179    pub text: String,
180    pub bounding_box: BoundingBox,
181}
182
183impl RecognizedTextObservation {
184    /// Drop the generic observation metadata and keep the existing text result
185    /// shape used elsewhere in the crate.
186    #[must_use]
187    pub fn into_recognized_text(self) -> RecognizedText {
188        self.into()
189    }
190
191    /// Clone into the existing [`RecognizedText`] shape.
192    #[must_use]
193    pub fn as_recognized_text(&self) -> RecognizedText {
194        self.clone().into()
195    }
196
197    /// Clone into the dedicated `VNRecognizedText` wrapper.
198    #[must_use]
199    pub fn candidate(&self) -> RecognizedTextCandidate {
200        self.as_recognized_text().into()
201    }
202}
203
204impl From<RecognizedTextObservation> for RecognizedText {
205    fn from(value: RecognizedTextObservation) -> Self {
206        Self {
207            text: value.text,
208            confidence: value.observation.confidence,
209            bounding_box: value.bounding_box,
210        }
211    }
212}
213
214/// Safe wrapper around `VNImageRequestHandler`.
215#[derive(Debug, Clone, PartialEq, Eq)]
216pub struct ImageRequestHandler {
217    image_path: PathBuf,
218}
219
220impl ImageRequestHandler {
221    /// Bind the handler to an image path.
222    #[must_use]
223    pub fn new(image_path: impl AsRef<Path>) -> Self {
224        Self {
225            image_path: image_path.as_ref().to_path_buf(),
226        }
227    }
228
229    /// Image-option keys accepted by `VNImageRequestHandler`.
230    #[must_use]
231    pub const fn supported_image_options() -> &'static [ImageOption] {
232        ImageOption::ALL
233    }
234
235    /// Perform `request` against the bound image.
236    ///
237    /// # Errors
238    ///
239    /// Returns [`VisionError`] if the path is invalid, the image cannot be
240    /// loaded, or Vision rejects the request.
241    pub fn perform(
242        &self,
243        request: &Request,
244    ) -> Result<Vec<RecognizedTextObservation>, VisionError> {
245        let image_c = path_to_cstring(&self.image_path, "image path")?;
246        let mut out_array: *mut c_void = ptr::null_mut();
247        let mut out_count: usize = 0;
248        let mut err_msg: *mut c_char = ptr::null_mut();
249        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
250        let status = unsafe {
251            ffi::vn_image_request_handler_perform_text_request(
252                image_c.as_ptr(),
253                request.recognition_level_raw(),
254                request.uses_language_correction,
255                request.prefer_background_processing,
256                request.uses_cpu_only,
257                request.revision.unwrap_or_default(),
258                request.revision.is_some(),
259                &mut out_array,
260                &mut out_count,
261                &mut err_msg,
262            )
263        };
264        if status != ffi::status::OK {
265            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
266            return Err(unsafe { from_swift(status, err_msg) });
267        }
268        Ok(collect_request_observations(out_array, out_count))
269    }
270}
271
272/// Safe wrapper around a retained `VNSequenceRequestHandler`.
273pub struct SequenceRequestHandler {
274    handle: *mut c_void,
275}
276
277impl SequenceRequestHandler {
278    /// Create a fresh sequence handler.
279    ///
280    /// # Errors
281    ///
282    /// Returns [`VisionError`] if the Swift bridge fails to allocate the
283    /// backing handler.
284    pub fn new() -> Result<Self, VisionError> {
285        let mut handle: *mut c_void = ptr::null_mut();
286        let mut err_msg: *mut c_char = ptr::null_mut();
287        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
288        let status = unsafe { ffi::vn_sequence_request_handler_create(&mut handle, &mut err_msg) };
289        if status != ffi::status::OK {
290            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
291            return Err(unsafe { from_swift(status, err_msg) });
292        }
293        if handle.is_null() {
294            return Err(VisionError::Unknown {
295                code: ffi::status::UNKNOWN,
296                message: "sequence request handler bridge returned a null handle".into(),
297            });
298        }
299        Ok(Self { handle })
300    }
301
302    /// Perform `request` on `image_path`, preserving Vision's sequence state
303    /// across calls.
304    ///
305    /// # Errors
306    ///
307    /// Returns [`VisionError`] if the path is invalid, the image cannot be
308    /// loaded, or Vision rejects the request.
309    pub fn perform(
310        &mut self,
311        image_path: impl AsRef<Path>,
312        request: &Request,
313    ) -> Result<Vec<RecognizedTextObservation>, VisionError> {
314        let image_c = path_to_cstring(image_path.as_ref(), "image path")?;
315        let mut out_array: *mut c_void = ptr::null_mut();
316        let mut out_count: usize = 0;
317        let mut err_msg: *mut c_char = ptr::null_mut();
318        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
319        let status = unsafe {
320            ffi::vn_sequence_request_handler_perform_text_request(
321                self.handle,
322                image_c.as_ptr(),
323                request.recognition_level_raw(),
324                request.uses_language_correction,
325                request.prefer_background_processing,
326                request.uses_cpu_only,
327                request.revision.unwrap_or_default(),
328                request.revision.is_some(),
329                &mut out_array,
330                &mut out_count,
331                &mut err_msg,
332            )
333        };
334        if status != ffi::status::OK {
335            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
336            return Err(unsafe { from_swift(status, err_msg) });
337        }
338        Ok(collect_request_observations(out_array, out_count))
339    }
340}
341
342impl Drop for SequenceRequestHandler {
343    fn drop(&mut self) {
344        if !self.handle.is_null() {
345            // SAFETY: `self.handle` is a live bridge-owned handle and this drop path releases it exactly once.
346            unsafe { ffi::vn_sequence_request_handler_free(self.handle) };
347        }
348    }
349}
350
351/// Cadence override for [`VideoProcessor`].
352#[derive(Debug, Clone, Copy, PartialEq)]
353#[non_exhaustive]
354pub enum VideoCadence {
355    /// Let Vision process every frame.
356    EveryFrame,
357    /// Sample the video at the given frame rate.
358    FrameRate(usize),
359    /// Sample the video at the given time interval, in seconds.
360    TimeIntervalSeconds(f64),
361}
362
363/// Base `VNVideoProcessorCadence` wrapper.
364#[derive(Debug, Clone, Copy, PartialEq)]
365pub struct VideoProcessorCadence {
366    pub cadence: VideoCadence,
367}
368
369impl VideoProcessorCadence {
370    #[must_use]
371    pub const fn every_frame() -> Self {
372        Self {
373            cadence: VideoCadence::EveryFrame,
374        }
375    }
376
377    #[must_use]
378    pub const fn frame_rate(frames_per_second: usize) -> Self {
379        Self {
380            cadence: VideoCadence::FrameRate(frames_per_second),
381        }
382    }
383
384    #[must_use]
385    pub const fn time_interval_seconds(seconds: f64) -> Self {
386        Self {
387            cadence: VideoCadence::TimeIntervalSeconds(seconds),
388        }
389    }
390}
391
392/// Dedicated `VNVideoProcessorFrameRateCadence` wrapper.
393#[derive(Debug, Clone, Copy, PartialEq, Eq)]
394pub struct VideoProcessorFrameRateCadence {
395    pub frames_per_second: usize,
396}
397
398impl VideoProcessorFrameRateCadence {
399    #[must_use]
400    pub const fn as_video_processor_cadence(self) -> VideoProcessorCadence {
401        VideoProcessorCadence::frame_rate(self.frames_per_second)
402    }
403}
404
405/// Dedicated `VNVideoProcessorTimeIntervalCadence` wrapper.
406#[derive(Debug, Clone, Copy, PartialEq)]
407pub struct VideoProcessorTimeIntervalCadence {
408    pub seconds: f64,
409}
410
411impl VideoProcessorTimeIntervalCadence {
412    #[must_use]
413    pub const fn as_video_processor_cadence(self) -> VideoProcessorCadence {
414        VideoProcessorCadence::time_interval_seconds(self.seconds)
415    }
416}
417
418/// `VNVideoProcessor` request options.
419#[derive(Debug, Clone, Copy, PartialEq)]
420pub struct VideoProcessingOptions {
421    pub cadence: Option<VideoCadence>,
422}
423
424/// Dedicated `VNVideoProcessorRequestProcessingOptions` wrapper.
425pub type VideoProcessorRequestProcessingOptions = VideoProcessingOptions;
426
427impl From<VideoProcessorCadence> for VideoCadence {
428    fn from(value: VideoProcessorCadence) -> Self {
429        value.cadence
430    }
431}
432
433impl Default for VideoProcessingOptions {
434    fn default() -> Self {
435        Self::new()
436    }
437}
438
439impl VideoProcessingOptions {
440    #[must_use]
441    pub const fn new() -> Self {
442        Self { cadence: None }
443    }
444
445    #[must_use]
446    pub const fn with_cadence(mut self, cadence: VideoCadence) -> Self {
447        self.cadence = Some(cadence);
448        self
449    }
450
451    #[must_use]
452    pub const fn with_video_processor_cadence(mut self, cadence: VideoProcessorCadence) -> Self {
453        self.cadence = Some(cadence.cadence);
454        self
455    }
456}
457
458/// Safe wrapper around `VNVideoProcessor`.
459#[derive(Debug, Clone, PartialEq, Eq)]
460pub struct VideoProcessor {
461    video_path: PathBuf,
462}
463
464impl VideoProcessor {
465    /// Bind the processor to a video file.
466    #[must_use]
467    pub fn new(video_path: impl AsRef<Path>) -> Self {
468        Self {
469            video_path: video_path.as_ref().to_path_buf(),
470        }
471    }
472
473    /// Analyze the bound video with `request`.
474    ///
475    /// # Errors
476    ///
477    /// Returns [`VisionError`] if the path is invalid, the video cannot be
478    /// opened, the cadence is invalid, or Vision rejects the request.
479    pub fn analyze(
480        &self,
481        request: &Request,
482        options: VideoProcessingOptions,
483    ) -> Result<Vec<RecognizedTextObservation>, VisionError> {
484        let video_c = path_to_cstring(&self.video_path, "video path")?;
485        let (cadence_kind, cadence_value) = cadence_to_ffi(options.cadence)?;
486        let mut out_array: *mut c_void = ptr::null_mut();
487        let mut out_count: usize = 0;
488        let mut err_msg: *mut c_char = ptr::null_mut();
489        // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
490        let status = unsafe {
491            ffi::vn_video_processor_analyze_text_request(
492                video_c.as_ptr(),
493                request.recognition_level_raw(),
494                request.uses_language_correction,
495                request.prefer_background_processing,
496                request.uses_cpu_only,
497                request.revision.unwrap_or_default(),
498                request.revision.is_some(),
499                cadence_kind,
500                cadence_value,
501                &mut out_array,
502                &mut out_count,
503                &mut err_msg,
504            )
505        };
506        if status != ffi::status::OK {
507            // SAFETY: the error pointer is either null or a bridge-allocated C string; `from_swift` frees it.
508            return Err(unsafe { from_swift(status, err_msg) });
509        }
510        Ok(collect_request_observations(out_array, out_count))
511    }
512}
513
514fn cadence_to_ffi(cadence: Option<VideoCadence>) -> Result<(i32, f64), VisionError> {
515    match cadence.unwrap_or(VideoCadence::EveryFrame) {
516        VideoCadence::EveryFrame => Ok((VIDEO_CADENCE_DEFAULT, 0.0)),
517        VideoCadence::FrameRate(frame_rate) => {
518            if frame_rate == 0 {
519                return Err(VisionError::InvalidArgument(
520                    "video cadence frame rate must be greater than zero".into(),
521                ));
522            }
523            let frame_rate = u32::try_from(frame_rate).map_err(|_| {
524                VisionError::InvalidArgument(
525                    "video cadence frame rate exceeds the supported range".into(),
526                )
527            })?;
528            Ok((VIDEO_CADENCE_FRAME_RATE, f64::from(frame_rate)))
529        }
530        VideoCadence::TimeIntervalSeconds(seconds) => {
531            if !seconds.is_finite() || seconds <= 0.0 {
532                return Err(VisionError::InvalidArgument(
533                    "video cadence time interval must be a finite positive number".into(),
534                ));
535            }
536            Ok((VIDEO_CADENCE_TIME_INTERVAL, seconds))
537        }
538    }
539}
540
541fn collect_request_observations(
542    array: *mut c_void,
543    count: usize,
544) -> Vec<RecognizedTextObservation> {
545    if array.is_null() || count == 0 {
546        return Vec::new();
547    }
548
549    let typed = array.cast::<ffi::RequestObservationRaw>();
550    let mut observations = Vec::with_capacity(count);
551    for index in 0..count {
552        // SAFETY: the pointer is valid for the reported element count; the index is in bounds.
553        let raw = unsafe { &*typed.add(index) };
554        let uuid = c_string_or_empty(raw.uuid);
555        let text = c_string_or_empty(raw.text);
556        let time_range = raw.has_time_range.then_some(TimeRange {
557            start_seconds: raw.time_range_start_seconds,
558            duration_seconds: raw.time_range_duration_seconds,
559        });
560        observations.push(RecognizedTextObservation {
561            observation: Observation {
562                uuid,
563                confidence: raw.confidence,
564                time_range,
565            },
566            text,
567            bounding_box: BoundingBox {
568                x: raw.bbox_x,
569                y: raw.bbox_y,
570                width: raw.bbox_w,
571                height: raw.bbox_h,
572            },
573        });
574    }
575
576    // SAFETY: the pointer/count pair was allocated by the bridge and is freed exactly once here.
577    unsafe { ffi::vn_request_observations_free(array, count) };
578    observations
579}
580
581fn c_string_or_empty(ptr: *mut c_char) -> String {
582    if ptr.is_null() {
583        String::new()
584    } else {
585        // SAFETY: the C string pointer is non-null (checked above) and valid for the duration of this borrow.
586        unsafe { CStr::from_ptr(ptr) }
587            .to_string_lossy()
588            .into_owned()
589    }
590}
591
592fn path_to_cstring(path: &Path, label: &str) -> Result<CString, VisionError> {
593    let path_str = path
594        .to_str()
595        .ok_or_else(|| VisionError::InvalidArgument(format!("non-UTF-8 {label}")))?;
596    CString::new(path_str)
597        .map_err(|err| VisionError::InvalidArgument(format!("{label} NUL byte: {err}")))
598}
599
600#[doc(hidden)]
601/// Test helper used by the smoke test + example — renders a tiny MOV file with
602/// two text segments so `VNVideoProcessor` can be exercised without fixture
603/// assets checked into git.
604pub fn _test_helper_render_text_video(
605    first_text: &str,
606    second_text: &str,
607    width: i32,
608    height: i32,
609    fps: i32,
610    frames_per_text: i32,
611    path: &Path,
612) -> Result<(), VisionError> {
613    let first_c =
614        CString::new(first_text).map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
615    let second_c =
616        CString::new(second_text).map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
617    let path_c = CString::new(path.to_string_lossy().as_ref())
618        .map_err(|err| VisionError::InvalidArgument(err.to_string()))?;
619    // SAFETY: all pointer arguments are valid stack locations or bridge-owned handles; strings are valid C strings for the duration of the call.
620    let status = unsafe {
621        ffi::vn_test_helper_render_text_video(
622            first_c.as_ptr(),
623            second_c.as_ptr(),
624            width,
625            height,
626            fps,
627            frames_per_text,
628            path_c.as_ptr(),
629        )
630    };
631    if status != ffi::status::OK {
632        return Err(VisionError::Unknown {
633            code: status,
634            message: "video render helper failed".into(),
635        });
636    }
637    Ok(())
638}