apple-vision 0.16.7

Safe Rust bindings for Apple's Vision framework — OCR, object detection, face landmarks on macOS
Documentation
// Explicit VNRequest / handler / video-processor wrappers.

import CoreGraphics
import CoreMedia
import Foundation
import Vision

@frozen
public struct VNRequestObservationRaw {
    public var uuid: UnsafeMutablePointer<CChar>?
    public var text: UnsafeMutablePointer<CChar>?
    public var confidence: Float
    public var has_time_range: Bool
    public var time_range_start_seconds: Double
    public var time_range_duration_seconds: Double
    public var bbox_x: Double
    public var bbox_y: Double
    public var bbox_w: Double
    public var bbox_h: Double
}

internal struct CollectedTextObservation {
    let uuid: String
    let text: String
    let confidence: Float
    let hasTimeRange: Bool
    let timeRangeStartSeconds: Double
    let timeRangeDurationSeconds: Double
    let boundingBox: CGRect
}

internal func collectTextObservation(_ observation: VNRecognizedTextObservation) -> CollectedTextObservation {
    let candidate = observation.topCandidates(1).first
    let start = CMTimeGetSeconds(observation.timeRange.start)
    let duration = CMTimeGetSeconds(observation.timeRange.duration)
    let hasTimeRange = start.isFinite && duration.isFinite
    return CollectedTextObservation(
        uuid: observation.uuid.uuidString,
        text: candidate?.string ?? "",
        confidence: observation.confidence,
        hasTimeRange: hasTimeRange,
        timeRangeStartSeconds: hasTimeRange ? start : 0,
        timeRangeDurationSeconds: hasTimeRange ? duration : 0,
        boundingBox: observation.boundingBox
    )
}

internal func packCollectedTextObservations(
    _ observations: [CollectedTextObservation],
    outArray: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
    outCount: UnsafeMutablePointer<Int>
) {
    guard !observations.isEmpty else {
        outArray.pointee = nil
        outCount.pointee = 0
        return
    }

    let buffer = UnsafeMutablePointer<VNRequestObservationRaw>.allocate(capacity: observations.count)
    for (index, observation) in observations.enumerated() {
        let bbox = observation.boundingBox
        buffer.advanced(by: index).initialize(to: VNRequestObservationRaw(
            uuid: ffiString(observation.uuid),
            text: ffiString(observation.text),
            confidence: observation.confidence,
            has_time_range: observation.hasTimeRange,
            time_range_start_seconds: observation.timeRangeStartSeconds,
            time_range_duration_seconds: observation.timeRangeDurationSeconds,
            bbox_x: Double(bbox.origin.x),
            bbox_y: Double(bbox.origin.y),
            bbox_w: Double(bbox.size.width),
            bbox_h: Double(bbox.size.height)
        ))
    }
    outArray.pointee = UnsafeMutableRawPointer(buffer)
    outCount.pointee = observations.count
}

internal func buildRecognizeTextRequest(
    recognitionLevel: Int32,
    usesLanguageCorrection: Bool,
    preferBackgroundProcessing: Bool,
    usesCPUOnly: Bool,
    revision: Int,
    hasRevision: Bool,
    completionHandler: VNRequestCompletionHandler? = nil
) -> VNRecognizeTextRequest {
    let request: VNRecognizeTextRequest
    if let completionHandler {
        request = VNRecognizeTextRequest(completionHandler: completionHandler)
    } else {
        request = VNRecognizeTextRequest()
    }
    request.recognitionLevel = recognitionLevel == 1 ? .accurate : .fast
    request.usesLanguageCorrection = usesLanguageCorrection
    request.preferBackgroundProcessing = preferBackgroundProcessing
    request.usesCPUOnly = usesCPUOnly
    if hasRevision {
        request.revision = revision
    }
    return request
}

internal func applyImageBasedRequestConfig(
    _ request: VNImageBasedRequest,
    roiX: Double,
    roiY: Double,
    roiW: Double,
    roiH: Double,
    hasRegionOfInterest: Bool,
    preferBackgroundProcessing: Bool,
    usesCPUOnly: Bool,
    revision: Int,
    hasRevision: Bool
) {
    if hasRegionOfInterest {
        request.regionOfInterest = CGRect(x: roiX, y: roiY, width: roiW, height: roiH)
    }
    request.preferBackgroundProcessing = preferBackgroundProcessing
    request.usesCPUOnly = usesCPUOnly
    if hasRevision {
        request.revision = revision
    }
}

@_cdecl("vn_image_request_handler_perform_text_request")
public func vn_image_request_handler_perform_text_request(
    _ imagePath: UnsafePointer<CChar>,
    _ recognitionLevel: Int32,
    _ usesLanguageCorrection: Bool,
    _ preferBackgroundProcessing: Bool,
    _ usesCPUOnly: Bool,
    _ revision: Int,
    _ hasRevision: Bool,
    _ outArray: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
    _ outCount: UnsafeMutablePointer<Int>,
    _ outErrorMessage: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>?
) -> Int32 {
    let path = String(cString: imagePath)
    guard let cgImage = loadCGImage(path: path) else {
        outErrorMessage?.pointee = ffiString("could not load image at \(path)")
        outArray.pointee = nil
        outCount.pointee = 0
        return VN_IMAGE_LOAD_FAILED
    }

    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
    let request = buildRecognizeTextRequest(
        recognitionLevel: recognitionLevel,
        usesLanguageCorrection: usesLanguageCorrection,
        preferBackgroundProcessing: preferBackgroundProcessing,
        usesCPUOnly: usesCPUOnly,
        revision: revision,
        hasRevision: hasRevision
    )

    do {
        try handler.perform([request])
    } catch {
        outErrorMessage?.pointee = ffiString("VNImageRequestHandler.perform(text) failed: \(error.localizedDescription)")
        outArray.pointee = nil
        outCount.pointee = 0
        return VN_REQUEST_FAILED
    }

    let observations = (request.results ?? []).map(collectTextObservation)
    packCollectedTextObservations(observations, outArray: outArray, outCount: outCount)
    return VN_OK
}

final class TextSequenceRequestHandlerSession {
    private let handler = VNSequenceRequestHandler()

    func perform(
        path: String,
        recognitionLevel: Int32,
        usesLanguageCorrection: Bool,
        preferBackgroundProcessing: Bool,
        usesCPUOnly: Bool,
        revision: Int,
        hasRevision: Bool
    ) throws -> [CollectedTextObservation] {
        guard let cgImage = loadCGImage(path: path) else {
            throw NSError(domain: "apple-vision", code: Int(VN_IMAGE_LOAD_FAILED), userInfo: [NSLocalizedDescriptionKey: "could not load image at \(path)"])
        }

        let request = buildRecognizeTextRequest(
            recognitionLevel: recognitionLevel,
            usesLanguageCorrection: usesLanguageCorrection,
            preferBackgroundProcessing: preferBackgroundProcessing,
            usesCPUOnly: usesCPUOnly,
            revision: revision,
            hasRevision: hasRevision
        )
        try handler.perform([request], on: cgImage)
        return (request.results ?? []).map(collectTextObservation)
    }
}

@_cdecl("vn_sequence_request_handler_create")
public func vn_sequence_request_handler_create(
    _ outHandle: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
    _ outErrorMessage: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>?
) -> Int32 {
    outErrorMessage?.pointee = nil
    outHandle.pointee = Unmanaged.passRetained(TextSequenceRequestHandlerSession()).toOpaque()
    return VN_OK
}

@_cdecl("vn_sequence_request_handler_perform_text_request")
public func vn_sequence_request_handler_perform_text_request(
    _ handle: UnsafeMutableRawPointer?,
    _ imagePath: UnsafePointer<CChar>,
    _ recognitionLevel: Int32,
    _ usesLanguageCorrection: Bool,
    _ preferBackgroundProcessing: Bool,
    _ usesCPUOnly: Bool,
    _ revision: Int,
    _ hasRevision: Bool,
    _ outArray: UnsafeMutablePointer<UnsafeMutableRawPointer?>,
    _ outCount: UnsafeMutablePointer<Int>,
    _ outErrorMessage: UnsafeMutablePointer<UnsafeMutablePointer<CChar>?>?
) -> Int32 {
    outArray.pointee = nil
    outCount.pointee = 0
    guard let handle else {
        outErrorMessage?.pointee = ffiString("null sequence request handler handle")
        return VN_INVALID_ARGUMENT
    }

    let session = Unmanaged<TextSequenceRequestHandlerSession>.fromOpaque(handle).takeUnretainedValue()
    let path = String(cString: imagePath)
    do {
        let observations = try session.perform(
            path: path,
            recognitionLevel: recognitionLevel,
            usesLanguageCorrection: usesLanguageCorrection,
            preferBackgroundProcessing: preferBackgroundProcessing,
            usesCPUOnly: usesCPUOnly,
            revision: revision,
            hasRevision: hasRevision
        )
        packCollectedTextObservations(observations, outArray: outArray, outCount: outCount)
        return VN_OK
    } catch {
        outErrorMessage?.pointee = ffiString("VNSequenceRequestHandler.perform(text) failed: \(error.localizedDescription)")
        return (error as NSError).code == Int(VN_IMAGE_LOAD_FAILED) ? VN_IMAGE_LOAD_FAILED : VN_REQUEST_FAILED
    }
}

@_cdecl("vn_sequence_request_handler_free")
public func vn_sequence_request_handler_free(_ handle: UnsafeMutableRawPointer?) {
    guard let handle else { return }
    Unmanaged<TextSequenceRequestHandlerSession>.fromOpaque(handle).release()
}

@_cdecl("vn_request_observations_free")
public func vn_request_observations_free(
    _ array: UnsafeMutableRawPointer?,
    _ count: Int
) {
    guard let array else { return }
    let typed = array.assumingMemoryBound(to: VNRequestObservationRaw.self)
    for index in 0..<count {
        let observation = typed.advanced(by: index).pointee
        if let uuid = observation.uuid {
            free(uuid)
        }
        if let text = observation.text {
            free(text)
        }
    }
    typed.deallocate()
}