car-inference 0.13.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
//! Apple FoundationModels backend — on-device inference through the
//! macOS 26+ system LLM. Bridges to a small Swift shim
//! (`swift/CarFoundationModels.swift`) compiled by `build.rs`.
//!
//! Two paths are exposed:
//! - [`generate`]: blocking text generation (single round-trip).
//! - [`stream`]: callback-based incremental streaming.
//!
//! Tool calling is **not** wired in v1 — Foundation Models supports it
//! via the Swift `Tool` protocol but the schema mapping is non-trivial
//! and would block this PR. Callers requesting tools fall through to
//! the next router candidate via [`InferenceError::UnsupportedMode`].
//!
//! # Tool calling — design notes for the next pass
//!
//! Apple's `Tool` protocol takes `Arguments: Generable`, which is a
//! Swift-static, macro-derived protocol. To bridge dynamic JSON-schema
//! tool definitions from Rust, we have two viable paths:
//!
//! 1. **`DynamicGenerationSchema`** (preferred). macOS 26's Foundation
//!    Models exposes a runtime schema builder that maps onto a single
//!    `GeneratedContent` argument type. The Swift shim builds one of
//!    these per Rust-defined tool, registers a `BridgeTool` whose
//!    `call(arguments:)` body decodes the `GeneratedContent` to JSON,
//!    invokes a Rust callback (FFI extern fn pointer + opaque state),
//!    and wraps the JSON response as `ToolOutput.GeneratedContent(...)`.
//! 2. **Single dispatch tool**. One Swift `Tool` named `bridge_call`
//!    whose arguments are `(name: String, args_json: String)`. The
//!    model picks tools by writing the name; the bridge dispatches.
//!    Lossier (the model can't see per-tool schemas) but trivial to
//!    implement and recovers when (1) hits an API edge case.
//!
//! Surface to add:
//! - `extern fn car_fm_generate_with_tools(prompt, instructions,
//!   tools_json, dispatch_cb, dispatch_state, ...)` — JSON tool catalog,
//!   plus a `(name, args_json) -> result_json` callback. Mirror
//!   `call_async::<Promise<String>>` pattern from NAPI; keep
//!   `ErrorStrategy::Fatal`.
//! - Update `pub fn generate(...)` to take an optional `tools` arg and
//!   route to the new shim entry when present; otherwise no behavior
//!   change.
//! - Honor cancellation: the streaming case will need a `dispatch_cb`
//!   that can return an error to abort the model turn.
//!
//! Deferred because: (a) `DynamicGenerationSchema` is macOS 26-only and
//! the existing `aarch64-apple-darwin` build gate means there's no way
//! to compile-verify the Swift on Intel hosts; (b) Apple Intelligence
//! itself needs to be provisioned to runtime-verify. Land this in a
//! change reviewed on an Apple Silicon dev machine.
//!
//! Cfg-gated to `aarch64-apple-darwin`; everything below that line is
//! invisible on Linux/Intel-Mac builds. On those platforms the upstream
//! schema check (`is_foundation_models()`) still works (so registries
//! can describe the model) but dispatch errors out before calling here.

use std::ffi::{c_char, c_int, c_void, CStr, CString};
use std::ptr;
use std::sync::Mutex;
use std::time::{Duration, Instant};

use crate::InferenceError;

/// How long [`is_available`] caches the framework probe before re-checking.
/// Apple Intelligence can be toggled on/off in System Settings and the
/// model can finish provisioning after process start, so a permanent
/// cache would strand long-running daemons. Five seconds is enough to
/// cover a tight router loop without staling against settings changes.
const AVAILABILITY_CACHE_TTL: Duration = Duration::from_secs(5);

// ---------------------------------------------------------------------
// extern "C" surface emitted by the Swift shim.
//
// Gated by `car_fm_swift_built`, set by `build.rs` only when `swiftc`
// successfully compiled the shim into a static library. On hosts
// without full Xcode (Command Line Tools only) the cfg is absent and
// we provide stubs that report unavailable — keeps the crate buildable
// and lets the runtime path return a clean UnsupportedMode instead of
// failing at link time.
// ---------------------------------------------------------------------

#[cfg(car_fm_swift_built)]
extern "C" {
    fn car_fm_is_available() -> c_int;
    fn car_fm_free_string(ptr: *mut c_char);
    fn car_fm_generate(
        prompt: *const c_char,
        instructions: *const c_char,
        max_tokens: i32,
        temperature: f64,
        out_text: *mut *mut c_char,
        out_err: *mut *mut c_char,
    ) -> c_int;
    fn car_fm_generate_stream(
        prompt: *const c_char,
        instructions: *const c_char,
        max_tokens: i32,
        temperature: f64,
        callback: extern "C" fn(token: *const c_char, state: *mut c_void) -> c_int,
        state: *mut c_void,
        out_err: *mut *mut c_char,
    ) -> c_int;
}

#[cfg(not(car_fm_swift_built))]
mod swift_stubs {
    use super::{c_char, c_int, c_void};
    /// Always returns 0 (unavailable). The runtime check in
    /// [`super::is_available`] catches this and prevents any of the
    /// other extern paths from being called.
    pub(super) unsafe fn car_fm_is_available() -> c_int {
        0
    }
    /// No allocation crossed the boundary, so nothing to free.
    pub(super) unsafe fn car_fm_free_string(_ptr: *mut c_char) {}
    /// Unreachable: gated upstream by [`super::is_available`].
    pub(super) unsafe fn car_fm_generate(
        _prompt: *const c_char,
        _instructions: *const c_char,
        _max_tokens: i32,
        _temperature: f64,
        _out_text: *mut *mut c_char,
        _out_err: *mut *mut c_char,
    ) -> c_int {
        unreachable!("car_fm_generate called without the Swift bridge")
    }
    /// Unreachable: gated upstream by [`super::is_available`].
    pub(super) unsafe fn car_fm_generate_stream(
        _prompt: *const c_char,
        _instructions: *const c_char,
        _max_tokens: i32,
        _temperature: f64,
        _callback: extern "C" fn(token: *const c_char, state: *mut c_void) -> c_int,
        _state: *mut c_void,
        _out_err: *mut *mut c_char,
    ) -> c_int {
        unreachable!("car_fm_generate_stream called without the Swift bridge")
    }
}

#[cfg(not(car_fm_swift_built))]
use swift_stubs::{
    car_fm_free_string, car_fm_generate, car_fm_generate_stream, car_fm_is_available,
};

// ---------------------------------------------------------------------
// Public API.
// ---------------------------------------------------------------------

/// Returns true when the FoundationModels framework is available **and**
/// the on-device model is provisioned. Cached for [`AVAILABILITY_CACHE_TTL`]
/// so a tight router loop doesn't repeatedly cross the FFI boundary,
/// while still allowing recovery from "framework unavailable at startup,
/// available later" — which is the common case for long-running
/// daemons whose host machine finishes Apple Intelligence provisioning
/// minutes after launch.
pub fn is_available() -> bool {
    // (Instant, value) pair under a Mutex. The probe itself is cheap; the
    // Mutex overhead is negligible vs the FFI call we're avoiding.
    static CACHE: Mutex<Option<(Instant, bool)>> = Mutex::new(None);

    let now = Instant::now();
    let mut guard = match CACHE.lock() {
        Ok(g) => g,
        Err(poisoned) => poisoned.into_inner(),
    };
    if let Some((stamped, value)) = *guard {
        if now.duration_since(stamped) < AVAILABILITY_CACHE_TTL {
            return value;
        }
    }
    let value = unsafe { car_fm_is_available() != 0 };
    *guard = Some((now, value));
    value
}

/// Blocking single-shot generation. The caller is responsible for
/// running this on a blocking-friendly executor (the Swift bridge uses
/// a `DispatchSemaphore` to wait on the underlying async task).
pub fn generate(
    prompt: &str,
    instructions: Option<&str>,
    max_tokens: u32,
    temperature: f32,
) -> Result<String, InferenceError> {
    if !is_available() {
        return Err(unavailable_error());
    }

    let prompt_c = CString::new(prompt)
        .map_err(|e| InferenceError::InferenceFailed(format!("prompt has interior NUL: {e}")))?;
    let instr_c = match instructions {
        Some(s) if !s.is_empty() => Some(CString::new(s).map_err(|e| {
            InferenceError::InferenceFailed(format!("instructions have interior NUL: {e}"))
        })?),
        _ => None,
    };

    let mut out_text: *mut c_char = ptr::null_mut();
    let mut out_err: *mut c_char = ptr::null_mut();

    let rc = unsafe {
        car_fm_generate(
            prompt_c.as_ptr(),
            instr_c.as_ref().map_or(ptr::null(), |s| s.as_ptr()),
            max_tokens.min(i32::MAX as u32) as i32,
            temperature as f64,
            &mut out_text as *mut *mut c_char,
            &mut out_err as *mut *mut c_char,
        )
    };

    if rc != 0 {
        return Err(InferenceError::InferenceFailed(consume_swift_string(
            out_err,
        )));
    }
    Ok(consume_swift_string(out_text))
}

/// Token shape used by the streaming callback. The Swift side already
/// performs prefix-diffing on the cumulative snapshots Foundation
/// Models emits, so each delta is the newly-appended slice — no
/// further work needed on the consumer.
pub struct StreamCallback<'a> {
    on_delta: Box<dyn FnMut(&str) -> bool + Send + 'a>,
}

impl<'a> StreamCallback<'a> {
    /// `on_delta` is invoked for each incremental text fragment.
    /// Returning `false` cancels the stream.
    pub fn new<F>(on_delta: F) -> Self
    where
        F: FnMut(&str) -> bool + Send + 'a,
    {
        Self {
            on_delta: Box::new(on_delta),
        }
    }
}

extern "C" fn stream_trampoline(token: *const c_char, state: *mut c_void) -> c_int {
    // SAFETY: this function runs on a thread Swift owns; an unwinding
    // panic crossing back into Swift is undefined behavior. Wrap the
    // body in catch_unwind and translate panics into "cancel" so the
    // model turn aborts cleanly instead of taking the process down.
    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
        if state.is_null() {
            return 1;
        }
        let cb = unsafe { &mut *(state as *mut StreamCallback) };
        let s = if token.is_null() {
            ""
        } else {
            match unsafe { CStr::from_ptr(token) }.to_str() {
                Ok(s) => s,
                Err(_) => return 1,
            }
        };
        if (cb.on_delta)(s) {
            0 // continue
        } else {
            1 // cancel
        }
    }));
    match result {
        Ok(rc) => rc,
        Err(_) => 1, // cancel on panic
    }
}

/// Blocking streaming generation. Each delta is forwarded to the
/// callback as a string slice owned by Swift — copy if it needs to
/// outlive the call.
pub fn stream(
    prompt: &str,
    instructions: Option<&str>,
    max_tokens: u32,
    temperature: f32,
    mut callback: StreamCallback<'_>,
) -> Result<(), InferenceError> {
    if !is_available() {
        return Err(unavailable_error());
    }

    let prompt_c = CString::new(prompt)
        .map_err(|e| InferenceError::InferenceFailed(format!("prompt has interior NUL: {e}")))?;
    let instr_c = match instructions {
        Some(s) if !s.is_empty() => Some(CString::new(s).map_err(|e| {
            InferenceError::InferenceFailed(format!("instructions have interior NUL: {e}"))
        })?),
        _ => None,
    };

    let mut out_err: *mut c_char = ptr::null_mut();
    let state: *mut c_void = &mut callback as *mut StreamCallback as *mut c_void;

    let rc = unsafe {
        car_fm_generate_stream(
            prompt_c.as_ptr(),
            instr_c.as_ref().map_or(ptr::null(), |s| s.as_ptr()),
            max_tokens.min(i32::MAX as u32) as i32,
            temperature as f64,
            stream_trampoline,
            state,
            &mut out_err as *mut *mut c_char,
        )
    };

    if rc != 0 {
        return Err(InferenceError::InferenceFailed(consume_swift_string(
            out_err,
        )));
    }
    Ok(())
}

// ---------------------------------------------------------------------
// Helpers.
// ---------------------------------------------------------------------

fn consume_swift_string(ptr: *mut c_char) -> String {
    if ptr.is_null() {
        return String::new();
    }
    let s = unsafe { CStr::from_ptr(ptr) }
        .to_string_lossy()
        .into_owned();
    unsafe { car_fm_free_string(ptr) };
    s
}

fn unavailable_error() -> InferenceError {
    InferenceError::UnsupportedMode {
        mode: "apple-foundation-models",
        backend: "foundation-models",
        reason: "FoundationModels framework reports unavailable on this host. Requires macOS 26+ \
             on Apple Silicon with Apple Intelligence enabled. Falling through to the next \
             router candidate.",
    }
}