car-inference 0.29.0

Local model inference for CAR — Candle backend with Qwen3 models
//! Locate + provision the external `vllm-mlx` OpenAI-compatible MLX server that
//! CAR supervises to serve `vllm-mlx/*` models.
//!
//! The in-process MLX backend is text-only, so multimodal and unsupported-arch
//! models (`vllm-mlx/gemma-4-*`, `vllm-mlx/qwen3.6-*`) run through a
//! `vllm-mlx serve <model> --port <P>` process that exposes an OpenAI API. CAR
//! supervises that process (start / health-wait / idle-stop) so those models
//! "just work" — and, when the runtime is absent, provisions it so the user
//! doesn't have to. `vllm-mlx` is a PyPI package (`pip install vllm-mlx`); we
//! install it into a dedicated `uv` venv under `~/.car/visual-runtime`, mirroring
//! the managed speech runtime.
//!
//! Resolution prefers an *existing* install (the user may already have it on
//! PATH or via `uv tool install vllm-mlx`); provisioning only happens when none
//! is found, and lands in the CAR-managed venv.

use std::path::{Path, PathBuf};
use std::time::Duration;

use tokio::process::Command;

/// Binary that serves an OpenAI-compatible MLX endpoint (`vllm-mlx serve …`).
const SERVER_BIN: &str = "vllm-mlx";

/// Minimum `vllm-mlx` version required. `< 0.3.0` (mlx-vlm `< ~0.6`) hits the
/// upstream `RuntimeError: There is no Stream(gpu, 0)` on every server-side
/// generation — verified fixed at vllm-mlx 0.3.0 / mlx-vlm 0.6.3. A binary older
/// than this is skipped during resolution so we provision a working one instead.
const MIN_VERSION: (u64, u64, u64) = (0, 3, 0);

/// The PyPI distribution that provides [`SERVER_BIN`]. Pinned to the floor so a
/// provision can never land a known-broken build.
const PIP_SPEC: &str = "vllm-mlx>=0.3.0";

/// Errors from locating or provisioning the vLLM-MLX runtime.
#[derive(Debug, thiserror::Error)]
pub enum RuntimeError {
    #[error("`uv` is required to provision the vllm-mlx runtime but was not found on PATH; \
             install uv (https://docs.astral.sh/uv/) or `pip install vllm-mlx` yourself")]
    UvMissing,
    #[error("provisioning step `{step}` failed: {detail}")]
    Provision { step: &'static str, detail: String },
    #[error("vllm-mlx still not found after provisioning into {0}")]
    NotFoundAfterInstall(PathBuf),
}

/// A located (and possibly CAR-provisioned) vLLM-MLX runtime.
#[derive(Debug, Clone)]
pub struct VllmRuntime {
    /// Absolute path to the `vllm-mlx` server binary.
    pub server: PathBuf,
}

/// Root of the CAR-managed visual runtime venv (`~/.car/visual-runtime`).
/// Shared with the other Python visual/audio runtimes by convention.
fn managed_root() -> Option<PathBuf> {
    std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".car").join("visual-runtime"))
}

/// The `vllm-mlx` binary inside the managed venv, if HOME is known.
fn managed_server_bin() -> Option<PathBuf> {
    managed_root().map(|r| r.join("bin").join(SERVER_BIN))
}

/// Directories searched for an existing `vllm-mlx`, in priority order: an
/// explicit override, then PATH, `~/.local/bin`, the `uv tool` bin dir, and the
/// CAR-managed venv. Mirrors `backend::mlx_vlm_cli`'s search convention.
fn search_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = std::env::var_os("PATH")
        .map(|paths| std::env::split_paths(&paths).collect())
        .unwrap_or_default();
    if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
        dirs.push(home.join(".local").join("bin"));
        dirs.push(
            home.join(".local")
                .join("share")
                .join("uv")
                .join("tools")
                .join("vllm-mlx")
                .join("bin"),
        );
        dirs.push(home.join(".car").join("visual-runtime").join("bin"));
    }
    dedupe(dirs)
}

fn dedupe(paths: Vec<PathBuf>) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for p in paths {
        if !out.contains(&p) {
            out.push(p);
        }
    }
    out
}

/// Locate an existing `vllm-mlx` binary without provisioning. Honors the
/// `CAR_VLLM_MLX_BIN` override (used as-is). Auto-discovered binaries must meet
/// [`MIN_VERSION`] — a too-old one is skipped so [`ensure_runtime`] provisions a
/// working build instead. Returns an absolute path.
pub fn resolve_existing() -> Option<PathBuf> {
    // An explicit override is the user's responsibility — honored without a
    // version gate.
    if let Some(p) = std::env::var_os("CAR_VLLM_MLX_BIN").map(PathBuf::from) {
        if p.is_file() {
            return absolutize(&p);
        }
    }
    for dir in search_dirs() {
        let candidate = dir.join(SERVER_BIN);
        if candidate.is_file() && version_ok(&candidate) {
            return absolutize(&candidate);
        }
    }
    None
}

/// True if `binary`'s `vllm-mlx` version is at least [`MIN_VERSION`]. An
/// undeterminable version is treated as too old (conservative — provision a
/// known-good build rather than risk the `Stream(gpu, 0)` bug).
fn version_ok(binary: &Path) -> bool {
    binary_version(binary).map(|v| v >= MIN_VERSION).unwrap_or(false)
}

/// Ask `binary`'s shebang interpreter for the installed `vllm-mlx` version.
fn binary_version(binary: &Path) -> Option<(u64, u64, u64)> {
    let head = std::fs::read_to_string(binary).ok()?;
    let interp = head.lines().next()?.strip_prefix("#!")?.trim();
    let out = std::process::Command::new(interp)
        .args([
            "-c",
            "import importlib.metadata as m; print(m.version('vllm-mlx'))",
        ])
        .output()
        .ok()?;
    if !out.status.success() {
        return None;
    }
    parse_semver(String::from_utf8_lossy(&out.stdout).trim())
}

fn parse_semver(s: &str) -> Option<(u64, u64, u64)> {
    let lead = |part: &str| -> u64 {
        part.chars()
            .take_while(|c| c.is_ascii_digit())
            .collect::<String>()
            .parse()
            .unwrap_or(0)
    };
    let mut it = s.split('.');
    let major = it.next()?.trim().parse().ok()?;
    Some((major, lead(it.next().unwrap_or("0")), lead(it.next().unwrap_or("0"))))
}

fn absolutize(p: &Path) -> Option<PathBuf> {
    std::fs::canonicalize(p).ok().or_else(|| Some(p.to_path_buf()))
}

/// Ensure a usable `vllm-mlx` runtime exists, provisioning into the CAR-managed
/// venv via `uv` when none is found. Idempotent: a second call with the runtime
/// present returns immediately. Network + disk on the install path only.
pub async fn ensure_runtime() -> Result<VllmRuntime, RuntimeError> {
    if let Some(server) = resolve_existing() {
        return Ok(VllmRuntime { server });
    }
    provision().await?;
    // Post-provision, trust the managed venv binary directly (it was pinned to a
    // known-good version), falling back to a fresh resolve.
    let server = managed_server_bin()
        .filter(|p| p.is_file())
        .or_else(resolve_existing)
        .ok_or_else(|| RuntimeError::NotFoundAfterInstall(managed_root().unwrap_or_default()))?;
    Ok(VllmRuntime { server })
}

/// Provision `vllm-mlx` into `~/.car/visual-runtime` via `uv venv` + `uv pip
/// install`. Mirrors `bootstrap_speech_runtime`.
async fn provision() -> Result<(), RuntimeError> {
    if which("uv").is_none() {
        return Err(RuntimeError::UvMissing);
    }
    let root = managed_root().ok_or_else(|| RuntimeError::Provision {
        step: "resolve-home",
        detail: "HOME is not set".into(),
    })?;
    std::fs::create_dir_all(&root).map_err(|e| RuntimeError::Provision {
        step: "mkdir",
        detail: e.to_string(),
    })?;

    // Reuse the venv if a prior runtime (e.g. speech/visual) already created it;
    // `uv venv` is idempotent and leaves existing packages in place.
    run_uv(
        "venv",
        &[
            "venv".into(),
            "--python".into(),
            "python3".into(),
            root.display().to_string(),
        ],
    )
    .await?;

    let venv_python = root.join("bin").join("python");
    run_uv(
        "pip-install",
        &[
            "pip".into(),
            "install".into(),
            "--python".into(),
            venv_python.display().to_string(),
            PIP_SPEC.into(),
        ],
    )
    .await?;

    Ok(())
}

async fn run_uv(step: &'static str, args: &[String]) -> Result<(), RuntimeError> {
    let output = Command::new("uv")
        .args(args)
        // Installing a heavy MLX wheel set can take a while on a cold cache.
        .kill_on_drop(true)
        .output()
        .await
        .map_err(|e| RuntimeError::Provision {
            step,
            detail: e.to_string(),
        })?;
    if output.status.success() {
        Ok(())
    } else {
        Err(RuntimeError::Provision {
            step,
            detail: format!(
                "uv exited with {}: {}",
                output.status,
                String::from_utf8_lossy(&output.stderr).trim()
            ),
        })
    }
}

/// Minimal `which`: first hit for `name` across PATH. Used to gate the
/// provisioning preflight (`uv` present?) and by tests.
pub(crate) fn which(name: &str) -> Option<PathBuf> {
    std::env::var_os("PATH")?
        .to_str()?
        .split(':')
        .map(|d| Path::new(d).join(name))
        .find(|p| p.is_file())
}

/// Probe a `vllm-mlx` server's `/health` once. `true` on any 2xx. Short timeout —
/// callers poll this in a readiness loop after spawning the process.
pub async fn health_ok(endpoint: &str, timeout: Duration) -> bool {
    let url = format!("{}/health", endpoint.trim_end_matches('/'));
    let client = match reqwest::Client::builder().timeout(timeout).build() {
        Ok(c) => c,
        Err(_) => return false,
    };
    matches!(client.get(&url).send().await, Ok(r) if r.status().is_success())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn managed_paths_are_under_dot_car() {
        if std::env::var_os("HOME").is_none() {
            return; // CI sandbox without HOME: nothing to assert
        }
        let root = managed_root().unwrap();
        assert!(root.ends_with(".car/visual-runtime"), "root: {}", root.display());
        let bin = managed_server_bin().unwrap();
        assert!(bin.ends_with(".car/visual-runtime/bin/vllm-mlx"));
    }

    #[test]
    fn search_dirs_include_path_and_managed() {
        // PATH entries always lead; the managed venv bin is always a candidate.
        let dirs = search_dirs();
        if std::env::var_os("HOME").is_some() {
            assert!(
                dirs.iter().any(|d| d.ends_with(".car/visual-runtime/bin")),
                "managed bin dir missing from search set"
            );
        }
    }

    #[test]
    fn parse_semver_handles_plain_and_prerelease() {
        assert_eq!(parse_semver("0.3.0"), Some((0, 3, 0)));
        assert_eq!(parse_semver("1.2.10"), Some((1, 2, 10)));
        assert_eq!(parse_semver("0.6.3rc1"), Some((0, 6, 3)));
        assert_eq!(parse_semver("0.2"), Some((0, 2, 0)));
        assert!(parse_semver("not-a-version").is_none());
        // The version floor must order correctly.
        assert!((0, 2, 9) < MIN_VERSION);
        assert!((0, 3, 0) >= MIN_VERSION);
        assert!((0, 6, 3) >= MIN_VERSION);
    }

    #[test]
    fn env_override_resolves_when_file_exists() {
        // A non-file override is ignored (falls through to the search dirs).
        std::env::set_var("CAR_VLLM_MLX_BIN", "/nonexistent/vllm-mlx-xyz");
        // Can't assert a positive without a real binary; assert the override path
        // is not blindly returned when it isn't a file.
        let resolved = resolve_existing();
        assert!(
            resolved
                .as_ref()
                .map(|p| !p.ends_with("vllm-mlx-xyz"))
                .unwrap_or(true),
            "non-file override must not be returned"
        );
        std::env::remove_var("CAR_VLLM_MLX_BIN");
    }
}