car-inference 0.6.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
//! Local VLM image inference via the `mlx_vlm.generate` Python CLI.
//!
//! Background — issue #115. CAR's two existing local Qwen2.5-VL paths both
//! fail on image inputs today:
//!
//! - **Native MLX (`backend::mlx`)** loads only the `language_model.*`
//!   prefix of a VL checkpoint, so it's a text-only tower. Image content
//!   blocks are rejected up-front with a `UnsupportedMode { mode:
//!   "image-content-block", backend: "native-mlx-text", ... }` error.
//! - **vLLM-MLX HTTP (`backend::remote_backend` via `ModelSource::VllmMlx`)**
//!   forwards an OpenAI-compatible multimodal request, which the upstream
//!   `mlx_vlm.server` honors by erroring with `RuntimeError: There is no
//!   Stream(gpu, 0) in current thread`. Server-side bug, not CAR's.
//!
//! Direct invocation of `mlx_vlm.generate` (the same Python tool) works
//! end-to-end for the same checkpoints. So this module shells out to it
//! the way `external_flux` shells out to `mflux-generate`: subprocess at
//! the feature boundary, env probing, errors that name the install
//! command. Apple Silicon only (mlx-vlm itself is `cfg(target_os =
//! "macos", target_arch = "aarch64")`).

use std::io::Write;
use std::path::PathBuf;
use std::process::Command;

use base64::Engine as _;

use crate::ContentBlock;
use crate::InferenceError;

/// Preferred entry point. mlx-vlm publishes both `mlx_vlm.generate`
/// (Python module) and a `mlx_vlm` console script after `pip install`.
/// We try the module-style invocation first because it is what the
/// upstream README documents, then fall back to the bare `mlx_vlm` CLI.
const MODULE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];

/// Probe whether mlx-vlm is reachable. Cheap (a `--help` invocation per
/// candidate). Returns the form that worked so callers can reuse it
/// without re-probing.
pub fn locate() -> Option<&'static str> {
    for form in MODULE_FORMS {
        let mut cmd = if *form == "mlx_vlm" {
            Command::new("mlx_vlm")
        } else {
            let mut c = Command::new("python");
            c.args(["-m", form]);
            c
        };
        let ok = cmd
            .arg("--help")
            .stdout(std::process::Stdio::null())
            .stderr(std::process::Stdio::null())
            .status()
            .map(|s| s.success())
            .unwrap_or(false);
        if ok {
            return Some(form);
        }
    }
    None
}

pub fn is_available() -> bool {
    locate().is_some()
}

/// Run a single VLM image inference. Writes any base64-embedded images
/// to temp files (mlx-vlm only accepts paths or URLs, not data URIs)
/// and invokes the CLI with the resulting `--image` arguments.
///
/// Returns the model's full text reply on success.
pub fn generate(
    hf_repo: &str,
    prompt: &str,
    images: &[ContentBlock],
    temperature: f64,
    max_tokens: usize,
) -> Result<String, InferenceError> {
    if images.is_empty() {
        return Err(InferenceError::InferenceFailed(
            "mlx_vlm CLI route invoked without any image content blocks; \
             callers must check has_images before dispatching here"
                .into(),
        ));
    }

    let form = locate().ok_or_else(|| {
        InferenceError::InferenceFailed(
            "mlx-vlm CLI not found on PATH. Install with `uv tool install mlx-vlm` \
             (or `pip install mlx-vlm`) so CAR can route image inputs to a \
             working local Qwen2.5-VL backend. See issue #115."
                .into(),
        )
    })?;

    // mlx-vlm wants paths on disk. Write any base64-embedded images to a
    // temp dir; URL-form blocks pass straight through. Lifetime of the
    // temp dir matches this function (RAII drop on return).
    let tmp_dir = tempfile::tempdir().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "mlx_vlm: failed to create tempdir for image staging: {e}"
        ))
    })?;
    let mut image_args: Vec<String> = Vec::with_capacity(images.len());
    for (idx, block) in images.iter().enumerate() {
        match block {
            ContentBlock::ImageBase64 { data, media_type } => {
                let ext = match media_type.as_str() {
                    "image/png" => "png",
                    "image/jpeg" | "image/jpg" => "jpg",
                    "image/webp" => "webp",
                    "image/gif" => "gif",
                    // Be conservative: unrecognized media type → fall
                    // back to the bytes-as-png path. mlx-vlm sniffs by
                    // content, not extension, so this is harmless.
                    _ => "png",
                };
                let bytes = base64::engine::general_purpose::STANDARD
                    .decode(data)
                    .map_err(|e| {
                        InferenceError::InferenceFailed(format!(
                            "mlx_vlm: image #{idx} base64 decode failed: {e}"
                        ))
                    })?;
                let path = tmp_dir.path().join(format!("img_{idx}.{ext}"));
                let mut f = std::fs::File::create(&path).map_err(|e| {
                    InferenceError::InferenceFailed(format!(
                        "mlx_vlm: write staged image to {}: {e}",
                        path.display()
                    ))
                })?;
                f.write_all(&bytes).map_err(|e| {
                    InferenceError::InferenceFailed(format!(
                        "mlx_vlm: write staged image to {}: {e}",
                        path.display()
                    ))
                })?;
                image_args.push(path.to_string_lossy().into_owned());
            }
            ContentBlock::ImageUrl { url, .. } => {
                image_args.push(url.clone());
            }
            // Text blocks belong in the prompt, not in --image. Skip
            // silently — caller is responsible for prompt assembly.
            // Video/audio blocks are not understood by mlx_vlm.generate;
            // the dispatch in lib.rs already rejects video+audio against
            // this backend before we get here, so warn-and-skip is safe
            // and prevents a partial/confusing CLI invocation if
            // dispatch ever drifts.
            other => {
                tracing::warn!(
                    block = ?other,
                    "mlx_vlm CLI: ignoring non-image content block; only Text + Image* are accepted by mlx_vlm.generate"
                );
            }
        }
    }

    let mut cmd = if form == "mlx_vlm" {
        Command::new("mlx_vlm")
    } else {
        let mut c = Command::new("python");
        c.args(["-m", form]);
        c
    };
    cmd.arg("--model").arg(hf_repo);
    for path in &image_args {
        cmd.arg("--image").arg(path);
    }
    cmd.arg("--prompt").arg(prompt);
    cmd.arg("--max-tokens").arg(max_tokens.to_string());
    if temperature.is_finite() && temperature >= 0.0 {
        cmd.arg("--temperature").arg(format!("{temperature}"));
    }

    tracing::info!(
        repo = hf_repo,
        images = image_args.len(),
        max_tokens,
        "mlx_vlm CLI: invoking"
    );

    let output = cmd.output().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "mlx_vlm CLI failed to spawn ({form}): {e}. \
             Reinstall with `uv tool install mlx-vlm`."
        ))
    })?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        return Err(InferenceError::InferenceFailed(format!(
            "mlx_vlm exited with status {}: {}",
            output.status,
            stderr.trim()
        )));
    }

    let text = parse_output(&String::from_utf8_lossy(&output.stdout));
    if text.trim().is_empty() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        return Err(InferenceError::InferenceFailed(format!(
            "mlx_vlm produced empty output. stderr: {}",
            stderr.trim()
        )));
    }
    Ok(text)
}

/// Strip mlx-vlm's framing. Every successful run prints a banner like
/// `==========`, then the generated text, then performance lines such
/// as `Prompt: N tokens`, `Peak memory: X.YYY GB`, etc. Keep only the
/// content between banners.
fn parse_output(stdout: &str) -> String {
    let mut in_body = false;
    let mut body: Vec<&str> = Vec::new();
    for line in stdout.lines() {
        let trimmed = line.trim_end();
        // Treat a line of '=' (with optional whitespace) as a banner.
        if !trimmed.is_empty() && trimmed.chars().all(|c| c == '=' || c.is_whitespace()) {
            if !in_body {
                in_body = true;
            } else {
                // Closing banner: stop accumulating; everything after is
                // mlx-vlm's perf summary.
                break;
            }
            continue;
        }
        if in_body {
            body.push(line);
        }
    }
    if body.is_empty() {
        // Older mlx-vlm versions don't print banners — return raw
        // stdout, trimmed.
        return stdout.trim().to_string();
    }
    body.join("\n").trim_end().to_string()
}

/// Where on disk to expect a previously-pulled MLX VL repo. Used by the
/// availability probe so `car models list --capability vision` can
/// distinguish "registered" from "weights actually present".
pub fn cached_repo_path(hf_repo: &str) -> PathBuf {
    // mlx-vlm uses huggingface_hub's standard cache layout. We don't
    // need to honor HF_HUB_CACHE here — `car models pull` will write
    // into the same cache via `huggingface_hub.snapshot_download`.
    let home = std::env::var_os("HOME").map(PathBuf::from).unwrap_or_default();
    home.join(".cache")
        .join("huggingface")
        .join("hub")
        .join(format!("models--{}", hf_repo.replace('/', "--")))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_output_strips_banners_and_perf_lines() {
        let raw = "Loading model...\n\
                   ==========\n\
                   The image is a blank canvas with a grid pattern.\n\
                   ==========\n\
                   Prompt: 234 tokens\n\
                   Generation: 12 tokens, 80.123 tokens/s\n\
                   Peak memory: 2.345 GB";
        assert_eq!(
            parse_output(raw),
            "The image is a blank canvas with a grid pattern."
        );
    }

    #[test]
    fn parse_output_handles_missing_banners() {
        let raw = "  Some single-line response.  \n";
        assert_eq!(parse_output(raw), "Some single-line response.");
    }

    #[test]
    fn parse_output_handles_multiline_body() {
        let raw = "==========\nLine one.\nLine two.\n==========\nPrompt: 1 tokens";
        assert_eq!(parse_output(raw), "Line one.\nLine two.");
    }
}