car-inference 0.14.0

//! Local VLM image inference via the `mlx_vlm.generate` Python CLI.
//!
//! Background — issue #115. CAR's two existing local Qwen2.5-VL paths both
//! fail on image inputs today:
//!
//! - **Native MLX (`backend::mlx`)** loads only the `language_model.*`
//!   prefix of a VL checkpoint, so it's a text-only tower. Image content
//!   blocks are rejected up-front with a `UnsupportedMode { mode:
//!   "image-content-block", backend: "native-mlx-text", ... }` error.
//! - **vLLM-MLX HTTP (`backend::remote_backend` via `ModelSource::VllmMlx`)**
//!   forwards an OpenAI-compatible multimodal request, which the upstream
//!   `mlx_vlm.server` honors by erroring with `RuntimeError: There is no
//!   Stream(gpu, 0) in current thread`. Server-side bug, not CAR's.
//!
//! Direct invocation of `mlx_vlm.generate` (the same Python tool) works
//! end-to-end for the same checkpoints. So this module shells out to it
//! the way `external_flux` shells out to `mflux-generate`: subprocess at
//! the feature boundary, env probing, errors that name the install
//! command. Apple Silicon only (mlx-vlm itself is `cfg(target_os =
//! "macos", target_arch = "aarch64")`).

use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::Command;

use base64::Engine as _;

use crate::ContentBlock;
use crate::InferenceError;

const EXECUTABLE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];
const MODULE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];

#[derive(Debug, Clone)]
pub struct CliInvocation {
    program: PathBuf,
    args: Vec<String>,
    label: String,
    python: Option<PathBuf>,
}

#[derive(Debug, Clone)]
pub enum RuntimeStatus {
    Available(CliInvocation),
    MissingCli {
        searched: Vec<PathBuf>,
    },
    MissingDeps {
        invocation: CliInvocation,
        detail: String,
    },
}

impl RuntimeStatus {
    pub fn is_available(&self) -> bool {
        matches!(self, RuntimeStatus::Available(_))
    }

    pub fn user_message(&self) -> String {
        match self {
            RuntimeStatus::Available(invocation) => {
                format!("mlx-vlm CLI available at {}", invocation.label)
            }
            RuntimeStatus::MissingCli { searched } => {
                let searched = searched
                    .iter()
                    .map(|path| path.display().to_string())
                    .collect::<Vec<_>>()
                    .join(", ");
                format!(
                    "mlx-vlm CLI not found. Install with `uv tool install mlx-vlm` \
                     (or `pip install mlx-vlm`). CAR searched PATH plus common \
                     tool locations: {searched}"
                )
            }
            RuntimeStatus::MissingDeps { invocation, detail } => {
                format!(
                    "mlx-vlm CLI found at {}, but its Python environment is missing \
                     runtime dependencies required by Qwen3-VL processors: {detail}. \
                     Install them with `uv pip install --python {} torch torchvision`.",
                    invocation.label,
                    invocation
                        .python
                        .as_ref()
                        .map(|path| path.display().to_string())
                        .unwrap_or_else(|| "<mlx-vlm python>".to_string())
                )
            }
        }
    }
}

impl CliInvocation {
    fn command(&self) -> Command {
        let mut cmd = Command::new(&self.program);
        cmd.args(&self.args);
        cmd
    }
}

pub fn runtime_status() -> RuntimeStatus {
    let mut searched = Vec::new();
    let Some(invocation) = locate_invocation(&mut searched) else {
        return RuntimeStatus::MissingCli { searched };
    };
    if let Some(detail) = missing_processor_deps(&invocation) {
        RuntimeStatus::MissingDeps { invocation, detail }
    } else {
        RuntimeStatus::Available(invocation)
    }
}

/// Probe whether mlx-vlm is reachable. Cheap (`--help` invocations plus a
/// dependency import check). Returns the invocation that worked so callers can
/// reuse it without re-probing.
pub fn locate() -> Option<CliInvocation> {
    match runtime_status() {
        RuntimeStatus::Available(invocation) => Some(invocation),
        _ => None,
    }
}

pub fn is_available() -> bool {
    runtime_status().is_available()
}

/// Run a single VLM image inference. Writes any base64-embedded images
/// to temp files (mlx-vlm only accepts paths or URLs, not data URIs)
/// and invokes the CLI with the resulting `--image` arguments.
///
/// Returns the model's full text reply on success.
pub fn generate(
    hf_repo: &str,
    prompt: &str,
    images: &[ContentBlock],
    temperature: f64,
    max_tokens: usize,
) -> Result<String, InferenceError> {
    if images.is_empty() {
        return Err(InferenceError::InferenceFailed(
            "mlx_vlm CLI route invoked without any image content blocks; \
             callers must check has_images before dispatching here"
                .into(),
        ));
    }

    let invocation = match runtime_status() {
        RuntimeStatus::Available(invocation) => invocation,
        status => return Err(InferenceError::InferenceFailed(status.user_message())),
    };

    // mlx-vlm wants paths on disk. Write any base64-embedded images to a
    // temp dir; URL-form blocks pass straight through. Lifetime of the
    // temp dir matches this function (RAII drop on return).
    let tmp_dir = tempfile::tempdir().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "mlx_vlm: failed to create tempdir for image staging: {e}"
        ))
    })?;
    let mut image_args: Vec<String> = Vec::with_capacity(images.len());
    for (idx, block) in images.iter().enumerate() {
        match block {
            ContentBlock::ImageBase64 { data, media_type } => {
                let ext = match media_type.as_str() {
                    "image/png" => "png",
                    "image/jpeg" | "image/jpg" => "jpg",
                    "image/webp" => "webp",
                    "image/gif" => "gif",
                    // Be conservative: unrecognized media type → fall
                    // back to the bytes-as-png path. mlx-vlm sniffs by
                    // content, not extension, so this is harmless.
                    _ => "png",
                };
                let bytes = base64::engine::general_purpose::STANDARD
                    .decode(data)
                    .map_err(|e| {
                        InferenceError::InferenceFailed(format!(
                            "mlx_vlm: image #{idx} base64 decode failed: {e}"
                        ))
                    })?;
                let path = tmp_dir.path().join(format!("img_{idx}.{ext}"));
                let mut f = std::fs::File::create(&path).map_err(|e| {
                    InferenceError::InferenceFailed(format!(
                        "mlx_vlm: write staged image to {}: {e}",
                        path.display()
                    ))
                })?;
                f.write_all(&bytes).map_err(|e| {
                    InferenceError::InferenceFailed(format!(
                        "mlx_vlm: write staged image to {}: {e}",
                        path.display()
                    ))
                })?;
                image_args.push(path.to_string_lossy().into_owned());
            }
            ContentBlock::ImageUrl { url, .. } => {
                image_args.push(url.clone());
            }
            // Text blocks belong in the prompt, not in --image. Skip
            // silently — caller is responsible for prompt assembly.
            // Video/audio blocks are not understood by mlx_vlm.generate;
            // the dispatch in lib.rs already rejects video+audio against
            // this backend before we get here, so warn-and-skip is safe
            // and prevents a partial/confusing CLI invocation if
            // dispatch ever drifts.
            other => {
                tracing::warn!(
                    block = ?other,
                    "mlx_vlm CLI: ignoring non-image content block; only Text + Image* are accepted by mlx_vlm.generate"
                );
            }
        }
    }

    let mut cmd = invocation.command();
    cmd.arg("--model").arg(hf_repo);
    for path in &image_args {
        cmd.arg("--image").arg(path);
    }
    cmd.arg("--prompt").arg(prompt);
    cmd.arg("--max-tokens").arg(max_tokens.to_string());
    if temperature.is_finite() && temperature >= 0.0 {
        cmd.arg("--temperature").arg(format!("{temperature}"));
    }

    tracing::info!(
        repo = hf_repo,
        images = image_args.len(),
        max_tokens,
        "mlx_vlm CLI: invoking"
    );

    let output = cmd.output().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "mlx_vlm CLI failed to spawn ({}): {e}. \
             Reinstall with `uv tool install mlx-vlm`.",
            invocation.label
        ))
    })?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        if let Some(detail) = classify_missing_deps(&stderr) {
            return Err(InferenceError::InferenceFailed(format!(
                "mlx_vlm found at {} but missing runtime dependencies: {detail}",
                invocation.label
            )));
        }
        return Err(InferenceError::InferenceFailed(format!(
            "mlx_vlm exited with status {}: {}",
            output.status,
            stderr.trim()
        )));
    }

    let text = parse_output(&String::from_utf8_lossy(&output.stdout));
    if text.trim().is_empty() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        return Err(InferenceError::InferenceFailed(format!(
            "mlx_vlm produced empty output. stderr: {}",
            stderr.trim()
        )));
    }
    Ok(text)
}

fn locate_invocation(searched: &mut Vec<PathBuf>) -> Option<CliInvocation> {
    for exe in EXECUTABLE_FORMS {
        if let Some(path) = find_executable(exe, searched) {
            let mut invocation = CliInvocation {
                program: path.clone(),
                args: Vec::new(),
                label: path.display().to_string(),
                python: python_from_shebang(&path),
            };
            if help_succeeds(&invocation) {
                return Some(invocation);
            }
            invocation.args.clear();
        }
    }

    for python in python_candidates(searched) {
        for module in MODULE_FORMS {
            let invocation = CliInvocation {
                program: python.clone(),
                args: vec!["-m".to_string(), (*module).to_string()],
                label: format!("{} -m {module}", python.display()),
                python: Some(python.clone()),
            };
            if help_succeeds(&invocation) {
                return Some(invocation);
            }
        }
    }
    None
}

fn help_succeeds(invocation: &CliInvocation) -> bool {
    let mut cmd = invocation.command();
    cmd.arg("--help")
        .stdout(std::process::Stdio::null())
        .stderr(std::process::Stdio::null())
        .status()
        .map(|s| s.success())
        .unwrap_or(false)
}

fn find_executable(name: &str, searched: &mut Vec<PathBuf>) -> Option<PathBuf> {
    for dir in executable_search_dirs() {
        let path = dir.join(name);
        searched.push(path.clone());
        if path.exists() && path.is_file() {
            return Some(path);
        }
    }
    None
}

fn executable_search_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = std::env::var_os("PATH")
        .map(|paths| std::env::split_paths(&paths).collect())
        .unwrap_or_default();
    if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
        dirs.push(home.join(".local").join("bin"));
        dirs.push(
            home.join(".local")
                .join("share")
                .join("uv")
                .join("tools")
                .join("mlx-vlm")
                .join("bin"),
        );
        dirs.push(home.join(".car").join("visual-runtime").join("bin"));
    }
    dedupe_paths(dirs)
}

fn python_candidates(searched: &mut Vec<PathBuf>) -> Vec<PathBuf> {
    let mut candidates = Vec::new();
    if let Ok(path) = std::env::var("CAR_MLX_VLM_PYTHON") {
        candidates.push(PathBuf::from(path));
    }
    if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
        candidates.push(
            home.join(".local")
                .join("share")
                .join("uv")
                .join("tools")
                .join("mlx-vlm")
                .join("bin")
                .join("python"),
        );
        candidates.push(
            home.join(".car")
                .join("visual-runtime")
                .join("bin")
                .join("python"),
        );
    }
    candidates.extend(
        ["python3", "python"]
            .iter()
            .filter_map(|name| find_executable(name, searched)),
    );
    dedupe_paths(candidates)
        .into_iter()
        .filter(|path| {
            searched.push(path.clone());
            path.exists() || path.components().count() == 1
        })
        .collect()
}

fn dedupe_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for path in paths {
        if !out.contains(&path) {
            out.push(path);
        }
    }
    out
}

fn python_from_shebang(path: &Path) -> Option<PathBuf> {
    let bytes = std::fs::read(path).ok()?;
    let first_line = bytes.split(|byte| *byte == b'\n').next()?;
    let line = std::str::from_utf8(first_line).ok()?.trim();
    let shebang = line.strip_prefix("#!")?.trim();
    if shebang.contains("python") {
        Some(PathBuf::from(shebang.split_whitespace().next()?))
    } else {
        None
    }
}

fn missing_processor_deps(invocation: &CliInvocation) -> Option<String> {
    let python = invocation.python.as_ref()?;
    let output = Command::new(python)
        .args(["-c", "import torch, torchvision"])
        .output()
        .ok()?;
    if output.status.success() {
        return None;
    }
    classify_missing_deps(&String::from_utf8_lossy(&output.stderr))
        .or_else(|| Some(String::from_utf8_lossy(&output.stderr).trim().to_string()))
}

fn classify_missing_deps(stderr: &str) -> Option<String> {
    let lower = stderr.to_ascii_lowercase();
    let mut missing = Vec::new();
    if lower.contains("no module named 'torch'")
        || lower.contains("no module named torch")
        || lower.contains("pytorch library but it was not found")
    {
        missing.push("torch");
    }
    if lower.contains("no module named 'torchvision'")
        || lower.contains("no module named torchvision")
        || lower.contains("torchvision library but it was not found")
    {
        missing.push("torchvision");
    }
    if missing.is_empty() {
        None
    } else {
        Some(missing.join(", "))
    }
}

/// Strip mlx-vlm's framing. Every successful run prints a banner like
/// `==========`, then the generated text, then performance lines such
/// as `Prompt: N tokens`, `Peak memory: X.YYY GB`, etc. Keep only the
/// content between banners.
fn parse_output(stdout: &str) -> String {
    let mut in_body = false;
    let mut body: Vec<&str> = Vec::new();
    for line in stdout.lines() {
        let trimmed = line.trim_end();
        // Treat a line of '=' (with optional whitespace) as a banner.
        if !trimmed.is_empty() && trimmed.chars().all(|c| c == '=' || c.is_whitespace()) {
            if !in_body {
                in_body = true;
            } else {
                // Closing banner: stop accumulating; everything after is
                // mlx-vlm's perf summary.
                break;
            }
            continue;
        }
        if in_body {
            body.push(line);
        }
    }
    if body.is_empty() {
        // Older mlx-vlm versions don't print banners — return raw
        // stdout, trimmed.
        return stdout.trim().to_string();
    }
    body.join("\n").trim_end().to_string()
}

/// Where on disk to expect a previously-pulled MLX VL repo. Used by the
/// availability probe so `car models list --capability vision` can
/// distinguish "registered" from "weights actually present".
pub fn cached_repo_path(hf_repo: &str) -> PathBuf {
    // mlx-vlm uses huggingface_hub's standard cache layout. We don't
    // need to honor HF_HUB_CACHE here — `car models pull` will write
    // into the same cache via `huggingface_hub.snapshot_download`.
    let home = std::env::var_os("HOME")
        .map(PathBuf::from)
        .unwrap_or_default();
    home.join(".cache")
        .join("huggingface")
        .join("hub")
        .join(format!("models--{}", hf_repo.replace('/', "--")))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_output_strips_banners_and_perf_lines() {
        let raw = "Loading model...\n\
                   ==========\n\
                   The image is a blank canvas with a grid pattern.\n\
                   ==========\n\
                   Prompt: 234 tokens\n\
                   Generation: 12 tokens, 80.123 tokens/s\n\
                   Peak memory: 2.345 GB";
        assert_eq!(
            parse_output(raw),
            "The image is a blank canvas with a grid pattern."
        );
    }

    #[test]
    fn parse_output_handles_missing_banners() {
        let raw = "  Some single-line response.  \n";
        assert_eq!(parse_output(raw), "Some single-line response.");
    }

    #[test]
    fn parse_output_handles_multiline_body() {
        let raw = "==========\nLine one.\nLine two.\n==========\nPrompt: 1 tokens";
        assert_eq!(parse_output(raw), "Line one.\nLine two.");
    }

    #[test]
    fn classify_missing_deps_distinguishes_processor_imports() {
        let stderr = "ImportError: Qwen3VLVideoProcessor requires the Torchvision library but it was not found in your environment.\n\
                      ModuleNotFoundError: No module named 'torch'";
        assert_eq!(
            classify_missing_deps(stderr),
            Some("torch, torchvision".to_string())
        );
    }
}