use std::io::Write;
use std::path::PathBuf;
use std::process::Command;
use base64::Engine as _;
use crate::ContentBlock;
use crate::InferenceError;
const MODULE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];
pub fn locate() -> Option<&'static str> {
for form in MODULE_FORMS {
let mut cmd = if *form == "mlx_vlm" {
Command::new("mlx_vlm")
} else {
let mut c = Command::new("python");
c.args(["-m", form]);
c
};
let ok = cmd
.arg("--help")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false);
if ok {
return Some(form);
}
}
None
}
pub fn is_available() -> bool {
locate().is_some()
}
pub fn generate(
hf_repo: &str,
prompt: &str,
images: &[ContentBlock],
temperature: f64,
max_tokens: usize,
) -> Result<String, InferenceError> {
if images.is_empty() {
return Err(InferenceError::InferenceFailed(
"mlx_vlm CLI route invoked without any image content blocks; \
callers must check has_images before dispatching here"
.into(),
));
}
let form = locate().ok_or_else(|| {
InferenceError::InferenceFailed(
"mlx-vlm CLI not found on PATH. Install with `uv tool install mlx-vlm` \
(or `pip install mlx-vlm`) so CAR can route image inputs to a \
working local Qwen2.5-VL backend. See issue #115."
.into(),
)
})?;
let tmp_dir = tempfile::tempdir().map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: failed to create tempdir for image staging: {e}"
))
})?;
let mut image_args: Vec<String> = Vec::with_capacity(images.len());
for (idx, block) in images.iter().enumerate() {
match block {
ContentBlock::ImageBase64 { data, media_type } => {
let ext = match media_type.as_str() {
"image/png" => "png",
"image/jpeg" | "image/jpg" => "jpg",
"image/webp" => "webp",
"image/gif" => "gif",
_ => "png",
};
let bytes = base64::engine::general_purpose::STANDARD
.decode(data)
.map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: image #{idx} base64 decode failed: {e}"
))
})?;
let path = tmp_dir.path().join(format!("img_{idx}.{ext}"));
let mut f = std::fs::File::create(&path).map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: write staged image to {}: {e}",
path.display()
))
})?;
f.write_all(&bytes).map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: write staged image to {}: {e}",
path.display()
))
})?;
image_args.push(path.to_string_lossy().into_owned());
}
ContentBlock::ImageUrl { url, .. } => {
image_args.push(url.clone());
}
other => {
tracing::warn!(
block = ?other,
"mlx_vlm CLI: ignoring non-image content block; only Text + Image* are accepted by mlx_vlm.generate"
);
}
}
}
let mut cmd = if form == "mlx_vlm" {
Command::new("mlx_vlm")
} else {
let mut c = Command::new("python");
c.args(["-m", form]);
c
};
cmd.arg("--model").arg(hf_repo);
for path in &image_args {
cmd.arg("--image").arg(path);
}
cmd.arg("--prompt").arg(prompt);
cmd.arg("--max-tokens").arg(max_tokens.to_string());
if temperature.is_finite() && temperature >= 0.0 {
cmd.arg("--temperature").arg(format!("{temperature}"));
}
tracing::info!(
repo = hf_repo,
images = image_args.len(),
max_tokens,
"mlx_vlm CLI: invoking"
);
let output = cmd.output().map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm CLI failed to spawn ({form}): {e}. \
Reinstall with `uv tool install mlx-vlm`."
))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(InferenceError::InferenceFailed(format!(
"mlx_vlm exited with status {}: {}",
output.status,
stderr.trim()
)));
}
let text = parse_output(&String::from_utf8_lossy(&output.stdout));
if text.trim().is_empty() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(InferenceError::InferenceFailed(format!(
"mlx_vlm produced empty output. stderr: {}",
stderr.trim()
)));
}
Ok(text)
}
fn parse_output(stdout: &str) -> String {
let mut in_body = false;
let mut body: Vec<&str> = Vec::new();
for line in stdout.lines() {
let trimmed = line.trim_end();
if !trimmed.is_empty() && trimmed.chars().all(|c| c == '=' || c.is_whitespace()) {
if !in_body {
in_body = true;
} else {
break;
}
continue;
}
if in_body {
body.push(line);
}
}
if body.is_empty() {
return stdout.trim().to_string();
}
body.join("\n").trim_end().to_string()
}
pub fn cached_repo_path(hf_repo: &str) -> PathBuf {
let home = std::env::var_os("HOME").map(PathBuf::from).unwrap_or_default();
home.join(".cache")
.join("huggingface")
.join("hub")
.join(format!("models--{}", hf_repo.replace('/', "--")))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_output_strips_banners_and_perf_lines() {
let raw = "Loading model...\n\
==========\n\
The image is a blank canvas with a grid pattern.\n\
==========\n\
Prompt: 234 tokens\n\
Generation: 12 tokens, 80.123 tokens/s\n\
Peak memory: 2.345 GB";
assert_eq!(
parse_output(raw),
"The image is a blank canvas with a grid pattern."
);
}
#[test]
fn parse_output_handles_missing_banners() {
let raw = " Some single-line response. \n";
assert_eq!(parse_output(raw), "Some single-line response.");
}
#[test]
fn parse_output_handles_multiline_body() {
let raw = "==========\nLine one.\nLine two.\n==========\nPrompt: 1 tokens";
assert_eq!(parse_output(raw), "Line one.\nLine two.");
}
}