use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::Command;
use base64::Engine as _;
use crate::ContentBlock;
use crate::InferenceError;
const EXECUTABLE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];
const MODULE_FORMS: &[&str] = &["mlx_vlm.generate", "mlx_vlm"];
#[derive(Debug, Clone)]
pub struct CliInvocation {
program: PathBuf,
args: Vec<String>,
label: String,
python: Option<PathBuf>,
}
#[derive(Debug, Clone)]
pub enum RuntimeStatus {
Available(CliInvocation),
MissingCli {
searched: Vec<PathBuf>,
},
MissingDeps {
invocation: CliInvocation,
detail: String,
},
}
impl RuntimeStatus {
pub fn is_available(&self) -> bool {
matches!(self, RuntimeStatus::Available(_))
}
pub fn user_message(&self) -> String {
match self {
RuntimeStatus::Available(invocation) => {
format!("mlx-vlm CLI available at {}", invocation.label)
}
RuntimeStatus::MissingCli { searched } => {
let searched = searched
.iter()
.map(|path| path.display().to_string())
.collect::<Vec<_>>()
.join(", ");
format!(
"mlx-vlm CLI not found. Install with `uv tool install mlx-vlm` \
(or `pip install mlx-vlm`). CAR searched PATH plus common \
tool locations: {searched}"
)
}
RuntimeStatus::MissingDeps { invocation, detail } => {
format!(
"mlx-vlm CLI found at {}, but its Python environment is missing \
runtime dependencies required by Qwen3-VL processors: {detail}. \
Install them with `uv pip install --python {} torch torchvision`.",
invocation.label,
invocation
.python
.as_ref()
.map(|path| path.display().to_string())
.unwrap_or_else(|| "<mlx-vlm python>".to_string())
)
}
}
}
}
impl CliInvocation {
fn command(&self) -> Command {
let mut cmd = Command::new(&self.program);
cmd.args(&self.args);
cmd
}
}
pub fn runtime_status() -> RuntimeStatus {
let mut searched = Vec::new();
let Some(invocation) = locate_invocation(&mut searched) else {
return RuntimeStatus::MissingCli { searched };
};
if let Some(detail) = missing_processor_deps(&invocation) {
RuntimeStatus::MissingDeps { invocation, detail }
} else {
RuntimeStatus::Available(invocation)
}
}
pub fn locate() -> Option<CliInvocation> {
match runtime_status() {
RuntimeStatus::Available(invocation) => Some(invocation),
_ => None,
}
}
pub fn is_available() -> bool {
runtime_status().is_available()
}
pub fn generate(
hf_repo: &str,
prompt: &str,
images: &[ContentBlock],
temperature: f64,
max_tokens: usize,
) -> Result<String, InferenceError> {
if images.is_empty() {
return Err(InferenceError::InferenceFailed(
"mlx_vlm CLI route invoked without any image content blocks; \
callers must check has_images before dispatching here"
.into(),
));
}
let invocation = match runtime_status() {
RuntimeStatus::Available(invocation) => invocation,
status => return Err(InferenceError::InferenceFailed(status.user_message())),
};
let tmp_dir = tempfile::tempdir().map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: failed to create tempdir for image staging: {e}"
))
})?;
let mut image_args: Vec<String> = Vec::with_capacity(images.len());
for (idx, block) in images.iter().enumerate() {
match block {
ContentBlock::ImageBase64 { data, media_type } => {
let ext = match media_type.as_str() {
"image/png" => "png",
"image/jpeg" | "image/jpg" => "jpg",
"image/webp" => "webp",
"image/gif" => "gif",
_ => "png",
};
let bytes = base64::engine::general_purpose::STANDARD
.decode(data)
.map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: image #{idx} base64 decode failed: {e}"
))
})?;
let path = tmp_dir.path().join(format!("img_{idx}.{ext}"));
let mut f = std::fs::File::create(&path).map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: write staged image to {}: {e}",
path.display()
))
})?;
f.write_all(&bytes).map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm: write staged image to {}: {e}",
path.display()
))
})?;
image_args.push(path.to_string_lossy().into_owned());
}
ContentBlock::ImageUrl { url, .. } => {
image_args.push(url.clone());
}
other => {
tracing::warn!(
block = ?other,
"mlx_vlm CLI: ignoring non-image content block; only Text + Image* are accepted by mlx_vlm.generate"
);
}
}
}
let mut cmd = invocation.command();
cmd.arg("--model").arg(hf_repo);
for path in &image_args {
cmd.arg("--image").arg(path);
}
cmd.arg("--prompt").arg(prompt);
cmd.arg("--max-tokens").arg(max_tokens.to_string());
if temperature.is_finite() && temperature >= 0.0 {
cmd.arg("--temperature").arg(format!("{temperature}"));
}
tracing::info!(
repo = hf_repo,
images = image_args.len(),
max_tokens,
"mlx_vlm CLI: invoking"
);
let output = cmd.output().map_err(|e| {
InferenceError::InferenceFailed(format!(
"mlx_vlm CLI failed to spawn ({}): {e}. \
Reinstall with `uv tool install mlx-vlm`.",
invocation.label
))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
if let Some(detail) = classify_missing_deps(&stderr) {
return Err(InferenceError::InferenceFailed(format!(
"mlx_vlm found at {} but missing runtime dependencies: {detail}",
invocation.label
)));
}
return Err(InferenceError::InferenceFailed(format!(
"mlx_vlm exited with status {}: {}",
output.status,
stderr.trim()
)));
}
let text = parse_output(&String::from_utf8_lossy(&output.stdout));
if text.trim().is_empty() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(InferenceError::InferenceFailed(format!(
"mlx_vlm produced empty output. stderr: {}",
stderr.trim()
)));
}
Ok(text)
}
fn locate_invocation(searched: &mut Vec<PathBuf>) -> Option<CliInvocation> {
for exe in EXECUTABLE_FORMS {
if let Some(path) = find_executable(exe, searched) {
let mut invocation = CliInvocation {
program: path.clone(),
args: Vec::new(),
label: path.display().to_string(),
python: python_from_shebang(&path),
};
if help_succeeds(&invocation) {
return Some(invocation);
}
invocation.args.clear();
}
}
for python in python_candidates(searched) {
for module in MODULE_FORMS {
let invocation = CliInvocation {
program: python.clone(),
args: vec!["-m".to_string(), (*module).to_string()],
label: format!("{} -m {module}", python.display()),
python: Some(python.clone()),
};
if help_succeeds(&invocation) {
return Some(invocation);
}
}
}
None
}
fn help_succeeds(invocation: &CliInvocation) -> bool {
let mut cmd = invocation.command();
cmd.arg("--help")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.map(|s| s.success())
.unwrap_or(false)
}
fn find_executable(name: &str, searched: &mut Vec<PathBuf>) -> Option<PathBuf> {
for dir in executable_search_dirs() {
let path = dir.join(name);
searched.push(path.clone());
if path.exists() && path.is_file() {
return Some(path);
}
}
None
}
fn executable_search_dirs() -> Vec<PathBuf> {
let mut dirs: Vec<PathBuf> = std::env::var_os("PATH")
.map(|paths| std::env::split_paths(&paths).collect())
.unwrap_or_default();
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
dirs.push(home.join(".local").join("bin"));
dirs.push(
home.join(".local")
.join("share")
.join("uv")
.join("tools")
.join("mlx-vlm")
.join("bin"),
);
dirs.push(home.join(".car").join("visual-runtime").join("bin"));
}
dedupe_paths(dirs)
}
fn python_candidates(searched: &mut Vec<PathBuf>) -> Vec<PathBuf> {
let mut candidates = Vec::new();
if let Ok(path) = std::env::var("CAR_MLX_VLM_PYTHON") {
candidates.push(PathBuf::from(path));
}
if let Some(home) = std::env::var_os("HOME").map(PathBuf::from) {
candidates.push(
home.join(".local")
.join("share")
.join("uv")
.join("tools")
.join("mlx-vlm")
.join("bin")
.join("python"),
);
candidates.push(
home.join(".car")
.join("visual-runtime")
.join("bin")
.join("python"),
);
}
candidates.extend(
["python3", "python"]
.iter()
.filter_map(|name| find_executable(name, searched)),
);
dedupe_paths(candidates)
.into_iter()
.filter(|path| {
searched.push(path.clone());
path.exists() || path.components().count() == 1
})
.collect()
}
fn dedupe_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
let mut out = Vec::new();
for path in paths {
if !out.contains(&path) {
out.push(path);
}
}
out
}
fn python_from_shebang(path: &Path) -> Option<PathBuf> {
let bytes = std::fs::read(path).ok()?;
let first_line = bytes.split(|byte| *byte == b'\n').next()?;
let line = std::str::from_utf8(first_line).ok()?.trim();
let shebang = line.strip_prefix("#!")?.trim();
if shebang.contains("python") {
Some(PathBuf::from(shebang.split_whitespace().next()?))
} else {
None
}
}
fn missing_processor_deps(invocation: &CliInvocation) -> Option<String> {
let python = invocation.python.as_ref()?;
let output = Command::new(python)
.args(["-c", "import torch, torchvision"])
.output()
.ok()?;
if output.status.success() {
return None;
}
classify_missing_deps(&String::from_utf8_lossy(&output.stderr))
.or_else(|| Some(String::from_utf8_lossy(&output.stderr).trim().to_string()))
}
fn classify_missing_deps(stderr: &str) -> Option<String> {
let lower = stderr.to_ascii_lowercase();
let mut missing = Vec::new();
if lower.contains("no module named 'torch'")
|| lower.contains("no module named torch")
|| lower.contains("pytorch library but it was not found")
{
missing.push("torch");
}
if lower.contains("no module named 'torchvision'")
|| lower.contains("no module named torchvision")
|| lower.contains("torchvision library but it was not found")
{
missing.push("torchvision");
}
if missing.is_empty() {
None
} else {
Some(missing.join(", "))
}
}
fn parse_output(stdout: &str) -> String {
let mut in_body = false;
let mut body: Vec<&str> = Vec::new();
for line in stdout.lines() {
let trimmed = line.trim_end();
if !trimmed.is_empty() && trimmed.chars().all(|c| c == '=' || c.is_whitespace()) {
if !in_body {
in_body = true;
} else {
break;
}
continue;
}
if in_body {
body.push(line);
}
}
if body.is_empty() {
return stdout.trim().to_string();
}
body.join("\n").trim_end().to_string()
}
pub fn cached_repo_path(hf_repo: &str) -> PathBuf {
let home = std::env::var_os("HOME")
.map(PathBuf::from)
.unwrap_or_default();
home.join(".cache")
.join("huggingface")
.join("hub")
.join(format!("models--{}", hf_repo.replace('/', "--")))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_output_strips_banners_and_perf_lines() {
let raw = "Loading model...\n\
==========\n\
The image is a blank canvas with a grid pattern.\n\
==========\n\
Prompt: 234 tokens\n\
Generation: 12 tokens, 80.123 tokens/s\n\
Peak memory: 2.345 GB";
assert_eq!(
parse_output(raw),
"The image is a blank canvas with a grid pattern."
);
}
#[test]
fn parse_output_handles_missing_banners() {
let raw = " Some single-line response. \n";
assert_eq!(parse_output(raw), "Some single-line response.");
}
#[test]
fn parse_output_handles_multiline_body() {
let raw = "==========\nLine one.\nLine two.\n==========\nPrompt: 1 tokens";
assert_eq!(parse_output(raw), "Line one.\nLine two.");
}
#[test]
fn classify_missing_deps_distinguishes_processor_imports() {
let stderr = "ImportError: Qwen3VLVideoProcessor requires the Torchvision library but it was not found in your environment.\n\
ModuleNotFoundError: No module named 'torch'";
assert_eq!(
classify_missing_deps(stderr),
Some("torch, torchvision".to_string())
);
}
}