car-inference 0.14.0

//! External MLX video adapter for non-LTX model families.
//!
//! CAR's native Rust video backend currently implements the LTX family.
//! Wan/Yume models use the upstream `mlx-video` Python package's Wan 2
//! generator, so this adapter keeps the registry model-agnostic while
//! routing each family through the runtime that can actually load it.

use std::path::{Path, PathBuf};
use std::process::Command;

use crate::schema::ModelSchema;
use crate::tasks::generate_video::{GenerateVideoRequest, GenerateVideoResult, VideoMode};
use crate::InferenceError;

const WAN2_MODULE: &str = "mlx_video.models.wan_2.generate";

fn python_binary() -> PathBuf {
    if let Ok(value) = std::env::var("CAR_MLX_VIDEO_PYTHON") {
        return PathBuf::from(value);
    }
    if let Ok(home) = std::env::var("HOME") {
        let runtime_python = PathBuf::from(home)
            .join(".car")
            .join("visual-runtime")
            .join("bin")
            .join("python");
        if runtime_python.exists() {
            return runtime_python;
        }
    }
    PathBuf::from("python3")
}

pub fn is_wan_family(schema: &ModelSchema) -> bool {
    schema.family.starts_with("yume")
        || schema.tags.iter().any(|tag| {
            matches!(
                tag.as_str(),
                "wan2.2" | "ti2v" | "world-model" | "image-to-video"
            )
        })
}

pub fn generate_wan_video(
    schema: &ModelSchema,
    model_dir: &Path,
    req: &GenerateVideoRequest,
) -> Result<GenerateVideoResult, InferenceError> {
    req.validate().map_err(InferenceError::InferenceFailed)?;

    let output_path = req
        .output_path
        .clone()
        .unwrap_or_else(|| "output.mp4".to_string());

    let mode = req.effective_mode();
    match mode {
        VideoMode::T2v | VideoMode::I2v => {}
        VideoMode::AudioVideo | VideoMode::AudioRefVideo => {
            return Err(InferenceError::UnsupportedMode {
                mode: "audio_video",
                backend: "mlx-video-wan2",
                reason: "Yume/Wan2.2 TI2V exposes text-to-video and image-to-video, not audio-conditioned video",
            });
        }
        VideoMode::Extend => {
            return Err(InferenceError::UnsupportedMode {
                mode: "extend",
                backend: "mlx-video-wan2",
                reason: "the upstream Wan 2 MLX generator does not expose video extension",
            });
        }
        VideoMode::Retake => {
            return Err(InferenceError::UnsupportedMode {
                mode: "retake",
                backend: "mlx-video-wan2",
                reason:
                    "the upstream Wan 2 MLX generator does not expose retake/inpaint generation",
            });
        }
    }

    let python = python_binary();
    let mut cmd = Command::new(&python);
    cmd.arg("-m")
        .arg(WAN2_MODULE)
        .arg("--model-dir")
        .arg(model_dir)
        .arg("--prompt")
        .arg(&req.prompt)
        .arg("--output-path")
        .arg(&output_path);

    if let Some(w) = req.width {
        cmd.arg("--width").arg(w.to_string());
    }
    if let Some(h) = req.height {
        cmd.arg("--height").arg(h.to_string());
    }
    if let Some(f) = req.num_frames {
        cmd.arg("--num-frames").arg(f.to_string());
    }
    if let Some(s) = req.steps {
        cmd.arg("--steps").arg(s.to_string());
    }
    if let Some(g) = req.guidance {
        cmd.arg("--guide-scale").arg(g.to_string());
    }
    if let Some(seed) = req.seed {
        cmd.arg("--seed").arg(seed.to_string());
    }
    if let VideoMode::I2v = mode {
        let path = req.image_path.as_deref().ok_or_else(|| {
            InferenceError::InferenceFailed("i2v requested but image_path is empty".to_string())
        })?;
        cmd.arg("--image").arg(path);
    }

    if req.negative_prompt.is_some() || req.fps.is_some() {
        tracing::warn!(
            model = %schema.id,
            "mlx-video Wan 2 adapter ignores negative_prompt/fps because the upstream Yume entrypoint does not document those flags"
        );
    }

    tracing::info!(
        model = %schema.id,
        ?mode,
        prompt = %req.prompt,
        output = %output_path,
        "external mlx-video Wan 2: invoking"
    );
    let output = cmd.output().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "failed to spawn `{}` for {WAN2_MODULE}: {e}",
            python.display()
        ))
    })?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        return Err(InferenceError::InferenceFailed(format!(
            "mlx-video Wan 2 exited with status {}: stderr={stderr} stdout={stdout}. \
             Install or upgrade the visual runtime so `{WAN2_MODULE}` is importable.",
            output.status
        )));
    }

    Ok(GenerateVideoResult {
        video_path: output_path,
        media_type: "video/mp4".to_string(),
        model_used: Some(format!("external-mlx-video:{}", schema.id)),
    })
}