car-inference 0.14.0

//! External LTX-2 backend — shells out to the `ltx-2-mlx` Python CLI.
//!
//! TEMPORARY BRIDGE. The native Rust MLX port (`mlx_ltx.rs`) is
//! structurally complete but not yet producing prompt-faithful output.
//! Until parity is reached, this backend delegates `car video` to
//! `dgrauet/ltx-2-mlx` — a known-working pure-MLX port on the same
//! quantized weights — so the CLI is usable today.
//!
//! The subprocess call is explicitly at the feature-surface boundary
//! (not an internal runtime dependency): the Rust pipeline constructs
//! the request, the external tool does the inference, the resulting
//! `.mp4` path is returned. When the native backend reaches parity,
//! swap the dispatch in `InferenceEngine::generate_video` back and
//! this file can be deleted.

use std::process::Command;

use crate::tasks::generate_video::{GenerateVideoRequest, GenerateVideoResult, VideoMode};
use crate::InferenceError;

/// Name of the Python CLI binary to invoke.
const CLI_BINARY: &str = "ltx-2-mlx";

/// Default model repo to pass via `--model`. Matches what the native
/// backend uses so both paths operate on the same quantized weights.
const DEFAULT_MODEL: &str = "dgrauet/ltx-2.3-mlx-q4";

/// True if the external CLI is on PATH.
pub fn is_available() -> bool {
    Command::new(CLI_BINARY)
        .arg("--help")
        .stdout(std::process::Stdio::null())
        .stderr(std::process::Stdio::null())
        .status()
        .map(|s| s.success())
        .unwrap_or(false)
}

/// Generate a video by shelling out to `ltx-2-mlx generate`.
pub fn generate_video(req: &GenerateVideoRequest) -> Result<GenerateVideoResult, InferenceError> {
    req.validate().map_err(InferenceError::InferenceFailed)?;

    let output_path = req
        .output_path
        .clone()
        .unwrap_or_else(|| "output.mp4".to_string());

    let mode = req.effective_mode();
    let audio_conditioning = mode == VideoMode::AudioRefVideo && !req.audio_passthrough;
    let mut cmd = Command::new(CLI_BINARY);
    cmd.arg(if audio_conditioning {
        "a2v"
    } else {
        "generate"
    })
    .arg("--prompt")
    .arg(&req.prompt)
    .arg("--output")
    .arg(&output_path)
    .arg("--model")
    .arg(req.model.as_deref().unwrap_or(DEFAULT_MODEL));

    if let Some(w) = req.width {
        cmd.arg("--width").arg(w.to_string());
    }
    if let Some(h) = req.height {
        cmd.arg("--height").arg(h.to_string());
    }
    if let Some(f) = req.num_frames {
        cmd.arg("--frames").arg(f.to_string());
    }
    if let Some(s) = req.steps {
        if audio_conditioning {
            cmd.arg("--stage1-steps").arg(s.to_string());
            cmd.arg("--stage2-steps").arg("0");
        } else {
            cmd.arg("--steps").arg(s.to_string());
        }
    }
    if let Some(g) = req.guidance {
        cmd.arg("--cfg-scale").arg(g.to_string());
    }
    if let Some(seed) = req.seed {
        cmd.arg("--seed").arg(seed.to_string());
    }
    match mode {
        VideoMode::T2v | VideoMode::AudioVideo => {}
        VideoMode::I2v => {
            let path = req.image_path.as_deref().ok_or_else(|| {
                InferenceError::InferenceFailed("i2v requested but image_path is empty".to_string())
            })?;
            cmd.arg("--image").arg(path);
        }
        VideoMode::AudioRefVideo => {
            if audio_conditioning {
                let path = req.audio_path.as_deref().ok_or_else(|| {
                    InferenceError::InferenceFailed(
                        "audio_ref_video requested but audio_path is empty".to_string(),
                    )
                })?;
                cmd.arg("--audio").arg(path);
                if let Some(path) = req.image_path.as_deref() {
                    cmd.arg("--image").arg(path);
                }
            } else {
                tracing::warn!(
                    audio_path = ?req.audio_path,
                    "external ltx-2-mlx: audio_path is mux-only because audio_passthrough=true; \
                     generating text-only video via `generate`"
                );
            }
        }
        VideoMode::Extend => {
            return Err(InferenceError::UnsupportedMode {
                mode: "extend",
                backend: "external-ltx-2-mlx",
                reason: "not exposed by the upstream Python CLI's argparse surface",
            });
        }
        VideoMode::Retake => {
            return Err(InferenceError::UnsupportedMode {
                mode: "retake",
                backend: "external-ltx-2-mlx",
                reason: "not exposed by the upstream Python CLI's argparse surface",
            });
        }
    }

    tracing::info!(?mode, prompt = %req.prompt, output = %output_path, "external ltx-2-mlx: invoking");
    let output = cmd.output().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "failed to spawn `{CLI_BINARY}`: {e}. \
             Install the companion Python port with `uv sync` in a clone of \
             https://github.com/dgrauet/ltx-2-mlx and add its venv bin to PATH."
        ))
    })?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        return Err(InferenceError::InferenceFailed(format!(
            "ltx-2-mlx exited with status {}: stderr={stderr} stdout={stdout}",
            output.status
        )));
    }

    Ok(GenerateVideoResult {
        video_path: output_path,
        media_type: "video/mp4".to_string(),
        model_used: Some(format!(
            "external:{}",
            req.model.as_deref().unwrap_or(DEFAULT_MODEL)
        )),
    })
}