car-inference 0.8.0

Local model inference for CAR — Candle backend with Qwen3 models
Documentation
//! External LTX-2 backend — shells out to the `ltx-2-mlx` Python CLI.
//!
//! TEMPORARY BRIDGE. The native Rust MLX port (`mlx_ltx.rs`) is
//! structurally complete but not yet producing prompt-faithful output.
//! Until parity is reached, this backend delegates `car video` to
//! `dgrauet/ltx-2-mlx` — a known-working pure-MLX port on the same
//! quantized weights — so the CLI is usable today.
//!
//! The subprocess call is explicitly at the feature-surface boundary
//! (not an internal runtime dependency): the Rust pipeline constructs
//! the request, the external tool does the inference, the resulting
//! `.mp4` path is returned. When the native backend reaches parity,
//! swap the dispatch in `InferenceEngine::generate_video` back and
//! this file can be deleted.

use std::process::Command;

use crate::tasks::generate_video::{GenerateVideoRequest, GenerateVideoResult, VideoMode};
use crate::InferenceError;

/// Name of the Python CLI binary to invoke.
const CLI_BINARY: &str = "ltx-2-mlx";

/// Default model repo to pass via `--model`. Matches what the native
/// backend uses so both paths operate on the same quantized weights.
const DEFAULT_MODEL: &str = "dgrauet/ltx-2.3-mlx-q4";

/// True if the external CLI is on PATH.
pub fn is_available() -> bool {
    Command::new(CLI_BINARY)
        .arg("--help")
        .stdout(std::process::Stdio::null())
        .stderr(std::process::Stdio::null())
        .status()
        .map(|s| s.success())
        .unwrap_or(false)
}

/// Generate a video by shelling out to `ltx-2-mlx generate`.
pub fn generate_video(req: &GenerateVideoRequest) -> Result<GenerateVideoResult, InferenceError> {
    req.validate()
        .map_err(InferenceError::InferenceFailed)?;

    let output_path = req
        .output_path
        .clone()
        .unwrap_or_else(|| "output.mp4".to_string());

    let mode = req.effective_mode();
    let mut cmd = Command::new(CLI_BINARY);
    cmd.arg("generate")
        .arg("--prompt")
        .arg(&req.prompt)
        .arg("--output")
        .arg(&output_path)
        .arg("--model")
        .arg(req.model.as_deref().unwrap_or(DEFAULT_MODEL));

    if let Some(w) = req.width {
        cmd.arg("--width").arg(w.to_string());
    }
    if let Some(h) = req.height {
        cmd.arg("--height").arg(h.to_string());
    }
    if let Some(f) = req.num_frames {
        cmd.arg("--frames").arg(f.to_string());
    }
    if let Some(s) = req.steps {
        cmd.arg("--steps").arg(s.to_string());
    }
    if let Some(g) = req.guidance {
        cmd.arg("--cfg-scale").arg(g.to_string());
    }
    if let Some(seed) = req.seed {
        cmd.arg("--seed").arg(seed.to_string());
    }
    match mode {
        VideoMode::T2v | VideoMode::AudioVideo => {}
        VideoMode::I2v => {
            let path = req.image_path.as_deref().ok_or_else(|| {
                InferenceError::InferenceFailed(
                    "i2v requested but image_path is empty".to_string(),
                )
            })?;
            cmd.arg("--image").arg(path);
        }
        VideoMode::AudioRefVideo => {
            // Upstream Python CLI doesn't condition on audio either.
            // Validation (#130) rejects `image_path + audio_path` so
            // by the time we see AudioRefVideo here, image_path is
            // None — pure text-to-video at the CLI level. The
            // `audio_path` field is a marker of intent so downstream
            // tooling (musicart) can find it for muxing.
            //
            // Loud warn per #130 — the API name implies conditioning,
            // so surface the actual no-conditioning behavior every
            // call.
            tracing::warn!(
                audio_path = ?req.audio_path,
                "external ltx-2-mlx: audio_path is INFORMATIONAL ONLY — \
                 generated frames are pure text-to-video and not conditioned \
                 on the audio bytes. Caller muxes the song downstream. (#130)"
            );
        }
        VideoMode::Extend => {
            return Err(InferenceError::UnsupportedMode {
                mode: "extend",
                backend: "external-ltx-2-mlx",
                reason: "not exposed by the upstream Python CLI's argparse surface",
            });
        }
        VideoMode::Retake => {
            return Err(InferenceError::UnsupportedMode {
                mode: "retake",
                backend: "external-ltx-2-mlx",
                reason: "not exposed by the upstream Python CLI's argparse surface",
            });
        }
    }

    tracing::info!(?mode, prompt = %req.prompt, output = %output_path, "external ltx-2-mlx: invoking");
    let output = cmd.output().map_err(|e| {
        InferenceError::InferenceFailed(format!(
            "failed to spawn `{CLI_BINARY}`: {e}. \
             Install the companion Python port with `uv sync` in a clone of \
             https://github.com/dgrauet/ltx-2-mlx and add its venv bin to PATH."
        ))
    })?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        let stdout = String::from_utf8_lossy(&output.stdout);
        return Err(InferenceError::InferenceFailed(format!(
            "ltx-2-mlx exited with status {}: stderr={stderr} stdout={stdout}",
            output.status
        )));
    }

    Ok(GenerateVideoResult {
        video_path: output_path,
        media_type: "video/mp4".to_string(),
        model_used: Some(format!(
            "external:{}",
            req.model.as_deref().unwrap_or(DEFAULT_MODEL)
        )),
    })
}