car-inference 0.13.0

use serde::{Deserialize, Serialize};

fn default_video_format() -> String {
    "mp4".to_string()
}

/// Video generation mode.
///
/// Inferred from the conditioning inputs when `mode` is unset:
/// * `image_path` → [`VideoMode::I2v`]
/// * `video_path` + retake range → [`VideoMode::Retake`]
/// * `video_path` alone → [`VideoMode::Extend`]
/// * nothing → [`VideoMode::T2v`]
///
/// `AudioVideo` must be requested explicitly via `mode`.
///
/// ## Wiring status
///
/// | Mode | Native MLX backend |
/// |---|---|
/// | `T2v` | fully wired |
/// | `I2v` | fully wired (requires `vae_encoder.safetensors`) |
/// | `AudioVideo` | fully wired |
/// | `AudioRefVideo` | text + existing-audio reference conditioning. LTX-family models route through `ltx-2-mlx a2v` until native Rust/MLX conditioning lands. `image_path + audio_path` rejected at validation (#130). |
/// | `Extend` | request surface only — returns `InferenceFailed` until backend wiring lands |
/// | `Retake` | request surface only — returns `InferenceFailed` until backend wiring lands |
///
/// The surface-only modes are included so higher-level tooling
/// doesn't need an ad-hoc LTX escape hatch once the backend does
/// implement them. Consumers should handle the documented error and
/// degrade accordingly in the meantime.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VideoMode {
    /// Text-to-video (default).
    T2v,
    /// Image-to-video: first frame conditioned on `image_path`.
    I2v,
    /// Joint text-to-(video + audio) synthesis. Output mp4 contains
    /// a synchronized audio track generated alongside the video.
    AudioVideo,
    /// Text + existing-audio video (#113, #130): text prompt steered by an
    /// existing audio file's timing, intensity, and rhythm. LTX-family models
    /// currently route this mode through the external `ltx-2-mlx a2v` adapter;
    /// the native Rust/MLX backend can replace that bridge once its
    /// audio-reference conditioning path is complete.
    ///
    /// What is rejected (validate() returns an error):
    /// - **text + audio_path + image_path**: the combination still
    ///   produces artifacts on the native backend (#130). Pick one.
    ///
    AudioRefVideo,
    /// Video extension: continue an existing clip (from `video_path`)
    /// past its final frame (or after `extend_after_frame` if set).
    /// The prompt describes the motion to produce from there.
    Extend,
    /// Video retake: regenerate the frame range
    /// `[retake_start_frame, retake_end_frame]` of the input
    /// `video_path`, leaving the surrounding frames untouched.
    /// The prompt describes the new behavior for the regenerated span.
    Retake,
}

/// Request to generate a video.
///
/// `prompt` is always required. The rest of the shape depends on the
/// effective [`VideoMode`] — see [`GenerateVideoRequest::validate`]
/// for the combinations the runtime will accept.
///
/// Existing callers that only set `prompt` / `image_path` / `mode` see
/// identical behavior; `video_path` and the retake/extend range fields
/// are additive.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerateVideoRequest {
    pub prompt: String,
    #[serde(default)]
    pub model: Option<String>,
    #[serde(default)]
    pub negative_prompt: Option<String>,
    #[serde(default)]
    pub width: Option<u32>,
    #[serde(default)]
    pub height: Option<u32>,
    #[serde(default)]
    pub num_frames: Option<u32>,
    #[serde(default)]
    pub steps: Option<u32>,
    #[serde(default)]
    pub guidance: Option<f32>,
    /// Classifier-free guidance scale for the audio branch in `AudioVideo`
    /// mode. When absent, defaults to `min(guidance, 3.0)` so audio doesn't
    /// inherit an over-strong video CFG that would cause timbre artifacts.
    #[serde(default)]
    pub audio_guidance: Option<f32>,
    #[serde(default)]
    pub seed: Option<u64>,
    #[serde(default)]
    pub fps: Option<u32>,
    #[serde(default)]
    pub output_path: Option<String>,
    #[serde(default = "default_video_format")]
    pub format: String,
    /// Reference image path for image-to-video mode.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub image_path: Option<String>,
    /// Reference video path for extension or retake modes.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub video_path: Option<String>,
    /// Reference audio path for [`VideoMode::AudioRefVideo`] (#113).
    /// The audio is conditioning input — the generator uses its
    /// timing, rhythm, vocal cadence, intensity, and transitions as
    /// control signal for the visual motion. Distinct from
    /// `AudioVideo`'s synthesised audio: this audio file already
    /// exists and the model conditions on it. Backends that don't
    /// support audio conditioning return `InferenceError::UnsupportedMode`
    /// when this is set.
    ///
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub audio_path: Option<String>,
    /// Explicit opt-in to the path-records-only-no-conditioning
    /// behavior of `audio_path` (Parslee-ai/car#185). Default
    /// `false` — `audio_path` means real audio-reference conditioning.
    /// Callers that knowingly want the muxing-only workflow (audio path
    /// forwarded to the request for downstream tooling; video frames stay
    /// text-only) set this to `true`. The CLI's `--audio-mux` flag sets it
    /// implicitly.
    #[serde(default)]
    pub audio_passthrough: bool,
    /// For `VideoMode::Extend`: the frame index in `video_path` to
    /// resume generation from (0-indexed, inclusive). When `None`, the
    /// extension continues from the final frame of the input.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub extend_after_frame: Option<u32>,
    /// For `VideoMode::Extend`: how many trailing frames of `video_path`
    /// the model should attend to when continuing the clip. Diffusion
    /// video extenders condition on a *window* rather than a single
    /// cut point, so this knob is orthogonal to `extend_after_frame`
    /// (which selects *where* to cut) — this selects *how much
    /// context* the model sees. `None` lets the backend pick its
    /// default (upstream LTX uses the last 8 frames).
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub extend_context_frames: Option<u32>,
    /// For `VideoMode::Retake`: start of the frame range to regenerate
    /// (0-indexed, inclusive). Paired with `retake_end_frame` as a
    /// Rust-style half-open range `[start, end)`. This convention is
    /// enforced at the CAR API boundary; the backend adapter is
    /// responsible for converting to upstream LTX's convention when
    /// the native retake path lands.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub retake_start_frame: Option<u32>,
    /// For `VideoMode::Retake`: end of the frame range to regenerate
    /// (0-indexed, exclusive). See [`retake_start_frame`].
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub retake_end_frame: Option<u32>,
    /// Explicit mode override. When absent, the mode is inferred from
    /// the conditioning inputs as documented on [`VideoMode`].
    #[serde(default)]
    pub mode: Option<VideoMode>,
}

impl GenerateVideoRequest {
    /// Returns `true` when the request asks for real audio-reference
    /// conditioning rather than the explicit mux-only path.
    pub fn requires_audio_passthrough_opt_in(&self) -> bool {
        self.audio_path.is_some()
            && self.effective_mode() == VideoMode::AudioRefVideo
            && !self.audio_passthrough
    }

    pub fn effective_mode(&self) -> VideoMode {
        if let Some(m) = self.mode {
            return m;
        }
        // Retake overrides Extend: if the caller gave a range on a
        // video they mean "regenerate this slice", not "extend past it".
        if self.video_path.is_some()
            && (self.retake_start_frame.is_some() || self.retake_end_frame.is_some())
        {
            return VideoMode::Retake;
        }
        if self.video_path.is_some() {
            return VideoMode::Extend;
        }
        // Audio reference (#113) — `audio_path` alone routes as
        // AudioRefVideo. `audio_path + image_path` is rejected by
        // validate() per #130 (the combination produces artifacts
        // on the native LTX backend), so we don't try to be clever
        // about the order here — let validate() emit the clear
        // error rather than silently falling through to one or the
        // other.
        if self.audio_path.is_some() && self.image_path.is_none() {
            return VideoMode::AudioRefVideo;
        }
        if self.image_path.is_some() {
            return VideoMode::I2v;
        }
        VideoMode::T2v
    }

    /// Reject request shapes where conditioning inputs contradict the
    /// mode or are missing. Call before running generation so callers
    /// get a precise diagnostic instead of a deep-in-the-backend error.
    pub fn validate(&self) -> Result<(), String> {
        // Pre-check: catch "stray" conditioning fields that could be
        // silently ignored by mode inference. A caller who sets
        // retake_start_frame=0 with no video_path falls through to
        // T2v today; tell them the field is orphaned instead.
        if self.retake_start_frame.is_some() != self.retake_end_frame.is_some() {
            return Err("retake_start_frame and retake_end_frame must be set together".to_string());
        }
        if self.extend_after_frame.is_some() && self.video_path.is_none() {
            return Err("extend_after_frame requires video_path (set mode=extend)".to_string());
        }
        if self.extend_context_frames.is_some() && self.video_path.is_none() {
            return Err("extend_context_frames requires video_path (set mode=extend)".to_string());
        }
        if (self.retake_start_frame.is_some() || self.retake_end_frame.is_some())
            && self.video_path.is_none()
        {
            return Err(
                "retake_{start,end}_frame requires video_path (set mode=retake)".to_string(),
            );
        }

        let mode = self.effective_mode();
        match mode {
            VideoMode::T2v => {
                if self.image_path.is_some() {
                    return Err("mode=t2v set explicitly but image_path was also provided; \
                         omit mode or set mode=i2v"
                        .to_string());
                }
                if self.video_path.is_some() {
                    return Err("mode=t2v does not accept video_path; \
                         set mode=extend or mode=retake"
                        .to_string());
                }
            }
            VideoMode::I2v => {
                if self.image_path.is_none() {
                    return Err("mode=i2v requires image_path".to_string());
                }
                if self.video_path.is_some() {
                    return Err(
                        "mode=i2v does not accept video_path; use mode=extend or mode=retake"
                            .to_string(),
                    );
                }
            }
            VideoMode::AudioVideo => {
                if self.image_path.is_some() {
                    return Err("mode=audio_video does not accept image_path; \
                         image conditioning is not wired for joint audio+video synthesis"
                        .to_string());
                }
                if self.video_path.is_some() {
                    return Err("mode=audio_video does not accept video_path".to_string());
                }
            }
            VideoMode::Extend => {
                if self.video_path.is_none() {
                    return Err("mode=extend requires video_path".to_string());
                }
                if self.retake_start_frame.is_some() || self.retake_end_frame.is_some() {
                    return Err(
                        "mode=extend does not accept retake_start_frame / retake_end_frame; \
                         use mode=retake"
                            .to_string(),
                    );
                }
                if self.image_path.is_some() {
                    return Err(
                        "mode=extend does not accept image_path; extend is video-conditioned"
                            .to_string(),
                    );
                }
            }
            VideoMode::Retake => {
                if self.video_path.is_none() {
                    return Err("mode=retake requires video_path".to_string());
                }
                match (self.retake_start_frame, self.retake_end_frame) {
                    (Some(s), Some(e)) if s < e => {}
                    (Some(_), Some(_)) => {
                        return Err("mode=retake requires retake_start_frame < retake_end_frame"
                            .to_string());
                    }
                    _ => {
                        return Err(
                            "mode=retake requires both retake_start_frame and retake_end_frame"
                                .to_string(),
                        );
                    }
                }
                if self.image_path.is_some() {
                    return Err(
                        "mode=retake does not accept image_path; retake is video-conditioned"
                            .to_string(),
                    );
                }
                if self.extend_after_frame.is_some() {
                    return Err(
                        "mode=retake does not accept extend_after_frame; use mode=extend"
                            .to_string(),
                    );
                }
                if self.extend_context_frames.is_some() {
                    return Err(
                        "mode=retake does not accept extend_context_frames; use mode=extend"
                            .to_string(),
                    );
                }
            }
            VideoMode::AudioRefVideo => {
                if self.audio_path.is_none() {
                    return Err("mode=audio_ref_video requires audio_path".to_string());
                }
                if self.video_path.is_some() {
                    return Err("mode=audio_ref_video does not accept video_path".to_string());
                }
                // Reject `image_path + audio_path`. Musicart canary
                // testing (#130) showed this combination still
                // produces severe woven/canvas artifacts on the
                // native LTX backend. Until a real audio VAE encoder
                // lands and the artifacts are understood, callers
                // pick one anchor — text+audio (clean) or
                // text+image (clean i2v). The "no silent degradation"
                // contract: AudioRefVideo never accepts inputs we
                // can't ship cleanly.
                if self.image_path.is_some() {
                    return Err(
                        "mode=audio_ref_video does not currently accept image_path: \
                         the combination produces severe artifacts on the native LTX \
                         backend (#130). Use mode=i2v for image-anchored generation \
                         (audio is then a downstream mux concern), or omit image_path \
                         for text+audio"
                            .to_string(),
                    );
                }
                if self.retake_start_frame.is_some()
                    || self.retake_end_frame.is_some()
                    || self.extend_after_frame.is_some()
                    || self.extend_context_frames.is_some()
                {
                    return Err("mode=audio_ref_video does not accept frame-range fields; \
                         retake/extend semantics don't apply"
                        .to_string());
                }
            }
        }
        Ok(())
    }
}

impl Default for GenerateVideoRequest {
    fn default() -> Self {
        Self {
            prompt: String::new(),
            model: None,
            negative_prompt: None,
            width: None,
            height: None,
            num_frames: None,
            steps: None,
            guidance: None,
            audio_guidance: None,
            seed: None,
            fps: None,
            output_path: None,
            format: default_video_format(),
            image_path: None,
            video_path: None,
            audio_path: None,
            audio_passthrough: false,
            extend_after_frame: None,
            extend_context_frames: None,
            retake_start_frame: None,
            retake_end_frame: None,
            mode: None,
        }
    }
}

/// Video generation result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GenerateVideoResult {
    pub video_path: String,
    pub media_type: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model_used: Option<String>,
}

#[cfg(test)]
mod tests {
    use super::*;

    fn req(prompt: &str) -> GenerateVideoRequest {
        GenerateVideoRequest {
            prompt: prompt.into(),
            ..GenerateVideoRequest::default()
        }
    }

    #[test]
    fn legacy_payload_deserializes_with_new_fields_defaulted() {
        // Existing JSON from v1 callers must still parse.
        let json = r#"{"prompt":"a cat","image_path":"/tmp/cat.png","format":"mp4"}"#;
        let r: GenerateVideoRequest = serde_json::from_str(json).unwrap();
        assert_eq!(r.effective_mode(), VideoMode::I2v);
        assert!(r.video_path.is_none());
        assert!(r.extend_after_frame.is_none());
        assert!(r.retake_start_frame.is_none());
        r.validate().unwrap();
    }

    #[test]
    fn effective_mode_infers_extend_from_video_path_alone() {
        let r = GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            ..req("continue walking")
        };
        assert_eq!(r.effective_mode(), VideoMode::Extend);
        r.validate().unwrap();
    }

    #[test]
    fn effective_mode_infers_retake_from_range_on_video() {
        let r = GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            retake_start_frame: Some(10),
            retake_end_frame: Some(20),
            ..req("replace this span with a zoom")
        };
        assert_eq!(r.effective_mode(), VideoMode::Retake);
        r.validate().unwrap();
    }

    #[test]
    fn explicit_mode_overrides_inference() {
        // Caller explicitly sets T2v but also provides image_path — that's a conflict.
        let r = GenerateVideoRequest {
            image_path: Some("/tmp/a.png".into()),
            mode: Some(VideoMode::T2v),
            ..req("x")
        };
        assert!(r.validate().is_err());
    }

    #[test]
    fn extend_requires_video_path() {
        let r = GenerateVideoRequest {
            mode: Some(VideoMode::Extend),
            ..req("x")
        };
        assert!(r.validate().is_err());
    }

    #[test]
    fn retake_requires_both_frame_bounds_in_order() {
        let base = GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            mode: Some(VideoMode::Retake),
            ..req("x")
        };
        // Missing end.
        assert!(GenerateVideoRequest {
            retake_start_frame: Some(0),
            ..base.clone()
        }
        .validate()
        .is_err());
        // Start >= end.
        assert!(GenerateVideoRequest {
            retake_start_frame: Some(10),
            retake_end_frame: Some(10),
            ..base.clone()
        }
        .validate()
        .is_err());
        // Valid.
        GenerateVideoRequest {
            retake_start_frame: Some(10),
            retake_end_frame: Some(20),
            ..base
        }
        .validate()
        .unwrap();
    }

    #[test]
    fn extend_rejects_retake_range_fields() {
        let r = GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            mode: Some(VideoMode::Extend),
            retake_start_frame: Some(0),
            retake_end_frame: Some(5),
            ..req("x")
        };
        assert!(r.validate().is_err());
    }

    #[test]
    fn retake_rejects_extend_after_frame() {
        let r = GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            retake_start_frame: Some(0),
            retake_end_frame: Some(5),
            extend_after_frame: Some(10),
            ..req("x")
        };
        assert!(r.validate().is_err());
    }

    #[test]
    fn stray_extend_fields_without_video_path_are_rejected() {
        // Pre-check catches the case where mode inference falls
        // through to T2v and would otherwise silently drop the fields.
        assert!(GenerateVideoRequest {
            extend_after_frame: Some(10),
            ..req("x")
        }
        .validate()
        .is_err());
        assert!(GenerateVideoRequest {
            extend_context_frames: Some(8),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    #[test]
    fn stray_retake_bounds_without_video_path_are_rejected() {
        assert!(GenerateVideoRequest {
            retake_start_frame: Some(0),
            retake_end_frame: Some(5),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    #[test]
    fn retake_bounds_must_be_set_together() {
        // Only start — end missing.
        assert!(GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            retake_start_frame: Some(0),
            ..req("x")
        }
        .validate()
        .is_err());
        // Only end — start missing.
        assert!(GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            retake_end_frame: Some(5),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    #[test]
    fn extend_accepts_context_frames_but_retake_rejects_it() {
        // Extend + context_frames is fine.
        GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            extend_context_frames: Some(8),
            ..req("x")
        }
        .validate()
        .unwrap();
        // Retake + context_frames is a conflict.
        assert!(GenerateVideoRequest {
            video_path: Some("/tmp/clip.mp4".into()),
            retake_start_frame: Some(0),
            retake_end_frame: Some(5),
            extend_context_frames: Some(8),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    #[test]
    fn audio_video_still_rejects_image_and_video() {
        assert!(GenerateVideoRequest {
            image_path: Some("/tmp/a.png".into()),
            mode: Some(VideoMode::AudioVideo),
            ..req("x")
        }
        .validate()
        .is_err());
        assert!(GenerateVideoRequest {
            video_path: Some("/tmp/a.mp4".into()),
            mode: Some(VideoMode::AudioVideo),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    #[test]
    fn audio_ref_video_infers_from_audio_path_alone() {
        // text + audio_path (no image) is the supported AudioRefVideo
        // shape. Validates clean; effective_mode resolves to
        // AudioRefVideo so backends emit the "audio is informational
        // only" warning.
        let audio_only = GenerateVideoRequest {
            audio_path: Some("/tmp/ref.wav".into()),
            audio_passthrough: false,
            ..req("x")
        };
        assert_eq!(audio_only.effective_mode(), VideoMode::AudioRefVideo);
        audio_only.validate().unwrap();
    }

    #[test]
    fn audio_ref_video_rejects_image_plus_audio_combination() {
        // #130: musicart canary observed severe woven/canvas
        // artifacts on the native backend with `--audio --image`.
        // Until a real audio VAE encoder lands, callers pick one
        // anchor — text+audio (clean) or text+image (clean i2v).
        let image_and_audio = GenerateVideoRequest {
            image_path: Some("/tmp/anchor.png".into()),
            audio_path: Some("/tmp/ref.wav".into()),
            audio_passthrough: false,
            ..req("x")
        };
        // effective_mode falls through to I2V instead of silently
        // collapsing to AudioRefVideo — that way the user gets the
        // i2v path they probably wanted, not a no-op audio path
        // that ignores the image.
        assert_eq!(image_and_audio.effective_mode(), VideoMode::I2v);

        // Explicit AudioRefVideo + both fields is the case where the
        // caller really meant "use both" — that's where we draw the
        // line and reject loudly.
        let explicit = GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            image_path: Some("/tmp/anchor.png".into()),
            audio_path: Some("/tmp/ref.wav".into()),
            audio_passthrough: false,
            ..req("x")
        };
        let err = explicit.validate().unwrap_err();
        assert!(
            err.contains("does not currently accept image_path"),
            "expected #130 error message, got: {err}"
        );
    }

    #[test]
    fn audio_ref_video_requires_audio_and_rejects_video_path() {
        assert!(GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            ..req("x")
        }
        .validate()
        .is_err());
        assert!(GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            audio_path: Some("/tmp/ref.wav".into()),
            audio_passthrough: false,
            video_path: Some("/tmp/clip.mp4".into()),
            ..req("x")
        }
        .validate()
        .is_err());
    }

    // Parslee-ai/car#185 — engine-level gate predicate tests.
    // The engine entry point calls `requires_audio_passthrough_opt_in()`
    // before dispatching to any backend; these tests cover the
    // matrix without needing to construct an InferenceEngine.

    #[test]
    fn passthrough_gate_fires_for_audio_ref_video_without_opt_in() {
        let r = GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            audio_path: Some("/tmp/song.wav".into()),
            audio_passthrough: false,
            ..req("x")
        };
        assert!(r.requires_audio_passthrough_opt_in());
    }

    #[test]
    fn passthrough_gate_skipped_when_opt_in_set() {
        let r = GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            audio_path: Some("/tmp/song.wav".into()),
            audio_passthrough: true,
            ..req("x")
        };
        assert!(!r.requires_audio_passthrough_opt_in());
    }

    #[test]
    fn passthrough_gate_skipped_when_audio_path_absent() {
        let r = GenerateVideoRequest {
            mode: Some(VideoMode::AudioRefVideo),
            audio_path: None,
            audio_passthrough: false,
            ..req("x")
        };
        // audio_path absent means the AudioRefVideo mode itself is
        // invalid (validate() rejects), but the passthrough gate
        // isn't what should fire — the validate() path handles it.
        assert!(!r.requires_audio_passthrough_opt_in());
    }

    #[test]
    fn passthrough_gate_skipped_for_audio_video_mode() {
        // AudioVideo (joint text-to-video+audio synthesis) is a
        // different code path — no input audio to gate.
        let r = GenerateVideoRequest {
            mode: Some(VideoMode::AudioVideo),
            audio_path: Some("/tmp/song.wav".into()),
            audio_passthrough: false,
            ..req("x")
        };
        assert!(!r.requires_audio_passthrough_opt_in());
    }
}