ferrotorch-diffusion 0.5.8

//! Stable-Diffusion 1.5 end-to-end text-to-image generation pipeline.
//!
//! Phase F of real-artifact-driven development (#1163). Composes the
//! four SD-1.5 sub-models into a single fixed-seed generation pipeline:
//!
//! ```text
//! prompt          → CLIP text encoder → cond_embeds   [1, 77, 768]
//! negative_prompt → CLIP text encoder → uncond_embeds [1, 77, 768]
//! init_latent     [1, 4, 64, 64]
//! for step in 0..N:
//!   t = scheduler.timesteps[step]
//!   uncond_noise = UNet(latent, t, uncond_embeds)
//!   cond_noise   = UNet(latent, t, cond_embeds)
//!   noise_pred   = uncond_noise + guidance_scale * (cond_noise - uncond_noise)
//!   latent       = scheduler.step(noise_pred, t, latent)
//! image = VAE.decode(latent / 0.18215)  → [1, 3, 512, 512]
//! ```
//!
//! Note on noise determinism: rust's PRNG (`rand::StdRng`) does NOT
//! produce the same Gaussian sequence as
//! `torch.Generator(device='cpu').manual_seed(seed)`. The pipeline
//! therefore takes the `init_latent` from the caller (which the dump
//! example pulls from the pinned mirror), not from a rust-side seeded
//! Gaussian.

use ferrotorch_core::grad_fns::arithmetic::{add, mul, sub};
use ferrotorch_core::{FerrotorchError, FerrotorchResult, Float, Tensor, TensorStorage};

use crate::clip_text_encoder::ClipTextEncoder;
use crate::scheduler::DDIMScheduler;
use crate::unet::UNet2DConditionModel;
use crate::vae::VaeDecoder;

/// Per-step diagnostic dump. The dump example writes one of these per
/// scheduler step so the harness can pinpoint a divergent stage if the
/// final image fails.
#[derive(Debug)]
pub struct PipelineStepDump<T: Float> {
    /// Inference-step index (0-based).
    pub step: usize,
    /// Diffusion timestep at this step (e.g. 751 for the SD 4-step recipe).
    pub timestep: usize,
    /// `UNet(latent, t, uncond_embeds)` — the unconditional noise pred.
    pub noise_pred_uncond: Tensor<T>,
    /// `UNet(latent, t, cond_embeds)` — the conditional noise pred.
    pub noise_pred_cond: Tensor<T>,
    /// `uncond + guidance_scale * (cond - uncond)` — the CFG-guided noise.
    pub guided_noise: Tensor<T>,
    /// `scheduler.step(guided_noise, t, latent).prev_sample` — the
    /// latent fed into the next iteration.
    pub latent_after_step: Tensor<T>,
}

/// Holds the four SD-1.5 sub-models composed into a single generation
/// pipeline. The text-encoder, UNet, VAE, and scheduler must already be
/// constructed (typically via `load_clip_text_encoder`, `load_unet`,
/// `load_vae_decoder`, and `DDIMScheduler::new`).
#[derive(Debug)]
pub struct StableDiffusionPipeline<T: Float> {
    /// CLIP text tower (text → `[1, 77, 768]` hidden states).
    pub text_encoder: ClipTextEncoder<T>,
    /// UNet noise predictor.
    pub unet: UNet2DConditionModel<T>,
    /// VAE decoder (latent → image).
    pub vae: VaeDecoder<T>,
    /// DDIM scheduler. Mutability is needed because `set_timesteps`
    /// caches per-call state; the pipeline takes `&mut self`.
    pub scheduler: DDIMScheduler,
}

impl<T: Float> StableDiffusionPipeline<T> {
    /// Construct a pipeline from its four sub-models. Validates that the
    /// scheduler's `prediction_type` is `Epsilon` (the only one wired
    /// here today).
    ///
    /// # Errors
    ///
    /// Returns [`FerrotorchError::InvalidArgument`] for any unsupported
    /// scheduler configuration.
    pub fn new(
        text_encoder: ClipTextEncoder<T>,
        unet: UNet2DConditionModel<T>,
        vae: VaeDecoder<T>,
        scheduler: DDIMScheduler,
    ) -> FerrotorchResult<Self> {
        Ok(Self {
            text_encoder,
            unet,
            vae,
            scheduler,
        })
    }

    /// Encode a single token-id sequence into the CLIP text embedding
    /// `[1, S, hidden_size]`. The caller is responsible for padding to
    /// `S = max_position_embeddings` with the CLIP pad/eos token.
    ///
    /// # Errors
    ///
    /// Propagates the underlying [`ClipTextEncoder::forward_from_ids`] error.
    pub fn encode_prompt(&self, input_ids: &[u32]) -> FerrotorchResult<Tensor<T>> {
        self.text_encoder.forward_from_ids(input_ids)
    }

    /// Build the `[B]` timestep tensor the UNet consumes (filled with
    /// `timestep` cast to the active Float).
    fn timestep_tensor(timestep: usize, batch: usize) -> FerrotorchResult<Tensor<T>> {
        let v = T::from(timestep as f64).ok_or_else(|| FerrotorchError::InvalidArgument {
            message: format!(
                "StableDiffusionPipeline: cannot represent timestep {timestep} as the active Float"
            ),
        })?;
        Tensor::<T>::from_storage(TensorStorage::cpu(vec![v; batch]), vec![batch], false)
    }

    /// One CFG-guided UNet evaluation: two forward passes (uncond + cond)
    /// blended by the classifier-free-guidance scale. Returns
    /// `(noise_uncond, noise_cond, guided_noise)` so the dump example
    /// can persist all three for diagnostics.
    fn cfg_eval(
        &self,
        latent: &Tensor<T>,
        timestep: usize,
        cond_embeds: &Tensor<T>,
        uncond_embeds: &Tensor<T>,
        guidance_scale: f32,
    ) -> FerrotorchResult<(Tensor<T>, Tensor<T>, Tensor<T>)> {
        let batch = latent.shape()[0];
        let t = Self::timestep_tensor(timestep, batch)?;
        // DDIM's scale_model_input is identity at SD defaults, but call
        // it for forward-compat with non-DDIM schedulers.
        let model_input = self.scheduler.scale_model_input(latent, timestep)?;
        let noise_uncond = self.unet.forward_t(&model_input, &t, uncond_embeds)?;
        let noise_cond = self.unet.forward_t(&model_input, &t, cond_embeds)?;
        // guided = uncond + scale * (cond - uncond)
        let gs = T::from(guidance_scale as f64).ok_or_else(|| FerrotorchError::InvalidArgument {
            message: format!(
                "StableDiffusionPipeline: cannot represent guidance_scale {guidance_scale} as the active Float"
            ),
        })?;
        let gs_t = ferrotorch_core::scalar::<T>(gs)?;
        let diff = sub(&noise_cond, &noise_uncond)?;
        let scaled = mul(&diff, &gs_t)?;
        let guided = add(&noise_uncond, &scaled)?;
        Ok((noise_uncond, noise_cond, guided))
    }

    /// Run the full diffusion loop starting from `init_latent`. Returns
    /// the final image `[1, 3, 512, 512]` plus the per-step diagnostic
    /// dumps in iteration order.
    ///
    /// `cond_embeds` and `uncond_embeds` are caller-encoded
    /// `[1, S, cross_attention_dim]` text embeddings (already produced
    /// via [`Self::encode_prompt`]).
    ///
    /// `init_latent` is the caller-provided Gaussian noise; it must be
    /// shape `[1, 4, H, W]` matching the UNet's `in_channels`. Pre-scaled
    /// by `scheduler.init_noise_sigma()` (1.0 for SD-1.5) by this method.
    ///
    /// `num_inference_steps` and `guidance_scale` follow the
    /// diffusers convention.
    ///
    /// # Errors
    ///
    /// Returns [`FerrotorchError::ShapeMismatch`] on any rank/shape
    /// problem and forwards any sub-model error.
    pub fn generate(
        &mut self,
        cond_embeds: &Tensor<T>,
        uncond_embeds: &Tensor<T>,
        init_latent: &Tensor<T>,
        num_inference_steps: usize,
        guidance_scale: f32,
    ) -> FerrotorchResult<(Tensor<T>, Vec<PipelineStepDump<T>>)> {
        if init_latent.ndim() != 4 {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "StableDiffusionPipeline::generate: expected init_latent [B, 4, H, W], got {:?}",
                    init_latent.shape()
                ),
            });
        }
        if cond_embeds.shape() != uncond_embeds.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "StableDiffusionPipeline::generate: cond_embeds shape {:?} != uncond_embeds {:?}",
                    cond_embeds.shape(),
                    uncond_embeds.shape()
                ),
            });
        }

        // Configure the scheduler and capture timesteps up front.
        let timesteps: Vec<usize> = self.scheduler.set_timesteps(num_inference_steps)?.to_vec();

        // latent = init_latent * scheduler.init_noise_sigma. For SD-1.5
        // DDIM this is identity, but call out the multiplication so
        // future scheduler swaps (LMS, Euler) keep working.
        let sigma = self.scheduler.init_noise_sigma();
        let sigma_t = T::from(sigma).ok_or_else(|| FerrotorchError::InvalidArgument {
            message: format!(
                "StableDiffusionPipeline: cannot represent init_noise_sigma {sigma} as the active Float"
            ),
        })?;
        let sigma_scalar = ferrotorch_core::scalar::<T>(sigma_t)?;
        let mut latent = mul(init_latent, &sigma_scalar)?;

        let mut dumps: Vec<PipelineStepDump<T>> = Vec::with_capacity(num_inference_steps);
        for (i, &t) in timesteps.iter().enumerate() {
            let (noise_uncond, noise_cond, guided) =
                self.cfg_eval(&latent, t, cond_embeds, uncond_embeds, guidance_scale)?;
            let latent_after = self.scheduler.step(&guided, t, &latent)?;
            dumps.push(PipelineStepDump {
                step: i,
                timestep: t,
                noise_pred_uncond: noise_uncond,
                noise_pred_cond: noise_cond,
                guided_noise: guided,
                latent_after_step: latent_after.clone(),
            });
            latent = latent_after;
        }

        // VAE: `image = vae.decode(latent / scaling_factor)`. The
        // existing `decode_with_scaling` does exactly this, mirroring
        // `diffusers.vae.decode(latent / 0.18215).sample`.
        let image = self.vae.decode_with_scaling(&latent)?;
        Ok((image, dumps))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::clip_text_encoder::ClipTextConfig;
    use crate::config::VaeDecoderConfig;
    use crate::scheduler::DDIMConfig;
    use crate::unet::UNet2DConditionModel;
    use crate::unet_config::UNet2DConditionConfig;
    use crate::vae::VaeDecoder;

    fn build_tiny_pipeline() -> FerrotorchResult<StableDiffusionPipeline<f32>> {
        // We do not stand up real SD-scale models here; the unit test
        // only validates wiring. The smallest legal configs we can
        // construct still allocate sizeable tensors, so we keep this
        // gated behind the test feature alongside the lib tests.
        let clip_cfg = ClipTextConfig::sd_v1_5();
        let text_encoder = ClipTextEncoder::<f32>::new(clip_cfg)?;
        let mut unet_cfg = UNet2DConditionConfig::sd_v1_5();
        unet_cfg.sample_size = 8;
        let unet = UNet2DConditionModel::<f32>::new(unet_cfg)?;
        let mut vae_cfg = VaeDecoderConfig::sd_v1_5();
        vae_cfg.sample_size = 8;
        let vae = VaeDecoder::<f32>::new(vae_cfg)?;
        let sched = DDIMScheduler::new(DDIMConfig::sd_v1_5())?;
        StableDiffusionPipeline::new(text_encoder, unet, vae, sched)
    }

    #[test]
    fn pipeline_constructs() {
        // Building real-scale modules with random weights is slow but
        // covers the wiring. Disabled by default with #[ignore] would
        // be valid; for now skip if construction returns an error
        // (RAM-constrained CI shouldn't run this).
        let p = build_tiny_pipeline();
        match p {
            Ok(_) => {}
            Err(e) => panic!("pipeline construction unexpectedly failed: {e}"),
        }
    }
}