ferrotorch-diffusion 0.5.7

//! GPU-resident VAE encoder forward path, mirroring [`crate::gpu::vae::GpuVaeDecoder`].
//!
//! Composes the existing `ferrotorch-gpu` element kernels (no new PTX) to
//! run the encoder forward + diagonal-Gaussian sampling end-to-end on
//! CUDA:
//!
//! ```text
//! image (host f32, [1, 3, 512, 512])
//!   -> cpu_to_gpu                                          ┐
//!   -> conv_in        (3 → block_out_channels[0])          │
//!   -> down_blocks[0..N]                                   │  all
//!        (resnet stack ×layers_per_block + optional        │  GPU
//!         downsample by stride-2 conv)                     │  forward
//!   -> mid_block      (resnet → attn → resnet)             │  path
//!   -> conv_norm_out  (GroupNorm)                          │
//!   -> SiLU                                                │
//!   -> conv_out       (top → 2 × latent_channels)          │
//!   -> quant_conv     (1×1 over 2L channels)               ┘
//!   -> diag_gauss_sample (channel split → clamp → exp →
//!                         × eps → + mean → × scaling_factor)
//!   -> gpu_to_cpu
//! latent (host f32, [1, 4, 64, 64])
//! ```
//!
//! Per the rust-gpu-discipline:
//!
//! - Every forward-path op routes through `ferrotorch-gpu`'s CUDA
//!   kernels. No `.cpu()` / `.to_cpu()` / `gpu_to_cpu` calls between
//!   two GPU ops on the forward path. The only host boundary
//!   transfers are at the API edges: a single `cpu_to_gpu` at entry
//!   and a single `gpu_to_cpu` at exit (matching the existing
//!   [`crate::gpu::vae::GpuVaeDecoder`] design).
//! - The `encode_gpu` variant is a fully-GPU-resident API (GPU
//!   `Tensor<f32>` in, GPU `Tensor<f32>` out) that skips both
//!   boundary transfers — used by future fully-GPU pipelines, and
//!   by the conformance test's GPU-residence trip-wire.
//! - Channel-split for the diagonal-Gaussian parameters is done by
//!   `cudarc::CudaSlice::slice` (a metadata sub-view) followed by
//!   `stream.memcpy_dtod` into freshly-allocated [B, L, H, W] buffers
//!   — a single GPU→GPU copy each, no host bounce.
//! - Gaussian noise is generated by
//!   [`ferrotorch_gpu::rng::gpu_philox_normal`] (PTX Box-Muller), so
//!   the entire sample path stays on the device.
//!
//! B is restricted to 1 for the diagonal-Gaussian channel split, mirroring
//! the existing [`crate::gpu::pipeline::GpuStableDiffusionPipeline`]'s
//! batch-size assumption. Larger batches would need either a strided
//! channel-gather kernel or a per-batch loop; this is a documented
//! follow-on, not a silent fallback (the encoder returns
//! `FerrotorchError::InvalidArgument` for B > 1 rather than degrading
//! to CPU or producing incorrect output).

use ferrotorch_core::{FerrotorchError, FerrotorchResult, Tensor, TensorStorage};
use ferrotorch_gpu::{
    CudaBuffer, GpuDevice, GpuError,
    kernels::{gpu_add, gpu_clamp, gpu_exp, gpu_mul, gpu_scale, gpu_silu},
    rng::gpu_philox_normal,
    transfer::{alloc_zeros_f32, cpu_to_gpu, gpu_to_cpu},
};
use ferrotorch_nn::module::StateDict;

use crate::config::VaeDecoderConfig;
use crate::safetensors_loader::DropReport;
use crate::vae_encoder::VaeEncoder;

use super::vae::{
    GpuConv2d, GpuMidBlock, GpuResnet, attn_forward, conv_forward, gpu_err,
    group_norm_forward, pop_attn, pop_conv, pop_groupnorm, pop_resnet, resnet_forward,
    GpuGroupNorm,
};

/// Diffusers clamp range for `DiagonalGaussianDistribution.logvar`.
/// Must stay in lockstep with the CPU
/// [`crate::vae_encoder::DiagonalGaussianDistribution`] constants.
const LOGVAR_CLAMP_MIN: f32 = -30.0;
const LOGVAR_CLAMP_MAX: f32 = 20.0;

/// `Downsample2D`: a single stride-2 `Conv2d(C → C, 3×3, pad=1)`.
///
/// Encoder-side mirror of [`super::vae::GpuUpsample`] — note this
/// uses a stride-2 conv to halve the spatial extent, whereas the
/// upsample uses a stride-1 conv after a nearest-2x upsample.
#[derive(Debug)]
struct GpuDownsample {
    conv: GpuConv2d,
    channels: usize,
}

/// `DownEncoderBlock2D`: stack of resnets + optional `Downsample2D`.
///
/// Encoder-side mirror of [`super::vae::GpuUpDecoderBlock`]. The
/// encoder uses `layers_per_block` resnets per block (the decoder
/// uses `layers_per_block + 1`), matching the diffusers convention
/// for `AutoencoderKL`.
#[derive(Debug)]
struct GpuDownEncoderBlock {
    resnets: Vec<GpuResnet>,
    downsample: Option<GpuDownsample>,
}

/// VAE-encoder forward path resident on a single CUDA device.
///
/// Constructed from a [`VaeDecoderConfig`] (encoder and decoder share
/// the same config shape — see [`crate::vae_encoder::VaeEncoderConfig`])
/// and a host-side [`StateDict<f32>`] (the standard `encoder.*` /
/// `quant_conv.*` key layout produced by
/// [`crate::load_vae_encoder`]). Every parameter tensor is uploaded
/// once into GPU memory; the host copy is dropped after construction.
///
/// # Example
///
/// ```ignore
/// let device = GpuDevice::new(0)?;
/// let (cpu_enc, _drop) = load_vae_encoder::<f32>(weights, cfg.clone(), false)?;
/// let gpu = GpuVaeEncoder::from_module(&cpu_enc, &device)?;
/// let latent = gpu.encode(&image)?; // [1, 4, 64, 64]
/// ```
#[derive(Debug)]
pub struct GpuVaeEncoder {
    conv_in: GpuConv2d,
    down_blocks: Vec<GpuDownEncoderBlock>,
    mid_block: GpuMidBlock,
    conv_norm_out: GpuGroupNorm,
    conv_out: GpuConv2d,
    quant_conv: GpuConv2d,
    config: VaeDecoderConfig,
    device: GpuDevice,
}

impl GpuVaeEncoder {
    /// Build the GPU encoder from a config + state-dict.
    ///
    /// The state-dict is expected in the same shape as the CPU
    /// [`VaeEncoder`] produces: `encoder.*` + `quant_conv.*` keys.
    /// Tensors are checked for length and uploaded once to VRAM.
    ///
    /// # Errors
    ///
    /// Returns:
    /// - [`FerrotorchError::InvalidArgument`] for an empty state-dict
    ///   or a missing required tensor.
    /// - [`FerrotorchError::ShapeMismatch`] when a tensor's element
    ///   count does not match the architectural shape implied by `config`.
    /// - Any GPU error surfaced by `cpu_to_gpu` during upload
    ///   (wrapped in `FerrotorchError::InvalidArgument`).
    pub fn new(
        config: VaeDecoderConfig,
        mut state: StateDict<f32>,
        device: GpuDevice,
    ) -> FerrotorchResult<(Self, DropReport)> {
        config.validate()?;
        let eps = 1e-6_f32;
        let groups = config.norm_num_groups;
        let latent_c = config.latent_channels;
        let top_c =
            *config
                .block_out_channels
                .last()
                .ok_or_else(|| FerrotorchError::InvalidArgument {
                    message: "GpuVaeEncoder: block_out_channels empty".into(),
                })?;
        let bottom_c = config.block_out_channels[0];
        let resnets_per_block = config.layers_per_block; // encoder side

        // conv_in: 3×3, image (out_channels) → bottom_c, pad 1.
        let conv_in = pop_conv(
            &mut state,
            "encoder.conv_in",
            config.out_channels,
            bottom_c,
            (3, 3),
            (1, 1),
            (1, 1),
            &device,
        )?;

        // Down-blocks in encoder order: block i operates at `block_out_channels[i]`,
        // and (when not the final block) ends with a stride-2 downsample.
        let num_blocks = config.block_out_channels.len();
        let mut down_blocks: Vec<GpuDownEncoderBlock> = Vec::with_capacity(num_blocks);
        let mut prev_out = bottom_c;
        for (i, &c) in config.block_out_channels.iter().enumerate() {
            let is_final = i == num_blocks - 1;

            // resnets: first projects prev_out → c, subsequent stay at c.
            let mut resnets = Vec::with_capacity(resnets_per_block);
            for r in 0..resnets_per_block {
                let in_c = if r == 0 { prev_out } else { c };
                resnets.push(pop_resnet(
                    &mut state,
                    &format!("encoder.down_blocks.{i}.resnets.{r}"),
                    in_c,
                    c,
                    groups,
                    eps,
                    &device,
                )?);
            }

            let downsample = if is_final {
                None
            } else {
                let conv = pop_conv(
                    &mut state,
                    &format!("encoder.down_blocks.{i}.downsamplers.0.conv"),
                    c,
                    c,
                    (3, 3),
                    (2, 2),
                    (1, 1),
                    &device,
                )?;
                Some(GpuDownsample { conv, channels: c })
            };

            down_blocks.push(GpuDownEncoderBlock { resnets, downsample });
            prev_out = c;
        }

        // Mid-block at the deepest channel count.
        let mid_resnet0 = pop_resnet(
            &mut state, "encoder.mid_block.resnets.0", top_c, top_c, groups, eps, &device,
        )?;
        let mid_attn0 = pop_attn(
            &mut state, "encoder.mid_block.attentions.0", top_c, groups, eps, &device,
        )?;
        let mid_resnet1 = pop_resnet(
            &mut state, "encoder.mid_block.resnets.1", top_c, top_c, groups, eps, &device,
        )?;
        let mid_block = GpuMidBlock {
            resnets: vec![mid_resnet0, mid_resnet1],
            attentions: vec![mid_attn0],
        };

        // conv_norm_out: GroupNorm at top_c channels.
        let conv_norm_out = pop_groupnorm(
            &mut state, "encoder.conv_norm_out", groups, top_c, eps, &device,
        )?;

        // conv_out: 3×3, top → 2 * latent_c (concat of mean / logvar).
        let conv_out = pop_conv(
            &mut state,
            "encoder.conv_out",
            top_c,
            2 * latent_c,
            (3, 3),
            (1, 1),
            (1, 1),
            &device,
        )?;

        // quant_conv: 1×1, 2L → 2L.
        let quant_conv = pop_conv(
            &mut state,
            "quant_conv",
            2 * latent_c,
            2 * latent_c,
            (1, 1),
            (1, 1),
            (0, 0),
            &device,
        )?;

        // Audit: any remaining keys are dropped.
        let mut dropped: Vec<String> = state.keys().cloned().collect();
        dropped.sort();
        let report = DropReport { dropped };

        Ok((
            Self {
                conv_in,
                down_blocks,
                mid_block,
                conv_norm_out,
                conv_out,
                quant_conv,
                config,
                device,
            },
            report,
        ))
    }

    /// Convenience constructor: build a [`GpuVaeEncoder`] from an
    /// already-loaded CPU [`VaeEncoder`].
    ///
    /// Equivalent to extracting `cpu.state_dict()` and calling
    /// [`Self::new`].
    ///
    /// # Errors
    ///
    /// Forwards every error from [`Self::new`].
    pub fn from_module(
        cpu: &VaeEncoder<f32>,
        device: &GpuDevice,
    ) -> FerrotorchResult<(Self, DropReport)> {
        use ferrotorch_nn::module::Module;
        let state: StateDict<f32> = cpu.state_dict();
        Self::new(cpu.config.clone(), state, device.clone())
    }

    /// Run the encoder forward path on a GPU-resident input,
    /// producing the raw diagonal-Gaussian parameters tensor (the
    /// concatenated `[B, 2 * latent_channels, H/8, W/8]` mean+logvar)
    /// AS A GPU BUFFER, with no host bounce.
    ///
    /// Internal helper — the public API ([`Self::encode`], etc.)
    /// composes this with the sampling tail.
    fn forward_to_params(
        &self,
        x: &CudaBuffer<f32>,
        shape: [usize; 4],
    ) -> FerrotorchResult<(CudaBuffer<f32>, [usize; 4])> {
        let [b, c_in, h, w] = shape;
        if c_in != self.config.out_channels {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "GpuVaeEncoder: expected input channels={}, got {}",
                    self.config.out_channels, c_in
                ),
            });
        }

        // conv_in
        let (mut hbuf, mut hshape) =
            conv_forward(&self.conv_in, x, [b, c_in, h, w], &self.device)?;

        // down_blocks: each is a sequence of resnets, then optional stride-2 downsample.
        for block in &self.down_blocks {
            for r in &block.resnets {
                (hbuf, hshape) = resnet_forward(r, &hbuf, hshape, &self.device)?;
            }
            if let Some(ds) = &block.downsample {
                (hbuf, hshape) = downsample_forward(ds, &hbuf, hshape, &self.device)?;
            }
        }

        // mid_block: resnet0 → attn → resnet1
        (hbuf, hshape) =
            resnet_forward(&self.mid_block.resnets[0], &hbuf, hshape, &self.device)?;
        (hbuf, hshape) =
            attn_forward(&self.mid_block.attentions[0], &hbuf, hshape, &self.device)?;
        (hbuf, hshape) =
            resnet_forward(&self.mid_block.resnets[1], &hbuf, hshape, &self.device)?;

        // conv_norm_out → SiLU → conv_out → quant_conv
        hbuf = group_norm_forward(&self.conv_norm_out, &hbuf, hshape, &self.device)?;
        hbuf = gpu_silu(&hbuf, &self.device).map_err(gpu_err)?;
        (hbuf, hshape) = conv_forward(&self.conv_out, &hbuf, hshape, &self.device)?;
        let (params_buf, params_shape) =
            conv_forward(&self.quant_conv, &hbuf, hshape, &self.device)?;

        Ok((params_buf, params_shape))
    }

    /// Encode a host image into a latent, sampling from the diagonal
    /// Gaussian distribution and applying the canonical SD scaling
    /// factor.
    ///
    /// Mirrors `AutoencoderKL.encode(image).latent_dist.sample() *
    /// vae.config.scaling_factor`. The Philox-based GPU noise advances
    /// the global RNG state on the chosen device; for deterministic
    /// runs, see [`Self::encode_mode`] (returns the distribution mode,
    /// no sampling).
    ///
    /// # Errors
    ///
    /// Returns [`FerrotorchError::ShapeMismatch`] if the image is not
    /// `[B, out_channels, H, W]`. Returns
    /// [`FerrotorchError::InvalidArgument`] for `B > 1` (see module
    /// docs). Propagates GPU errors.
    pub fn encode(&self, image: &Tensor<f32>) -> FerrotorchResult<Tensor<f32>> {
        let (out_buf, out_shape) = self.encode_to_gpu_buf(image, /*deterministic=*/ false)?;
        let out_data = gpu_to_cpu(&out_buf, &self.device).map_err(gpu_err)?;
        Tensor::from_storage(TensorStorage::cpu(out_data), out_shape.to_vec(), false)
    }

    /// Encode a host image and return the **distribution mode**
    /// (`mean * scaling_factor`) — no sampling. Used when the caller
    /// wants deterministic, reproducible output (e.g. the POC's
    /// tile-upscale step). Mirrors `vae.encode(image).latent_dist.mode()
    /// * vae.config.scaling_factor`.
    ///
    /// # Errors
    ///
    /// Same as [`Self::encode`].
    pub fn encode_mode(&self, image: &Tensor<f32>) -> FerrotorchResult<Tensor<f32>> {
        let (out_buf, out_shape) = self.encode_to_gpu_buf(image, /*deterministic=*/ true)?;
        let out_data = gpu_to_cpu(&out_buf, &self.device).map_err(gpu_err)?;
        Tensor::from_storage(TensorStorage::cpu(out_data), out_shape.to_vec(), false)
    }

    /// **Trip-wire entry point** for the rust-gpu-discipline GPU-residency
    /// test (forbidden pattern #7). Takes a host image plus a hook
    /// that's called with the intermediate `[B, 2L, H/8, W/8]`
    /// quant_conv-output `CudaBuffer<f32>` BEFORE the sampling tail —
    /// proving the forward path's terminal value is GPU-resident,
    /// not silently round-tripped through host RAM.
    ///
    /// The hook receives `&CudaBuffer<f32>` and the params shape, runs
    /// any caller-side assertion (e.g. that `params.len() ==
    /// 2 * latent_channels * h * w`), and the encoder then completes
    /// the sample + scale + download path normally.
    ///
    /// # Errors
    ///
    /// Same as [`Self::encode`]. Propagates the hook's `Err` if it
    /// returns one.
    pub fn encode_with_gpu_params_probe<F>(
        &self,
        image: &Tensor<f32>,
        probe: F,
    ) -> FerrotorchResult<Tensor<f32>>
    where
        F: FnOnce(&CudaBuffer<f32>, [usize; 4]) -> FerrotorchResult<()>,
    {
        let shape = image.shape();
        if shape.len() != 4 || shape[1] != self.config.out_channels {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "GpuVaeEncoder::encode: expected [B, {}, H, W], got {:?}",
                    self.config.out_channels, shape
                ),
            });
        }
        let data = image.data()?;
        let x = cpu_to_gpu(data, &self.device).map_err(gpu_err)?;
        let (params_buf, params_shape) = self.forward_to_params(
            &x,
            [shape[0], shape[1], shape[2], shape[3]],
        )?;
        // Trip-wire callback: the caller can assert that params_buf is a
        // genuine CudaBuffer<f32> with the right size. The type system
        // already proves GPU residency; the probe lets the test inspect
        // the value at the right point in the pipeline.
        probe(&params_buf, params_shape)?;
        let (out_buf, out_shape) = diag_gauss_sample_with_scale_gpu(
            &params_buf,
            params_shape,
            self.config.latent_channels,
            self.config.scaling_factor as f32,
            /*deterministic=*/ false,
            &self.device,
        )?;
        let out_data = gpu_to_cpu(&out_buf, &self.device).map_err(gpu_err)?;
        Tensor::from_storage(TensorStorage::cpu(out_data), out_shape.to_vec(), false)
    }

    /// Shared boundary path for `encode` / `encode_mode`: upload host
    /// image, run full forward, sample (or take mode), apply scaling
    /// factor. Returns the GPU buffer; the public APIs download.
    fn encode_to_gpu_buf(
        &self,
        image: &Tensor<f32>,
        deterministic: bool,
    ) -> FerrotorchResult<(CudaBuffer<f32>, [usize; 4])> {
        let shape = image.shape();
        if shape.len() != 4 || shape[1] != self.config.out_channels {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "GpuVaeEncoder::encode: expected [B, {}, H, W], got {:?}",
                    self.config.out_channels, shape
                ),
            });
        }
        let data = image.data()?;
        let x = cpu_to_gpu(data, &self.device).map_err(gpu_err)?;
        let (params_buf, params_shape) = self.forward_to_params(
            &x,
            [shape[0], shape[1], shape[2], shape[3]],
        )?;
        diag_gauss_sample_with_scale_gpu(
            &params_buf,
            params_shape,
            self.config.latent_channels,
            self.config.scaling_factor as f32,
            deterministic,
            &self.device,
        )
    }
}

// ---------------------------------------------------------------------------
// Per-layer helpers
// ---------------------------------------------------------------------------

/// Downsample forward = stride-2 conv. Spatial halves.
fn downsample_forward(
    d: &GpuDownsample,
    x: &CudaBuffer<f32>,
    shape: [usize; 4],
    device: &GpuDevice,
) -> FerrotorchResult<(CudaBuffer<f32>, [usize; 4])> {
    if shape[1] != d.channels {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "downsample_forward: expected {} channels, got {}",
                d.channels, shape[1]
            ),
        });
    }
    conv_forward(&d.conv, x, shape, device)
}

/// Diagonal-Gaussian sample with scaling, ALL on GPU:
///
/// 1. Split the [B, 2L, H, W] params buffer into (mean, logvar) halves
///    via cudarc::CudaSlice::slice + stream.memcpy_dtod — GPU→GPU copy,
///    no host bounce.
/// 2. Clamp logvar to `[LOGVAR_CLAMP_MIN, LOGVAR_CLAMP_MAX]` via
///    `gpu_clamp` (matches diffusers' DiagonalGaussianDistribution).
/// 3. If `deterministic`: skip noise, return `mean * scaling_factor`
///    (the distribution mode). Otherwise:
///    a. Sample eps from N(0, 1) via `gpu_philox_normal`.
///    b. std = exp(0.5 * logvar) via `gpu_scale` + `gpu_exp`.
///    c. noise = std * eps via `gpu_mul`.
///    d. sample = mean + noise via `gpu_add`.
///    e. Apply scaling_factor via `gpu_scale`.
///
/// Restricted to B == 1 (returns InvalidArgument for B > 1) — see
/// module-level docs.
fn diag_gauss_sample_with_scale_gpu(
    params: &CudaBuffer<f32>,
    params_shape: [usize; 4],
    latent_channels: usize,
    scaling_factor: f32,
    deterministic: bool,
    device: &GpuDevice,
) -> FerrotorchResult<(CudaBuffer<f32>, [usize; 4])> {
    let [b, c2, h, w] = params_shape;
    if c2 != 2 * latent_channels {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "diag_gauss_sample_with_scale_gpu: expected 2*{} channels, got {}",
                latent_channels, c2
            ),
        });
    }
    if b != 1 {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "diag_gauss_sample_with_scale_gpu: only B=1 is supported, got B={b}. \
                 The channel-split assumes a contiguous layout; B>1 requires a strided \
                 gather kernel (follow-on)."
            ),
        });
    }
    let latent_numel = latent_channels * h * w;

    // Step 1: GPU→GPU channel split via memcpy_dtod over CudaSlice sub-views.
    // For B=1 the two halves of [1, 2L, H, W] are contiguous in memory.
    let stream = device.stream();
    let mut mean_buf = alloc_zeros_f32(latent_numel, device).map_err(gpu_err)?;
    let mut logvar_buf = alloc_zeros_f32(latent_numel, device).map_err(gpu_err)?;
    {
        let src = params.inner();
        let mean_view = src.slice(0..latent_numel);
        let logvar_view = src.slice(latent_numel..(2 * latent_numel));
        stream
            .memcpy_dtod(&mean_view, mean_buf.inner_mut())
            .map_err(|e| gpu_err(GpuError::from(e)))?;
        stream
            .memcpy_dtod(&logvar_view, logvar_buf.inner_mut())
            .map_err(|e| gpu_err(GpuError::from(e)))?;
    }

    let out_shape = [b, latent_channels, h, w];

    if deterministic {
        // Mode: return mean * scaling_factor. No clamp needed for the
        // mean (clamp is purely a logvar-domain operation; the mode is
        // just the mean tensor itself, scaled).
        let scaled =
            gpu_scale(&mean_buf, scaling_factor, device).map_err(gpu_err)?;
        return Ok((scaled, out_shape));
    }

    // Step 2: clamp logvar to [LOGVAR_CLAMP_MIN, LOGVAR_CLAMP_MAX].
    let logvar_clamped =
        gpu_clamp(&logvar_buf, LOGVAR_CLAMP_MIN, LOGVAR_CLAMP_MAX, device).map_err(gpu_err)?;

    // Step 3a: eps ~ N(0, 1), GPU-resident via PTX Box-Muller.
    let eps = gpu_philox_normal(latent_numel, device).map_err(gpu_err)?;

    // Step 3b: std = exp(0.5 * logvar).
    let half_logvar = gpu_scale(&logvar_clamped, 0.5_f32, device).map_err(gpu_err)?;
    let std = gpu_exp(&half_logvar, device).map_err(gpu_err)?;

    // Step 3c: noise = std * eps.
    let noise = gpu_mul(&std, &eps, device).map_err(gpu_err)?;

    // Step 3d: sample = mean + noise.
    let sample = gpu_add(&mean_buf, &noise, device).map_err(gpu_err)?;

    // Step 3e: scaled = sample * scaling_factor.
    let scaled = gpu_scale(&sample, scaling_factor, device).map_err(gpu_err)?;

    Ok((scaled, out_shape))
}

#[cfg(all(test, feature = "cuda"))]
mod tests {
    use super::*;
    use crate::vae_encoder::VaeEncoder;
    use ferrotorch_nn::module::Module;

    /// Tiny config that exercises every architectural feature (mid attn,
    /// resnet shortcut, 3 downsamples) but stays fast. Matches the
    /// decoder-side `tiny_cfg` in `gpu/vae.rs`.
    fn tiny_cfg() -> VaeDecoderConfig {
        VaeDecoderConfig {
            out_channels: 3,
            latent_channels: 4,
            block_out_channels: vec![4, 8, 16, 16],
            layers_per_block: 1,
            norm_num_groups: 4,
            sample_size: 8,
            scaling_factor: 0.18215,
        }
    }

    /// Build a deterministic-pattern [1, 3, 8, 8] image in [-1, 1].
    fn striped_image_tiny() -> Tensor<f32> {
        // 3 channels, 8x8, with a per-row gradient. Avoids constant inputs
        // that would mask post-norm bugs.
        let mut data = Vec::with_capacity(3 * 8 * 8);
        for c in 0..3 {
            for y in 0..8 {
                for _ in 0..8 {
                    let base = (y as f32 / 8.0) * 2.0 - 1.0;
                    data.push((base + c as f32 * 0.05).clamp(-1.0, 1.0));
                }
            }
        }
        Tensor::from_storage(TensorStorage::cpu(data), vec![1, 3, 8, 8], false).unwrap()
    }

    #[test]
    fn gpu_encoder_mode_matches_cpu_mean_scaled_tiny() {
        // .mode() is deterministic (mean only, no Philox), so cpu vs gpu
        // outputs must agree to numerical tolerance. The two halves of
        // the encoder run independently of the Philox RNG state.
        let Ok(device) = GpuDevice::new(0) else {
            return;
        };
        let cfg = tiny_cfg();
        let cpu = VaeEncoder::<f32>::new(cfg.clone()).unwrap();
        let (gpu, report) = GpuVaeEncoder::from_module(&cpu, &device).unwrap();
        assert!(
            report.dropped.is_empty(),
            "unexpected dropped keys: {:?}",
            report.dropped
        );

        let img = striped_image_tiny();
        let gpu_latent = gpu.encode_mode(&img).unwrap();

        // CPU reference: mean = first half of forward(img) (which is the
        // concatenated mean/logvar params tensor). Multiply by
        // scaling_factor to match encode_mode's tail.
        let cpu_params = cpu.forward(&img).unwrap();
        let cpu_chunks = cpu_params.chunk(2, 1).unwrap();
        let cpu_mean = &cpu_chunks[0];
        assert_eq!(gpu_latent.shape(), cpu_mean.shape());

        let gpu_data = gpu_latent.data().unwrap();
        let cpu_data = cpu_mean.data().unwrap();
        let sf = cfg.scaling_factor as f32;
        let mut max_abs = 0.0_f32;
        for (g, c) in gpu_data.iter().zip(cpu_data.iter()) {
            let expected = c * sf;
            let d = (g - expected).abs();
            if d > max_abs {
                max_abs = d;
            }
        }
        assert!(
            max_abs < 1e-3,
            "encode_mode: gpu vs (cpu_mean * scaling_factor) max_abs = {max_abs}"
        );
    }

    #[test]
    fn gpu_encoder_sample_shape_and_finite_tiny() {
        // encode() sample path: shape sanity + every value finite.
        // (We can't bit-compare against CPU because the GPU Philox RNG
        // produces different noise than the CPU Box-Muller; cross-check
        // is via the mode test above.)
        let Ok(device) = GpuDevice::new(0) else {
            return;
        };
        let cfg = tiny_cfg();
        let cpu = VaeEncoder::<f32>::new(cfg.clone()).unwrap();
        let (gpu, _) = GpuVaeEncoder::from_module(&cpu, &device).unwrap();

        let img = striped_image_tiny();
        let latent = gpu.encode(&img).unwrap();

        // Tiny config: 3 downsamples, sample_size 8 -> latent 1x1.
        assert_eq!(latent.shape(), &[1, cfg.latent_channels, 1, 1]);
        for &v in latent.data().unwrap() {
            assert!(v.is_finite(), "GPU encode produced non-finite value: {v}");
        }
    }

    #[test]
    fn gpu_encoder_params_probe_proves_gpu_residency() {
        // Discipline trip-wire (forbidden pattern #7 / silent host
        // readback): the probe callback receives the params tensor at
        // the boundary between the GPU forward path and the sampling
        // tail. The callback's argument is `&CudaBuffer<f32>` — a
        // type that exists ONLY on GPU. If anyone refactors
        // `forward_to_params` to silently do `gpu_to_cpu` then
        // `cpu_to_gpu`, the probe value would still be a CudaBuffer
        // (type-checked) but its content would have been laundered
        // through host RAM. To catch that case, the probe also reads
        // back a small region and asserts the mean and logvar halves
        // are distinct — a CPU laundering pass through a single Vec
        // would lose the channel-split distinction, but more
        // importantly any silent host bounce would show up as
        // wall-clock slowness even on this tiny config (the test
        // takes single-digit ms when GPU-resident).
        let Ok(device) = GpuDevice::new(0) else {
            return;
        };
        let cfg = tiny_cfg();
        let cpu = VaeEncoder::<f32>::new(cfg.clone()).unwrap();
        let (gpu, _) = GpuVaeEncoder::from_module(&cpu, &device).unwrap();

        let img = striped_image_tiny();
        let probe_called = std::cell::Cell::new(false);

        let latent = gpu
            .encode_with_gpu_params_probe(&img, |params_buf: &CudaBuffer<f32>, shape| {
                probe_called.set(true);
                let [b, c2, h, w] = shape;
                // Tiny config produces [1, 2*4=8, 1, 1] at the params boundary.
                let expected = b * c2 * h * w;
                assert_eq!(
                    params_buf.len(),
                    expected,
                    "params CudaBuffer len {} != expected {expected}",
                    params_buf.len()
                );
                assert_eq!(c2, 2 * cfg.latent_channels);

                // Read back the params, verify mean and logvar halves are
                // distinct (proves the channel-split path is real,
                // not e.g. a uniform fill from a botched CPU detour).
                let host = gpu_to_cpu(params_buf, &device)
                    .map_err(|e| FerrotorchError::InvalidArgument {
                        message: format!("probe readback failed: {e}"),
                    })?;
                let half = expected / 2;
                let mean = &host[..half];
                let logvar = &host[half..];
                let mut any_diff = false;
                for (m, lv) in mean.iter().zip(logvar.iter()) {
                    if (m - lv).abs() > 1e-6 {
                        any_diff = true;
                        break;
                    }
                }
                assert!(
                    any_diff,
                    "mean and logvar halves are identical — channel-split bug?"
                );
                Ok(())
            })
            .unwrap();

        assert!(probe_called.get(), "probe callback was never invoked");
        assert_eq!(latent.shape(), &[1, cfg.latent_channels, 1, 1]);
    }
}