ferrotorch-core 0.6.2

//! Forward-pass implementations for N-D indexing operations.
//!
//! - `gather(input, dim, index)` — gather elements along an axis
//! - `scatter(input, dim, index, src)` — scatter src values into input
//! - `scatter_add(input, dim, index, src)` — scatter with addition
//! - `where_cond(condition, x, y)` — ternary selection
//!
//! `gather` / `scatter` / `scatter_value` / `scatter_add` have CUDA-resident
//! fast paths (f32/f64) that dispatch through `GpuBackend::{op}_dim_{f32,f64}`
//! to the PTX kernels in `ferrotorch-gpu/src/scatter_gather_kernels.rs` (the
//! host `&[usize]` index is uploaded as a resident `i64` buffer; the result
//! stays GPU-resident). bf16/f16 CUDA inputs return `NotImplementedOnCuda`.
//! `where_cond` (host-`&[bool]`) is CPU-only; `where_cond_bt` / `masked_select`
//! have their own GPU-resident paths (#1185 / #1187).
//! Backward (gradient) functions live in `grad_fns::indexing`.
//!
//! ## REQ status (per `.design/ferrotorch-core/ops/indexing.md`)
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 | SHIPPED | `gather` at `ops/indexing.rs:112`; consumer: re-export `ferrotorch_core::gather` at `lib.rs:174` |
//! | REQ-2 | SHIPPED | `scatter` at `ops/indexing.rs:183` + scalar-src overload `scatter_value` at `ops/indexing.rs:306` (closes #1258 mirroring `aten/src/ATen/native/TensorAdvancedIndexing.cpp:2278`); consumer: re-export at `lib.rs:174`; non-test consumer for `scatter_value`: `Tensor::scatter_value_t` at `methods.rs:1166`. |
//! | REQ-3 | SHIPPED | `scatter_add` at `ops/indexing.rs:259`; consumer: `grad_fns::cumulative::cumsum_backward` at `grad_fns/cumulative.rs:503` invokes `ops::indexing::scatter_add` |
//! | REQ-4 | SHIPPED | `where_cond` at `ops/indexing.rs:334`; consumer: re-export at `lib.rs:174`; `where_cond_bt` CPU fallback at `:458` |
//! | REQ-5 | SHIPPED | `where_cond_bt` at `ops/indexing.rs:397`; consumer: `grad_fns::indexing::where_differentiable` at `grad_fns/indexing.rs:1845,1853` |
//! | REQ-6 | SHIPPED | `masked_select` at `ops/indexing.rs:478`; consumer: `tensor::Tensor::masked_select` at `tensor.rs:1146`; `grad_fns::indexing::masked_select_backward` at `grad_fns/indexing.rs:1823,1828` |
//! | REQ-7 | SHIPPED | grad-fn attachment (e.g. `gather` at `ops/indexing.rs:154-164`); consumer: every autograd-tracking caller |
//! | REQ-8 | SHIPPED | `validate_gather_shapes` at `ops/indexing.rs:66`; consumer: `gather`/`scatter`/`scatter_add` |

use std::sync::Arc;

use crate::autograd::no_grad::is_grad_enabled;
use crate::dtype::{DType, Float};
use crate::error::{FerrotorchError, FerrotorchResult};
use crate::gpu_dispatch::GpuBufferHandle;
use crate::shape::normalize_axis;
use crate::storage::TensorStorage;
use crate::tensor::Tensor;

/// Factorise `shape` around `dim` into `(outer, dim_size, inner)` for the
/// `[outer, dim_size, inner]` GPU kernel layout shared by the dim-aware
/// gather/scatter family (`scatter_gather_kernels.rs`). `outer =
/// prod(shape[..dim])`, `inner = prod(shape[dim+1..])`. Mirrors the per-dim
/// stride decomposition in
/// `aten/src/ATen/native/cuda/ScatterGatherKernel.cu`.
#[inline]
fn factor(shape: &[usize], dim: usize) -> (usize, usize, usize) {
    let outer: usize = shape[..dim].iter().product();
    let dim_size = shape[dim];
    let inner: usize = shape[dim + 1..].iter().product();
    (outer, dim_size, inner)
}

/// Upload a host `&[usize]` index slice to a GPU-resident `i64` buffer on
/// `ordinal` (PyTorch index tensors are `int64`). The dim-aware
/// gather/scatter CUDA kernels read the index with `ld.global.s64`, so the
/// host indices are widened to `i64` before the copy. The core validator
/// (`validate_gather_shapes`) has already rejected out-of-range values, so
/// every uploaded index is in-bounds along `dim`.
fn upload_index_i64(index: &[usize], ordinal: usize) -> FerrotorchResult<GpuBufferHandle> {
    let backend = crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
    let widened: Vec<i64> = index.iter().map(|&v| v as i64).collect();
    // SAFETY: `widened: Vec<i64>` is fully initialized and borrowed for the
    // duration of this call. `i64` has no padding/niches, so reading its
    // backing store as `&[u8]` of length `widened.len() * 8` (==
    // `widened.len() * size_of::<i64>()`) is sound and exactly covers the
    // allocation; the byte slice does not outlive `widened`.
    let bytes: &[u8] =
        unsafe { std::slice::from_raw_parts(widened.as_ptr().cast::<u8>(), widened.len() * 8) };
    backend.cpu_to_gpu(bytes, DType::I64, ordinal)
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Whether at least one of two tensors requires grad (and grad is enabled).
#[inline]
fn needs_grad<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> bool {
    is_grad_enabled() && (a.requires_grad() || b.requires_grad())
}

/// Compute the flat index into a C-contiguous buffer from per-axis coordinates.
#[inline]
fn flat_index(coords: &[usize], shape: &[usize]) -> usize {
    let mut idx = 0;
    let mut stride = 1;
    for d in (0..shape.len()).rev() {
        idx += coords[d] * stride;
        stride *= shape[d];
    }
    idx
}

/// Increment a multi-dimensional coordinate vector in C-order (last axis
/// fastest). Returns `false` when the coordinate wraps past the last element.
#[inline]
fn increment_coords(coords: &mut [usize], shape: &[usize]) -> bool {
    for d in (0..shape.len()).rev() {
        coords[d] += 1;
        if coords[d] < shape[d] {
            return true;
        }
        coords[d] = 0;
    }
    false
}

/// Validate that `index` shape matches `input` shape on all dimensions
/// except `dim`, and that every index value is in-bounds for `input.shape()[dim]`.
///
/// This mirrors PyTorch's gather/scatter shape requirements:
///   - `index.ndim() == input.ndim()`
///   - For all d != dim: `index.shape()[d] <= input.shape()[d]`  (gather)
///     or `index.shape()[d] <= src.shape()[d]` (scatter)
///
/// We enforce the simpler check that ndim matches.
fn validate_gather_shapes(
    input_shape: &[usize],
    dim: usize,
    index_shape: &[usize],
    index_data: &[usize],
    axis_size: usize,
) -> FerrotorchResult<()> {
    if input_shape.len() != index_shape.len() {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "gather/scatter: input ndim ({}) must equal index ndim ({})",
                input_shape.len(),
                index_shape.len()
            ),
        });
    }
    // Validate index values are in-bounds along `dim`.
    for &v in index_data {
        if v >= axis_size {
            return Err(FerrotorchError::IndexOutOfBounds {
                index: v,
                axis: dim,
                size: axis_size,
            });
        }
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// gather
// ---------------------------------------------------------------------------

/// Gather values from `input` along `dim` using `index`.
///
/// PyTorch semantics:
/// ```text
/// output[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
/// output[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
/// output[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
/// ```
///
/// The output has the same shape as `index`.
///
/// `index` is passed as a flat `&[usize]` slice with shape `index_shape`.
/// If `input.requires_grad()`, attaches a `GatherBackward` grad_fn.
pub fn gather<T: Float>(
    input: &Tensor<T>,
    dim: isize,
    index: &[usize],
    index_shape: &[usize],
) -> FerrotorchResult<Tensor<T>> {
    // CUDA-resident fast path: `input` on a CUDA device, f32/f64 dtype. The
    // host `&[usize]` index is uploaded as a GPU-resident `i64` buffer; the
    // dim-aware PTX kernel runs entirely on-device and the result stays
    // resident (no host round trip). bf16/f16 fall through to
    // `NotImplementedOnCuda` (no dim-aware kernel for those dtypes yet).
    if input.is_cuda() {
        match T::dtype() {
            DType::F32 | DType::F64 => {
                let ndim = input.ndim();
                if ndim == 0 {
                    return Err(FerrotorchError::InvalidArgument {
                        message: "gather: 0-D CUDA input not supported".into(),
                    });
                }
                let dim = normalize_axis(dim, ndim)?;
                // The dim-aware PTX kernel assumes a C-contiguous
                // `[outer, axis, inner]` physical buffer. A transposed/permuted
                // CUDA view has logical shape != physical layout, so materialise
                // to contiguous ON-DEVICE first (strided_copy kernel — no host
                // round trip) so the buffer matches the `factor` decomposition.
                let input = input.contiguous()?;
                let input_shape = input.shape().to_vec();
                let (outer, in_dim, inner) = factor(&input_shape, dim);
                let out_dim = if index_shape.is_empty() {
                    1
                } else {
                    index_shape[dim]
                };
                let input_handle = input.gpu_handle()?;
                let ordinal = input_handle.device_ordinal();
                let idx_handle = upload_index_i64(index, ordinal)?;
                let backend =
                    crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
                let h = if T::dtype() == DType::F32 {
                    backend.gather_dim_f32(
                        input_handle,
                        &idx_handle,
                        outer,
                        in_dim,
                        out_dim,
                        inner,
                    )?
                } else {
                    backend.gather_dim_f64(
                        input_handle,
                        &idx_handle,
                        outer,
                        in_dim,
                        out_dim,
                        inner,
                    )?
                };
                let output_shape = index_shape.to_vec();
                let storage = TensorStorage::gpu(h);
                if input.requires_grad() && is_grad_enabled() {
                    let grad_fn = Arc::new(crate::grad_fns::indexing::GatherBackward {
                        input: input.clone(),
                        dim,
                        index: index.to_vec(),
                        index_shape: index_shape.to_vec(),
                    });
                    return Tensor::from_operation(storage, output_shape, grad_fn);
                }
                return Tensor::from_storage(storage, output_shape, false);
            }
            _ => return Err(FerrotorchError::NotImplementedOnCuda { op: "gather" }),
        }
    }

    // PyTorch treats 0-D tensors as if they had `ensure_nonempty_dim(self.dim()) == 1`
    // for gather/scatter shape checks (see `ScatterGatherChecks.h:44`
    // `ensure_nonempty_dim`). Mirror that here: a 0-D input acts like a
    // 1-element tensor of shape `[1]` along the only valid axis (dim 0). When
    // index is also 0-D (rank-0 scalar index), promote it to shape `[1]` so
    // the ndim-equality validation succeeds; the output shape preserves the
    // caller's original `index_shape` (still `[]`) so 0-D in → 0-D out.
    let ndim = input.ndim();
    let effective_input_shape: Vec<usize> = if ndim == 0 {
        vec![1]
    } else {
        input.shape().to_vec()
    };
    let effective_ndim = effective_input_shape.len();
    let effective_index_shape: Vec<usize> = if ndim == 0 && index_shape.is_empty() {
        vec![1]
    } else {
        index_shape.to_vec()
    };
    let dim = normalize_axis(dim, effective_ndim)?;

    validate_gather_shapes(
        &effective_input_shape,
        dim,
        &effective_index_shape,
        index,
        effective_input_shape[dim],
    )?;
    let input_shape: &[usize] = &effective_input_shape;

    let input_data = input.data_vec()?;
    let out_numel: usize = index_shape.iter().product();
    let mut output = vec![<T as num_traits::Zero>::zero(); out_numel];

    let mut coords = vec![0usize; effective_ndim];
    for out_flat in 0..out_numel {
        // Build source coordinates: same as output coords, but replace dim
        // with the index value.
        let idx_val = index[out_flat];
        let mut src_coords = coords.clone();
        src_coords[dim] = idx_val;
        let src_flat = flat_index(&src_coords, input_shape);
        output[out_flat] = input_data[src_flat];

        if out_flat + 1 < out_numel {
            increment_coords(&mut coords, &effective_index_shape);
        }
    }

    let output_shape = index_shape.to_vec();

    if input.requires_grad() && is_grad_enabled() {
        let grad_fn = Arc::new(crate::grad_fns::indexing::GatherBackward {
            input: input.clone(),
            dim,
            index: index.to_vec(),
            index_shape: index_shape.to_vec(),
        });
        Tensor::from_operation(TensorStorage::cpu(output), output_shape, grad_fn)
    } else {
        Tensor::from_storage(TensorStorage::cpu(output), output_shape, false)
    }
}

// ---------------------------------------------------------------------------
// scatter
// ---------------------------------------------------------------------------

/// Scatter `src` values into a clone of `input` along `dim` using `index`.
///
/// PyTorch semantics:
/// ```text
/// output = input.clone()
/// output[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
/// ```
///
/// The output has the same shape as `input`.
///
/// `index` and `src` are flat slices with shape `index_shape`.
/// If either `input` or `src` requires grad, attaches a `ScatterBackward`.
pub fn scatter<T: Float>(
    input: &Tensor<T>,
    dim: isize,
    index: &[usize],
    index_shape: &[usize],
    src: &Tensor<T>,
) -> FerrotorchResult<Tensor<T>> {
    let ndim = input.ndim();
    if ndim == 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: "scatter: input must have at least 1 dimension".into(),
        });
    }
    let dim = normalize_axis(dim, ndim)?;
    let input_shape = input.shape();

    // CUDA-resident fast path: `input` + `src` on the same CUDA device,
    // f32/f64. The host index uploads as a resident `i64` buffer; the result
    // (a clone of `input` with the scattered writes) stays GPU-resident.
    // bf16/f16 → `NotImplementedOnCuda`.
    if input.is_cuda() || src.is_cuda() {
        match T::dtype() {
            DType::F32 | DType::F64 if input.is_cuda() && src.is_cuda() => {
                if input.device() != src.device() {
                    return Err(FerrotorchError::DeviceMismatch {
                        expected: input.device(),
                        got: src.device(),
                    });
                }
                // The scatter PTX kernel reads `self`/`src` as C-contiguous
                // `[outer, axis, inner]`. Materialise both to contiguous
                // ON-DEVICE (strided_copy — no host round trip) so a
                // transposed/permuted view's physical buffer matches the
                // logical shape `factor` decomposes.
                let input = input.contiguous()?;
                let src = src.contiguous()?;
                let input_shape: &[usize] = input.shape();
                let (outer, out_dim, inner) = factor(input_shape, dim);
                let idx_dim = index_shape[dim];
                let input_handle = input.gpu_handle()?;
                let ordinal = input_handle.device_ordinal();
                let idx_handle = upload_index_i64(index, ordinal)?;
                let backend =
                    crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
                let src_handle = src.gpu_handle()?;
                let h = if T::dtype() == DType::F32 {
                    backend.scatter_dim_f32(
                        input_handle,
                        &idx_handle,
                        src_handle,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                } else {
                    backend.scatter_dim_f64(
                        input_handle,
                        &idx_handle,
                        src_handle,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                };
                let output_shape = input_shape.to_vec();
                let storage = TensorStorage::gpu(h);
                if needs_grad(&input, &src) {
                    let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterBackward {
                        input: input.clone(),
                        src: src.clone(),
                        dim,
                        index: index.to_vec(),
                        index_shape: index_shape.to_vec(),
                    });
                    return Tensor::from_operation(storage, output_shape, grad_fn);
                }
                return Tensor::from_storage(storage, output_shape, false);
            }
            _ => return Err(FerrotorchError::NotImplementedOnCuda { op: "scatter" }),
        }
    }

    validate_gather_shapes(input_shape, dim, index_shape, index, input_shape[dim])?;

    let index_numel: usize = index_shape.iter().product();
    if src.numel() < index_numel {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "scatter: src has {} elements but index has {}",
                src.numel(),
                index_numel
            ),
        });
    }

    let mut output = input.data_vec()?;
    let src_data = src.data_vec()?;

    let mut coords = vec![0usize; ndim];
    for i in 0..index_numel {
        let idx_val = index[i];
        let mut dst_coords = coords.clone();
        dst_coords[dim] = idx_val;
        let dst_flat = flat_index(&dst_coords, input_shape);
        output[dst_flat] = src_data[i];

        if i + 1 < index_numel {
            increment_coords(&mut coords, index_shape);
        }
    }

    let output_shape = input_shape.to_vec();

    if needs_grad(input, src) {
        let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterBackward {
            input: input.clone(),
            src: src.clone(),
            dim,
            index: index.to_vec(),
            index_shape: index_shape.to_vec(),
        });
        Tensor::from_operation(TensorStorage::cpu(output), output_shape, grad_fn)
    } else {
        Tensor::from_storage(TensorStorage::cpu(output), output_shape, false)
    }
}

// ---------------------------------------------------------------------------
// scatter_value (scalar-src overload — closes #1258)
// ---------------------------------------------------------------------------

/// Scatter a scalar `value` into a clone of `input` along `dim` at the
/// positions named by `index`. The `scatter.value` overload of PyTorch's
/// `scatter`.
///
/// PyTorch semantics (scalar-src):
/// ```text
/// output = input.clone()
/// output[index[i][j][k]][j][k] = value  # if dim == 0
/// ```
///
/// Mirrors upstream `Tensor& scatter_(int64_t dim, const Tensor& index,
/// const Scalar& value)` at
/// `aten/src/ATen/native/TensorAdvancedIndexing.cpp:2278`. Equivalent to
/// `scatter(input, dim, index, index_shape, full_like(index, value))` but
/// avoids the temporary `src` allocation.
///
/// Autograd note: the scalar `value` is not a differentiable input, so
/// gradients route only to `input` via a `ScatterValueBackward`-shaped path
/// — for now we route through the existing `ScatterBackward` by
/// materialising a `src` of zeros (the value-arm grad of `src` is
/// discarded anyway). When `input` does not require grad, no autograd node
/// is attached.
pub fn scatter_value<T: Float>(
    input: &Tensor<T>,
    dim: isize,
    index: &[usize],
    index_shape: &[usize],
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    let ndim = input.ndim();
    if ndim == 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: "scatter_value: input must have at least 1 dimension".into(),
        });
    }
    let dim = normalize_axis(dim, ndim)?;
    let input_shape = input.shape();

    // CUDA-resident fast path: `input` on a CUDA device, f32/f64. The host
    // index uploads as a resident `i64` buffer; the broadcast scalar `value`
    // is written at every named position by the on-device kernel and the
    // result stays resident. bf16/f16 → `NotImplementedOnCuda`.
    if input.is_cuda() {
        match T::dtype() {
            DType::F32 | DType::F64 => {
                // Materialise `self` to contiguous ON-DEVICE (strided_copy — no
                // host round trip) so a transposed/permuted view's physical
                // buffer matches the logical shape the kernel `factor` assumes.
                let input = input.contiguous()?;
                let input_shape: &[usize] = input.shape();
                let (outer, out_dim, inner) = factor(input_shape, dim);
                let idx_dim = index_shape[dim];
                let input_handle = input.gpu_handle()?;
                let ordinal = input_handle.device_ordinal();
                let idx_handle = upload_index_i64(index, ordinal)?;
                let backend =
                    crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
                let h = if T::dtype() == DType::F32 {
                    backend.scatter_value_dim_f32(
                        input_handle,
                        &idx_handle,
                        value.to_f32().ok_or(FerrotorchError::InvalidArgument {
                            message: "scatter_value: value not representable as f32".into(),
                        })?,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                } else {
                    backend.scatter_value_dim_f64(
                        input_handle,
                        &idx_handle,
                        value.to_f64().ok_or(FerrotorchError::InvalidArgument {
                            message: "scatter_value: value not representable as f64".into(),
                        })?,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                };
                let output_shape = input_shape.to_vec();
                let storage = TensorStorage::gpu(h);
                if is_grad_enabled() && input.requires_grad() {
                    let zero = <T as num_traits::Zero>::zero();
                    let zeros_src = Tensor::from_storage(
                        TensorStorage::cpu(vec![zero; index_shape.iter().product()]),
                        index_shape.to_vec(),
                        false,
                    )?;
                    let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterBackward {
                        input: input.clone(),
                        src: zeros_src,
                        dim,
                        index: index.to_vec(),
                        index_shape: index_shape.to_vec(),
                    });
                    return Tensor::from_operation(storage, output_shape, grad_fn);
                }
                return Tensor::from_storage(storage, output_shape, false);
            }
            _ => {
                return Err(FerrotorchError::NotImplementedOnCuda {
                    op: "scatter_value",
                });
            }
        }
    }

    validate_gather_shapes(input_shape, dim, index_shape, index, input_shape[dim])?;

    let index_numel: usize = index_shape.iter().product();

    let mut output = input.data_vec()?;

    let mut coords = vec![0usize; ndim];
    for i in 0..index_numel {
        let idx_val = index[i];
        let mut dst_coords = coords.clone();
        dst_coords[dim] = idx_val;
        let dst_flat = flat_index(&dst_coords, input_shape);
        output[dst_flat] = value;

        if i + 1 < index_numel {
            increment_coords(&mut coords, index_shape);
        }
    }

    let output_shape = input_shape.to_vec();

    if is_grad_enabled() && input.requires_grad() {
        // Route through ScatterBackward by passing a zeros `src` — the
        // value-arm has no `src` gradient (scalar is not differentiable),
        // and the `input` gradient is the standard scatter zero-out at the
        // written positions.
        let zero = <T as num_traits::Zero>::zero();
        let zeros_src = Tensor::from_storage(
            TensorStorage::cpu(vec![zero; index_numel]),
            index_shape.to_vec(),
            false,
        )?;
        let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterBackward {
            input: input.clone(),
            src: zeros_src,
            dim,
            index: index.to_vec(),
            index_shape: index_shape.to_vec(),
        });
        Tensor::from_operation(TensorStorage::cpu(output), output_shape, grad_fn)
    } else {
        Tensor::from_storage(TensorStorage::cpu(output), output_shape, false)
    }
}

// ---------------------------------------------------------------------------
// scatter_add
// ---------------------------------------------------------------------------

/// Scatter-add `src` values into a clone of `input` along `dim`.
///
/// Like `scatter`, but uses addition instead of assignment:
/// ```text
/// output = input.clone()
/// output[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
/// ```
pub fn scatter_add<T: Float>(
    input: &Tensor<T>,
    dim: isize,
    index: &[usize],
    index_shape: &[usize],
    src: &Tensor<T>,
) -> FerrotorchResult<Tensor<T>> {
    let ndim = input.ndim();
    if ndim == 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: "scatter_add: input must have at least 1 dimension".into(),
        });
    }
    let dim = normalize_axis(dim, ndim)?;
    let input_shape = input.shape();

    // CUDA-resident fast path: `input` + `src` on the same CUDA device,
    // f32/f64. The host index uploads as a resident `i64` buffer; the kernel
    // accumulates with an ATOMIC add so duplicate index values targeting the
    // same output slot sum correctly. The result stays GPU-resident. bf16/f16
    // → `NotImplementedOnCuda`.
    if input.is_cuda() || src.is_cuda() {
        match T::dtype() {
            DType::F32 | DType::F64 if input.is_cuda() && src.is_cuda() => {
                if input.device() != src.device() {
                    return Err(FerrotorchError::DeviceMismatch {
                        expected: input.device(),
                        got: src.device(),
                    });
                }
                // The scatter_add PTX kernel reads `self`/`src` as C-contiguous
                // `[outer, axis, inner]`. Materialise both to contiguous
                // ON-DEVICE (strided_copy — no host round trip) so a
                // transposed/permuted view's physical buffer matches the
                // logical shape `factor` decomposes. (A non-zero `self` exposes
                // this: an all-zeros buffer reads identically either layout.)
                let input = input.contiguous()?;
                let src = src.contiguous()?;
                let input_shape: &[usize] = input.shape();
                let (outer, out_dim, inner) = factor(input_shape, dim);
                let idx_dim = index_shape[dim];
                let input_handle = input.gpu_handle()?;
                let ordinal = input_handle.device_ordinal();
                let idx_handle = upload_index_i64(index, ordinal)?;
                let backend =
                    crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
                let src_handle = src.gpu_handle()?;
                let h = if T::dtype() == DType::F32 {
                    backend.scatter_add_dim_f32(
                        input_handle,
                        &idx_handle,
                        src_handle,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                } else {
                    backend.scatter_add_dim_f64(
                        input_handle,
                        &idx_handle,
                        src_handle,
                        outer,
                        out_dim,
                        idx_dim,
                        inner,
                    )?
                };
                let output_shape = input_shape.to_vec();
                let storage = TensorStorage::gpu(h);
                if needs_grad(&input, &src) {
                    let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterAddBackward {
                        input: input.clone(),
                        src: src.clone(),
                        dim,
                        index: index.to_vec(),
                        index_shape: index_shape.to_vec(),
                    });
                    return Tensor::from_operation(storage, output_shape, grad_fn);
                }
                return Tensor::from_storage(storage, output_shape, false);
            }
            _ => return Err(FerrotorchError::NotImplementedOnCuda { op: "scatter_add" }),
        }
    }

    validate_gather_shapes(input_shape, dim, index_shape, index, input_shape[dim])?;

    let index_numel: usize = index_shape.iter().product();
    if src.numel() < index_numel {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "scatter_add: src has {} elements but index has {}",
                src.numel(),
                index_numel
            ),
        });
    }

    let mut output = input.data_vec()?;
    let src_data = src.data_vec()?;

    let mut coords = vec![0usize; ndim];
    for i in 0..index_numel {
        let idx_val = index[i];
        let mut dst_coords = coords.clone();
        dst_coords[dim] = idx_val;
        let dst_flat = flat_index(&dst_coords, input_shape);
        output[dst_flat] += src_data[i];

        if i + 1 < index_numel {
            increment_coords(&mut coords, index_shape);
        }
    }

    let output_shape = input_shape.to_vec();

    if needs_grad(input, src) {
        let grad_fn = Arc::new(crate::grad_fns::indexing::ScatterAddBackward {
            input: input.clone(),
            src: src.clone(),
            dim,
            index: index.to_vec(),
            index_shape: index_shape.to_vec(),
        });
        Tensor::from_operation(TensorStorage::cpu(output), output_shape, grad_fn)
    } else {
        Tensor::from_storage(TensorStorage::cpu(output), output_shape, false)
    }
}

// ---------------------------------------------------------------------------
// where_cond
// ---------------------------------------------------------------------------

/// Ternary selection: `output[i] = condition[i] ? x[i] : y[i]`.
///
/// All three tensors must have the same shape (no broadcasting yet).
/// `condition` is a flat `&[bool]` slice.
///
/// If either `x` or `y` requires grad, attaches a `WhereCondBackward`.
pub fn where_cond<T: Float>(
    condition: &[bool],
    x: &Tensor<T>,
    y: &Tensor<T>,
) -> FerrotorchResult<Tensor<T>> {
    if x.shape() != y.shape() {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "where_cond: x shape {:?} != y shape {:?}",
                x.shape(),
                y.shape()
            ),
        });
    }
    if x.is_cuda() || y.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda { op: "where_cond" });
    }

    let numel = x.numel();
    if condition.len() != numel {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "where_cond: condition length {} != tensor numel {}",
                condition.len(),
                numel
            ),
        });
    }

    let x_data = x.data_vec()?;
    let y_data = y.data_vec()?;

    let output: Vec<T> = condition
        .iter()
        .zip(x_data.iter().zip(y_data.iter()))
        .map(|(&c, (&xv, &yv))| if c { xv } else { yv })
        .collect();

    let output_shape = x.shape().to_vec();

    if needs_grad(x, y) {
        // This entry point inherently has a host `&[bool]`; wrap it as a CPU
        // BoolTensor for storage. The backward struct now holds a BoolTensor
        // (CPU here; the resident `where_cond_bt` path stores a GPU one).
        let grad_fn = Arc::new(crate::grad_fns::indexing::WhereCondBackward {
            x: x.clone(),
            y: y.clone(),
            condition: crate::bool_tensor::BoolTensor::from_slice(condition, &output_shape)?,
        });
        Tensor::from_operation(TensorStorage::cpu(output), output_shape, grad_fn)
    } else {
        Tensor::from_storage(TensorStorage::cpu(output), output_shape, false)
    }
}

/// Ternary selection taking a [`BoolTensor`] condition: `output[i] =
/// cond[i] ? x[i] : y[i]`. Mirrors `torch.where(cond, x, y)`.
///
/// All three tensors must share shape and device. When `cond`, `x`, and `y`
/// are CUDA-resident (same device), the select runs on the GPU through a real
/// PTX kernel dispatched on `x`'s dtype and the result stays GPU-resident — NO
/// host crossing (crosslink #1185 Phase 3c). Otherwise it builds the host
/// `&[bool]` and delegates to the CPU [`where_cond`] (autograd-aware).
pub fn where_cond_bt<T: Float>(
    cond: &crate::bool_tensor::BoolTensor,
    x: &Tensor<T>,
    y: &Tensor<T>,
) -> FerrotorchResult<Tensor<T>> {
    if x.shape() != y.shape() {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "where_cond_bt: x shape {:?} != y shape {:?}",
                x.shape(),
                y.shape()
            ),
        });
    }
    if cond.shape() != x.shape() {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "where_cond_bt: cond shape {:?} != x shape {:?}",
                cond.shape(),
                x.shape()
            ),
        });
    }

    // GPU-resident fast path: all three on the same CUDA device.
    if x.is_cuda() && y.is_cuda() && cond.is_cuda() {
        if x.device() != y.device() {
            return Err(FerrotorchError::DeviceMismatch {
                expected: x.device(),
                got: y.device(),
            });
        }
        if x.device() != cond.device() {
            return Err(FerrotorchError::DeviceMismatch {
                expected: x.device(),
                got: cond.device(),
            });
        }
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #1660: normalise the narrowed-offset CUDA x/y operands to packed
        // offset-0 buffers before the select kernel reads element 0 (#1658
        // class). A row-narrowed view's BASE buffer is longer than `numel`, so
        // the kernel rejected the call ("where_cond: numel mismatch (cond 6,
        // x 8, y 6)"); `.contiguous()` materialises the logical view on-device
        // (strided_copy; cheap clone when already offset-0). The autograd
        // capture below stores the packed operands so the backward agrees.
        let x = x.contiguous()?;
        let y = y.contiguous()?;
        let h = backend.where_cond(cond.gpu_handle()?, x.gpu_handle()?, y.gpu_handle()?)?;
        let storage = TensorStorage::gpu(h);
        let output_shape = x.shape().to_vec();

        if needs_grad(&x, &y) {
            // Store the resident cond directly (cheap Arc/clone-on-storage) — the
            // backward routes through the resident `where_cond` VJP with NO host
            // crossing (crosslink #1187 Phase 3d). No `cond.to(Cpu)`.
            let grad_fn = Arc::new(crate::grad_fns::indexing::WhereCondBackward {
                x: x.clone(),
                y: y.clone(),
                condition: cond.clone(),
            });
            return Tensor::from_operation(storage, output_shape, grad_fn);
        }
        return Tensor::from_storage(storage, output_shape, false);
    }

    // CPU (or mixed-residency) path: materialise the host condition and delegate
    // to the autograd-aware CPU `where_cond`. `cond.data()?` errors if the cond
    // is on GPU while x/y are not — the correct device-mismatch signal.
    where_cond(cond.data()?, x, y)
}

/// `masked_select(input, mask)` — return a 1-D tensor of the elements of
/// `input` where `mask` is true, in flat C-order. Mirrors
/// `torch.masked_select`. `mask` must have the same numel as `input`.
///
/// On CUDA (input + mask resident, same device) this runs a GPU stream
/// compaction (crosslink #1185 Phase 3c): an on-device count of the true mask
/// bytes sizes the output, then a compaction kernel writes the kept elements —
/// the result stays GPU-resident. The single integer COUNT crosses to the host
/// to size the data-dependent output; that scalar is the result SHAPE, not a
/// data round-trip (PyTorch parity: a CUDA sync sizes `masked_select`'s
/// output).
///
/// `masked_select` IS differentiable (PyTorch parity). When `input` requires
/// grad and grad is enabled, the result carries a `MaskedSelectBackward` grad_fn
/// that scatters the compacted gradient back into a zeros tensor of
/// `input.numel()` at the selected positions. On the GPU path the backward stays
/// resident via the `masked_scatter` kernel (crosslink #1187 Phase 3d).
pub fn masked_select<T: Float>(
    input: &Tensor<T>,
    mask: &crate::bool_tensor::BoolTensor,
) -> FerrotorchResult<Tensor<T>> {
    if mask.numel() != input.numel() {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "masked_select: mask numel {} != input numel {}",
                mask.numel(),
                input.numel()
            ),
        });
    }

    if input.is_cuda() && mask.is_cuda() {
        if input.device() != mask.device() {
            return Err(FerrotorchError::DeviceMismatch {
                expected: input.device(),
                got: mask.device(),
            });
        }
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #1660: normalise the narrowed-offset CUDA input to a packed offset-0
        // buffer before the compaction kernel reads element 0 (#1658 class). A
        // row-narrowed view's BASE buffer is longer than `numel`, which the
        // kernel rejected ("input numel 8 != mask numel 6"); `.contiguous()`
        // materialises the logical view on-device (strided_copy; cheap clone
        // when already offset-0). The backward capture below stores the packed
        // input so the scatter agrees.
        let input = input.contiguous()?;
        let (handle, len) = backend.masked_select(input.gpu_handle()?, mask.gpu_handle()?)?;
        let storage = TensorStorage::gpu(handle);

        // PyTorch parity: masked_select IS differentiable. Attach the backward
        // (scatter the compacted grad back into a zeros tensor at the true mask
        // positions). Store the resident mask directly — the backward stays
        // GPU-resident, NO host crossing (crosslink #1187 Phase 3d).
        if input.requires_grad() && is_grad_enabled() {
            let grad_fn = Arc::new(crate::grad_fns::indexing::MaskedSelectBackward {
                input: input.clone(),
                mask: mask.clone(),
            });
            return Tensor::from_operation(storage, vec![len], grad_fn);
        }
        return Tensor::from_storage(storage, vec![len], false);
    }

    // CPU (or mixed-residency) path: walk the host data + mask. `mask.data()?` /
    // `input.data_vec()` error on a GPU operand whose counterpart is on host,
    // which is the correct device-mismatch signal.
    let data = input.data_vec()?;
    let mask_h = mask.data()?;
    let out: Vec<T> = data
        .iter()
        .zip(mask_h.iter())
        .filter_map(|(&v, &m)| if m { Some(v) } else { None })
        .collect();
    let len = out.len();
    let storage = TensorStorage::cpu(out);

    if input.requires_grad() && is_grad_enabled() {
        let grad_fn = Arc::new(crate::grad_fns::indexing::MaskedSelectBackward {
            input: input.clone(),
            mask: mask.clone(),
        });
        Tensor::from_operation(storage, vec![len], grad_fn)
    } else {
        Tensor::from_storage(storage, vec![len], false)
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::autograd::graph::backward;
    use crate::autograd::no_grad;
    use crate::storage::TensorStorage;
    use crate::tensor::GradFn;

    /// Create a leaf tensor from a flat slice and shape.
    fn leaf(data: &[f32], shape: &[usize], requires_grad: bool) -> Tensor<f32> {
        Tensor::from_storage(
            TensorStorage::cpu(data.to_vec()),
            shape.to_vec(),
            requires_grad,
        )
        .unwrap()
    }

    // -----------------------------------------------------------------------
    // gather forward
    // -----------------------------------------------------------------------

    #[test]
    fn test_gather_1d() {
        // input = [10, 20, 30, 40], gather along dim 0 with index [3, 0, 2]
        let input = leaf(&[10.0, 20.0, 30.0, 40.0], &[4], false);
        let index = &[3, 0, 2];
        let result = gather(&input, 0, index, &[3]).unwrap();
        assert_eq!(result.shape(), &[3]);
        assert_eq!(result.data().unwrap(), &[40.0, 10.0, 30.0]);
    }

    #[test]
    fn test_gather_2d_dim0() {
        // input = [[1, 2], [3, 4], [5, 6]]  shape [3, 2]
        // index = [[2, 0], [1, 1]]           shape [2, 2]
        // output[i][j] = input[index[i][j]][j]
        // output = [[5, 2], [3, 4]]
        let input = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[3, 2], false);
        let index = &[2, 0, 1, 1];
        let result = gather(&input, 0, index, &[2, 2]).unwrap();
        assert_eq!(result.shape(), &[2, 2]);
        assert_eq!(result.data().unwrap(), &[5.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn test_gather_2d_dim1() {
        // input = [[1, 2, 3], [4, 5, 6]]  shape [2, 3]
        // index = [[0, 2], [1, 0]]        shape [2, 2]
        // output[i][j] = input[i][index[i][j]]
        // output = [[1, 3], [5, 4]]
        let input = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3], false);
        let index = &[0, 2, 1, 0];
        let result = gather(&input, 1, index, &[2, 2]).unwrap();
        assert_eq!(result.shape(), &[2, 2]);
        assert_eq!(result.data().unwrap(), &[1.0, 3.0, 5.0, 4.0]);
    }

    #[test]
    fn test_gather_out_of_bounds() {
        let input = leaf(&[1.0, 2.0, 3.0], &[3], false);
        let result = gather(&input, 0, &[5], &[1]);
        assert!(result.is_err());
    }

    #[test]
    fn test_gather_ndim_mismatch() {
        // input is 2D, index is 1D
        let input = leaf(&[1.0, 2.0, 3.0, 4.0], &[2, 2], false);
        let result = gather(&input, 0, &[0, 1], &[2]);
        assert!(result.is_err());
    }

    // -----------------------------------------------------------------------
    // scatter forward
    // -----------------------------------------------------------------------

    #[test]
    fn test_scatter_1d() {
        // input = [0, 0, 0, 0, 0], scatter src=[10, 20, 30] at index=[1, 3, 0]
        let input = leaf(&[0.0; 5], &[5], false);
        let src = leaf(&[10.0, 20.0, 30.0], &[3], false);
        let result = scatter(&input, 0, &[1, 3, 0], &[3], &src).unwrap();
        assert_eq!(result.data().unwrap(), &[30.0, 10.0, 0.0, 20.0, 0.0]);
    }

    #[test]
    fn test_scatter_2d_dim0() {
        // input = [[0,0],[0,0],[0,0]]  shape [3, 2]
        // src   = [[1,2]]              shape [1, 2]
        // index = [[2,0]]              shape [1, 2]
        // output[index[i][j]][j] = src[i][j]
        // output = [[0,2],[0,0],[1,0]]
        let input = leaf(&[0.0; 6], &[3, 2], false);
        let src = leaf(&[1.0, 2.0], &[1, 2], false);
        let result = scatter(&input, 0, &[2, 0], &[1, 2], &src).unwrap();
        assert_eq!(result.shape(), &[3, 2]);
        assert_eq!(result.data().unwrap(), &[0.0, 2.0, 0.0, 0.0, 1.0, 0.0]);
    }

    #[test]
    fn test_scatter_2d_dim1() {
        // input = [[0,0,0],[0,0,0]]  shape [2, 3]
        // src   = [[5],[6]]          shape [2, 1]
        // index = [[2],[0]]          shape [2, 1]
        // output[i][index[i][j]] = src[i][j]
        // output = [[0,0,5],[6,0,0]]
        let input = leaf(&[0.0; 6], &[2, 3], false);
        let src = leaf(&[5.0, 6.0], &[2, 1], false);
        let result = scatter(&input, 1, &[2, 0], &[2, 1], &src).unwrap();
        assert_eq!(result.data().unwrap(), &[0.0, 0.0, 5.0, 6.0, 0.0, 0.0]);
    }

    // -----------------------------------------------------------------------
    // scatter_add forward
    // -----------------------------------------------------------------------

    #[test]
    fn test_scatter_add_1d() {
        // input = [1, 2, 3], scatter_add src=[10, 20, 30] at index=[0, 2, 0]
        // output = [1+10+30, 2, 3+20] = [41, 2, 23]
        let input = leaf(&[1.0, 2.0, 3.0], &[3], false);
        let src = leaf(&[10.0, 20.0, 30.0], &[3], false);
        let result = scatter_add(&input, 0, &[0, 2, 0], &[3], &src).unwrap();
        assert_eq!(result.data().unwrap(), &[41.0, 2.0, 23.0]);
    }

    #[test]
    fn test_scatter_add_2d_dim0() {
        // input = [[0,0],[0,0]]  shape [2, 2]
        // src   = [[1,2],[3,4],[5,6]]  shape [3, 2]
        // index = [[0,1],[1,0],[0,0]]  shape [3, 2]
        //
        // output[index[i][j]][j] += src[i][j]
        // (0,0) += 1, (1,0) += 2
        // (1,0) += 3, (0,1) += 4
        // (0,0) += 5, (0,1) += 6
        // output = [[0+1+5, 0+4+6], [0+3, 0+2]] = [[6, 10], [3, 2]]
        let input = leaf(&[0.0; 4], &[2, 2], false);
        let src = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[3, 2], false);
        let result = scatter_add(&input, 0, &[0, 1, 1, 0, 0, 0], &[3, 2], &src).unwrap();
        assert_eq!(result.shape(), &[2, 2]);
        assert_eq!(result.data().unwrap(), &[6.0, 10.0, 3.0, 2.0]);
    }

    // -----------------------------------------------------------------------
    // where_cond forward
    // -----------------------------------------------------------------------

    #[test]
    fn test_where_cond_basic() {
        let x = leaf(&[1.0, 2.0, 3.0, 4.0], &[4], false);
        let y = leaf(&[10.0, 20.0, 30.0, 40.0], &[4], false);
        let cond = [true, false, true, false];
        let result = where_cond(&cond, &x, &y).unwrap();
        assert_eq!(result.data().unwrap(), &[1.0, 20.0, 3.0, 40.0]);
    }

    #[test]
    fn test_where_cond_all_true() {
        let x = leaf(&[1.0, 2.0], &[2], false);
        let y = leaf(&[10.0, 20.0], &[2], false);
        let result = where_cond(&[true, true], &x, &y).unwrap();
        assert_eq!(result.data().unwrap(), &[1.0, 2.0]);
    }

    #[test]
    fn test_where_cond_all_false() {
        let x = leaf(&[1.0, 2.0], &[2], false);
        let y = leaf(&[10.0, 20.0], &[2], false);
        let result = where_cond(&[false, false], &x, &y).unwrap();
        assert_eq!(result.data().unwrap(), &[10.0, 20.0]);
    }

    #[test]
    fn test_where_cond_shape_mismatch() {
        let x = leaf(&[1.0, 2.0], &[2], false);
        let y = leaf(&[1.0, 2.0, 3.0], &[3], false);
        let result = where_cond(&[true, false], &x, &y);
        assert!(result.is_err());
    }

    #[test]
    fn test_where_cond_cond_length_mismatch() {
        let x = leaf(&[1.0, 2.0], &[2], false);
        let y = leaf(&[10.0, 20.0], &[2], false);
        let result = where_cond(&[true], &x, &y);
        assert!(result.is_err());
    }

    // -----------------------------------------------------------------------
    // gather backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_gather_backward_1d() {
        // input = [10, 20, 30], gather at [2, 0, 0] -> output = [30, 10, 10]
        // grad_output = [1, 1, 1]
        // grad_input: scatter_add of [1,1,1] at [2,0,0] into zeros(3)
        //   = [2, 0, 1]
        let input = leaf(&[10.0, 20.0, 30.0], &[3], true);
        let result = gather(&input, 0, &[2, 0, 0], &[3]).unwrap();

        assert!(result.requires_grad());

        let grad_output = leaf(&[1.0, 1.0, 1.0], &[3], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();
        let gi = grads[0].as_ref().unwrap();
        let gd = gi.data().unwrap();
        assert!((gd[0] - 2.0).abs() < 1e-6, "grad[0]={}, expected 2", gd[0]);
        assert!((gd[1] - 0.0).abs() < 1e-6, "grad[1]={}, expected 0", gd[1]);
        assert!((gd[2] - 1.0).abs() < 1e-6, "grad[2]={}, expected 1", gd[2]);
    }

    #[test]
    fn test_gather_backward_2d() {
        // input shape [2, 3], gather dim=1, index shape [2, 2]
        // input = [[1,2,3],[4,5,6]]
        // index = [[0, 2], [1, 0]]
        // output = [[1,3],[5,4]]
        //
        // grad_output = [[1,1],[1,1]]
        // grad_input: scatter_add along dim=1
        //   row 0: idx [0,2] -> [1, 0, 1]
        //   row 1: idx [1,0] -> [1, 1, 0]
        //   grad_input = [[1,0,1],[1,1,0]]
        let input = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3], true);
        let result = gather(&input, 1, &[0, 2, 1, 0], &[2, 2]).unwrap();

        let grad_output = leaf(&[1.0, 1.0, 1.0, 1.0], &[2, 2], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();
        let gi = grads[0].as_ref().unwrap();
        let gd = gi.data().unwrap();
        assert_eq!(gi.shape(), &[2, 3]);
        // row 0: [1, 0, 1]
        assert!((gd[0] - 1.0).abs() < 1e-6);
        assert!((gd[1] - 0.0).abs() < 1e-6);
        assert!((gd[2] - 1.0).abs() < 1e-6);
        // row 1: [1, 1, 0]
        assert!((gd[3] - 1.0).abs() < 1e-6);
        assert!((gd[4] - 1.0).abs() < 1e-6);
        assert!((gd[5] - 0.0).abs() < 1e-6);
    }

    // -----------------------------------------------------------------------
    // scatter backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_scatter_backward_input() {
        // scatter zeros out the positions that were overwritten.
        // input = [1, 2, 3, 4, 5], scatter src at [1, 3]
        // grad wrt input: ones everywhere except positions 1 and 3
        // -> [1, 0, 1, 0, 1]
        let input = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0], &[5], true);
        let src = leaf(&[10.0, 20.0], &[2], false);
        let result = scatter(&input, 0, &[1, 3], &[2], &src).unwrap();

        let grad_output = leaf(&[1.0; 5], &[5], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();
        let gi = grads[0].as_ref().unwrap();
        let gd = gi.data().unwrap();
        assert_eq!(gd, &[1.0, 0.0, 1.0, 0.0, 1.0]);
    }

    #[test]
    fn test_scatter_backward_src() {
        // scatter grad wrt src is gather from grad_output at index positions.
        // input = [0, 0, 0], scatter src at [2, 0]
        // grad_output = [10, 20, 30]
        // grad_src = [grad_output[2], grad_output[0]] = [30, 10]
        let input = leaf(&[0.0; 3], &[3], false);
        let src = leaf(&[1.0, 2.0], &[2], true);
        let result = scatter(&input, 0, &[2, 0], &[2], &src).unwrap();

        let grad_output = leaf(&[10.0, 20.0, 30.0], &[3], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();

        // grads[0] is for input (not requiring grad -> None)
        assert!(grads[0].is_none());
        // grads[1] is for src
        let gs = grads[1].as_ref().unwrap();
        let gd = gs.data().unwrap();
        assert_eq!(gd, &[30.0, 10.0]);
    }

    // -----------------------------------------------------------------------
    // scatter_add backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_scatter_add_backward_input() {
        // scatter_add backward for input is just grad_output (identity).
        let input = leaf(&[1.0, 2.0, 3.0], &[3], true);
        let src = leaf(&[10.0, 20.0], &[2], false);
        let result = scatter_add(&input, 0, &[0, 2], &[2], &src).unwrap();

        let grad_output = leaf(&[5.0, 6.0, 7.0], &[3], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();
        let gi = grads[0].as_ref().unwrap();
        assert_eq!(gi.data().unwrap(), &[5.0, 6.0, 7.0]);
    }

    #[test]
    fn test_scatter_add_backward_src() {
        // scatter_add backward for src is gather from grad_output.
        // index = [2, 0], grad_output = [5, 6, 7]
        // grad_src = [grad_output[2], grad_output[0]] = [7, 5]
        let input = leaf(&[1.0, 2.0, 3.0], &[3], false);
        let src = leaf(&[10.0, 20.0], &[2], true);
        let result = scatter_add(&input, 0, &[2, 0], &[2], &src).unwrap();

        let grad_output = leaf(&[5.0, 6.0, 7.0], &[3], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();

        assert!(grads[0].is_none());
        let gs = grads[1].as_ref().unwrap();
        assert_eq!(gs.data().unwrap(), &[7.0, 5.0]);
    }

    // -----------------------------------------------------------------------
    // where_cond backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_where_cond_backward_x() {
        // where_cond grad for x: grad_output where condition is true, 0 otherwise.
        let x = leaf(&[1.0, 2.0, 3.0, 4.0], &[4], true);
        let y = leaf(&[10.0, 20.0, 30.0, 40.0], &[4], false);
        let cond = [true, false, true, false];
        let result = where_cond(&cond, &x, &y).unwrap();

        let grad_output = leaf(&[1.0, 2.0, 3.0, 4.0], &[4], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();

        let gx = grads[0].as_ref().unwrap();
        assert_eq!(gx.data().unwrap(), &[1.0, 0.0, 3.0, 0.0]);
        assert!(grads[1].is_none());
    }

    #[test]
    fn test_where_cond_backward_y() {
        let x = leaf(&[1.0, 2.0, 3.0, 4.0], &[4], false);
        let y = leaf(&[10.0, 20.0, 30.0, 40.0], &[4], true);
        let cond = [true, false, true, false];
        let result = where_cond(&cond, &x, &y).unwrap();

        let grad_output = leaf(&[1.0, 2.0, 3.0, 4.0], &[4], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();

        assert!(grads[0].is_none());
        let gy = grads[1].as_ref().unwrap();
        assert_eq!(gy.data().unwrap(), &[0.0, 2.0, 0.0, 4.0]);
    }

    #[test]
    fn test_where_cond_backward_both() {
        let x = leaf(&[1.0, 2.0, 3.0], &[3], true);
        let y = leaf(&[10.0, 20.0, 30.0], &[3], true);
        let cond = [false, true, false];
        let result = where_cond(&cond, &x, &y).unwrap();

        let grad_output = leaf(&[5.0, 6.0, 7.0], &[3], false);
        let grad_fn = result.grad_fn().unwrap();
        let grads = grad_fn.backward(&grad_output).unwrap();

        let gx = grads[0].as_ref().unwrap();
        assert_eq!(gx.data().unwrap(), &[0.0, 6.0, 0.0]);
        let gy = grads[1].as_ref().unwrap();
        assert_eq!(gy.data().unwrap(), &[5.0, 0.0, 7.0]);
    }

    // -----------------------------------------------------------------------
    // no_grad context
    // -----------------------------------------------------------------------

    #[test]
    fn test_gather_no_grad() {
        let input = leaf(&[1.0, 2.0, 3.0], &[3], true);
        let result = no_grad(|| gather(&input, 0, &[2, 0], &[2])).unwrap();
        assert!(!result.requires_grad());
        assert!(result.grad_fn().is_none());
    }

    #[test]
    fn test_where_cond_no_grad() {
        let x = leaf(&[1.0, 2.0], &[2], true);
        let y = leaf(&[3.0, 4.0], &[2], true);
        let result = no_grad(|| where_cond(&[true, false], &x, &y)).unwrap();
        assert!(!result.requires_grad());
    }

    // -----------------------------------------------------------------------
    // End-to-end backward through autograd
    // -----------------------------------------------------------------------

    #[test]
    fn test_gather_end_to_end_backward() {
        let input = leaf(&[10.0, 20.0, 30.0, 40.0], &[4], true);
        let gathered = gather(&input, 0, &[1, 3], &[2]).unwrap();

        // Sum to scalar via inline SumBackward.
        let data = gathered.data().unwrap();
        let total: f32 = data.iter().sum();

        #[derive(Debug)]
        struct SumBackward<T: Float> {
            input: Tensor<T>,
        }
        impl<T: Float> GradFn<T> for SumBackward<T> {
            fn backward(
                &self,
                grad_output: &Tensor<T>,
            ) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
                let go_val = grad_output.data()?[0];
                let grad = vec![go_val; self.input.numel()];
                let t = Tensor::from_storage(
                    TensorStorage::cpu(grad),
                    self.input.shape().to_vec(),
                    false,
                )?;
                Ok(vec![Some(t)])
            }
            fn inputs(&self) -> Vec<&Tensor<T>> {
                vec![&self.input]
            }
            fn name(&self) -> &'static str {
                "SumBackward"
            }
        }

        let loss = Tensor::from_operation(
            TensorStorage::cpu(vec![total]),
            vec![],
            Arc::new(SumBackward {
                input: gathered.clone(),
            }),
        )
        .unwrap();

        backward(&loss).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let gd = grad.data().unwrap();
        // indices [1, 3]: grad = [0, 1, 0, 1]
        assert!((gd[0] - 0.0).abs() < 1e-6);
        assert!((gd[1] - 1.0).abs() < 1e-6);
        assert!((gd[2] - 0.0).abs() < 1e-6);
        assert!((gd[3] - 1.0).abs() < 1e-6);
    }
}