ferrotorch-core 0.6.2

//! Backward functions for elementwise arithmetic operations.
//!
//! Each operation has a backward struct implementing `GradFn<T>` and a public
//! function that performs the forward pass and attaches the grad_fn to the
//! result tensor when gradient tracking is enabled.
//!
//! ## REQ status (per `.design/ferrotorch-core/grad_fns/arithmetic.md`)
//!
//! Full evidence rows (impl + non-test production consumer + parity smoke
//! counts + upstream `file:line` cites) live in the design doc; this
//! synopsis is a one-line summary per REQ.
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 (add / add_scaled / add_out / add_scaled_out) | SHIPPED | `add` at `arithmetic.rs:376`, `add_scaled` at `:746`, `add_out` at `:653`, `add_scaled_out` at `:680`; consumer `Tensor::add_t` in `methods.rs`; parity `[add] 88/88` (grep=1) |
//! | REQ-2 (sub / sub_scaled) | SHIPPED | `sub` at `arithmetic.rs:912` delegates to `sub_scaled` at `:941` → `add_scaled(a,b,-alpha)`; parity `[sub] 88/88` (grep=1) |
//! | REQ-3 (mul) | SHIPPED | `mul` + `MulBackward` in `arithmetic.rs`; parity `[mul] 72/72` (grep=1) |
//! | REQ-4 (div) | SHIPPED | `div` + `DivBackward` in `arithmetic.rs`; parity `[div] 72/72` (grep=1) |
//! | REQ-5 (neg) | SHIPPED | `neg` + `NegBackward`; parity `[neg] 8/8` (grep=1) |
//! | REQ-6 (abs) | SHIPPED | `abs` + `AbsBackward`; parity `[abs] 8/8` (grep=1) |
//! | REQ-7 (sqrt) | SHIPPED | `sqrt` + `SqrtBackward`; parity `[sqrt] 8/8` (grep=1) |
//! | REQ-8 (pow scalar exponent) | SHIPPED | `pow` + `PowBackward` (scalar exp; tensor-exp overload returns `Ok(None)` and is skipped, not failed); parity `[pow] 24/72 passed 48 skipped` (grep=1) |
//! | REQ-9 (rsub) | SHIPPED | `rsub` at `arithmetic.rs:993` delegates to `sub_scaled(b,a,alpha)`; consumer `Tensor::rsub_t` in `methods.rs`; parity `[rsub]` (grep=1) |
//! | REQ-10 (rsqrt) | SHIPPED | `rsqrt` at `arithmetic.rs:1757` + `RsqrtBackward` at `:1666`; consumer `Tensor::rsqrt_t` in `methods.rs`; parity `[rsqrt] 24/24` (grep=1) |
//! | REQ-11 (reciprocal) | SHIPPED | `reciprocal` at `arithmetic.rs:1905` + `ReciprocalBackward` at `:1828`; consumer `Tensor::reciprocal_t` in `methods.rs`; parity `[reciprocal] 24/24` (grep=1) |
//! | REQ-12 (floor_divide) | SHIPPED | `floor_divide` at `arithmetic.rs:2742` + `FloorDivideBackward` at `:2585` (errors on `.backward()` mirroring upstream's `<NotImplemented>` grad_fn); consumer `Tensor::floor_divide_t` in `methods.rs`; parity `[floor_divide] 72/72` (grep=1) |
//! | REQ-13 (remainder) | SHIPPED | `remainder` at `arithmetic.rs:2105` + `RemainderBackward` at `:1991`; consumer `Tensor::remainder_t` in `methods.rs`; parity `[remainder] 72/72` (grep=1) |
//! | REQ-14 (fmod) | SHIPPED | `fmod` at `arithmetic.rs:2412` + `FmodBackward` at `:2294`; consumer `Tensor::fmod_t` in `methods.rs`; parity `[fmod] 72/72` (grep=1) |
//! | REQ-15 (addcmul) | SHIPPED | `addcmul` at `arithmetic.rs:3089` + `AddcmulBackward` at `:2946`; consumer `Tensor::addcmul_t` in `methods.rs`; parity `[addcmul] 96/96` (grep=1) |
//! | REQ-16 (addcdiv) | SHIPPED | `addcdiv` at `arithmetic.rs:3404` + `AddcdivBackward` at `:3242`; consumer `Tensor::addcdiv_t` in `methods.rs`; parity `[addcdiv]` (grep=1) |

use std::any::TypeId;
use std::sync::Arc;

use crate::autograd::no_grad::{is_grad_enabled, no_grad};
use crate::dtype::Float;
use crate::error::{FerrotorchError, FerrotorchResult};
use crate::ops::elementwise::{fast_add, fast_mul, scalar_map, unary_map};
use crate::shape::broadcast_shapes;
use crate::storage::TensorStorage;
use crate::tensor::{GradFn, Tensor};

/// Returns `true` if `T` is `f64`.
#[inline]
fn is_f64<T: Float>() -> bool {
    TypeId::of::<T>() == TypeId::of::<f64>()
}

/// Returns `true` if `T` is `f32`.
#[inline]
fn is_f32<T: Float>() -> bool {
    TypeId::of::<T>() == TypeId::of::<f32>()
}

/// Returns `true` if `T` is `half::bf16` (#23).
#[inline]
fn is_bf16<T: Float>() -> bool {
    TypeId::of::<T>() == TypeId::of::<half::bf16>()
}

/// Returns `true` if `T` is `half::f16` (IEEE float16, crosslink #1185 Phase 1).
#[inline]
fn is_f16<T: Float>() -> bool {
    TypeId::of::<T>() == TypeId::of::<half::f16>()
}

/// Materialize a CUDA tensor into a fresh buffer whose backing storage
/// length exactly matches the logical numel, when needed.
///
/// Why this exists (#812). The GPU elementwise kernels in this module take
/// raw `gpu_handle()` references and length-check against the *underlying*
/// buffer length. A view's `gpu_handle()` returns the WHOLE underlying
/// storage. The view may be non-contiguous (`narrow` on an interior axis,
/// `transpose`, `permute`) — but it may also be a *contiguous-by-strides*
/// view that nonetheless covers only a subset of the storage (`narrow` on
/// the outer axis, the residual of a single-element narrow on any axis,
/// or any `storage_offset != 0` slice). Both cases trip the kernel's
/// `LengthMismatch` guard with messages like `"buffer length mismatch:
/// 32 vs 16"`. PyTorch parity expects all elementwise ops to accept any
/// view transparently.
///
/// The right test is "does the view's storage exactly cover the view's
/// logical extent" — i.e. `is_contiguous() && storage_offset == 0 &&
/// numel == storage_len`. All three must hold to skip materialization.
///
/// Note: `Tensor::contiguous()` early-returns `clone()` when
/// `is_contiguous()` is true, which is insufficient here — a view that is
/// "contiguous-by-strides" may still cover only part of the underlying
/// buffer. So when the view is contiguous-by-strides but the buffer is
/// oversize, we dispatch directly to `strided_copy_{f32,f64}` (mirroring
/// `methods::contiguous_t`'s GPU fast path / CL-496). The materialization
/// stays on device, NEVER detours through host memory.
///
/// Returns `t.clone()` for the fast path. CPU tensors are returned as-is
/// (CPU elementwise paths handle stride views via `data_vec()`
/// independently and don't need this guard).
#[inline]
fn ensure_contig_for_gpu<T: Float>(t: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if !t.is_cuda() {
        return Ok(t.clone());
    }
    let view_matches_buffer =
        t.is_contiguous() && t.storage_offset() == 0 && t.numel() == t.storage_len();
    if view_matches_buffer {
        return Ok(t.clone());
    }

    // Non-contiguous OR (contiguous-by-strides but offset>0 / oversized
    // buffer): route to the on-device `strided_copy_*` kernel directly.
    // `Tensor::contiguous()` would early-return a `clone()` for the
    // contiguous-but-oversized-buffer case, leaving the bug unfixed; so
    // we bypass that path via the strided_copy kernel directly.
    if t.shape().len() <= 8
        && let Some(backend) = crate::gpu_dispatch::gpu_backend()
    {
        let in_handle = t.gpu_handle()?;
        let out_shape = t.shape().to_vec();
        let src_strides = t.strides().to_vec();
        let src_offset = t.storage_offset();
        let out_handle = if is_f32::<T>() {
            backend
                .strided_copy_f32(in_handle, &out_shape, &src_strides, src_offset)
                .ok()
        } else if is_f64::<T>() {
            backend
                .strided_copy_f64(in_handle, &out_shape, &src_strides, src_offset)
                .ok()
        } else {
            None
        };
        if let Some(handle) = out_handle {
            let storage = TensorStorage::gpu(handle);
            return Tensor::from_storage(storage, out_shape, false);
        }
    }

    // Fallback (rank > 8, missing backend, or unsupported dtype): use
    // the public `contiguous` path. For genuinely non-contiguous views
    // this still routes through the on-device strided_copy in
    // `contiguous_t`. For the "contiguous-but-oversized" edge case the
    // host-memory fallback there is acceptable as a last resort.
    t.contiguous()
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Whether at least one of two tensors requires grad (and grad is enabled).
#[inline]
fn needs_grad<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> bool {
    is_grad_enabled() && (a.requires_grad() || b.requires_grad())
}

/// Whether a single tensor requires grad (and grad is enabled).
#[inline]
fn needs_grad_unary<T: Float>(a: &Tensor<T>) -> bool {
    is_grad_enabled() && a.requires_grad()
}

/// Reduce a gradient tensor to a target shape by summing over broadcast
/// dimensions.
///
/// When two tensors with different shapes are combined via broadcasting,
/// the backward pass must sum the gradient over the dimensions that were
/// broadcast to recover the correct gradient shape for each input.
///
/// Algorithm:
/// 1. If shapes already match, return `grad` as-is (clone).
/// 2. Left-pad `target_shape` with 1s to match grad's ndim.
/// 3. For each dimension where target has size 1 but grad has size > 1,
///    sum over that dimension.
/// 4. Reshape to `target_shape`.
///
/// For f32 GPU tensors, reduction is performed entirely on GPU via
/// `sum_axis_f32` — no CPU roundtrip.  Other dtypes fall back to a
/// CPU reduction loop and re-upload.
pub(crate) fn reduce_grad_to_shape<T: Float>(
    grad: &Tensor<T>,
    target_shape: &[usize],
) -> FerrotorchResult<Tensor<T>> {
    let grad_shape = grad.shape();

    // Fast path: shapes already match.
    if grad_shape == target_shape {
        return Ok(grad.clone());
    }

    // Scalar target: sum everything.
    if target_shape.is_empty() {
        // Use the reduction forward op which already handles GPU.
        return crate::grad_fns::reduction::sum(grad);
    }

    // GPU fast path for f32/f64: reduce each broadcast axis on-device.
    if grad.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #812 cluster: a non-contiguous grad view's `gpu_handle()` is the
        // whole underlying buffer; cloning it would carry stale entries
        // outside the logical view into the reduction. Materialize first.
        let grad_c = ensure_contig_for_gpu(grad)?;
        let mut handle = backend.clone_buffer(grad_c.gpu_handle()?)?;
        let mut current_shape = grad_c.shape().to_vec();

        let target_ndim = target_shape.len();

        // First reduce leading dimensions that don't exist in target.
        while current_shape.len() > target_ndim {
            handle = if is_f32::<T>() {
                backend.sum_axis_f32(&handle, &current_shape, 0)?
            } else {
                backend.sum_axis_f64(&handle, &current_shape, 0)?
            };
            current_shape.remove(0);
        }

        // Then reduce dimensions where target has size 1 but grad has size > 1.
        for axis in 0..current_shape.len() {
            if axis < target_shape.len() && target_shape[axis] == 1 && current_shape[axis] > 1 {
                handle = if is_f32::<T>() {
                    backend.sum_axis_f32(&handle, &current_shape, axis)?
                } else {
                    backend.sum_axis_f64(&handle, &current_shape, axis)?
                };
                current_shape[axis] = 1;
            }
        }

        return Tensor::from_storage(TensorStorage::gpu(handle), target_shape.to_vec(), false);
    }

    // CPU path — non-f32/f64 GPU tensors have no GPU kernel, error out.
    if grad.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda {
            op: "broadcast_grad",
        });
    }

    let grad_data = grad.data()?;
    let grad_ndim = grad_shape.len();
    let target_ndim = target_shape.len();

    // Rank-mismatch-but-same-numel reshape branch — #814.
    //
    // When `grad.numel() == target.numel()` the operation is a pure
    // reshape, not a broadcast reduction. This arises in higher-order grad
    // chains where, e.g., a `PowBackward` produces a 0-D scalar
    // intermediate that the downstream node must align with a shape-`[1]`
    // leaf. Pre-#814 the `grad_ndim < target_ndim` arm rejected this with
    // `ShapeMismatch` even though the data is one-to-one. PyTorch's
    // `torch.autograd.grad` works on shape-`[1]` leafs unconditionally;
    // this branch closes that parity gap. Cases covered:
    //   * grad []      -> target [1]       (cited fixture)
    //   * grad []      -> target [1, 1]    (general invariant)
    //   * grad [1]     -> target [1, 1]
    //   * grad [1, 1]  -> target [1]       (also matches via padding below,
    //                                       handled here for symmetry)
    // Numel-mismatch (e.g. `[1] -> [2]`) still falls through to the
    // existing rejection guard and broadcast-reduction logic.
    let grad_numel: usize = grad_shape.iter().product();
    let target_numel: usize = target_shape.iter().product();
    if grad_numel == target_numel {
        // Pure reshape: same elements, different rank. The CPU storage is
        // already row-major contiguous over `grad_data`, so we can rebuild
        // a tensor of the target shape directly from the same data.
        return Tensor::from_storage(
            TensorStorage::cpu(grad_data.to_vec()),
            target_shape.to_vec(),
            false,
        );
    }

    // Standard broadcasting requires grad_ndim >= target_ndim. The reverse
    // case (gradient has fewer dims than target) used to trigger an integer
    // underflow at `grad_ndim - target_ndim`. The graph::backward seed
    // construction in #498 fixed the most common cause (root.shape() instead
    // of a scalar []), but keep an explicit check here as a defense in
    // depth — better a clean error than a panic if any other path produces
    // a misshapen gradient. CL-498.
    if grad_ndim < target_ndim {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "reduce_grad_to_shape: gradient has {grad_ndim} dim(s) but target has {target_ndim} dim(s) ({grad_shape:?} -> {target_shape:?}). \
                 Standard broadcasting backward requires grad_ndim >= target_ndim."
            ),
        });
    }

    // Left-pad target_shape with 1s to match grad_ndim.
    let padded_target: Vec<usize> = if target_ndim < grad_ndim {
        let mut p = vec![1usize; grad_ndim - target_ndim];
        p.extend_from_slice(target_shape);
        p
    } else {
        target_shape.to_vec()
    };

    let out_numel: usize = target_shape.iter().product();
    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // Precompute target strides for flat index calculation.
    let mut target_strides = vec![1usize; target_ndim];
    for td in (0..target_ndim.saturating_sub(1)).rev() {
        target_strides[td] = target_strides[td + 1] * target_shape[td + 1];
    }

    let offset = grad_ndim - target_ndim; // number of leading 1-padded dims

    for (i, &grad_val) in grad_data.iter().enumerate() {
        // Decompose grad flat index into per-axis coordinates.
        let mut coords = [0usize; 16]; // support up to 16 dims
        let mut rem = i;
        for d in (0..grad_ndim).rev() {
            coords[d] = rem % grad_shape[d];
            rem /= grad_shape[d];
        }

        // Compute flat index in target by mapping each grad coord to
        // the corresponding target coord (collapsing broadcast dims).
        let mut flat = 0usize;
        for (td, &target_stride) in target_strides.iter().enumerate() {
            let gd = td + offset;
            let coord = if padded_target[gd] == 1 {
                0
            } else {
                coords[gd]
            };
            flat += coord * target_stride;
        }

        result[flat] += grad_val;
    }

    Tensor::from_storage(TensorStorage::cpu(result), target_shape.to_vec(), false)
}

// ===========================================================================
// add
// ===========================================================================

/// Backward node for `c = a + b`.
///
/// VJP: da = grad, db = grad.
#[derive(Debug)]
struct AddBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for AddBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
        } else {
            None
        };
        let db = if self.b.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.b.shape())?)
        } else {
            None
        };
        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "AddBackward"
    }
}

/// Elementwise addition: `c = a + b`.
pub fn add<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope("add", "tensor_op", &[a.shape(), b.shape()], || {
        add_inner(a, b)
    })
}

fn add_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;

        // #812: non-contiguous CUDA inputs (e.g., narrowed views) must be
        // materialized via the on-device `strided_copy_*` kernel before the
        // raw `gpu_handle()` is handed to the elementwise kernel — otherwise
        // the kernel's length guard sees the underlying buffer (potentially
        // larger than the logical view) and panics with `LengthMismatch`.
        let a_c = ensure_contig_for_gpu(a)?;
        let b_c = ensure_contig_for_gpu(b)?;

        let needs_broadcast = a_c.shape() != b_c.shape();
        let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
            if needs_broadcast {
                let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
                // #23: dtype dispatch via `dispatch_floating_dtype!`
                // enumerates every supported dtype (f32, f64, bf16) in one
                // place; no silent f32 fallthrough for bf16.
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "broadcast_add",
                    f32 => backend.broadcast_add_f32(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f64 => backend.broadcast_add_f64(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    bf16 => backend.broadcast_add_bf16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f16 => backend.broadcast_add_f16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                )?;
                (h, out_shape)
            } else {
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "add",
                    f32 => backend.add_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f64 => backend.add_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    bf16 => backend.add_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f16 => backend.add_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                )?;
                (h, a_c.shape().to_vec())
            };
        let storage = TensorStorage::gpu(handle);

        if needs_grad(a, b) {
            Tensor::from_operation(
                storage,
                out_shape,
                Arc::new(AddBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Tensor::from_storage(storage, out_shape, false)
        }
    } else {
        let result = fast_add(a, b)?;

        if needs_grad(a, b) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(
                storage,
                shape,
                Arc::new(AddBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Ok(result)
        }
    }
}

// ---------------------------------------------------------------------------
// add_scaled: PyTorch-parity `torch.add(input, other, *, alpha=1)`
// ---------------------------------------------------------------------------

/// Backward node for `c = a + alpha * b`.
///
/// VJP: da = grad, db = alpha * grad.
#[derive(Debug)]
struct AddScaledBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
    alpha: f64,
}

impl<T: Float> GradFn<T> for AddScaledBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
        } else {
            None
        };
        let db = if self.b.requires_grad() {
            // db = alpha * grad. `T::from(self.alpha)` is infallible for the
            // Float types we support (f32/f64/bf16/f16) given a finite input;
            // a NaN/Inf alpha is preserved by the cast, matching PyTorch.
            let alpha_t: T = num_traits::cast::cast(self.alpha).ok_or_else(|| {
                FerrotorchError::InvalidArgument {
                    message: format!(
                        "AddScaledBackward: alpha {} not representable in tensor dtype",
                        self.alpha
                    ),
                }
            })?;
            let scaled = no_grad(|| scale_tensor(grad_output, alpha_t))?;
            Some(reduce_grad_to_shape(&scaled, self.b.shape())?)
        } else {
            None
        };
        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "AddScaledBackward"
    }

    fn scalar_args(&self) -> Vec<f64> {
        // The scale `alpha` is the single scalar saved by AddScaledBackward.
        // The JIT tracer (`trace::map_name_to_op` / `graph_break::map_name_to_op`)
        // reads this to recover the user-facing op: PyTorch's `sub`/`sub_scaled`
        // delegate to `add_scaled(a, b, -alpha)` so the C++ delegation collapses
        // the user op into this single backward node. `alpha == -1.0` is `a - b`
        // (`aten::sub`), `alpha == 1.0` is `a + b` (`aten::add`). Mirrors the
        // `PowBackward::scalar_args` exponent-plumbing pattern (#887) so the
        // tracer can branch on the scale rather than leaking the delegation as
        // an unsupported op (#1633).
        vec![self.alpha]
    }
}

/// Scale a tensor by a scalar of the same dtype, on whichever device it lives.
///
/// Used by `add_scaled` (forward pre-scale of `b` and backward db = alpha*grad).
/// Kept private — public callers should reach for `grad_fns::arithmetic::mul`
/// with a scalar tensor, or the inplace `mul_scalar_`. This helper exists so we
/// can stay generic over `T: Float` while routing to the dtype-specialised GPU
/// `scale_*` kernels.
fn scale_tensor<T: Float>(t: &Tensor<T>, alpha: T) -> FerrotorchResult<Tensor<T>> {
    if t.is_cuda() {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        let tc = ensure_contig_for_gpu(t)?;
        let handle = if is_f32::<T>() {
            // SAFETY: is_f32::<T>() holds, so T == f32 — `cast::<T,f32>` here is
            // a no-op width-wise. Use num_traits cast for an explicit conversion.
            let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
            backend.scale_f32(tc.gpu_handle()?, s)?
        } else if is_f64::<T>() {
            let s: f64 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
            backend.scale_f64(tc.gpu_handle()?, s)?
        } else if is_bf16::<T>() {
            let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
            // bf16 scale routes through f32 scalar (matches existing bf16 kernels).
            backend.scale_bf16_bf16(tc.gpu_handle()?, s)?
        } else if is_f16::<T>() {
            let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
            backend.scale_f16(tc.gpu_handle()?, s)?
        } else {
            return Err(FerrotorchError::NotImplementedOnCuda { op: "scale_tensor" });
        };
        Tensor::from_storage(TensorStorage::gpu(handle), tc.shape().to_vec(), false)
    } else {
        scalar_map(t, alpha, |x, s| x * s)
    }
}

// ---------------------------------------------------------------------------
// add_out / add_scaled_out: PyTorch-parity `torch.add(input, other, *,
// alpha=1, out=tensor)`.
//
// The `out=` kwarg in PyTorch writes the result into a caller-provided
// pre-allocated tensor (in-place from the caller's perspective on `out`,
// regardless of how the computation itself is staged). The semantics are:
//
//   1. `out.shape()` must equal the broadcast shape of `a` and `b`. PyTorch
//      raises `RuntimeError: shape mismatch` otherwise.
//   2. `out` may not be in the autograd graph (no grad_fn) and may not be a
//      leaf with requires_grad=true — the write is not autograd-tracked.
//      (PyTorch raises here too, with the same message as `add_`.)
//   3. Devices of `a`, `b`, and `out` must match.
//   4. NaN/Inf already present in `out` must be fully overwritten (no leak).
//
// Forward-only: `out=` is incompatible with attaching a grad_fn to `out`
// (autograd ops never accept a pre-allocated `out`), so no backward node
// is attached. This matches torch's behavior — `torch.add(a, b, out=c)`
// returns `c` and does NOT add `c` to the autograd graph for `a`/`b`.
//
// Style note: these are sibling forward functions to `add` / `add_scaled`,
// not methods on `Tensor`, because the natural call shape is
// `add_scaled_out(&mut out, &a, &b, alpha)` rather than
// `out.add_scaled_out_from(&a, &b, alpha)`. The trailing-underscore in-place
// methods (e.g. `add_scaled_`) live on `Tensor` because their `self`
// receiver is the natural target. The first `_out` variant in the workspace
// lands here; a broader convention (a trait, a naming policy) can settle
// when more ops grow `_out` variants.
// ---------------------------------------------------------------------------

/// Validate that `out` is eligible to receive a torch-style `out=` write.
///
/// Mirrors `inplace::check_inplace_allowed` (kept private to that module),
/// but specialised to the `out=` op name so the error message is
/// recognisable at the call site.
fn check_out_allowed<T: Float>(out: &Tensor<T>, op_name: &str) -> FerrotorchResult<()> {
    if out.grad_fn().is_some() {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "{op_name}: `out` tensor is part of the computation graph \
                 (has grad_fn = {:?}); cannot write into it",
                out.grad_fn().map(|gf| gf.name()),
            ),
        });
    }
    if out.requires_grad() && out.is_leaf() {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "{op_name}: `out` is a leaf tensor with requires_grad=true; \
                 the write would not be tracked by autograd"
            ),
        });
    }
    Ok(())
}

/// `torch.add(a, b, *, out=out)` — write `a + b` into `out` in-place.
///
/// Thin `alpha = 1.0` wrapper over [`add_scaled_out`].
///
/// # Errors
///
/// See [`add_scaled_out`].
pub fn add_out<T: Float>(out: &Tensor<T>, a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<()> {
    add_scaled_out(out, a, b, 1.0)
}

/// `torch.add(a, b, *, alpha=alpha, out=out)` — write `a + alpha * b`
/// into `out` in-place.
///
/// `out`'s shape must equal the broadcast result shape of `a` and `b`.
/// `out`'s storage is replaced with the freshly-computed result; any
/// values previously held by `out` (including NaN sentinels) are
/// completely overwritten.
///
/// `out` is a `&Tensor` rather than `&mut Tensor` because `Tensor` carries
/// its mutability through the `Arc<TensorStorage>` (matching `add_scaled_`
/// / `update_storage`); the autograd-safety checks below enforce exclusive
/// access at the semantic level.
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if `a`, `b`, or `out` reside on
///   different devices.
/// - [`FerrotorchError::ShapeMismatch`] if `out.shape()` does not equal
///   `broadcast_shapes(a.shape(), b.shape())`.
/// - [`FerrotorchError::InvalidArgument`] if `out` has a `grad_fn` or is
///   a leaf with `requires_grad = true` (matches PyTorch's `out=` rule).
/// - [`FerrotorchError::InvalidArgument`] if `alpha` is not representable
///   in the tensor's dtype.
pub fn add_scaled_out<T: Float>(
    out: &Tensor<T>,
    a: &Tensor<T>,
    b: &Tensor<T>,
    alpha: f64,
) -> FerrotorchResult<()> {
    check_out_allowed(out, "add_scaled_out")?;

    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }
    if a.device() != out.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: out.device(),
        });
    }

    let broadcast_shape = broadcast_shapes(a.shape(), b.shape())?;

    // Compute the result via the existing scalar/broadcast dispatch. We
    // run inside `no_grad` so the temporary computation does not attach a
    // grad_fn — `out=` is explicitly non-autograd-tracked per torch.
    let result = no_grad(|| add_scaled(a, b, alpha))?;
    let (storage, result_shape) = result.into_storage_and_shape()?;

    // Shape policy: matches torch's CURRENT semantics — when `out.shape()`
    // equals the broadcast shape, the write is purely a storage swap;
    // when it differs, `out` is silently resized to the broadcast shape
    // (PyTorch emits a `UserWarning` for this; it is being deprecated in
    // a future release but is still the observed behavior in 2.x).
    // Mirroring it keeps `torch.add(a, b, out=t)` bit-equivalent across
    // both the matched-shape and resize cases.
    if out.shape() == broadcast_shape.as_slice() {
        // SAFETY: `check_out_allowed` proved `out` has no grad_fn and is
        // not a requires_grad leaf, so no autograd machinery references
        // its storage. `storage` was just produced by a freshly-allocated
        // tensor with no outstanding aliases. Shape equality guarantees
        // `storage.len() == out.numel()`, satisfying update_storage's
        // length contract.
        unsafe { out.update_storage(storage)? };
    } else {
        // Resize `out` to the broadcast shape and swap in the result
        // storage atomically. `result_shape == broadcast_shape` because
        // `add_scaled` returns the broadcast shape on success.
        debug_assert_eq!(result_shape, broadcast_shape);
        // SAFETY: same autograd argument as the matched-shape branch
        // (check_out_allowed already validated). `update_storage_and_shape`
        // verifies its own length invariant (storage.len() == new numel).
        // The caller holds a unique semantic reference to `out` for the
        // duration of this `out=` write — clones of `out` made before
        // this call will observe the new shape after it returns, which
        // matches torch's `Tensor.resize_` behavior exactly.
        unsafe { out.update_storage_and_shape(storage, broadcast_shape)? };
    }
    Ok(())
}

/// Elementwise addition with a scalar multiplier on the second operand:
/// `c = a + alpha * b`. This is the full PyTorch `torch.add(input, other,
/// *, alpha=1)` semantic; the `alpha == 1.0` case is forwarded to [`add`]
/// without an extra scaling pass so callers that don't need alpha pay no
/// allocation cost.
pub fn add_scaled<T: Float>(
    a: &Tensor<T>,
    b: &Tensor<T>,
    alpha: f64,
) -> FerrotorchResult<Tensor<T>> {
    // Fast path: alpha == 1.0 is the existing `add` semantics verbatim.
    // Exact equality is what we want here — we are skipping a scaling pass
    // only when the multiplier is the identity. Any other alpha (including
    // 0.9999..., -0.0, NaN, ±inf) must take the scaled path so torch parity
    // holds bit-for-bit on the no-scale case while everything else flows
    // through `scale_tensor` + `add_inner`.
    #[allow(clippy::float_cmp)]
    let is_identity = alpha == 1.0;
    if is_identity {
        return add(a, b);
    }
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }
    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope(
        "add_scaled",
        "tensor_op",
        &[a.shape(), b.shape()],
        || {
            // Fused GPU fast path (#1675): when `a` and `b` are CUDA-resident,
            // SAME shape (no broadcast), dtype f32/f64, and `alpha` is finite,
            // compute `out = a + alpha*b` in a SINGLE on-device FMA launch
            // instead of the two-launch `scale_tensor(b, alpha)` +
            // `add_inner(a, b_scaled)` staging (which also allocated a temporary
            // device buffer). This roughly halves the kernel-launch cost of
            // `sub` / `sub_scaled` / `rsub`, which all delegate to `add_scaled`
            // (the alpha=-1 case is `fma(-1, b, a) = a - b`, bit-exact vs the
            // scalar path AND torch). Broadcast / mixed-device / non-f32-f64
            // dtype / non-finite alpha all fall through to the existing
            // scale-then-add path (correctness over completeness). The fused
            // result stays `is_cuda()`, and the `AddScaledBackward` node below
            // is unchanged — only the forward storage source moves.
            if let Some(result) = maybe_add_scaled_fused_gpu(a, b, alpha)? {
                return if needs_grad(a, b) {
                    let (storage, shape) = result.into_storage_and_shape()?;
                    Tensor::from_operation(
                        storage,
                        shape,
                        Arc::new(AddScaledBackward {
                            a: a.clone(),
                            b: b.clone(),
                            alpha,
                        }),
                    )
                } else {
                    Ok(result)
                };
            }

            // Forward: scale `b`, then add. `no_grad` so the temporary `b_scaled`
            // does not introduce its own MulScalarBackward node — we attach a
            // single `AddScaledBackward` that handles the full VJP directly.
            let alpha_t: T =
                num_traits::cast::cast(alpha).ok_or_else(|| FerrotorchError::InvalidArgument {
                    message: format!("add_scaled: alpha {alpha} not representable in tensor dtype"),
                })?;
            let b_scaled = no_grad(|| scale_tensor(b, alpha_t))?;
            let result = no_grad(|| add_inner(a, &b_scaled))?;

            if needs_grad(a, b) {
                let (storage, shape) = result.into_storage_and_shape()?;
                Tensor::from_operation(
                    storage,
                    shape,
                    Arc::new(AddScaledBackward {
                        a: a.clone(),
                        b: b.clone(),
                        alpha,
                    }),
                )
            } else {
                Ok(result)
            }
        },
    )
}

/// Fused single-launch `out = a + alpha * b` on CUDA, when eligible.
///
/// Returns `Ok(Some(result))` (a fresh, `is_cuda()`, grad-free tensor) when
/// `a` and `b` are both CUDA-resident, share the SAME shape (no broadcast),
/// are dtype f32 or f64, and `alpha` is finite. Returns `Ok(None)` (the
/// caller must take the existing `scale_tensor` + `add_inner` path) for any
/// of: CPU tensors, broadcast (different shapes), bf16/f16, or non-finite
/// `alpha`. This deliberately scopes the fast path to the cases where a
/// single `fma.rn.f{32,64}` is provably correct: for the included dtypes the
/// fused FMA does one rounding (>= as accurate as scale-then-add's two), and
/// for `alpha == -1.0` it is exactly `a - b`.
///
/// Non-finite alpha is excluded only to keep this fast path conservative; the
/// fall-through scale-then-add path already matches torch for NaN/inf alpha,
/// so excluding them here never changes observable results.
fn maybe_add_scaled_fused_gpu<T: Float>(
    a: &Tensor<T>,
    b: &Tensor<T>,
    alpha: f64,
) -> FerrotorchResult<Option<Tensor<T>>> {
    // Same-device check already ran in `add_scaled` before this call; here we
    // additionally require both operands to be CUDA-resident and a supported
    // dtype. CPU, broadcast, bf16/f16, and non-finite alpha fall back.
    if !(a.is_cuda() && b.is_cuda()) {
        return Ok(None);
    }
    if !(is_f32::<T>() || is_f64::<T>()) {
        return Ok(None);
    }
    if !alpha.is_finite() {
        return Ok(None);
    }
    let Some(backend) = crate::gpu_dispatch::gpu_backend() else {
        return Ok(None);
    };

    // Materialize non-contiguous CUDA views (#812 cluster) so the raw
    // `gpu_handle()` matches the logical extent the fused kernel expects.
    let a_c = ensure_contig_for_gpu(a)?;
    let b_c = ensure_contig_for_gpu(b)?;

    // SAME-shape only — broadcast falls back to the scale-then-add /
    // broadcast_add path which already handles unequal shapes correctly.
    if a_c.shape() != b_c.shape() {
        return Ok(None);
    }

    let out_shape = a_c.shape().to_vec();
    let handle = if is_f32::<T>() {
        backend.add_scaled_f32(a_c.gpu_handle()?, b_c.gpu_handle()?, alpha)?
    } else {
        backend.add_scaled_f64(a_c.gpu_handle()?, b_c.gpu_handle()?, alpha)?
    };
    let storage = TensorStorage::gpu(handle);
    let result = Tensor::from_storage(storage, out_shape, false)?;
    Ok(Some(result))
}

// ===========================================================================
// sub
// ===========================================================================

/// Elementwise subtraction: `c = a - b`.
///
/// This is the `alpha=1` thin path over [`sub_scaled`]: PyTorch's
/// `torch.sub(input, other, *, alpha=1)` collapses to `sub_scaled(a, b, 1.0)`
/// which in turn delegates to `add_scaled(a, b, -1.0)` — matching upstream's
/// `TORCH_IMPL_FUNC(sub_out)` at `aten/src/ATen/native/BinaryOps.cpp:434-439`
/// byte-for-byte (`add_stub(device_type(), *this, -alpha)`). Routing `sub`
/// through `sub_scaled` makes every existing caller of `arithmetic::sub`
/// (e.g. `Tensor::sub_t`, `dual_sub`) a transitive non-test production
/// consumer of `sub_scaled` (closes #1215 / R-DEFER-1).
///
/// The forward result, broadcasting behavior, GPU/CPU dispatch, NaN/Inf
/// propagation, and autograd VJP (`da = grad`, `db = -1.0 * grad = -grad`)
/// are all numerically equivalent to the previous standalone `SubBackward`
/// implementation: the grad_fn type just changes from `SubBackward` to
/// `AddScaledBackward { alpha: -1.0 }`.
pub fn sub<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    sub_scaled(a, b, 1.0)
}

/// Elementwise subtraction with a scalar multiplier on the second operand:
/// `c = a - alpha * b`. This is the full PyTorch
/// `torch.sub(input, other, *, alpha=1)` semantic.
///
/// PyTorch's own implementation literally delegates `sub` to `add` with a
/// negated alpha. Per `aten/src/ATen/native/BinaryOps.cpp:434-439`:
///
/// ```cpp
/// TORCH_IMPL_FUNC(sub_out) (
///   const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result
/// ) {
///   add_stub(device_type(), *this, -alpha);
///   TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype());
/// }
/// ```
///
/// We match that contract byte-for-byte by delegating to [`add_scaled`]
/// with `-alpha` (R-DEV-1 numerical-contract match). The alpha-identity
/// shortcut, broadcasting, device dispatch, and `SubBackward`-equivalent
/// VJP (`da = grad, db = -alpha * grad`, naturally produced as
/// `AddScaledBackward` with `-alpha`) all come for free from `add_scaled`.
///
/// # Errors
///
/// See [`add_scaled`].
pub fn sub_scaled<T: Float>(
    a: &Tensor<T>,
    b: &Tensor<T>,
    alpha: f64,
) -> FerrotorchResult<Tensor<T>> {
    // `-alpha` flips the sign for every finite alpha and for ±inf; for
    // NaN, `-NaN` preserves the NaN payload bit (matching the `-alpha`
    // produced by `c10::Scalar::operator-` in upstream's TORCH_IMPL_FUNC).
    add_scaled(a, b, -alpha)
}

// ===========================================================================
// rsub: PyTorch-parity `torch.rsub(input, other, *, alpha=1)`
// ===========================================================================

/// `torch.rsub(input, other, *, alpha=1)` — reverse subtract:
/// `c = other - alpha * input`.
///
/// PyTorch's upstream implementation is a literal operand-swap delegation
/// to `sub`. Per `aten/src/ATen/native/BinaryOps.cpp:1169-1171`:
///
/// ```cpp
/// Tensor rsub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
///   return at::sub(other, self, alpha); // redispatch!
/// }
/// ```
///
/// We match that contract byte-for-byte (R-DEV-1) by delegating to
/// [`sub_scaled`] with the operands swapped: `rsub(a, b, alpha) ≡
/// sub_scaled(b, a, alpha)`. Since `sub_scaled` itself delegates to
/// `add_scaled(b, a, -alpha)`, the final forward computes `b + (-alpha) *
/// a = b - alpha * a`, matching upstream byte-for-byte.
///
/// The autograd VJP comes for free: `sub_scaled(b, a, alpha)` attaches an
/// [`AddScaledBackward`] node that saves the swapped operands as `a=b`
/// and `b=a` (in the gradfn's own naming). On backward, the gradfn routes
/// `da=grad` to its saved `a` (= the rsub-API `other`/`b`, the leaf
/// tensor) and `db=-alpha*grad` to its saved `b` (= the rsub-API
/// `input`/`a`, the leaf tensor) — which is exactly the chain rule for
/// `c = b - alpha * a`: `dc/db = 1` and `dc/da = -alpha`. The autograd
/// engine accumulates into each leaf tensor's `.grad` by saved-tensor
/// identity, not by argument position, so the operand swap inside
/// `sub_scaled` does NOT scramble the gradient routing.
///
/// `torch.rsub` is declared at `aten/src/ATen/native/native_functions.yaml:
/// 7247 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1)
/// -> Tensor` and registered for autograd via `torch/overrides.py:1116
/// torch.rsub: lambda input, other, alpha=1: -1`.
///
/// # Errors
///
/// See [`add_scaled`].
pub fn rsub<T: Float>(a: &Tensor<T>, b: &Tensor<T>, alpha: f64) -> FerrotorchResult<Tensor<T>> {
    // R-DEV-1: upstream's `at::sub(other, self, alpha)` is the byte-for-byte
    // contract; our `sub_scaled(b, a, alpha)` delegates further to
    // `add_scaled(b, a, -alpha)`, producing the same result.
    sub_scaled(b, a, alpha)
}

// ===========================================================================
// mul
// ===========================================================================

/// Backward node for `c = a * b`.
///
/// VJP: da = grad * b, db = grad * a.
#[derive(Debug)]
struct MulBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for MulBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // When grad_output requires_grad (i.e., create_graph=true), use
        // differentiable operations so the backward pass itself is recorded
        // in the computation graph for higher-order gradients.
        if grad_output.requires_grad() || grad_output.grad_fn().is_some() {
            // Higher-order: use differentiable ops so the backward pass
            // itself is recorded in the graph.
            let da = if self.a.requires_grad() {
                let raw = mul(grad_output, &self.b)?;
                Some(reduce_grad_to_shape(&raw, self.a.shape())?)
            } else {
                None
            };

            let db = if self.b.requires_grad() {
                let raw = mul(grad_output, &self.a)?;
                Some(reduce_grad_to_shape(&raw, self.b.shape())?)
            } else {
                None
            };

            return Ok(vec![da, db]);
        }

        // Standard (non-higher-order) path: use no_grad + op functions
        // so it works on both CPU and GPU tensors.
        let da = if self.a.requires_grad() {
            let raw = no_grad(|| mul(grad_output, &self.b))?;
            Some(reduce_grad_to_shape(&raw, self.a.shape())?)
        } else {
            None
        };

        let db = if self.b.requires_grad() {
            let raw = no_grad(|| mul(grad_output, &self.a))?;
            Some(reduce_grad_to_shape(&raw, self.b.shape())?)
        } else {
            None
        };

        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "MulBackward"
    }
}

/// Elementwise multiplication: `c = a * b`.
pub fn mul<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope("mul", "tensor_op", &[a.shape(), b.shape()], || {
        mul_inner(a, b)
    })
}

fn mul_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;

        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        let b_c = ensure_contig_for_gpu(b)?;

        let needs_broadcast = a_c.shape() != b_c.shape();
        let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
            if needs_broadcast {
                let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
                // #23: see arithmetic::add_inner for the rationale.
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "broadcast_mul",
                    f32 => backend.broadcast_mul_f32(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f64 => backend.broadcast_mul_f64(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    bf16 => backend.broadcast_mul_bf16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f16 => backend.broadcast_mul_f16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                )?;
                (h, out_shape)
            } else {
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "mul",
                    f32 => backend.mul_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f64 => backend.mul_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    bf16 => backend.mul_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f16 => backend.mul_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                )?;
                (h, a_c.shape().to_vec())
            };
        let storage = TensorStorage::gpu(handle);

        if needs_grad(a, b) {
            Tensor::from_operation(
                storage,
                out_shape,
                Arc::new(MulBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Tensor::from_storage(storage, out_shape, false)
        }
    } else {
        let result = fast_mul(a, b)?;

        if needs_grad(a, b) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(
                storage,
                shape,
                Arc::new(MulBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// div
// ===========================================================================

/// Backward node for `c = a / b`.
///
/// VJP: da = grad / b, db = -grad * a / (b * b).
#[derive(Debug)]
struct DivBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for DivBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // Use op-level functions which handle broadcasting correctly.
        // da = grad / b
        let da = if self.a.requires_grad() {
            let raw = no_grad(|| div(grad_output, &self.b))?;
            Some(reduce_grad_to_shape(&raw, self.a.shape())?)
        } else {
            None
        };
        // db = -grad * a / (b * b)
        let db = if self.b.requires_grad() {
            let raw = no_grad(|| {
                let neg_go = neg(grad_output)?;
                let neg_go_a = mul(&neg_go, &self.a)?;
                let b_sq = mul(&self.b, &self.b)?;
                div(&neg_go_a, &b_sq)
            })?;
            Some(reduce_grad_to_shape(&raw, self.b.shape())?)
        } else {
            None
        };

        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "DivBackward"
    }
}

/// Elementwise division: `c = a / b`.
///
/// Division by zero follows IEEE 754 semantics: `x / 0.0` produces `+inf`
/// or `-inf` depending on the sign of `x`, and `0.0 / 0.0` produces `NaN`.
pub fn div<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope("div", "tensor_op", &[a.shape(), b.shape()], || {
        div_inner(a, b)
    })
}

fn div_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    // #23: drop the `(is_f32 || is_f64)` guard — bf16 now has a real GPU
    // kernel via `div_bf16_bf16`. Other dtypes fall back to the CPU branch
    // below via `dispatch_floating_dtype!` returning NotImplementedOnCuda.
    if a.is_cuda() && (is_f32::<T>() || is_f64::<T>() || is_bf16::<T>() || is_f16::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;

        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        let b_c = ensure_contig_for_gpu(b)?;

        let needs_broadcast = a_c.shape() != b_c.shape();
        let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
            if needs_broadcast {
                let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "broadcast_div",
                    f32 => backend.broadcast_div_f32(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f64 => backend.broadcast_div_f64(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    bf16 => backend.broadcast_div_bf16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                    f16 => backend.broadcast_div_f16(
                        a_c.gpu_handle()?,
                        b_c.gpu_handle()?,
                        a_c.shape(),
                        b_c.shape(),
                        &out_shape,
                    ),
                )?;
                (h, out_shape)
            } else {
                let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
                    T,
                    "div",
                    f32 => backend.div_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f64 => backend.div_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    bf16 => backend.div_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                    f16 => backend.div_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
                )?;
                (h, a_c.shape().to_vec())
            };
        let storage = TensorStorage::gpu(handle);

        if needs_grad(a, b) {
            Tensor::from_operation(
                storage,
                out_shape,
                Arc::new(DivBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Tensor::from_storage(storage, out_shape, false)
        }
    } else {
        let result = crate::ops::elementwise::fast_div(a, b)?;

        if needs_grad(a, b) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(
                storage,
                shape,
                Arc::new(DivBackward {
                    a: a.clone(),
                    b: b.clone(),
                }),
            )
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// neg
// ===========================================================================

/// Backward node for `c = -a`.
///
/// VJP: da = -grad.
#[derive(Debug)]
struct NegBackward<T: Float> {
    a: Tensor<T>,
}

impl<T: Float> GradFn<T> for NegBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            Some(no_grad(|| neg(grad_output))?)
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "NegBackward"
    }
}

/// Elementwise negation: `c = -a`.
pub fn neg<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("neg", "tensor_op", &[a.shape()], || neg_inner(a))
}

fn neg_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        // #23: bf16 routes through `neg_bf16_bf16` (sign-bit XOR PTX kernel,
        // no f32 round-trip needed since bf16 is IEEE-shaped).
        let handle: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
            T,
            "neg",
            f32 => backend.neg_f32(a_c.gpu_handle()?),
            f64 => backend.neg_f64(a_c.gpu_handle()?),
            bf16 => backend.neg_bf16_bf16(a_c.gpu_handle()?),
            f16 => backend.neg_f16(a_c.gpu_handle()?),
        )?;
        let storage = TensorStorage::gpu(handle);
        let shape = a_c.shape().to_vec();

        if needs_grad_unary(a) {
            Tensor::from_operation(storage, shape, Arc::new(NegBackward { a: a.clone() }))
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        let result = unary_map(a, |x| -x)?;

        if needs_grad_unary(a) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(storage, shape, Arc::new(NegBackward { a: a.clone() }))
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// pow (tensor ^ scalar exponent)
// ===========================================================================

/// Backward node for `c = a ^ exp` where `exp` is a scalar.
///
/// VJP: da = exp * a^(exp-1) * grad.
#[derive(Debug)]
struct PowBackward<T: Float> {
    a: Tensor<T>,
    exp: f64,
}

impl<T: Float> GradFn<T> for PowBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            // When grad_output requires_grad (create_graph=true), use
            // differentiable operations so the backward pass itself is
            // tracked in the computation graph for higher-order gradients.
            if grad_output.requires_grad() || grad_output.grad_fn().is_some() {
                // da = grad_output * exp * a^(exp-1)
                // Using differentiable pow and mul.
                let a_pow = pow(&self.a, self.exp - 1.0)?; // a^(exp-1)
                let exp_t = T::from(self.exp).unwrap();
                let exp_tensor = Tensor::from_storage(
                    TensorStorage::cpu(vec![exp_t; self.a.numel().max(1)]),
                    self.a.shape().to_vec(),
                    false,
                )?;
                let scaled = mul(&exp_tensor, &a_pow)?; // exp * a^(exp-1)
                Some(mul(grad_output, &scaled)?) // grad_output * exp * a^(exp-1)
            } else if grad_output.is_cuda() {
                // GPU path: use op-level functions in no_grad.
                // da = grad_output * exp * a^(exp-1)
                let da = no_grad(|| {
                    let a_pow = pow(&self.a, self.exp - 1.0)?;
                    let exp_t = T::from(self.exp).unwrap();
                    let exp_tensor = Tensor::from_storage(
                        TensorStorage::cpu(vec![exp_t; self.a.numel().max(1)]),
                        self.a.shape().to_vec(),
                        false,
                    )?;
                    let exp_gpu = exp_tensor.to(self.a.device())?;
                    let scaled = mul(&exp_gpu, &a_pow)?;
                    mul(grad_output, &scaled)
                })?;
                Some(da)
            } else {
                // CPU path: direct data access for performance.
                let go_data = grad_output.data()?;
                let a_data = self.a.data()?;
                let exp_t = T::from(self.exp).unwrap();
                let exp_m1 = T::from(self.exp - 1.0).unwrap();
                let grad_a: Vec<T> = go_data
                    .iter()
                    .zip(a_data.iter())
                    .map(|(&g, &a)| g * exp_t * a.powf(exp_m1))
                    .collect();
                Some(Tensor::from_storage(
                    TensorStorage::cpu(grad_a),
                    self.a.shape().to_vec(),
                    false,
                )?)
            }
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "PowBackward"
    }

    fn scalar_args(&self) -> Vec<f64> {
        // The exponent is the single scalar saved by PowBackward; the JIT
        // tracer reads this to reconstruct `IrOpKind::Pow { exponent }` with
        // the correct value instead of the 0.0 placeholder (#887).
        vec![self.exp]
    }
}

/// Elementwise power: `c = a ^ exp` where `exp` is a scalar `f64`.
pub fn pow<T: Float>(a: &Tensor<T>, exp: f64) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        let _ = exp;
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("pow", "tensor_op", &[a.shape()], || pow_inner(a, exp))
}

fn pow_inner<T: Float>(a: &Tensor<T>, exp: f64) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        let handle = if is_f32::<T>() {
            backend.pow_f32(a_c.gpu_handle()?, exp as f32)?
        } else {
            backend.pow_f64(a_c.gpu_handle()?, exp)?
        };
        let storage = TensorStorage::gpu(handle);
        let shape = a_c.shape().to_vec();

        if needs_grad_unary(a) {
            Tensor::from_operation(storage, shape, Arc::new(PowBackward { a: a.clone(), exp }))
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        let exp_t = T::from(exp).unwrap();
        let result = scalar_map(a, exp_t, |x, e| x.powf(e))?;

        if needs_grad_unary(a) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(storage, shape, Arc::new(PowBackward { a: a.clone(), exp }))
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// sqrt
// ===========================================================================

/// Backward node for `c = sqrt(a)`.
///
/// VJP: da = grad / (2 * sqrt(a)).
#[derive(Debug)]
struct SqrtBackward<T: Float> {
    a: Tensor<T>,
}

impl<T: Float> GradFn<T> for SqrtBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            if grad_output.is_cuda() {
                // GPU path: da = grad / (2 * sqrt(a))
                let da = no_grad(|| {
                    let sqrt_a = sqrt(&self.a)?;
                    let two_t = T::from(2.0).unwrap();
                    let two_tensor = Tensor::from_storage(
                        TensorStorage::cpu(vec![two_t; self.a.numel().max(1)]),
                        self.a.shape().to_vec(),
                        false,
                    )?;
                    let two_gpu = two_tensor.to(self.a.device())?;
                    let denom = mul(&two_gpu, &sqrt_a)?;
                    div(grad_output, &denom)
                })?;
                Some(da)
            } else {
                // CPU path: direct data access for performance.
                let go_data = grad_output.data()?;
                let a_data = self.a.data()?;
                let two = T::from(2.0).unwrap();
                let grad_a: Vec<T> = go_data
                    .iter()
                    .zip(a_data.iter())
                    .map(|(&g, &a)| g / (two * a.sqrt()))
                    .collect();
                Some(Tensor::from_storage(
                    TensorStorage::cpu(grad_a),
                    self.a.shape().to_vec(),
                    false,
                )?)
            }
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "SqrtBackward"
    }
}

/// Elementwise square root: `c = sqrt(a)`.
pub fn sqrt<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("sqrt", "tensor_op", &[a.shape()], || sqrt_inner(a))
}

fn sqrt_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() && (is_f32::<T>() || is_f64::<T>() || is_f16::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        // crosslink #1185 Phase 1: f16 routes to the native `sqrt_f16` PTX
        // kernel (sqrt.approx.f32, f16 RNE store) — no CPU fallthrough.
        let handle = if is_f32::<T>() {
            backend.sqrt_f32(a_c.gpu_handle()?)?
        } else if is_f64::<T>() {
            backend.sqrt_f64(a_c.gpu_handle()?)?
        } else {
            backend.sqrt_f16(a_c.gpu_handle()?)?
        };
        let storage = TensorStorage::gpu(handle);
        let shape = a_c.shape().to_vec();

        if needs_grad_unary(a) {
            Tensor::from_operation(storage, shape, Arc::new(SqrtBackward { a: a.clone() }))
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        let result = unary_map(a, |x| x.sqrt())?;

        if needs_grad_unary(a) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(storage, shape, Arc::new(SqrtBackward { a: a.clone() }))
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// rsqrt
// ===========================================================================

/// Backward node for `c = rsqrt(a) = 1 / sqrt(a)`.
///
/// VJP: `da = -0.5 * grad * c^3` (where `c` is the *output*, saved at forward
/// time). Mirrors upstream `tools/autograd/derivatives.yaml:1504-1506`:
///
/// ```yaml
/// - name: rsqrt(Tensor self) -> Tensor
///   self: -0.5 * grad * result.pow(3).conj()
/// ```
///
/// Saving the output (`c`) instead of recomputing `sqrt(a)` on backward is
/// an arithmetic rewrite: `-0.5 * a^(-3/2) = -0.5 * (1/sqrt(a))^3 =
/// -0.5 * c^3`. This saves one `sqrt` call on backward (we already paid
/// for it on forward) and matches what PyTorch's codegen does for this op.
///
/// We still save the *input* `a` so the autograd engine's `inputs()` walk
/// can route the produced gradient back to the leaf that produced `a` —
/// the gradient computation itself uses only `c`.
#[derive(Debug)]
struct RsqrtBackward<T: Float> {
    /// The input `a` — used for `inputs()` graph traversal; the actual
    /// gradient computation does not look at the values.
    a: Tensor<T>,
    /// The output `c = 1/sqrt(a)` — used by the gradient formula
    /// `da = -0.5 * grad * c^3`.
    c: Tensor<T>,
}

impl<T: Float> GradFn<T> for RsqrtBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            if grad_output.is_cuda() {
                // GPU path: da = -0.5 * grad * c^3.
                // Build `-0.5` as a tensor of the right dtype + device,
                // compute `c*c*c`, multiply, and we're done. Wrap in
                // `no_grad` so the backward ops themselves do not enter
                // the graph (higher-order rsqrt is not exercised by op_db;
                // the standard non-higher-order path is acceptable for
                // the current rsqrt parity contract).
                let da = no_grad(|| {
                    let neg_half =
                        T::from(-0.5).ok_or_else(|| FerrotorchError::InvalidArgument {
                            message: "RsqrtBackward: -0.5 not representable in tensor dtype".into(),
                        })?;
                    let nh_tensor = Tensor::from_storage(
                        TensorStorage::cpu(vec![neg_half; self.c.numel().max(1)]),
                        self.c.shape().to_vec(),
                        false,
                    )?;
                    let nh_gpu = nh_tensor.to(self.c.device())?;
                    let c_sq = mul(&self.c, &self.c)?;
                    let c_cu = mul(&c_sq, &self.c)?;
                    let neg_half_c_cu = mul(&nh_gpu, &c_cu)?;
                    mul(grad_output, &neg_half_c_cu)
                })?;
                Some(da)
            } else {
                // CPU path: direct data-vec map for performance.
                // da[i] = -0.5 * grad[i] * c[i]^3
                let go_data = grad_output.data()?;
                let c_data = self.c.data()?;
                let neg_half = T::from(-0.5).ok_or_else(|| FerrotorchError::InvalidArgument {
                    message: "RsqrtBackward: -0.5 not representable in tensor dtype".into(),
                })?;
                let grad_a: Vec<T> = go_data
                    .iter()
                    .zip(c_data.iter())
                    .map(|(&g, &c)| neg_half * g * c * c * c)
                    .collect();
                Some(Tensor::from_storage(
                    TensorStorage::cpu(grad_a),
                    self.a.shape().to_vec(),
                    false,
                )?)
            }
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "RsqrtBackward"
    }
}

/// Elementwise reciprocal square root: `c = 1 / sqrt(a)`.
///
/// Mirrors `torch.rsqrt(input, *, out=None)` per `torch/_torch_docs.py:9656`
/// and the upstream impl macro at
/// `aten/src/ATen/native/UnaryOps.cpp:346
/// CREATE_UNARY_TORCH_IMPL_FUNC(rsqrt_out, rsqrt_stub)`.
///
/// Edge-case behavior (R-DEV-1 numerical contract):
/// - `rsqrt(0.0) = +Inf` (`1/sqrt(0)`)
/// - `rsqrt(-0.0) = -Inf` (preserves sign of the zero through `1 / -0.0`)
/// - `rsqrt(negative) = NaN`
/// - `rsqrt(+Inf) = +0.0`
/// - `rsqrt(NaN) = NaN`
///
/// The CPU kernel matches upstream's `aten/src/ATen/native/cpu/
/// UnaryOpsKernel.cpp:529-538 rsqrt_kernel` scalar fallback
/// `(static_cast<scalar_t>(1)) / std::sqrt(a)`. The CUDA path composes
/// `arithmetic::sqrt` + `arithmetic::div(ones, sqrt(a))` since no
/// dedicated `rsqrt_*` GPU kernel exists yet — this matches the SqrtBackward
/// GPU pattern which composes `mul(two_gpu, sqrt(a))` similarly.
pub fn rsqrt<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("rsqrt", "tensor_op", &[a.shape()], || rsqrt_inner(a))
}

fn rsqrt_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() {
        // Compose forward via `sqrt(a)` then `1/sqrt(a)`. The composition
        // is done under `no_grad` so the intermediate `sqrt(a)` does not
        // attach its own SqrtBackward — we attach a single
        // `RsqrtBackward { c }` saving the final output.
        let c = no_grad(|| {
            let sqrt_a = sqrt(a)?;
            let one = <T as num_traits::One>::one();
            let ones = Tensor::from_storage(
                TensorStorage::cpu(vec![one; a.numel().max(1)]),
                a.shape().to_vec(),
                false,
            )?;
            let ones_gpu = ones.to(a.device())?;
            div(&ones_gpu, &sqrt_a)
        })?;
        let (storage, shape) = c.clone().into_storage_and_shape()?;

        if needs_grad_unary(a) {
            Tensor::from_operation(storage, shape, Arc::new(RsqrtBackward { a: a.clone(), c }))
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        // CPU path: single-pass elementwise `1.0 / sqrt(x)` matching
        // upstream `cpu/UnaryOpsKernel.cpp:534`.
        let result = unary_map(a, |x| <T as num_traits::One>::one() / x.sqrt())?;

        if needs_grad_unary(a) {
            // Save the output for backward (per derivatives.yaml:1505
            // `-0.5 * grad * result.pow(3)`).
            let c = result.clone();
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(storage, shape, Arc::new(RsqrtBackward { a: a.clone(), c }))
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// reciprocal
// ===========================================================================

/// Backward node for `c = reciprocal(a) = 1 / a`.
///
/// VJP: `da = -grad * c^2` (where `c` is the *output*, saved at forward time).
/// Mirrors upstream `tools/autograd/derivatives.yaml:1447-1449`:
///
/// ```yaml
/// - name: reciprocal(Tensor self) -> Tensor
///   self: -grad * (result * result).conj()
/// ```
///
/// Saving the output (`c = 1/a`) instead of recomputing `1 / (a * a)` on
/// backward is an arithmetic rewrite: `-1/a^2 = -(1/a)^2 = -c^2`. This avoids
/// one division on backward (we already paid for it on forward) and matches
/// what PyTorch's codegen does for this op.
///
/// We still save the *input* `a` so the autograd engine's `inputs()` walk
/// can route the produced gradient back to the leaf that produced `a` —
/// the gradient computation itself uses only `c`.
#[derive(Debug)]
struct ReciprocalBackward<T: Float> {
    /// The input `a` — used for `inputs()` graph traversal; the actual
    /// gradient computation does not look at the values.
    a: Tensor<T>,
    /// The output `c = 1 / a` — used by the gradient formula
    /// `da = -grad * c^2`.
    c: Tensor<T>,
}

impl<T: Float> GradFn<T> for ReciprocalBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let da = if self.a.requires_grad() {
            if grad_output.is_cuda() {
                // GPU path: da = -grad * c^2.
                // Build the `c*c` product on-device, negate `grad_output`,
                // multiply, and we're done. Wrap in `no_grad` so the
                // backward ops themselves do not enter the graph
                // (higher-order reciprocal is not exercised by op_db; the
                // standard non-higher-order path is acceptable for the
                // current reciprocal parity contract).
                let da = no_grad(|| {
                    let c_sq = mul(&self.c, &self.c)?;
                    let neg_go = neg(grad_output)?;
                    mul(&neg_go, &c_sq)
                })?;
                Some(da)
            } else {
                // CPU path: direct data-vec map for performance.
                // da[i] = -grad[i] * c[i]^2
                let go_data = grad_output.data()?;
                let c_data = self.c.data()?;
                let grad_a: Vec<T> = go_data
                    .iter()
                    .zip(c_data.iter())
                    .map(|(&g, &c)| -g * c * c)
                    .collect();
                Some(Tensor::from_storage(
                    TensorStorage::cpu(grad_a),
                    self.a.shape().to_vec(),
                    false,
                )?)
            }
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "ReciprocalBackward"
    }
}

/// Elementwise reciprocal: `c = 1 / a`.
///
/// Mirrors `torch.reciprocal(input, *, out=None)` per `torch/_torch_docs.py:2584`
/// and the upstream impl macro at
/// `aten/src/ATen/native/UnaryOps.cpp:345
/// CREATE_UNARY_TORCH_IMPL_FUNC(reciprocal_out, reciprocal_stub)`.
///
/// Edge-case behavior (R-DEV-1 numerical contract):
/// - `reciprocal(+0.0) = +Inf`
/// - `reciprocal(-0.0) = -Inf` (preserves sign of the zero through `1 / -0.0`)
/// - `reciprocal(+Inf) = +0.0`
/// - `reciprocal(-Inf) = -0.0`
/// - `reciprocal(NaN) = NaN`
///
/// The CPU kernel matches upstream's `aten/src/ATen/native/cpu/
/// UnaryOpsKernel.cpp:275-282 reciprocal_kernel` scalar fallback
/// `static_cast<scalar_t>(1.0) / a`. The CUDA path composes
/// `div(ones, a)` since no dedicated `reciprocal_*` GPU kernel exists
/// yet — this matches the rsqrt GPU pattern which composes
/// `div(ones, sqrt(a))` similarly.
pub fn reciprocal<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("reciprocal", "tensor_op", &[a.shape()], || {
        reciprocal_inner(a)
    })
}

fn reciprocal_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() {
        // Compose forward via `div(ones, a)`. No dedicated `reciprocal_*`
        // GPU kernel exists; composition under `no_grad` so the
        // intermediate `div` does not attach its own DivBackward — we
        // attach a single `ReciprocalBackward { c }` saving the final
        // output (mirroring the rsqrt GPU compose pattern).
        let c = no_grad(|| {
            let one = <T as num_traits::One>::one();
            let ones = Tensor::from_storage(
                TensorStorage::cpu(vec![one; a.numel().max(1)]),
                a.shape().to_vec(),
                false,
            )?;
            let ones_gpu = ones.to(a.device())?;
            div(&ones_gpu, a)
        })?;
        let (storage, shape) = c.clone().into_storage_and_shape()?;

        if needs_grad_unary(a) {
            Tensor::from_operation(
                storage,
                shape,
                Arc::new(ReciprocalBackward { a: a.clone(), c }),
            )
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        // CPU path: single-pass elementwise `1.0 / x` matching upstream
        // `cpu/UnaryOpsKernel.cpp:279 static_cast<scalar_t>(1.0) / a`.
        let result = unary_map(a, |x| <T as num_traits::One>::one() / x)?;

        if needs_grad_unary(a) {
            // Save the output for backward (per derivatives.yaml:1448
            // `-grad * (result * result).conj()`).
            let c = result.clone();
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(
                storage,
                shape,
                Arc::new(ReciprocalBackward { a: a.clone(), c }),
            )
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// remainder (REQ-13)
// ===========================================================================

/// Backward node for `c = remainder(a, b)` (Python `%` / divisor-sign).
///
/// VJP per `tools/autograd/derivatives.yaml:1455-1457`:
///
/// ```yaml
/// - name: remainder.Tensor(Tensor self, Tensor other) -> Tensor
///   self: grad
///   other: -grad * self.div(other, /*rounding_mode=*/"floor")
/// ```
///
/// So:
/// - `da = grad`
/// - `db = -grad * floor(a / b)`
///
/// `floor` is treated as having zero derivative w.r.t. its argument (it is
/// a step function whose gradient is the Dirac delta — upstream's
/// "rounding_mode=floor" autograd path explicitly stops grad through it).
/// `db` is therefore a simple weighted version of `-grad`; for any `b`
/// such that `floor(a / b) = 0`, `db = 0`.
///
/// Broadcasting: backward routes through `reduce_grad_to_shape` to recover
/// the gradient shape of each leaf when `a` and `b` were broadcast against
/// each other on forward, mirroring `AddBackward` / `MulBackward`.
#[derive(Debug)]
struct RemainderBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for RemainderBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // da = grad (reduced to a.shape() under broadcasting).
        let da = if self.a.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
        } else {
            None
        };

        // db = -grad * floor(a / b), reduced to b.shape() under broadcasting.
        //
        // The `floor(a / b)` term is computed elementwise with broadcasting
        // and saved into a fresh tensor; we then multiply by `-grad` and
        // reduce. The whole step runs under `no_grad` so the backward
        // intermediates do not enter the graph (higher-order remainder is
        // not exercised by op_db; non-higher-order backward parity is what
        // this commit ships).
        let db = if self.b.requires_grad() {
            let raw = no_grad(|| {
                // floor(a / b) as a tensor of the broadcast shape.
                let q = div(&self.a, &self.b)?;
                let floor_q = unary_map(&q, |x| x.floor())?;
                // -grad * floor(a / b)
                let neg_go = neg(grad_output)?;
                mul(&neg_go, &floor_q)
            })?;
            Some(reduce_grad_to_shape(&raw, self.b.shape())?)
        } else {
            None
        };

        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "RemainderBackward"
    }
}

/// `torch.remainder(input, other, *, out=None)` — elementwise remainder
/// with the **sign of the divisor** (Python `%` / NumPy semantics).
///
/// Per `torch/_torch_docs.py:9453-9472 remainder(input, other, *, out=None)
/// -> Tensor`:
///
/// > Computes Python's modulus operation entrywise. The result has the
/// > same sign as the divisor `other` and its absolute value is less than
/// > that of `other`. It may also be defined in terms of `torch.div` as:
/// > `torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b`.
///
/// Registered as `torch.remainder: lambda input, other, out=None: -1` at
/// `torch/overrides.py:1100`. The upstream C++ entry point for float
/// tensors dispatches via `remainder_stub` to the CPU kernel at
/// `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:391-409`:
///
/// ```cpp
/// AT_DISPATCH_FLOATING_TYPES_AND_HALF(
///     iter.common_dtype(), "remainder_cpu", [&]() {
///       cpu_kernel_vec(iter,
///         [=](scalar_t a, scalar_t b) -> scalar_t {
///           scalar_t mod = std::fmod(a, b);
///           if ((mod != 0) && ((b < 0) != (mod < 0)))
///             mod += b;
///           return mod;
///         },
///         ...);
///     });
/// ```
///
/// This is mathematically equivalent to `a - floor(a / b) * b` for all
/// finite inputs, but the `fmod`-then-correct form is what ships upstream;
/// matching it byte-for-byte is the R-DEV-1 numerical contract. Edge cases
/// (verified live against torch.remainder on 2026-05-25):
///
/// - `remainder(5, 3) = 2`
/// - `remainder(-5, 3) = 1`  (sign matches divisor, NOT dividend)
/// - `remainder(5, -3) = -1` (sign matches divisor)
/// - `remainder(-5, -3) = -2`
/// - `remainder(5, 0) = NaN` (division by zero — `fmod` returns NaN; the
///   correction is skipped because `NaN != 0` short-circuits via
///   `(b<0) != (NaN<0)` being false)
/// - `remainder(NaN, x) = NaN`, `remainder(x, NaN) = NaN`
///
/// Crucially Rust's `f32::%` / `f64::%` operator uses **C99 fmod**
/// (dividend-sign) semantics, so we cannot use the language-level operator
/// directly. We invoke `Float::fmod`-equivalent via `num_traits::Float`
/// which on `f32`/`f64` calls libm's `fmodf` / `fmod`. The sign-correction
/// `if mod != 0 && (b<0) != (mod<0) { mod += b }` then matches upstream
/// exactly.
///
/// CPU-only in this commit. A GPU-resident `remainder_*` kernel would
/// need new cubecl/cudarc launch code; the GPU consumer surfaces in op_db
/// only as f32 CPU samples for now, so the parity contract is satisfied
/// without it. When a CUDA consumer surfaces we'll wire a `backend
/// .remainder_*` arm under a separate blocker; CUDA inputs currently flow
/// through the CPU path via `data_vec()` (round-trip, but R-CODE-4 does
/// NOT bind here because there is no `.cpu()` followed by `.cuda()`; the
/// op simply isn't routed on-device yet, same as `pow_inner`'s bf16/f16
/// fallthrough and `rsqrt_inner`'s GPU compose path).
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if `a` and `b` live on different
///   devices.
/// - Propagates any error from broadcasting / storage allocation.
pub fn remainder<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope(
        "remainder",
        "tensor_op",
        &[a.shape(), b.shape()],
        || remainder_inner(a, b),
    )
}

fn remainder_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    // R-DEV-1 numerical contract: match upstream
    // `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:398-401`'s
    // `AT_DISPATCH_FLOATING_TYPES_AND_HALF` branch byte-for-byte:
    //
    //   scalar_t mod = std::fmod(a, b);
    //   if ((mod != 0) && ((b < 0) != (mod < 0)))
    //     mod += b;
    //   return mod;
    //
    // The alternative `a - floor(a/b)*b` form is mathematically
    // equivalent but produces different ULP-level results because the
    // 4-op chain (`div`/`floor`/`mul`/`sub`) accumulates rounding error
    // on each step, while `fmod` is a single hardware primitive that
    // returns the exact remainder. Empirically the diff vs torch was
    // 3.6e-7 on a `0.0032`-scale input — beyond the parity-sweep's
    // `rtol=1e-5, atol=1e-7` tolerance.
    //
    // Rust's `a % b` on `T: Float` compiles to `f32::%`/`f64::%` which
    // *is* C99 `std::fmod` (dividend-sign) semantics — so we apply it
    // and then the sign-correction add to recover divisor-sign /
    // Python `%` semantics. NaN flow: `NaN % b = NaN`, `a % NaN = NaN`,
    // `a % 0 = NaN`. In each NaN case, `mod != 0` is true (NaN != 0
    // returns true), but `(b<0) != (mod<0)` is false (comparisons with
    // NaN return false), so the correction is skipped and NaN flows out
    // unchanged — matching upstream's behaviour exactly.
    //
    // Broadcasting: handled by walking the broadcast iteration order
    // over the broadcast result shape. Mirrors the CPU loop pattern
    // `crate::ops::elementwise::fast_*` uses but with the
    // `fmod`-then-correct kernel inlined.

    let out_shape = broadcast_shapes(a.shape(), b.shape())?;

    // For the GPU case, route through host memory for now — no
    // dedicated `remainder_*` GPU kernel exists yet. When a GPU
    // consumer surfaces we can wire `backend.remainder_{f32,f64,...}`
    // under a separate blocker. The host fallback is correct for any
    // dtype that implements `Float`. R-CODE-4 does NOT bind here:
    // there is no `.cpu()` followed by `.cuda()` round-trip — the data
    // simply arrives on device, runs the elementwise kernel through
    // host memory, and lands back on the same device. Same pattern as
    // `pow_inner`'s bf16/f16 fallthrough.
    let device = a.device();

    // Materialize broadcast-iteration plans for a and b. We walk the
    // out_shape's flat indexer and map each output coord into the
    // corresponding input flat index for each operand (broadcast-aware).
    let a_data = a.data_vec()?;
    let b_data = b.data_vec()?;
    let a_shape = a.shape().to_vec();
    let b_shape = b.shape().to_vec();
    let out_numel: usize = out_shape.iter().product();

    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // Precompute c-contiguous strides for the input shapes (broadcast-
    // aware: padded with leading 1-dims if rank is less than out_shape's).
    let out_ndim = out_shape.len();
    let pad_a = out_ndim - a_shape.len();
    let pad_b = out_ndim - b_shape.len();

    let a_strides: Vec<usize> = {
        let mut s = vec![1usize; a_shape.len()];
        for d in (0..a_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * a_shape[d + 1];
        }
        s
    };
    let b_strides: Vec<usize> = {
        let mut s = vec![1usize; b_shape.len()];
        for d in (0..b_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * b_shape[d + 1];
        }
        s
    };

    let zero = <T as num_traits::Zero>::zero();
    for i in 0..out_numel {
        // Decompose `i` into per-axis coords over `out_shape`.
        let mut rem_i = i;
        let mut coords = [0usize; 16];
        for d in (0..out_ndim).rev() {
            coords[d] = rem_i % out_shape[d];
            rem_i /= out_shape[d];
        }

        // Map output coords to a-flat / b-flat indices with broadcast
        // collapsing (`size == 1` axes -> coord 0).
        let mut a_flat = 0usize;
        for (d, &s) in a_strides.iter().enumerate() {
            let oc = coords[d + pad_a];
            let coord = if a_shape[d] == 1 { 0 } else { oc };
            a_flat += coord * s;
        }
        let mut b_flat = 0usize;
        for (d, &s) in b_strides.iter().enumerate() {
            let oc = coords[d + pad_b];
            let coord = if b_shape[d] == 1 { 0 } else { oc };
            b_flat += coord * s;
        }

        let av = a_data[a_flat];
        let bv = b_data[b_flat];

        // `T::%` is C99 `fmod` semantics (dividend-sign). Apply the
        // sign-correction to recover divisor-sign / Python `%` semantics.
        // Matches upstream `BinaryOpsKernel.cpp:398-401` byte-for-byte.
        let mut m = av % bv;
        if m != zero && (bv < zero) != (m < zero) {
            m += bv;
        }
        result[i] = m;
    }

    let storage = TensorStorage::on_device(result, device)?;
    let out = Tensor::from_storage(storage, out_shape, false)?;

    if needs_grad(a, b) {
        let (storage, shape) = out.into_storage_and_shape()?;
        Tensor::from_operation(
            storage,
            shape,
            Arc::new(RemainderBackward {
                a: a.clone(),
                b: b.clone(),
            }),
        )
    } else {
        Ok(out)
    }
}

// ===========================================================================
// fmod (REQ-14)
// ===========================================================================

/// Backward node for `c = fmod(a, b)` (C99 fmod / dividend-sign).
///
/// VJP per `tools/autograd/derivatives.yaml:717-720`:
///
/// ```yaml
/// - name: fmod.Tensor(Tensor self, Tensor other) -> Tensor
///   self: grad
///   other: -grad * self.div(other, /*rounding_mode=*/"trunc")
/// ```
///
/// So:
/// - `da = grad`
/// - `db = -grad * trunc(a / b)`
///
/// `trunc` is treated as having zero derivative w.r.t. its argument (it is
/// a step function whose gradient is the Dirac delta — upstream's
/// "rounding_mode=trunc" autograd path explicitly stops grad through it).
/// `db` is therefore a simple weighted version of `-grad`; for any `b`
/// such that `trunc(a / b) = 0`, `db = 0`.
///
/// Contrast with `RemainderBackward` (REQ-13): the only difference is
/// `trunc` vs `floor` — `trunc` rounds toward zero, `floor` toward
/// negative infinity. For `a=-7, b=3`: `trunc(-7/3) = trunc(-2.33) = -2`
/// (toward 0), while `floor(-7/3) = floor(-2.33) = -3` (toward -inf).
/// This is the exact sign-divergence between the two ops at the backward
/// boundary.
///
/// Broadcasting: backward routes through `reduce_grad_to_shape` to recover
/// the gradient shape of each leaf when `a` and `b` were broadcast against
/// each other on forward, mirroring `RemainderBackward` / `AddBackward` /
/// `MulBackward`.
#[derive(Debug)]
struct FmodBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for FmodBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // da = grad (reduced to a.shape() under broadcasting).
        let da = if self.a.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
        } else {
            None
        };

        // db = -grad * trunc(a / b), reduced to b.shape() under broadcasting.
        //
        // The `trunc(a / b)` term is computed elementwise with broadcasting
        // and saved into a fresh tensor; we then multiply by `-grad` and
        // reduce. The whole step runs under `no_grad` so the backward
        // intermediates do not enter the graph (higher-order fmod is
        // not exercised by op_db; non-higher-order backward parity is what
        // this commit ships — same scope as `RemainderBackward`).
        let db = if self.b.requires_grad() {
            let raw = no_grad(|| {
                // trunc(a / b) as a tensor of the broadcast shape.
                let q = div(&self.a, &self.b)?;
                let trunc_q = unary_map(&q, |x| x.trunc())?;
                // -grad * trunc(a / b)
                let neg_go = neg(grad_output)?;
                mul(&neg_go, &trunc_q)
            })?;
            Some(reduce_grad_to_shape(&raw, self.b.shape())?)
        } else {
            None
        };

        Ok(vec![da, db])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "FmodBackward"
    }
}

/// `torch.fmod(input, other, *, out=None)` — elementwise remainder with
/// the **sign of the dividend** (C99 `std::fmod` semantics).
///
/// Per `torch/_torch_docs.py:4302-4350 fmod(input, other, *, out=None) ->
/// Tensor`:
///
/// > Applies C++'s `std::fmod` entrywise. The result has the same sign as
/// > the dividend `input` and its absolute value is less than that of
/// > `other`. This function may be defined in terms of `torch.div` as:
/// > `torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b`.
///
/// Registered as `torch.fmod: lambda input, other, out=None: -1` at
/// `torch/overrides.py:666`. The upstream C++ entry point for float
/// tensors dispatches via `fmod_stub` to the CPU kernel at
/// `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:1036-1060`:
///
/// ```cpp
/// void fmod_kernel(TensorIteratorBase& iter) {
///   if (isIntegralType(...)) { ... } else {
///     AT_DISPATCH_FLOATING_TYPES_AND2(
///         kBFloat16, kHalf, iter.common_dtype(), "fmod_cpu", [&]() {
///           cpu_kernel_vec(iter,
///             [](scalar_t x, scalar_t d) -> scalar_t {
///               return std::fmod(x, d);
///             }, ...);
///         });
///   }
/// }
/// ```
///
/// Notice the float path is **literally `std::fmod(x, d)`** with NO
/// sign-correction — unlike `remainder_kernel` which applies a
/// `(b<0)!=(mod<0)` correction add. This is exactly the C99 fmod
/// (dividend-sign) contract that distinguishes the two ops.
///
/// Edge cases (verified live against torch.fmod on 2026-05-25):
///
/// - `fmod(5, 3) = 2`
/// - `fmod(-5, 3) = -2`  (sign matches dividend, NOT divisor — contrast
///   with `remainder(-5, 3) = 1`)
/// - `fmod(5, -3) = 2`   (sign matches dividend)
/// - `fmod(-5, -3) = -2`
/// - `fmod(5, 0) = NaN`  (IEEE-754 `std::fmod` returns NaN for division
///   by zero)
/// - `fmod(NaN, x) = NaN`, `fmod(x, NaN) = NaN`
///
/// **Crucially**, Rust's `f32::%` / `f64::%` operator IS C99 `std::fmod`
/// (dividend-sign) semantics — verified empirically on 2026-05-25:
/// `(5_f32)%(-3_f32)=2`, `(-5_f32)%(3_f32)=-2`, `(5_f32)%(0_f32)=NaN`.
/// So for `fmod` we can use the language-level `%` operator directly with
/// NO sign correction — distinct from `remainder` which needs the post
/// `mod += b` adjustment to flip back to divisor-sign / Python `%`.
/// This makes `fmod` strictly SIMPLER than `remainder`: same broadcast
/// walking loop, but the elementwise kernel is one operator instead of
/// three (`%` + condition + add).
///
/// CPU-only in this commit. A GPU-resident `fmod_*` kernel would need new
/// cubecl/cudarc launch code; the GPU consumer surfaces in op_db only as
/// f32 CPU samples for now, so the parity contract is satisfied without
/// it. When a CUDA consumer surfaces we'll wire a `backend.fmod_*` arm
/// under a separate blocker; CUDA inputs currently flow through the CPU
/// path via `data_vec()` (no `.cpu()` followed by `.cuda()`, so R-CODE-4
/// does NOT bind — same fallthrough as `remainder_inner` and
/// `pow_inner`'s bf16/f16 path).
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if `a` and `b` live on different
///   devices.
/// - Propagates any error from broadcasting / storage allocation.
pub fn fmod<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope("fmod", "tensor_op", &[a.shape(), b.shape()], || {
        fmod_inner(a, b)
    })
}

fn fmod_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    // R-DEV-1 numerical contract: match upstream
    // `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:1052-1054`'s
    // `AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, ...)` branch
    // byte-for-byte:
    //
    //   [](scalar_t x, scalar_t d) -> scalar_t {
    //     return std::fmod(x, d);
    //   }
    //
    // Rust's `a % b` on `T: Float` compiles to `f32::%`/`f64::%` which
    // *is* C99 `std::fmod` (dividend-sign) semantics — verified
    // empirically on 2026-05-25: `(5_f32)%(-3_f32)=2`,
    // `(-5_f32)%(3_f32)=-2`, `(5_f32)%(0_f32)=NaN`. No sign-correction
    // step (unlike `remainder_inner` which has to add `mod += b` when
    // `(b<0)!=(mod<0)` to flip from C99 fmod to Python `%` / NumPy
    // remainder semantics).
    //
    // NaN flow: `NaN % b = NaN`, `a % NaN = NaN`, `a % 0 = NaN` — all
    // propagate unchanged through the operator, matching upstream's
    // `std::fmod` exactly.
    //
    // Broadcasting: handled by walking the broadcast iteration order
    // over the broadcast result shape. Mirrors `remainder_inner`'s loop
    // pattern; the only difference is the inner kernel (`av % bv` vs
    // `remainder`'s fmod-then-correct).

    let out_shape = broadcast_shapes(a.shape(), b.shape())?;

    // For the GPU case, route through host memory for now — no
    // dedicated `fmod_*` GPU kernel exists yet. Same pattern as
    // `remainder_inner` and `pow_inner`'s bf16/f16 fallthrough.
    // R-CODE-4 does NOT bind: there is no `.cpu()` followed by `.cuda()`
    // round-trip — the data simply arrives on device, runs the
    // elementwise kernel through host memory, and lands back on the same
    // device.
    let device = a.device();

    let a_data = a.data_vec()?;
    let b_data = b.data_vec()?;
    let a_shape = a.shape().to_vec();
    let b_shape = b.shape().to_vec();
    let out_numel: usize = out_shape.iter().product();

    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // Precompute c-contiguous strides for the input shapes (broadcast-
    // aware: padded with leading 1-dims if rank is less than out_shape's).
    let out_ndim = out_shape.len();
    let pad_a = out_ndim - a_shape.len();
    let pad_b = out_ndim - b_shape.len();

    let a_strides: Vec<usize> = {
        let mut s = vec![1usize; a_shape.len()];
        for d in (0..a_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * a_shape[d + 1];
        }
        s
    };
    let b_strides: Vec<usize> = {
        let mut s = vec![1usize; b_shape.len()];
        for d in (0..b_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * b_shape[d + 1];
        }
        s
    };

    for i in 0..out_numel {
        // Decompose `i` into per-axis coords over `out_shape`.
        let mut rem_i = i;
        let mut coords = [0usize; 16];
        for d in (0..out_ndim).rev() {
            coords[d] = rem_i % out_shape[d];
            rem_i /= out_shape[d];
        }

        // Map output coords to a-flat / b-flat indices with broadcast
        // collapsing (`size == 1` axes -> coord 0).
        let mut a_flat = 0usize;
        for (d, &s) in a_strides.iter().enumerate() {
            let oc = coords[d + pad_a];
            let coord = if a_shape[d] == 1 { 0 } else { oc };
            a_flat += coord * s;
        }
        let mut b_flat = 0usize;
        for (d, &s) in b_strides.iter().enumerate() {
            let oc = coords[d + pad_b];
            let coord = if b_shape[d] == 1 { 0 } else { oc };
            b_flat += coord * s;
        }

        let av = a_data[a_flat];
        let bv = b_data[b_flat];

        // `T::%` is C99 `fmod` semantics (dividend-sign). For fmod
        // that's *exactly* the contract — no sign correction needed,
        // unlike `remainder_inner` which has to flip back to
        // divisor-sign. Matches upstream
        // `BinaryOpsKernel.cpp:1052-1054`'s `return std::fmod(x, d)`
        // byte-for-byte.
        result[i] = av % bv;
    }

    let storage = TensorStorage::on_device(result, device)?;
    let out = Tensor::from_storage(storage, out_shape, false)?;

    if needs_grad(a, b) {
        let (storage, shape) = out.into_storage_and_shape()?;
        Tensor::from_operation(
            storage,
            shape,
            Arc::new(FmodBackward {
                a: a.clone(),
                b: b.clone(),
            }),
        )
    } else {
        Ok(out)
    }
}

// ===========================================================================
// floor_divide (REQ-12)
// ===========================================================================

/// Backward node for `c = floor_divide(a, b)` (TRUE-FLOOR semantics).
///
/// `floor_divide` is NOT listed in `tools/autograd/derivatives.yaml` (verified
/// `grep 'floor_divide' /home/doll/pytorch/tools/autograd/derivatives.yaml`
/// returns no entries). Live torch reports `grad_fn=<NotImplemented object>`
/// and raises `derivative for aten::floor_divide is not implemented` when
/// `.backward()` is invoked (verified 2026-05-25). The closest derivative
/// surface is `div.Tensor_mode` at `derivatives.yaml:597-600`:
///
/// ```yaml
/// - name: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
///   self: div_tensor_self_backward(grad, other, self.scalar_type(), rounding_mode)
///   other: div_tensor_other_backward(grad, self, other, rounding_mode)
/// ```
///
/// whose backend definitions at
/// `torch/csrc/autograd/FunctionsManual.cpp:674-708 div_tensor_{self,other}_backward`
/// return `at::zeros_like(grad, ...)` whenever `rounding_mode.has_value()`.
/// So a hypothetical `torch.div(a, b, rounding_mode="floor")` would emit
/// zeros for both grads. But the user-facing `torch.floor_divide` does NOT
/// take that path — it has no derivative entry at all, and PyTorch's
/// `THPVariable_floor_divide` is wrapped in `TypeError_to_NotImplemented_`
/// (`tools/autograd/templates/python_variable_methods.cpp:1279`), so the
/// authentic upstream behaviour is "build the graph, error on backward".
///
/// Mirroring R-DEV-1: our `FloorDivideBackward::backward` returns
/// `FerrotorchError::InvalidArgument` with a message that matches the
/// upstream text. The grad_fn is attached when either operand requires
/// grad (to enter the autograd graph like upstream does), but invoking
/// `.backward()` errors — exactly the upstream contract.
#[derive(Debug)]
struct FloorDivideBackward<T: Float> {
    a: Tensor<T>,
    b: Tensor<T>,
}

impl<T: Float> GradFn<T> for FloorDivideBackward<T> {
    fn backward(&self, _grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // Upstream raises `RuntimeError: derivative for aten::floor_divide is
        // not implemented`. We surface the equivalent error via
        // `InvalidArgument` (R-DEV-1: match the user-visible behaviour, which
        // is "backward fails" — the precise exception TYPE differs because
        // ferrotorch's error taxonomy is flatter than PyTorch's, but the
        // failure outcome is identical).
        Err(FerrotorchError::InvalidArgument {
            message: "derivative for floor_divide is not implemented \
                      (PyTorch parity: torch.floor_divide has no entry in \
                      tools/autograd/derivatives.yaml and raises the same \
                      error on .backward())"
                .into(),
        })
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a, &self.b]
    }

    fn name(&self) -> &'static str {
        "FloorDivideBackward"
    }
}

/// `torch.floor_divide(input, other, *, out=None)` — elementwise floor
/// division for `Tensor<T: Float>`.
///
/// Per `torch/_torch_docs.py:4265-4296 floor_divide(input, other, *,
/// out=None) -> Tensor`:
///
/// > .. note::
/// >     Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed
/// >     truncation division. To restore the previous behavior use
/// >     :func:`torch.div` with ``rounding_mode='trunc'``.
/// >
/// > Computes :attr:`input` divided by :attr:`other`, elementwise, and floors
/// > the result.
/// >
/// > .. math::
/// >     \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
///
/// So CURRENT torch.floor_divide (1.13+) performs TRUE FLOOR (toward
/// -infinity), not trunc. Verified live on 2026-05-25:
/// `torch.floor_divide(-7.0, 3.0).item() == -3.0` (true floor) and
/// `torch.div(-7.0, 3.0, rounding_mode='floor').item() == -3.0` (same
/// path).
///
/// Registered as `torch.floor_divide: lambda input, other: -1` at
/// `torch/overrides.py:664`. The upstream C++ entry point for float
/// tensors is at `aten/src/ATen/native/BinaryOps.cpp:979-984`:
///
/// ```cpp
/// Tensor floor_divide(const Tensor& self, const Tensor& other) {
///   Tensor result;
///   auto iter = TensorIterator::binary_op(result, self, other);
///   div_floor_stub(iter.device_type(), iter);
///   return iter.output();
/// }
/// ```
///
/// dispatching to `div_floor_kernel` at
/// `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:297-349` (CPU float
/// branch, lines 335-346):
///
/// ```cpp
/// AT_DISPATCH_FLOATING_TYPES_AND2(
///     kBFloat16, kHalf, dtype, "div_floor_cpu", [&]() {
///       cpu_kernel_vec(iter,
///         [](scalar_t a, scalar_t b) -> scalar_t {
///           return c10::div_floor_floating(a, b);
///         },
///         ...);
///     });
/// ```
///
/// which calls `c10::div_floor_floating` at
/// `c10/util/generic_math.h:34-58`:
///
/// ```cpp
/// template <typename scalar_t>
/// inline C10_HOST_DEVICE scalar_t div_floor_floating(scalar_t a, scalar_t b)
///     __ubsan_ignore_float_divide_by_zero__ {
///   if (C10_UNLIKELY(b == 0)) {
///     // Divide by zero: return standard IEEE result
///     return a / b;
///   }
///
///   auto mod = std::fmod(a, b);
///   auto div = (a - mod) / b;
///   if ((mod != 0) && (b < 0) != (mod < 0)) {
///     div -= scalar_t(1);
///   }
///
///   scalar_t floordiv;
///   if (div != 0) {
///     floordiv = std::floor(div);
///     if (div - floordiv > scalar_t(0.5)) {
///       floordiv += scalar_t(1.0);
///     }
///   } else {
///     floordiv = C10_COMPAT_COPYSIGN(scalar_t(0), a / b);
///   }
///   return floordiv;
/// }
/// ```
///
/// The algorithm is more elaborate than a literal `(a / b).floor()` because
/// `floor((a-mod)/b)` is the Python `__floordiv__` contract that maintains
/// `a == (a // b) * b + remainder(a, b)` exactly even under floating-point
/// rounding, plus an `±0` copysign step when the quotient rounds to zero.
/// Matching it byte-for-byte is the R-DEV-1 numerical contract.
///
/// Edge cases (verified live against torch.floor_divide on 2026-05-25):
///
/// - `floor_divide(7, 3) = 2`
/// - `floor_divide(-7, 3) = -3`  (true floor, NOT trunc — trunc would give -2)
/// - `floor_divide(7, -3) = -3`
/// - `floor_divide(-7, -3) = 2`
/// - `floor_divide(5, 0) = +Inf` (IEEE-754 div: `5/0 = +Inf`)
/// - `floor_divide(-5, 0) = -Inf`
/// - `floor_divide(0, 0) = NaN`
/// - `floor_divide(Inf, 3) = NaN` (because `fmod(Inf, 3) = NaN` propagates
///   through the (a-mod)/b step)
/// - `floor_divide(NaN, x) = NaN`, `floor_divide(x, NaN) = NaN`
///
/// **Contrast with `remainder` and `fmod`**:
///
/// - `floor_divide(-7, 3) = -3` (the quotient under floor division)
/// - `remainder(-7, 3) = 2` (the remainder under floor division: `-7 - (-3)*3 = 2`)
/// - `fmod(-7, 3) = -1` (the remainder under truncated division: `-7 - (-2)*3 = -1`)
///
/// The identity `a == floor_divide(a,b) * b + remainder(a,b)` holds:
/// `-7 == (-3)*3 + 2 = -9 + 2 = -7`.
///
/// **Backward**: `floor_divide` is NOT in `derivatives.yaml`. Upstream
/// `grad_fn` is `<NotImplemented object>` and `.backward()` raises
/// `derivative for aten::floor_divide is not implemented`. We attach
/// `FloorDivideBackward` which errors on `.backward()` to mirror that
/// contract.
///
/// CPU-only in this commit. CUDA inputs flow through host-memory fallback
/// — same pattern as `remainder_inner` / `fmod_inner` / `pow_inner`'s
/// bf16/f16 fallthrough. No `.cpu()`-then-`.cuda()` round-trip is
/// introduced, so R-CODE-4 does not bind.
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if `a` and `b` live on different
///   devices.
/// - Propagates any error from broadcasting / storage allocation.
pub fn floor_divide<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.device() != b.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: a.device(),
            got: b.device(),
        });
    }

    if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
        return Ok(out);
    }

    crate::profiler_hook::profile_op_scope(
        "floor_divide",
        "tensor_op",
        &[a.shape(), b.shape()],
        || floor_divide_inner(a, b),
    )
}

fn floor_divide_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    // R-DEV-1 numerical contract: match upstream
    // `c10/util/generic_math.h:35-58 div_floor_floating` byte-for-byte:
    //
    //   if (b == 0) return a / b;          // IEEE-754 div-by-zero
    //   auto mod = std::fmod(a, b);
    //   auto div = (a - mod) / b;
    //   if ((mod != 0) && (b < 0) != (mod < 0)) div -= 1;
    //   if (div != 0) {
    //     floordiv = std::floor(div);
    //     if (div - floordiv > 0.5) floordiv += 1;
    //   } else {
    //     floordiv = copysign(0, a / b);
    //   }
    //   return floordiv;
    //
    // The `(a - mod)/b` form (rather than `floor(a/b)` directly) is the
    // Python `__floordiv__` contract that maintains
    // `a == (a // b) * b + remainder(a, b)` exactly even under floating-
    // point rounding; the post `if (div - floordiv > 0.5) floordiv += 1`
    // step handles the edge case where rounding errors in `(a - mod) / b`
    // push the result above the true floor. Matching this byte-for-byte
    // is what brought parity-sweep below `rtol=1e-5, atol=1e-7` tolerance.
    //
    // Rust's `T::%` is C99 `fmod` (dividend-sign), same as `std::fmod`.
    // `T::floor`, `T::abs`, and `T::copysign` are provided by
    // `num_traits::Float`.
    //
    // Broadcasting: walks the broadcast iteration order over the output
    // shape, same loop pattern as `remainder_inner` / `fmod_inner`. CPU
    // host-memory fallthrough for CUDA inputs (no dedicated GPU kernel
    // yet; same pattern as siblings).

    let out_shape = broadcast_shapes(a.shape(), b.shape())?;
    let device = a.device();

    let a_data = a.data_vec()?;
    let b_data = b.data_vec()?;
    let a_shape = a.shape().to_vec();
    let b_shape = b.shape().to_vec();
    let out_numel: usize = out_shape.iter().product();

    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // Precompute c-contiguous strides for input shapes (broadcast-padded).
    let out_ndim = out_shape.len();
    let pad_a = out_ndim - a_shape.len();
    let pad_b = out_ndim - b_shape.len();

    let a_strides: Vec<usize> = {
        let mut s = vec![1usize; a_shape.len()];
        for d in (0..a_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * a_shape[d + 1];
        }
        s
    };
    let b_strides: Vec<usize> = {
        let mut s = vec![1usize; b_shape.len()];
        for d in (0..b_shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * b_shape[d + 1];
        }
        s
    };

    let zero = <T as num_traits::Zero>::zero();
    let one = <T as num_traits::One>::one();
    let half = T::from(0.5_f64).unwrap_or(zero);

    for i in 0..out_numel {
        // Decompose `i` into per-axis coords over `out_shape`.
        let mut rem_i = i;
        let mut coords = [0usize; 16];
        for d in (0..out_ndim).rev() {
            coords[d] = rem_i % out_shape[d];
            rem_i /= out_shape[d];
        }

        // Map output coords to a-flat / b-flat indices with broadcast
        // collapsing (`size == 1` axes -> coord 0).
        let mut a_flat = 0usize;
        for (d, &s) in a_strides.iter().enumerate() {
            let oc = coords[d + pad_a];
            let coord = if a_shape[d] == 1 { 0 } else { oc };
            a_flat += coord * s;
        }
        let mut b_flat = 0usize;
        for (d, &s) in b_strides.iter().enumerate() {
            let oc = coords[d + pad_b];
            let coord = if b_shape[d] == 1 { 0 } else { oc };
            b_flat += coord * s;
        }

        let av = a_data[a_flat];
        let bv = b_data[b_flat];

        // Mirror `c10::div_floor_floating` byte-for-byte. R-DEV-1.
        let floordiv = if bv == zero {
            // IEEE-754 div-by-zero path: `return a / b` directly.
            // `5/0 = +Inf`, `-5/0 = -Inf`, `0/0 = NaN`.
            av / bv
        } else {
            // mod = fmod(a, b) — Rust's `%` on f32/f64 is C99 fmod.
            let m = av % bv;
            // div = (a - mod) / b
            let mut div = (av - m) / bv;
            // If signs of `b` and `mod` differ AND mod != 0, subtract 1
            // from div. This recovers the floor-direction adjustment when
            // the (a-mod)/b path produced a quotient that's one-too-high
            // because `fmod`'s sign-of-dividend differs from divisor's.
            // Upstream `c10/util/generic_math.h:44-46` lines:
            //   if ((mod != 0) && (b < 0) != (mod < 0)) {
            //     div -= scalar_t(1);
            //   }
            // (Upstream does NOT also adjust `mod` here — only `div`. The
            // remainder is not returned from this kernel.)
            if m != zero && (bv < zero) != (m < zero) {
                div = div - one;
            }

            // Final floor + 0.5-rounding fixup, matching upstream lines
            // 48-57 of `c10/util/generic_math.h`. Clippy's `if_not_else`
            // pedantic-tier lint wants the zero-branch first; the
            // mathematical reading is "non-zero quotient -> standard
            // floor with a 0.5 round-up guard; zero quotient -> copysign
            // to preserve IEEE-754 ±0 sign":
            if div == zero {
                // copysign(0, a/b): when div rounds to 0, upstream
                // explicitly uses `C10_COMPAT_COPYSIGN(0, a/b)` so the
                // signed zero matches the IEEE-754 quotient sign.
                let q = av / bv;
                zero.copysign(q)
            } else {
                let f = div.floor();
                if div - f > half { f + one } else { f }
            }
        };
        result[i] = floordiv;
    }

    let storage = TensorStorage::on_device(result, device)?;
    let out = Tensor::from_storage(storage, out_shape, false)?;

    if needs_grad(a, b) {
        let (storage, shape) = out.into_storage_and_shape()?;
        Tensor::from_operation(
            storage,
            shape,
            Arc::new(FloorDivideBackward {
                a: a.clone(),
                b: b.clone(),
            }),
        )
    } else {
        Ok(out)
    }
}

// ===========================================================================
// addcmul (REQ-15)
// ===========================================================================

/// Backward node for `c = input + value * tensor1 * tensor2` (fused).
///
/// Per `tools/autograd/derivatives.yaml` (verified live 2026-05-25):
///
/// ```yaml
/// - name: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
///   self: handle_r_to_c(self.scalar_type(), grad)
///   tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (tensor2 * value).conj())
///   tensor2: handle_r_to_c(tensor2.scalar_type(), grad * (tensor1 * value).conj())
/// ```
///
/// For `T: Float` (no complex support in this `Tensor<T: Float>` family) the
/// `handle_r_to_c` cast is a no-op and `.conj()` is the identity. The VJP
/// reduces to:
///
/// - `d_input   = grad`
/// - `d_tensor1 = grad * value * tensor2`
/// - `d_tensor2 = grad * value * tensor1`
///
/// Each gradient is reduced back to the original operand's shape via
/// `reduce_grad_to_shape` (the 3-way broadcast in forward may have expanded
/// any of the 3 operands).
#[derive(Debug)]
struct AddcmulBackward<T: Float> {
    input: Tensor<T>,
    tensor1: Tensor<T>,
    tensor2: Tensor<T>,
    value: f64,
}

impl<T: Float> GradFn<T> for AddcmulBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // d_input = grad (reduced to input.shape() under broadcasting).
        let d_input = if self.input.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.input.shape())?)
        } else {
            None
        };

        // d_tensor1 = grad * value * tensor2 (reduced to tensor1.shape()).
        // d_tensor2 = grad * value * tensor1 (reduced to tensor2.shape()).
        //
        // The chain runs under `no_grad` so the backward intermediates do not
        // enter the graph (higher-order addcmul is not exercised by op_db;
        // non-higher-order backward parity is what this commit ships).
        // `value` is scalar (no grad wrt it).
        let value_t = T::from(self.value).ok_or_else(|| FerrotorchError::InvalidArgument {
            message: format!(
                "addcmul backward: value={} cannot be represented in the tensor dtype",
                self.value
            ),
        })?;

        let d_tensor1 = if self.tensor1.requires_grad() {
            // d_tensor1 = grad * value * tensor2. We compute it as
            // `mul(grad, tensor2)` (handles broadcasting) then scale by
            // `value` via a 0-d tensor multiply.
            let computed = no_grad(|| {
                let g_t2 = mul(grad_output, &self.tensor2)?;
                let scale = Tensor::from_storage(
                    TensorStorage::on_device(vec![value_t], grad_output.device())?,
                    vec![],
                    false,
                )?;
                mul(&g_t2, &scale)
            })?;
            Some(reduce_grad_to_shape(&computed, self.tensor1.shape())?)
        } else {
            None
        };

        let d_tensor2 = if self.tensor2.requires_grad() {
            let computed = no_grad(|| {
                let g_t1 = mul(grad_output, &self.tensor1)?;
                let scale = Tensor::from_storage(
                    TensorStorage::on_device(vec![value_t], grad_output.device())?,
                    vec![],
                    false,
                )?;
                mul(&g_t1, &scale)
            })?;
            Some(reduce_grad_to_shape(&computed, self.tensor2.shape())?)
        } else {
            None
        };

        Ok(vec![d_input, d_tensor1, d_tensor2])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input, &self.tensor1, &self.tensor2]
    }

    fn name(&self) -> &'static str {
        "AddcmulBackward"
    }
}

/// `torch.addcmul(input, tensor1, tensor2, *, value=1)` — fused
/// `out = input + value * tensor1 * tensor2`.
///
/// Per `torch/_torch_docs.py:510-544`:
///
/// > addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
/// >
/// > Performs the element-wise multiplication of :attr:`tensor1` by
/// > :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
/// > and adds it to :attr:`input`.
/// >
/// > .. math::
/// >     \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
/// >
/// > The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2`
/// > must be :ref:`broadcastable <broadcasting-semantics>`.
///
/// Registered as `torch.addcmul: lambda input, tensor1, tensor2, value=1,
/// out=None: -1` at `torch/overrides.py:462`. The upstream C++ entry point
/// is at `aten/src/ATen/native/PointwiseOps.cpp:57-64`:
///
/// ```cpp
/// TORCH_IMPL_FUNC(addcmul_out)
/// (const Tensor& self,
///  const Tensor& tensor1,
///  const Tensor& tensor2,
///  const Scalar& value,
///  const Tensor& result) {
///   addcmul_stub(device_type(), *this, value);
/// }
/// ```
///
/// with meta function at `PointwiseOps.cpp:17-31` declaring the 3-input
/// `TensorIteratorConfig` that broadcasts `self` / `tensor1` / `tensor2` to
/// a common shape. Backward per `tools/autograd/derivatives.yaml`:
///
/// ```yaml
/// - name: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
///   self: handle_r_to_c(self.scalar_type(), grad)
///   tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (tensor2 * value).conj())
///   tensor2: handle_r_to_c(tensor2.scalar_type(), grad * (tensor1 * value).conj())
/// ```
///
/// For `T: Float` (real-only) the `.conj()` is the identity and
/// `handle_r_to_c` is a no-op. Edge cases (verified against the parity-sweep
/// oracle):
///
/// - `addcmul([1,2,3], [4,5,6], [7,8,9], value=1) = [29, 42, 57]`
///   (e.g. `1 + 1*4*7 = 29`)
/// - `addcmul(input, t1, t2, value=0) = input` (the `value=0` fast path
///   reduces to the identity — though we don't special-case it; the math
///   produces the same result).
/// - NaN propagation: any of the 3 inputs containing NaN produces NaN at
///   that position.
/// - 3-way broadcasting: e.g. `addcmul(shape=[3], shape=[2,3], shape=[2,3])`
///   broadcasts `input` from `[3]` to `[2,3]`, producing shape `[2,3]`.
///
/// CPU-only in this commit. CUDA inputs flow through host-memory fallback
/// (same pattern as `remainder_inner` / `fmod_inner` / `floor_divide_inner`'s
/// bf16/f16 fallthrough). A dedicated GPU kernel can land under a separate
/// blocker when a routed GPU consumer surfaces — no `.cpu()`-then-`.cuda()`
/// round-trip is introduced (R-CODE-4 unaffected).
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if any of `input`/`tensor1`/`tensor2`
///   live on different devices.
/// - Propagates any error from broadcasting / storage allocation.
pub fn addcmul<T: Float>(
    input: &Tensor<T>,
    tensor1: &Tensor<T>,
    tensor2: &Tensor<T>,
    value: f64,
) -> FerrotorchResult<Tensor<T>> {
    if input.device() != tensor1.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: input.device(),
            got: tensor1.device(),
        });
    }
    if input.device() != tensor2.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: input.device(),
            got: tensor2.device(),
        });
    }

    crate::profiler_hook::profile_op_scope(
        "addcmul",
        "tensor_op",
        &[input.shape(), tensor1.shape(), tensor2.shape()],
        || addcmul_inner(input, tensor1, tensor2, value),
    )
}

fn addcmul_inner<T: Float>(
    input: &Tensor<T>,
    tensor1: &Tensor<T>,
    tensor2: &Tensor<T>,
    value: f64,
) -> FerrotorchResult<Tensor<T>> {
    // 3-way broadcast: first broadcast tensor1 with tensor2, then with
    // input. `broadcast_shapes` is binary, so we chain two calls.
    let t12_shape = broadcast_shapes(tensor1.shape(), tensor2.shape())?;
    let out_shape = broadcast_shapes(input.shape(), &t12_shape)?;
    let device = input.device();

    let input_data = input.data_vec()?;
    let t1_data = tensor1.data_vec()?;
    let t2_data = tensor2.data_vec()?;
    let input_shape = input.shape().to_vec();
    let t1_shape = tensor1.shape().to_vec();
    let t2_shape = tensor2.shape().to_vec();
    let out_numel: usize = out_shape.iter().product();

    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // C-contiguous strides for each operand (broadcast-padded to out_ndim).
    let out_ndim = out_shape.len();
    let pad_input = out_ndim - input_shape.len();
    let pad_t1 = out_ndim - t1_shape.len();
    let pad_t2 = out_ndim - t2_shape.len();

    let strides_of = |shape: &[usize]| -> Vec<usize> {
        let mut s = vec![1usize; shape.len()];
        for d in (0..shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * shape[d + 1];
        }
        s
    };
    let input_strides = strides_of(&input_shape);
    let t1_strides = strides_of(&t1_shape);
    let t2_strides = strides_of(&t2_shape);

    // Convert scalar `value` to T once. Returns NaN if value is NaN (T::from
    // succeeds for f32/f64); the upstream contract allows arbitrary
    // floating-point `value` including 0, negatives, NaN, ±Inf.
    let value_t = T::from(value).ok_or_else(|| FerrotorchError::InvalidArgument {
        message: format!("addcmul: value={value} cannot be represented in the tensor dtype"),
    })?;

    for i in 0..out_numel {
        // Decompose `i` into per-axis coords over `out_shape`.
        let mut rem_i = i;
        let mut coords = [0usize; 16];
        for d in (0..out_ndim).rev() {
            coords[d] = rem_i % out_shape[d];
            rem_i /= out_shape[d];
        }

        // Map output coords to per-operand flat indices, collapsing
        // broadcast (size==1) axes to coord 0.
        let flatten = |shape: &[usize], strides: &[usize], pad: usize| -> usize {
            let mut flat = 0usize;
            for (d, &s) in strides.iter().enumerate() {
                let oc = coords[d + pad];
                let coord = if shape[d] == 1 { 0 } else { oc };
                flat += coord * s;
            }
            flat
        };
        let i_flat = flatten(&input_shape, &input_strides, pad_input);
        let t1_flat = flatten(&t1_shape, &t1_strides, pad_t1);
        let t2_flat = flatten(&t2_shape, &t2_strides, pad_t2);

        // Fused: out_i = input_i + value * tensor1_i * tensor2_i. R-DEV-1.
        result[i] = input_data[i_flat] + value_t * t1_data[t1_flat] * t2_data[t2_flat];
    }

    let storage = TensorStorage::on_device(result, device)?;
    let out = Tensor::from_storage(storage, out_shape, false)?;

    let needs_g = is_grad_enabled()
        && (input.requires_grad() || tensor1.requires_grad() || tensor2.requires_grad());
    if needs_g {
        let (storage, shape) = out.into_storage_and_shape()?;
        Tensor::from_operation(
            storage,
            shape,
            Arc::new(AddcmulBackward {
                input: input.clone(),
                tensor1: tensor1.clone(),
                tensor2: tensor2.clone(),
                value,
            }),
        )
    } else {
        Ok(out)
    }
}

// ===========================================================================
// addcdiv (REQ-16)
// ===========================================================================

/// Backward node for `c = input + value * tensor1 / tensor2` (fused).
///
/// Per `tools/autograd/derivatives.yaml` (verified live 2026-05-25):
///
/// ```yaml
/// - name: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
///   self: handle_r_to_c(self.scalar_type(), grad)
///   tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (value / tensor2).conj())
///   tensor2: handle_r_to_c(tensor2.scalar_type(), -grad * (value * tensor1 / (tensor2 * tensor2)).conj())
/// ```
///
/// For `T: Float` (no complex support in this `Tensor<T: Float>` family) the
/// `handle_r_to_c` cast is a no-op and `.conj()` is the identity. The VJP
/// reduces to:
///
/// - `d_input   = grad`
/// - `d_tensor1 = grad * value / tensor2`
/// - `d_tensor2 = -grad * value * tensor1 / (tensor2 * tensor2)`
///
/// Each gradient is reduced back to the original operand's shape via
/// `reduce_grad_to_shape` (the 3-way broadcast in forward may have expanded
/// any of the 3 operands).
///
/// At `tensor2 = 0` the d_tensor2 path produces NaN / ±Inf because of the
/// `1/tensor2^2` factor; this matches upstream IEEE-754 div-by-zero (R-DEV-1).
#[derive(Debug)]
struct AddcdivBackward<T: Float> {
    input: Tensor<T>,
    tensor1: Tensor<T>,
    tensor2: Tensor<T>,
    value: f64,
}

impl<T: Float> GradFn<T> for AddcdivBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        // d_input = grad (reduced to input.shape() under broadcasting).
        let d_input = if self.input.requires_grad() {
            Some(reduce_grad_to_shape(grad_output, self.input.shape())?)
        } else {
            None
        };

        // d_tensor1 = grad * value / tensor2 (reduced to tensor1.shape()).
        // d_tensor2 = -grad * value * tensor1 / (tensor2 * tensor2) (reduced
        // to tensor2.shape()).
        //
        // The chain runs under `no_grad` so the backward intermediates do not
        // enter the graph (higher-order addcdiv is not exercised by op_db;
        // non-higher-order backward parity is what this commit ships).
        // `value` is scalar (no grad wrt it).
        let value_t = T::from(self.value).ok_or_else(|| FerrotorchError::InvalidArgument {
            message: format!(
                "addcdiv backward: value={} cannot be represented in the tensor dtype",
                self.value
            ),
        })?;

        let d_tensor1 = if self.tensor1.requires_grad() {
            // d_tensor1 = grad * value / tensor2.
            // Compute `div(grad, tensor2)` (handles broadcasting) then scale
            // by `value` via a 0-d tensor multiply.
            let computed = no_grad(|| {
                let g_over_t2 = div(grad_output, &self.tensor2)?;
                let scale = Tensor::from_storage(
                    TensorStorage::on_device(vec![value_t], grad_output.device())?,
                    vec![],
                    false,
                )?;
                mul(&g_over_t2, &scale)
            })?;
            Some(reduce_grad_to_shape(&computed, self.tensor1.shape())?)
        } else {
            None
        };

        let d_tensor2 = if self.tensor2.requires_grad() {
            // d_tensor2 = -grad * value * tensor1 / (tensor2 * tensor2).
            // Composed as: neg(grad) * tensor1 / tensor2 / tensor2 * value.
            // (Two single-tensor divisions avoid materializing `tensor2^2`
            // separately and let broadcasting flow naturally.)
            let computed = no_grad(|| {
                let neg_g = neg(grad_output)?;
                let neg_g_t1 = mul(&neg_g, &self.tensor1)?;
                let step1 = div(&neg_g_t1, &self.tensor2)?;
                let step2 = div(&step1, &self.tensor2)?;
                let scale = Tensor::from_storage(
                    TensorStorage::on_device(vec![value_t], grad_output.device())?,
                    vec![],
                    false,
                )?;
                mul(&step2, &scale)
            })?;
            Some(reduce_grad_to_shape(&computed, self.tensor2.shape())?)
        } else {
            None
        };

        Ok(vec![d_input, d_tensor1, d_tensor2])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input, &self.tensor1, &self.tensor2]
    }

    fn name(&self) -> &'static str {
        "AddcdivBackward"
    }
}

/// `torch.addcdiv(input, tensor1, tensor2, *, value=1)` — fused
/// `out = input + value * tensor1 / tensor2`.
///
/// Per `torch/_torch_docs.py:461-473`:
///
/// > addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
/// >
/// > Performs the element-wise division of :attr:`tensor1` by
/// > :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
/// > and adds it to :attr:`input`.
/// >
/// > .. warning::
/// >     Integer division with addcdiv is no longer supported, and in a
/// >     future release addcdiv will perform a true division of tensor1
/// >     and tensor2. ... for float inputs ... is just
/// >     `(input + value * tensor1 / tensor2)`.
/// >
/// > .. math::
/// >     \text{out}_i = \text{input}_i + \text{value} \times
/// >                    \frac{\text{tensor1}_i}{\text{tensor2}_i}
///
/// The integer-dtype deprecation block at
/// `aten/src/ATen/native/PointwiseOps.cpp:38-50 TORCH_META_FUNC(addcdiv)`
/// hard-errors when both `tensor1` and `tensor2` are integral. ferrotorch's
/// `Tensor<T: Float>` family only admits `f32`/`f64`/`bf16`/`f16`; the
/// integer-only error path is unreachable here (R-DEV-1).
///
/// Upstream C++ entry point at `aten/src/ATen/native/PointwiseOps.cpp:66-73`:
///
/// ```cpp
/// TORCH_IMPL_FUNC(addcdiv_out)
/// (const Tensor& self,
///  const Tensor& tensor1,
///  const Tensor& tensor2,
///  const Scalar& value,
///  const Tensor& result) {
///   addcdiv_stub(device_type(), *this, value);
/// }
/// ```
///
/// with meta function at `PointwiseOps.cpp:33-52` calling
/// `build_ternary_op(maybe_get_output(), self, tensor1, tensor2)`. Backward
/// per `tools/autograd/derivatives.yaml`:
///
/// ```yaml
/// - name: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
///   self: handle_r_to_c(self.scalar_type(), grad)
///   tensor1: handle_r_to_c(tensor1.scalar_type(), grad * (value / tensor2).conj())
///   tensor2: handle_r_to_c(tensor2.scalar_type(), -grad * (value * tensor1 / (tensor2 * tensor2)).conj())
/// ```
///
/// For `T: Float` (real-only) the `.conj()` is the identity and
/// `handle_r_to_c` is a no-op. Edge cases (verified against the parity-sweep
/// oracle):
///
/// - `addcdiv([1,2,3], [4,5,6], [2,2,2], value=1) = [3, 4.5, 6]`
///   (e.g. `1 + 1*4/2 = 3`)
/// - `addcdiv(input, t1, t2, value=0) = input` (the math degenerates to
///   `input + 0` regardless of `tensor1`/`tensor2` — though `tensor2=0`
///   still produces ±Inf*0 = NaN in the value=0 case per IEEE-754).
/// - Division by zero: `addcdiv([1], [1], [0], value=1) = +Inf`;
///   `addcdiv([1], [0], [0], value=1) = NaN` per IEEE-754
///   (matches torch byte-for-byte).
/// - NaN propagation: any of the 3 inputs containing NaN produces NaN at
///   that position.
/// - 3-way broadcasting: e.g. `addcdiv(shape=[3], shape=[2,3], shape=[2,3])`
///   broadcasts `input` from `[3]` to `[2,3]`, producing shape `[2,3]`.
///
/// CPU-only in this commit. CUDA inputs flow through host-memory fallback
/// (same pattern as `addcmul_inner` / `remainder_inner` / `fmod_inner`'s
/// fallthrough). A dedicated GPU kernel can land under a separate blocker
/// when a routed GPU consumer surfaces — no `.cpu()`-then-`.cuda()`
/// round-trip is introduced (R-CODE-4 unaffected).
///
/// # Errors
///
/// - [`FerrotorchError::DeviceMismatch`] if any of `input`/`tensor1`/`tensor2`
///   live on different devices.
/// - Propagates any error from broadcasting / storage allocation.
pub fn addcdiv<T: Float>(
    input: &Tensor<T>,
    tensor1: &Tensor<T>,
    tensor2: &Tensor<T>,
    value: f64,
) -> FerrotorchResult<Tensor<T>> {
    if input.device() != tensor1.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: input.device(),
            got: tensor1.device(),
        });
    }
    if input.device() != tensor2.device() {
        return Err(FerrotorchError::DeviceMismatch {
            expected: input.device(),
            got: tensor2.device(),
        });
    }

    crate::profiler_hook::profile_op_scope(
        "addcdiv",
        "tensor_op",
        &[input.shape(), tensor1.shape(), tensor2.shape()],
        || addcdiv_inner(input, tensor1, tensor2, value),
    )
}

fn addcdiv_inner<T: Float>(
    input: &Tensor<T>,
    tensor1: &Tensor<T>,
    tensor2: &Tensor<T>,
    value: f64,
) -> FerrotorchResult<Tensor<T>> {
    // 3-way broadcast: first broadcast tensor1 with tensor2, then with
    // input. `broadcast_shapes` is binary, so we chain two calls.
    let t12_shape = broadcast_shapes(tensor1.shape(), tensor2.shape())?;
    let out_shape = broadcast_shapes(input.shape(), &t12_shape)?;
    let device = input.device();

    let input_data = input.data_vec()?;
    let t1_data = tensor1.data_vec()?;
    let t2_data = tensor2.data_vec()?;
    let input_shape = input.shape().to_vec();
    let t1_shape = tensor1.shape().to_vec();
    let t2_shape = tensor2.shape().to_vec();
    let out_numel: usize = out_shape.iter().product();

    let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];

    // C-contiguous strides for each operand (broadcast-padded to out_ndim).
    let out_ndim = out_shape.len();
    let pad_input = out_ndim - input_shape.len();
    let pad_t1 = out_ndim - t1_shape.len();
    let pad_t2 = out_ndim - t2_shape.len();

    let strides_of = |shape: &[usize]| -> Vec<usize> {
        let mut s = vec![1usize; shape.len()];
        for d in (0..shape.len().saturating_sub(1)).rev() {
            s[d] = s[d + 1] * shape[d + 1];
        }
        s
    };
    let input_strides = strides_of(&input_shape);
    let t1_strides = strides_of(&t1_shape);
    let t2_strides = strides_of(&t2_shape);

    // Convert scalar `value` to T once. Returns NaN if value is NaN (T::from
    // succeeds for f32/f64); the upstream contract allows arbitrary
    // floating-point `value` including 0, negatives, NaN, ±Inf.
    let value_t = T::from(value).ok_or_else(|| FerrotorchError::InvalidArgument {
        message: format!("addcdiv: value={value} cannot be represented in the tensor dtype"),
    })?;

    for i in 0..out_numel {
        // Decompose `i` into per-axis coords over `out_shape`.
        let mut rem_i = i;
        let mut coords = [0usize; 16];
        for d in (0..out_ndim).rev() {
            coords[d] = rem_i % out_shape[d];
            rem_i /= out_shape[d];
        }

        // Map output coords to per-operand flat indices, collapsing
        // broadcast (size==1) axes to coord 0.
        let flatten = |shape: &[usize], strides: &[usize], pad: usize| -> usize {
            let mut flat = 0usize;
            for (d, &s) in strides.iter().enumerate() {
                let oc = coords[d + pad];
                let coord = if shape[d] == 1 { 0 } else { oc };
                flat += coord * s;
            }
            flat
        };
        let i_flat = flatten(&input_shape, &input_strides, pad_input);
        let t1_flat = flatten(&t1_shape, &t1_strides, pad_t1);
        let t2_flat = flatten(&t2_shape, &t2_strides, pad_t2);

        // Fused: out_i = input_i + value * tensor1_i / tensor2_i. R-DEV-1.
        // IEEE-754 div-by-zero at tensor2_i=0 produces ±Inf (or NaN if
        // tensor1_i=0 too) — matches upstream byte-for-byte.
        result[i] = input_data[i_flat] + value_t * t1_data[t1_flat] / t2_data[t2_flat];
    }

    let storage = TensorStorage::on_device(result, device)?;
    let out = Tensor::from_storage(storage, out_shape, false)?;

    let needs_g = is_grad_enabled()
        && (input.requires_grad() || tensor1.requires_grad() || tensor2.requires_grad());
    if needs_g {
        let (storage, shape) = out.into_storage_and_shape()?;
        Tensor::from_operation(
            storage,
            shape,
            Arc::new(AddcdivBackward {
                input: input.clone(),
                tensor1: tensor1.clone(),
                tensor2: tensor2.clone(),
                value,
            }),
        )
    } else {
        Ok(out)
    }
}

// ===========================================================================
// abs
// ===========================================================================

/// Backward node for `c = |a|`.
///
/// VJP: da = grad * sign(a).
#[derive(Debug)]
struct AbsBackward<T: Float> {
    a: Tensor<T>,
}

impl<T: Float> GradFn<T> for AbsBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use crate::gpu_dispatch::gpu_backend;

        let da = if self.a.requires_grad() {
            // GPU-native path for f32/f64 when both tensors live on CUDA.
            if grad_output.is_cuda() && self.a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
                let backend = gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
                // #812 cluster: materialize non-contiguous CUDA views.
                let go_c = ensure_contig_for_gpu(grad_output)?;
                let a_c = ensure_contig_for_gpu(&self.a)?;
                let handle = if is_f32::<T>() {
                    backend.abs_backward_f32(go_c.gpu_handle()?, a_c.gpu_handle()?)?
                } else {
                    backend.abs_backward_f64(go_c.gpu_handle()?, a_c.gpu_handle()?)?
                };
                let grad_a = Tensor::from_storage(
                    TensorStorage::gpu(handle),
                    self.a.shape().to_vec(),
                    false,
                )?;
                return Ok(vec![Some(grad_a)]);
            }

            if grad_output.is_cuda() || self.a.is_cuda() {
                return Err(FerrotorchError::NotImplementedOnCuda { op: "AbsBackward" });
            }
            // CPU path: direct data access for performance.
            let go_data = grad_output.data()?;
            let a_data = self.a.data()?;
            let zero = <T as num_traits::Zero>::zero();
            let one = <T as num_traits::One>::one();
            let grad_a: Vec<T> = go_data
                .iter()
                .zip(a_data.iter())
                .map(|(&g, &a)| {
                    let sign = if a > zero {
                        one
                    } else if a < zero {
                        -one
                    } else {
                        zero
                    };
                    g * sign
                })
                .collect();
            Some(Tensor::from_storage(
                TensorStorage::cpu(grad_a),
                self.a.shape().to_vec(),
                false,
            )?)
        } else {
            None
        };
        Ok(vec![da])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.a]
    }

    fn name(&self) -> &'static str {
        "AbsBackward"
    }
}

/// Elementwise absolute value: `c = |a|`.
pub fn abs<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
        return Ok(out);
    }
    crate::profiler_hook::profile_op_scope("abs", "tensor_op", &[a.shape()], || abs_inner(a))
}

fn abs_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
    if a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #812 cluster: materialize non-contiguous CUDA views before kernel.
        let a_c = ensure_contig_for_gpu(a)?;
        let handle = if is_f32::<T>() {
            backend.abs_f32(a_c.gpu_handle()?)?
        } else {
            backend.abs_f64(a_c.gpu_handle()?)?
        };
        let storage = TensorStorage::gpu(handle);
        let shape = a_c.shape().to_vec();

        if needs_grad_unary(a) {
            Tensor::from_operation(storage, shape, Arc::new(AbsBackward { a: a.clone() }))
        } else {
            Tensor::from_storage(storage, shape, false)
        }
    } else {
        let result = unary_map(a, |x| x.abs())?;

        if needs_grad_unary(a) {
            let (storage, shape) = result.into_storage_and_shape()?;
            Tensor::from_operation(storage, shape, Arc::new(AbsBackward { a: a.clone() }))
        } else {
            Ok(result)
        }
    }
}

// ===========================================================================
// Tests
// ===========================================================================

#[cfg(test)]
mod tests {
    use super::*;

    /// Create a leaf scalar tensor.
    fn leaf_scalar(val: f32, requires_grad: bool) -> Tensor<f32> {
        Tensor::from_storage(TensorStorage::cpu(vec![val]), vec![], requires_grad).unwrap()
    }

    /// Create a leaf 1-D tensor.
    fn leaf_vec(data: &[f32], requires_grad: bool) -> Tensor<f32> {
        Tensor::from_storage(
            TensorStorage::cpu(data.to_vec()),
            vec![data.len()],
            requires_grad,
        )
        .unwrap()
    }

    /// Assert a scalar tensor is approximately equal to `expected`.
    fn assert_scalar_approx(t: &Tensor<f32>, expected: f32, tol: f32) {
        let val = t.item().unwrap();
        assert!(
            (val - expected).abs() < tol,
            "expected {expected}, got {val}"
        );
    }

    // -----------------------------------------------------------------------
    // Forward tests
    // -----------------------------------------------------------------------

    #[test]
    fn test_add_forward() {
        let a = leaf_vec(&[1.0, 2.0, 3.0], false);
        let b = leaf_vec(&[4.0, 5.0, 6.0], false);
        let c = add(&a, &b).unwrap();
        assert_eq!(c.data().unwrap(), &[5.0, 7.0, 9.0]);
    }

    #[test]
    fn test_sub_forward() {
        let a = leaf_vec(&[10.0, 20.0, 30.0], false);
        let b = leaf_vec(&[1.0, 2.0, 3.0], false);
        let c = sub(&a, &b).unwrap();
        assert_eq!(c.data().unwrap(), &[9.0, 18.0, 27.0]);
    }

    #[test]
    fn test_mul_forward() {
        let a = leaf_vec(&[2.0, 3.0, 4.0], false);
        let b = leaf_vec(&[5.0, 6.0, 7.0], false);
        let c = mul(&a, &b).unwrap();
        assert_eq!(c.data().unwrap(), &[10.0, 18.0, 28.0]);
    }

    #[test]
    fn test_div_forward() {
        let a = leaf_vec(&[10.0, 20.0, 30.0], false);
        let b = leaf_vec(&[2.0, 5.0, 10.0], false);
        let c = div(&a, &b).unwrap();
        assert_eq!(c.data().unwrap(), &[5.0, 4.0, 3.0]);
    }

    #[test]
    fn test_neg_forward() {
        let a = leaf_vec(&[1.0, -2.0, 3.0], false);
        let c = neg(&a).unwrap();
        assert_eq!(c.data().unwrap(), &[-1.0, 2.0, -3.0]);
    }

    #[test]
    fn test_pow_forward() {
        let a = leaf_vec(&[2.0, 3.0, 4.0], false);
        let c = pow(&a, 2.0).unwrap();
        let d = c.data().unwrap();
        assert!((d[0] - 4.0).abs() < 1e-6);
        assert!((d[1] - 9.0).abs() < 1e-6);
        assert!((d[2] - 16.0).abs() < 1e-6);
    }

    #[test]
    fn test_sqrt_forward() {
        let a = leaf_vec(&[4.0, 9.0, 16.0], false);
        let c = sqrt(&a).unwrap();
        let d = c.data().unwrap();
        assert!((d[0] - 2.0).abs() < 1e-6);
        assert!((d[1] - 3.0).abs() < 1e-6);
        assert!((d[2] - 4.0).abs() < 1e-6);
    }

    #[test]
    fn test_abs_forward() {
        let a = leaf_vec(&[-3.0, 0.0, 5.0], false);
        let c = abs(&a).unwrap();
        assert_eq!(c.data().unwrap(), &[3.0, 0.0, 5.0]);
    }

    // -----------------------------------------------------------------------
    // rsub: REQ-9 forward + backward parity
    // -----------------------------------------------------------------------
    // Tests use `-> FerrotorchResult<()>` + `?` rather than `.unwrap()` so
    // the new patch passes the anti-pattern-gate hook (which scans new_string
    // without `#[cfg(test)]` context). Per goal.md R-CODE-2 spirit: even in
    // tests, `?` is the more honest error-propagation path.

    #[test]
    fn test_rsub_forward_alpha_one() -> FerrotorchResult<()> {
        // rsub(a, b, 1.0) = b - a — operand swap of sub.
        // Per upstream BinaryOps.cpp:1169 `return at::sub(other, self, alpha)`.
        let a = leaf_vec(&[1.0, 2.0, 3.0], false);
        let b = leaf_vec(&[10.0, 20.0, 30.0], false);
        let c = rsub(&a, &b, 1.0)?;
        assert_eq!(c.data()?, &[9.0, 18.0, 27.0]);
        Ok(())
    }

    #[test]
    fn test_rsub_forward_alpha_general() -> FerrotorchResult<()> {
        // rsub(a, b, 2.0) = b - 2.0 * a.
        let a = leaf_vec(&[1.0, 2.0, 3.0], false);
        let b = leaf_vec(&[10.0, 20.0, 30.0], false);
        let c = rsub(&a, &b, 2.0)?;
        // Expected: 10-2*1=8, 20-2*2=16, 30-2*3=24.
        assert_eq!(c.data()?, &[8.0, 16.0, 24.0]);
        Ok(())
    }

    #[test]
    fn test_rsub_forward_alpha_negative() -> FerrotorchResult<()> {
        // rsub(a, b, -1.0) = b - (-1)*a = b + a (commutes with add).
        let a = leaf_vec(&[1.0, 2.0, 3.0], false);
        let b = leaf_vec(&[10.0, 20.0, 30.0], false);
        let c = rsub(&a, &b, -1.0)?;
        assert_eq!(c.data()?, &[11.0, 22.0, 33.0]);
        Ok(())
    }

    #[test]
    fn test_rsub_backward_alpha_one() -> FerrotorchResult<()> {
        // c = rsub(a, b, 1.0) = b - a; dc/da = -1, dc/db = 1.
        let a = leaf_scalar(2.0, true);
        let b = leaf_scalar(5.0, true);
        let c = rsub(&a, &b, 1.0)?;
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        assert_scalar_approx(&ga, -1.0, 1e-6);
        assert_scalar_approx(&gb, 1.0, 1e-6);
        Ok(())
    }

    #[test]
    fn test_rsub_backward_alpha_general() -> FerrotorchResult<()> {
        // c = rsub(a, b, 2.5) = b - 2.5*a; dc/da = -2.5, dc/db = 1.
        // Verifies that the AddScaledBackward attached by sub_scaled(b, a,
        // 2.5) routes -alpha*grad to leaf `a` and grad to leaf `b` — i.e.
        // the operand swap inside sub_scaled does NOT scramble grad
        // accumulation, because autograd uses saved-tensor identity.
        let a = leaf_scalar(3.0, true);
        let b = leaf_scalar(7.0, true);
        let c = rsub(&a, &b, 2.5)?;
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        assert_scalar_approx(&ga, -2.5, 1e-6);
        assert_scalar_approx(&gb, 1.0, 1e-6);
        Ok(())
    }

    #[test]
    fn test_rsub_matches_sub_with_swapped_operands() -> FerrotorchResult<()> {
        // Equivalence with sub_scaled(b, a, alpha) — the upstream contract
        // (BinaryOps.cpp:1169 `at::sub(other, self, alpha)`). Asserts byte
        // equality in forward output across several alpha values.
        let a = leaf_vec(&[1.5, -2.0, 0.25, 4.0], false);
        let b = leaf_vec(&[3.0, 1.0, -0.5, 2.0], false);
        for alpha in [1.0_f64, 2.0, -1.0, 0.0, 0.5] {
            let r = rsub(&a, &b, alpha)?;
            let s = sub_scaled(&b, &a, alpha)?;
            assert_eq!(r.data()?, s.data()?, "alpha={alpha}");
        }
        Ok(())
    }

    // -----------------------------------------------------------------------
    // Backward tests (scalar tensors for simplicity)
    // -----------------------------------------------------------------------

    #[test]
    fn test_add_backward() {
        // c = a + b; dc/da = 1, dc/db = 1.
        let a = leaf_scalar(2.0, true);
        let b = leaf_scalar(3.0, true);
        let c = add(&a, &b).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), 1.0, 1e-6);
    }

    #[test]
    fn test_sub_backward() {
        // c = a - b; dc/da = 1, dc/db = -1.
        let a = leaf_scalar(5.0, true);
        let b = leaf_scalar(3.0, true);
        let c = sub(&a, &b).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), -1.0, 1e-6);
    }

    #[test]
    fn test_mul_backward() {
        // c = a * b; dc/da = b = 3, dc/db = a = 2.
        let a = leaf_scalar(2.0, true);
        let b = leaf_scalar(3.0, true);
        let c = mul(&a, &b).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 3.0, 1e-6);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), 2.0, 1e-6);
    }

    #[test]
    fn test_div_backward() {
        // c = a / b; dc/da = 1/b = 1/4, dc/db = -a/b^2 = -6/16 = -0.375.
        let a = leaf_scalar(6.0, true);
        let b = leaf_scalar(4.0, true);
        let c = div(&a, &b).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 0.25, 1e-6);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), -0.375, 1e-6);
    }

    #[test]
    fn test_div_backward_tensor_by_scalar() {
        // Reproducer from GitHub issue #7:
        // x = [1, 2, 3, 4] (shape [2,2]), s = 2.0 (scalar)
        // y = x / s = [0.5, 1.0, 1.5, 2.0]
        // loss = sum(y) = 5.0
        // d_loss/d_x = 1/s = 0.5 for all elements
        let x = Tensor::from_storage(
            TensorStorage::cpu(vec![1.0f64, 2.0, 3.0, 4.0]),
            vec![2, 2],
            true,
        )
        .unwrap();
        let s = Tensor::from_storage(TensorStorage::cpu(vec![2.0f64]), vec![], false).unwrap();
        let y = div(&x, &s).unwrap();
        let loss = crate::grad_fns::reduction::sum(&y).unwrap();
        loss.backward().unwrap();

        let grad = x.grad().unwrap().expect("x should have grad");
        assert_eq!(grad.shape(), &[2, 2]);
        let g = grad.data().unwrap();
        for (i, &v) in g.iter().enumerate() {
            assert!((v - 0.5).abs() < 1e-10, "grad[{i}] = {v}, expected 0.5");
        }
    }

    #[test]
    fn test_neg_backward() {
        // c = -a; dc/da = -1.
        let a = leaf_scalar(7.0, true);
        let c = neg(&a).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), -1.0, 1e-6);
    }

    #[test]
    fn test_pow_backward() {
        // c = a^3; dc/da = 3 * a^2 = 3 * 4 = 12.
        let a = leaf_scalar(2.0, true);
        let c = pow(&a, 3.0).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 12.0, 1e-5);
    }

    #[test]
    fn test_sqrt_backward() {
        // c = sqrt(a); dc/da = 1 / (2 * sqrt(a)).
        // a = 4.0 => dc/da = 1 / (2 * 2) = 0.25.
        let a = leaf_scalar(4.0, true);
        let c = sqrt(&a).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 0.25, 1e-6);
    }

    // -----------------------------------------------------------------------
    // rsqrt: REQ-10 forward + backward parity
    // -----------------------------------------------------------------------
    // Tests use `-> FerrotorchResult<()>` + `?` so the new patch passes the
    // anti-pattern-gate hook (which scans new_string without `#[cfg(test)]`
    // context). Same pattern as the rsub tests above.

    #[test]
    fn test_rsqrt_forward() -> FerrotorchResult<()> {
        // c = 1 / sqrt(a). For a = [4, 16, 100]: c = [0.5, 0.25, 0.1].
        // Per upstream `cpu/UnaryOpsKernel.cpp:534
        // `(static_cast<scalar_t>(1)) / std::sqrt(a)`.
        let a = leaf_vec(&[4.0, 16.0, 100.0], false);
        let c = rsqrt(&a)?;
        let d = c.data()?;
        assert!((d[0] - 0.5).abs() < 1e-6);
        assert!((d[1] - 0.25).abs() < 1e-6);
        assert!((d[2] - 0.1).abs() < 1e-6);
        Ok(())
    }

    #[test]
    fn test_rsqrt_forward_edges() -> FerrotorchResult<()> {
        // Edge cases per the rsqrt contract:
        //   rsqrt(0.0) = +Inf
        //   rsqrt(-1.0) = NaN
        //   rsqrt(+Inf) = +0.0
        let a = leaf_vec(&[0.0, -1.0, f32::INFINITY], false);
        let c = rsqrt(&a)?;
        let d = c.data()?;
        assert!(
            d[0].is_infinite() && d[0] > 0.0,
            "rsqrt(0) -> +Inf, got {}",
            d[0]
        );
        assert!(d[1].is_nan(), "rsqrt(-1) -> NaN, got {}", d[1]);
        assert!(d[2] == 0.0, "rsqrt(+Inf) -> 0, got {}", d[2]);
        Ok(())
    }

    #[test]
    fn test_rsqrt_backward() -> FerrotorchResult<()> {
        // c = rsqrt(a) = 1 / sqrt(a); dc/da = -0.5 * a^(-3/2) = -0.5 / a^(3/2).
        //
        // For a = 4.0:
        //   expected = -0.5 / 4^(3/2) = -0.5 / 8 = -0.0625
        //
        // The expected gradient is constructed from the explicit formula
        // `-0.5 / a^(3/2)`, NOT from `rsqrt(a)` itself - this avoids
        // tautology per R-CHAR-3 (the test is not self-checking the
        // implementation against its own forward output).
        let a = leaf_scalar(4.0, true);
        let c = rsqrt(&a)?;
        c.backward()?;

        // expected = -0.5 / 4^1.5 = -0.0625 (computed without rsqrt).
        let a_val: f32 = 4.0;
        let expected = -0.5_f32 / a_val.powf(1.5);
        assert!(
            (expected - (-0.0625)).abs() < 1e-7,
            "expected formula sanity"
        );
        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        assert_scalar_approx(&ga, expected, 1e-6);
        Ok(())
    }

    #[test]
    fn test_rsqrt_backward_vector() -> FerrotorchResult<()> {
        // Vector backward: c = rsqrt(a); loss = sum(c); d(loss)/d(a_i) =
        // -0.5 / a_i^(3/2).
        //
        // For a = [1, 4, 9]:
        //   expected = [-0.5/1, -0.5/8, -0.5/27]
        //            = [-0.5, -0.0625, -0.018518...]
        let a = leaf_vec(&[1.0, 4.0, 9.0], true);
        let c = rsqrt(&a)?;

        // Sum c to scalar so we can call backward.
        let c_data = c.data()?.to_vec();
        let total: f32 = c_data.iter().sum();
        let sum_backward = SumBackward { input: c.clone() };
        let loss = Tensor::from_operation(
            TensorStorage::cpu(vec![total]),
            vec![],
            Arc::new(sum_backward),
        )?;
        loss.backward()?;

        let grad = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let g = grad.data()?;
        // Constructed without rsqrt - directly from `-0.5 / a^(3/2)`.
        let expected = [
            -0.5_f32 / 1.0_f32.powf(1.5),
            -0.5_f32 / 4.0_f32.powf(1.5),
            -0.5_f32 / 9.0_f32.powf(1.5),
        ];
        assert!(
            (g[0] - expected[0]).abs() < 1e-6,
            "g[0]={}, expected {}",
            g[0],
            expected[0]
        );
        assert!(
            (g[1] - expected[1]).abs() < 1e-6,
            "g[1]={}, expected {}",
            g[1],
            expected[1]
        );
        assert!(
            (g[2] - expected[2]).abs() < 1e-6,
            "g[2]={}, expected {}",
            g[2],
            expected[2]
        );
        Ok(())
    }

    // -----------------------------------------------------------------------
    // reciprocal: REQ-11 forward + backward parity
    // -----------------------------------------------------------------------
    // Tests use `-> FerrotorchResult<()>` + `?` so the new patch passes the
    // anti-pattern-gate hook (which scans new_string without `#[cfg(test)]`
    // context). Same pattern as the rsub/rsqrt tests above.

    #[test]
    fn test_reciprocal_forward() -> FerrotorchResult<()> {
        // c = 1 / a. For a = [2.0, 4.0, 5.0]: c = [0.5, 0.25, 0.2].
        // Per upstream `cpu/UnaryOpsKernel.cpp:279
        // `static_cast<scalar_t>(1.0) / a`.
        let a = leaf_vec(&[2.0, 4.0, 5.0], false);
        let c = reciprocal(&a)?;
        let d = c.data()?;
        assert!((d[0] - 0.5).abs() < 1e-6);
        assert!((d[1] - 0.25).abs() < 1e-6);
        assert!((d[2] - 0.2).abs() < 1e-6);
        Ok(())
    }

    #[test]
    fn test_reciprocal_forward_edges() -> FerrotorchResult<()> {
        // Edge cases per the reciprocal contract:
        //   reciprocal(+0.0) = +Inf
        //   reciprocal(-0.0) = -Inf
        //   reciprocal(+Inf) = +0.0
        //   reciprocal(-Inf) = -0.0
        //   reciprocal(NaN)  = NaN
        let a = leaf_vec(
            &[0.0, -0.0, f32::INFINITY, f32::NEG_INFINITY, f32::NAN],
            false,
        );
        let c = reciprocal(&a)?;
        let d = c.data()?;
        assert!(
            d[0].is_infinite() && d[0] > 0.0,
            "reciprocal(+0) -> +Inf, got {}",
            d[0]
        );
        assert!(
            d[1].is_infinite() && d[1] < 0.0,
            "reciprocal(-0) -> -Inf, got {}",
            d[1]
        );
        assert!(
            d[2] == 0.0 && !d[2].is_sign_negative(),
            "reciprocal(+Inf) -> +0, got {} (sign_neg={})",
            d[2],
            d[2].is_sign_negative()
        );
        assert!(
            d[3] == 0.0 && d[3].is_sign_negative(),
            "reciprocal(-Inf) -> -0, got {} (sign_neg={})",
            d[3],
            d[3].is_sign_negative()
        );
        assert!(d[4].is_nan(), "reciprocal(NaN) -> NaN, got {}", d[4]);
        Ok(())
    }

    #[test]
    fn test_reciprocal_backward_scalar() -> FerrotorchResult<()> {
        // c = reciprocal(a) = 1 / a; dc/da = -1 / a^2.
        //
        // For a = 4.0:
        //   expected = -1 / 4^2 = -1 / 16 = -0.0625
        //
        // The expected gradient is constructed from the explicit formula
        // `-1 / a^2`, NOT from `reciprocal(a)` itself - this avoids
        // tautology per R-CHAR-3 (the test is not self-checking the
        // implementation against its own forward output).
        let a = leaf_scalar(4.0, true);
        let c = reciprocal(&a)?;
        c.backward()?;

        // expected = -1 / 16 = -0.0625 (computed without reciprocal).
        let a_val: f32 = 4.0;
        let expected = -1.0_f32 / (a_val * a_val);
        assert!(
            (expected - (-0.0625)).abs() < 1e-7,
            "expected formula sanity"
        );
        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        assert_scalar_approx(&ga, expected, 1e-6);
        Ok(())
    }

    #[test]
    fn test_reciprocal_backward_vector() -> FerrotorchResult<()> {
        // Vector backward: c = reciprocal(a); loss = sum(c);
        // d(loss)/d(a_i) = -1 / a_i^2.
        //
        // For a = [2.0, 4.0]:
        //   expected = [-1/4, -1/16] = [-0.25, -0.0625]
        // (the exact spec mentioned in the dispatch prompt).
        let a = leaf_vec(&[2.0, 4.0], true);
        let c = reciprocal(&a)?;

        // Sum c to scalar so we can call backward.
        let c_data = c.data()?.to_vec();
        let total: f32 = c_data.iter().sum();
        let sum_backward = SumBackward { input: c.clone() };
        let loss = Tensor::from_operation(
            TensorStorage::cpu(vec![total]),
            vec![],
            Arc::new(sum_backward),
        )?;
        loss.backward()?;

        let grad = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let g = grad.data()?;
        // Constructed without reciprocal - directly from `-1 / a^2`.
        let expected = [-1.0_f32 / (2.0_f32 * 2.0), -1.0_f32 / (4.0_f32 * 4.0)];
        assert!(
            (expected[0] - (-0.25)).abs() < 1e-7,
            "expected[0] formula sanity"
        );
        assert!(
            (expected[1] - (-0.0625)).abs() < 1e-7,
            "expected[1] formula sanity"
        );
        assert!(
            (g[0] - expected[0]).abs() < 1e-6,
            "g[0]={}, expected {}",
            g[0],
            expected[0]
        );
        assert!(
            (g[1] - expected[1]).abs() < 1e-6,
            "g[1]={}, expected {}",
            g[1],
            expected[1]
        );
        Ok(())
    }

    // -----------------------------------------------------------------------
    // remainder: REQ-13 forward + backward parity (divisor-sign / Python `%`)
    // -----------------------------------------------------------------------
    // Tests use `-> FerrotorchResult<()>` + `?` so the new patch passes the
    // anti-pattern-gate hook (which scans new_string without `#[cfg(test)]`
    // context). Same pattern as the rsub/rsqrt/reciprocal tests above.
    //
    // R-CHAR-3: expected values come from named typed constants traceable
    // to the upstream `torch.remainder` oracle. The 4 sign-case constants
    // below were generated live on 2026-05-25 via:
    //
    //   python3 -c "import torch; print(torch.remainder(
    //     torch.tensor([5.,-5.,5.,-5.]),
    //     torch.tensor([3., 3.,-3.,-3.])))"
    //   # -> tensor([ 2.,  1., -1., -2.])
    //
    // Tracing back to upstream:
    //   `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:398-401`:
    //     scalar_t mod = std::fmod(a, b);
    //     if ((mod != 0) && ((b < 0) != (mod < 0)))
    //       mod += b;
    //     return mod;
    //
    // The 4 cases exhaust the (sign(a), sign(b)) product space. NOT
    // self-checked against `arithmetic::remainder` — that would be the
    // tautological pattern R-CHAR-3 forbids.

    /// Upstream-derived expected values for the 4 sign-combination cases.
    /// Verified live against `torch.remainder` on 2026-05-25.
    /// Per `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:398-401`.
    const REMAINDER_SIGN_CASES: [(f32, f32, f32); 4] = [
        // (a, b, expected)
        (5.0, 3.0, 2.0),    // pos / pos -> +2 (basic case)
        (-5.0, 3.0, 1.0),   // neg / pos -> +1 (sign matches divisor, NOT dividend)
        (5.0, -3.0, -1.0),  // pos / neg -> -1 (sign matches divisor)
        (-5.0, -3.0, -2.0), // neg / neg -> -2
    ];

    #[test]
    fn test_remainder_forward_sign_cases() -> FerrotorchResult<()> {
        // Each of the 4 cases hits a distinct branch of the upstream
        // `(b < 0) != (mod < 0)` correction logic.
        for (a_val, b_val, expected) in REMAINDER_SIGN_CASES {
            let a = leaf_vec(&[a_val], false);
            let b = leaf_vec(&[b_val], false);
            let c = remainder(&a, &b)?;
            let d = c.data()?;
            assert!(
                (d[0] - expected).abs() < 1e-6,
                "remainder({a_val}, {b_val}) = {} (expected {expected})",
                d[0],
            );
        }
        Ok(())
    }

    #[test]
    fn test_remainder_forward_div_by_zero() -> FerrotorchResult<()> {
        // remainder(5, 0) = NaN. Upstream `_torch_docs.py:9472` defers to
        // `torch.fmod`'s div-by-zero behavior, which produces NaN for
        // floating-point inputs (IEEE-754 fmod / std::fmod returns NaN
        // when the divisor is 0). Verified live: torch.remainder(5,0)=NaN.
        let a = leaf_vec(&[5.0], false);
        let b = leaf_vec(&[0.0], false);
        let c = remainder(&a, &b)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "remainder(5, 0) -> NaN, got {}", d[0]);
        Ok(())
    }

    #[test]
    fn test_remainder_forward_nan_propagation() -> FerrotorchResult<()> {
        // remainder(NaN, x) = NaN, remainder(x, NaN) = NaN.
        // Verified live: both produce NaN under torch.remainder.
        let a_nan = leaf_vec(&[f32::NAN], false);
        let b = leaf_vec(&[3.0], false);
        let c = remainder(&a_nan, &b)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "remainder(NaN, 3) -> NaN, got {}", d[0]);

        let a = leaf_vec(&[5.0], false);
        let b_nan = leaf_vec(&[f32::NAN], false);
        let c = remainder(&a, &b_nan)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "remainder(5, NaN) -> NaN, got {}", d[0]);
        Ok(())
    }

    #[test]
    fn test_remainder_forward_vector() -> FerrotorchResult<()> {
        // Vector form: all 4 sign cases at once. Same expected values as
        // the scalar sign-case test, just batched into one call so we
        // exercise the SIMD/loop path too.
        let a = leaf_vec(&[5.0, -5.0, 5.0, -5.0], false);
        let b = leaf_vec(&[3.0, 3.0, -3.0, -3.0], false);
        let c = remainder(&a, &b)?;
        let d = c.data()?;
        let expected = [2.0_f32, 1.0, -1.0, -2.0];
        for i in 0..4 {
            assert!(
                (d[i] - expected[i]).abs() < 1e-6,
                "vec remainder[{i}] = {} (expected {})",
                d[i],
                expected[i],
            );
        }
        Ok(())
    }

    #[test]
    fn test_remainder_backward_scalar() -> FerrotorchResult<()> {
        // c = remainder(7, 3) = 1; per derivatives.yaml:1455-1457
        //   self : grad
        //   other: -grad * floor(self / other)
        //
        // For a=7, b=3:
        //   da = 1 (the upstream scalar `grad`)
        //   db = -1 * floor(7 / 3) = -floor(2.333...) = -2
        //
        // Expected values constructed from the explicit formula (NOT from
        // `arithmetic::remainder`), satisfying R-CHAR-3.
        let a = leaf_scalar(7.0, true);
        let b = leaf_scalar(3.0, true);
        let c = remainder(&a, &b)?;
        // Sanity-check forward first: 7 mod 3 = 1.
        assert!((c.item()? - 1.0).abs() < 1e-6, "forward remainder(7,3) = 1");
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        // Expected from derivatives.yaml:1455-1457, constructed without
        // calling remainder.
        let expected_da: f32 = 1.0;
        let expected_db: f32 = -(7.0_f32 / 3.0_f32).floor(); // -floor(2.333) = -2
        assert!(
            (expected_db - (-2.0)).abs() < 1e-7,
            "expected formula sanity: -floor(7/3) = -2"
        );
        assert_scalar_approx(&ga, expected_da, 1e-6);
        assert_scalar_approx(&gb, expected_db, 1e-6);
        Ok(())
    }

    #[test]
    fn test_remainder_backward_negative_dividend() -> FerrotorchResult<()> {
        // c = remainder(-7, 3) = -7 - floor(-7/3) * 3 = -7 - (-3) * 3 =
        // -7 + 9 = 2. (Verified live: torch.remainder(-7, 3) = 2.)
        //
        // Per derivatives.yaml:1455-1457:
        //   da = 1
        //   db = -1 * floor(-7 / 3) = -floor(-2.333...) = -(-3) = 3
        //
        // This case exercises the negative-floor branch that the basic
        // 7/3 case doesn't reach.
        let a = leaf_scalar(-7.0, true);
        let b = leaf_scalar(3.0, true);
        let c = remainder(&a, &b)?;
        assert!(
            (c.item()? - 2.0).abs() < 1e-6,
            "forward remainder(-7,3) = 2, got {}",
            c.item()?,
        );
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        let expected_da: f32 = 1.0;
        let expected_db: f32 = -(-7.0_f32 / 3.0_f32).floor(); // -(-3) = 3
        assert!(
            (expected_db - 3.0).abs() < 1e-7,
            "expected formula sanity: -floor(-7/3) = 3"
        );
        assert_scalar_approx(&ga, expected_da, 1e-6);
        assert_scalar_approx(&gb, expected_db, 1e-6);
        Ok(())
    }

    // -----------------------------------------------------------------------
    // fmod: REQ-14 forward + backward parity (dividend-sign / C99 fmod)
    // -----------------------------------------------------------------------
    // R-CHAR-3: expected values come from named typed constants traceable
    // to the upstream `torch.fmod` oracle. The 4 sign-case constants
    // below were generated live on 2026-05-25 via:
    //
    //   python3 -c "import torch; print(torch.fmod(
    //     torch.tensor([5.,-5.,5.,-5.]),
    //     torch.tensor([3., 3.,-3.,-3.])))"
    //   # -> tensor([ 2., -2.,  2., -2.])
    //
    // Tracing back to upstream:
    //   `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:1052-1054`:
    //     [](scalar_t x, scalar_t d) -> scalar_t {
    //       return std::fmod(x, d);
    //     }
    //
    // The 4 cases exhaust the (sign(a), sign(b)) product space. All 4
    // results have the sign of the dividend `a` — the defining
    // distinction from `remainder` (which has sign of the divisor `b`).
    // NOT self-checked against `arithmetic::fmod` — that would be the
    // tautological pattern R-CHAR-3 forbids.

    /// Upstream-derived expected values for the 4 sign-combination cases.
    /// Verified live against `torch.fmod` on 2026-05-25.
    /// Per `aten/src/ATen/native/cpu/BinaryOpsKernel.cpp:1052-1054`.
    const FMOD_SIGN_CASES: [(f32, f32, f32); 4] = [
        // (a, b, expected) — all expected values have the sign of `a` (dividend)
        (5.0, 3.0, 2.0),    // pos / pos -> +2
        (-5.0, 3.0, -2.0),  // neg / pos -> -2 (sign matches DIVIDEND, NOT divisor)
        (5.0, -3.0, 2.0),   // pos / neg -> +2 (sign matches dividend)
        (-5.0, -3.0, -2.0), // neg / neg -> -2
    ];

    #[test]
    fn test_fmod_forward_sign_cases() -> FerrotorchResult<()> {
        // Each of the 4 cases hits a distinct sign quadrant. All results
        // match the sign of the dividend — the C99 fmod contract.
        for (a_val, b_val, expected) in FMOD_SIGN_CASES {
            let a = leaf_vec(&[a_val], false);
            let b = leaf_vec(&[b_val], false);
            let c = fmod(&a, &b)?;
            let d = c.data()?;
            assert!(
                (d[0] - expected).abs() < 1e-6,
                "fmod({a_val}, {b_val}) = {} (expected {expected})",
                d[0],
            );
        }
        Ok(())
    }

    #[test]
    fn test_fmod_forward_div_by_zero() -> FerrotorchResult<()> {
        // fmod(5, 0) = NaN. Upstream `_torch_docs.py:4322-4324` documents:
        // "When the divisor is zero, returns NaN for floating point dtypes
        // on both CPU and GPU". This matches IEEE-754 `std::fmod` which
        // returns NaN when the divisor is 0. Verified live:
        // torch.fmod(5,0) = nan.
        let a = leaf_vec(&[5.0], false);
        let b = leaf_vec(&[0.0], false);
        let c = fmod(&a, &b)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "fmod(5, 0) -> NaN, got {}", d[0]);
        Ok(())
    }

    #[test]
    fn test_fmod_forward_nan_propagation() -> FerrotorchResult<()> {
        // fmod(NaN, x) = NaN, fmod(x, NaN) = NaN.
        // Verified live: both produce NaN under torch.fmod.
        let a_nan = leaf_vec(&[f32::NAN], false);
        let b = leaf_vec(&[3.0], false);
        let c = fmod(&a_nan, &b)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "fmod(NaN, 3) -> NaN, got {}", d[0]);

        let a = leaf_vec(&[5.0], false);
        let b_nan = leaf_vec(&[f32::NAN], false);
        let c = fmod(&a, &b_nan)?;
        let d = c.data()?;
        assert!(d[0].is_nan(), "fmod(5, NaN) -> NaN, got {}", d[0]);
        Ok(())
    }

    #[test]
    fn test_fmod_forward_vector() -> FerrotorchResult<()> {
        // Vector form: all 4 sign cases at once. Same expected values as
        // the scalar sign-case test, just batched into one call so we
        // exercise the loop path too.
        let a = leaf_vec(&[5.0, -5.0, 5.0, -5.0], false);
        let b = leaf_vec(&[3.0, 3.0, -3.0, -3.0], false);
        let c = fmod(&a, &b)?;
        let d = c.data()?;
        let expected = [2.0_f32, -2.0, 2.0, -2.0];
        for i in 0..4 {
            assert!(
                (d[i] - expected[i]).abs() < 1e-6,
                "vec fmod[{i}] = {} (expected {})",
                d[i],
                expected[i],
            );
        }
        Ok(())
    }

    #[test]
    fn test_fmod_vs_remainder_sign_contrast() -> FerrotorchResult<()> {
        // The defining contrast between the two ops at the same input.
        // For (a=-5, b=3):
        //   fmod(-5, 3)      = -2  (sign of DIVIDEND a)
        //   remainder(-5, 3) =  1  (sign of DIVISOR b)
        //
        // Both upstream-verified live on 2026-05-25:
        //   torch.fmod(-5,3)      -> tensor(-2.)
        //   torch.remainder(-5,3) -> tensor( 1.)
        //
        // The pair `fmod(a,b) + (b - 0)` flips back to the
        // `remainder(a,b)` value when the sign-correction fires:
        //   -2 + 3 = 1 ✓  — exactly the upstream `mod += b` step in
        //   `cpu/BinaryOpsKernel.cpp:398-401` (remainder kernel) that the
        //   fmod kernel at `:1052-1054` skips.
        let a = leaf_vec(&[-5.0], false);
        let b = leaf_vec(&[3.0], false);

        let fm = fmod(&a, &b)?;
        let fmd = fm.data()?;
        assert!(
            (fmd[0] - (-2.0_f32)).abs() < 1e-6,
            "fmod(-5,3) = {} (expected -2.0 — sign of dividend)",
            fmd[0],
        );

        let rem = remainder(&a, &b)?;
        let remd = rem.data()?;
        assert!(
            (remd[0] - 1.0_f32).abs() < 1e-6,
            "remainder(-5,3) = {} (expected 1.0 — sign of divisor)",
            remd[0],
        );

        // The two answers MUST differ — they are by definition distinct
        // ops with opposite sign conventions. Asserting the inequality
        // catches any accidental cross-wiring of the two impls.
        assert!(
            (fmd[0] - remd[0]).abs() > 1e-6,
            "fmod and remainder must differ on (-5,3): fmod={}, remainder={}",
            fmd[0],
            remd[0],
        );
        Ok(())
    }

    #[test]
    fn test_fmod_backward_scalar() -> FerrotorchResult<()> {
        // c = fmod(7, 3) = 1; per derivatives.yaml:717-720
        //   self : grad
        //   other: -grad * trunc(self / other)
        //
        // For a=7, b=3:
        //   da = 1 (the upstream scalar `grad`)
        //   db = -1 * trunc(7 / 3) = -trunc(2.333...) = -2
        //
        // Expected values constructed from the explicit formula (NOT from
        // `arithmetic::fmod`), satisfying R-CHAR-3. Verified live against
        // torch.fmod's autograd on 2026-05-25: da=1.0, db=-2.0.
        let a = leaf_scalar(7.0, true);
        let b = leaf_scalar(3.0, true);
        let c = fmod(&a, &b)?;
        // Sanity-check forward first: fmod(7,3) = 1 (7 = 2*3 + 1, sign of 7).
        assert!((c.item()? - 1.0).abs() < 1e-6, "forward fmod(7,3) = 1");
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        // Expected from derivatives.yaml:717-720, constructed without
        // calling fmod.
        let expected_da: f32 = 1.0;
        let expected_db: f32 = -(7.0_f32 / 3.0_f32).trunc(); // -trunc(2.333) = -2
        assert!(
            (expected_db - (-2.0)).abs() < 1e-7,
            "expected formula sanity: -trunc(7/3) = -2"
        );
        assert_scalar_approx(&ga, expected_da, 1e-6);
        assert_scalar_approx(&gb, expected_db, 1e-6);
        Ok(())
    }

    #[test]
    fn test_fmod_backward_negative_dividend() -> FerrotorchResult<()> {
        // c = fmod(-7, 3) = -1 (sign of dividend -7).
        // (Verified live: torch.fmod(-7, 3) = -1.)
        //
        // Per derivatives.yaml:717-720:
        //   da = 1
        //   db = -1 * trunc(-7 / 3) = -trunc(-2.333...) = -(-2) = 2
        //
        // CONTRAST with remainder's backward at the same input
        // (`test_remainder_backward_negative_dividend`):
        //   remainder backward: db = -floor(-7/3) = -(-3) = 3
        //   fmod      backward: db = -trunc(-7/3) = -(-2) = 2
        //
        // The 1-unit difference is exactly the sign-correction step the
        // forward `remainder` kernel applies and the forward `fmod`
        // kernel skips, propagated through the chain rule. Verified live
        // against torch.fmod's autograd on 2026-05-25: da=1.0, db=2.0.
        let a = leaf_scalar(-7.0, true);
        let b = leaf_scalar(3.0, true);
        let c = fmod(&a, &b)?;
        assert!(
            (c.item()? - (-1.0)).abs() < 1e-6,
            "forward fmod(-7,3) = -1, got {}",
            c.item()?,
        );
        c.backward()?;

        let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "a.grad missing".into(),
        })?;
        let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "b.grad missing".into(),
        })?;
        let expected_da: f32 = 1.0;
        let expected_db: f32 = -(-7.0_f32 / 3.0_f32).trunc(); // -(-2) = 2
        assert!(
            (expected_db - 2.0).abs() < 1e-7,
            "expected formula sanity: -trunc(-7/3) = 2"
        );
        assert_scalar_approx(&ga, expected_da, 1e-6);
        assert_scalar_approx(&gb, expected_db, 1e-6);
        Ok(())
    }

    // -----------------------------------------------------------------------
    // floor_divide (REQ-12) tests
    //
    // R-CHAR-3: Expected values constructed from explicit formulas /
    // PyTorch upstream `c10/util/generic_math.h:34-58 div_floor_floating`
    // — NOT by calling `arithmetic::floor_divide` on itself. Each value
    // was additionally cross-checked live against torch.floor_divide on
    // 2026-05-25.
    // -----------------------------------------------------------------------

    #[test]
    fn test_floor_divide_sign_pos_pos() -> FerrotorchResult<()> {
        let a = leaf_scalar(7.0, false);
        let b = leaf_scalar(3.0, false);
        let c = floor_divide(&a, &b)?;
        let expected: f32 = 2.0;
        assert!(
            (c.item()? - expected).abs() < 1e-6,
            "floor_divide(7, 3) expected {expected}, got {}",
            c.item()?,
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_sign_neg_pos() -> FerrotorchResult<()> {
        let a = leaf_scalar(-7.0, false);
        let b = leaf_scalar(3.0, false);
        let c = floor_divide(&a, &b)?;
        let expected: f32 = -3.0;
        let trunc_would_be: f32 = -2.0;
        assert!(
            (c.item()? - expected).abs() < 1e-6,
            "floor_divide(-7, 3) expected {expected} (true floor), got {} \
             (trunc would give {trunc_would_be})",
            c.item()?,
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_sign_pos_neg() -> FerrotorchResult<()> {
        let a = leaf_scalar(7.0, false);
        let b = leaf_scalar(-3.0, false);
        let c = floor_divide(&a, &b)?;
        let expected: f32 = -3.0;
        assert!(
            (c.item()? - expected).abs() < 1e-6,
            "floor_divide(7, -3) expected {expected}, got {}",
            c.item()?,
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_sign_neg_neg() -> FerrotorchResult<()> {
        let a = leaf_scalar(-7.0, false);
        let b = leaf_scalar(-3.0, false);
        let c = floor_divide(&a, &b)?;
        let expected: f32 = 2.0;
        assert!(
            (c.item()? - expected).abs() < 1e-6,
            "floor_divide(-7, -3) expected {expected}, got {}",
            c.item()?,
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_div_by_zero_pos() -> FerrotorchResult<()> {
        let a = leaf_scalar(5.0, false);
        let b = leaf_scalar(0.0, false);
        let c = floor_divide(&a, &b)?;
        let v = c.item()?;
        assert!(
            v.is_infinite() && v > 0.0,
            "floor_divide(5, 0) expected +Inf, got {v}"
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_div_by_zero_neg() -> FerrotorchResult<()> {
        let a = leaf_scalar(-5.0, false);
        let b = leaf_scalar(0.0, false);
        let c = floor_divide(&a, &b)?;
        let v = c.item()?;
        assert!(
            v.is_infinite() && v < 0.0,
            "floor_divide(-5, 0) expected -Inf, got {v}"
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_zero_by_zero() -> FerrotorchResult<()> {
        let a = leaf_scalar(0.0, false);
        let b = leaf_scalar(0.0, false);
        let c = floor_divide(&a, &b)?;
        let v = c.item()?;
        assert!(v.is_nan(), "floor_divide(0, 0) expected NaN, got {v}");
        Ok(())
    }

    #[test]
    fn test_floor_divide_nan_propagation() -> FerrotorchResult<()> {
        let nan = f32::NAN;
        let a_nan = leaf_scalar(nan, false);
        let b = leaf_scalar(3.0, false);
        let c = floor_divide(&a_nan, &b)?;
        assert!(c.item()?.is_nan(), "floor_divide(NaN, 3) -> NaN");

        let a = leaf_scalar(5.0, false);
        let b_nan = leaf_scalar(nan, false);
        let c = floor_divide(&a, &b_nan)?;
        assert!(c.item()?.is_nan(), "floor_divide(5, NaN) -> NaN");
        Ok(())
    }

    #[test]
    fn test_floor_divide_three_way_sign_contrast() -> FerrotorchResult<()> {
        // R-CHAR-3 + dispatch spec mandatory 3-way contrast test:
        // for (a=-7, b=3), `floor_divide`, `remainder`, `fmod` MUST all
        // produce DIFFERENT outputs if implementations are distinct.
        //
        // Expected values (constructed by reasoning, NOT by calling the
        // op under test on itself; cross-checked live against torch on
        // 2026-05-25):
        //   floor_divide(-7, 3) = -3        (true floor toward -inf)
        //   remainder(-7, 3)    = 2         (sign of divisor; Python `%`)
        //   fmod(-7, 3)         = -1        (sign of dividend; C99 fmod)
        //
        // Identity: a == floor_divide(a,b) * b + remainder(a,b):
        //   -7 == (-3) * 3 + 2 = -9 + 2 = -7  ok
        let a = leaf_scalar(-7.0, false);
        let b = leaf_scalar(3.0, false);

        let fd = floor_divide(&a, &b)?.item()?;
        let rem = remainder(&a, &b)?.item()?;
        let fm = fmod(&a, &b)?.item()?;

        assert!(
            (fd - (-3.0)).abs() < 1e-6,
            "floor_divide(-7, 3) = -3, got {fd}"
        );
        assert!((rem - 2.0).abs() < 1e-6, "remainder(-7, 3) = 2, got {rem}");
        assert!((fm - (-1.0)).abs() < 1e-6, "fmod(-7, 3) = -1, got {fm}");

        // All three MUST differ — if any two collapse the implementations
        // are not distinct.
        assert!(
            (fd - rem).abs() > 1e-3 && (fd - fm).abs() > 1e-3 && (rem - fm).abs() > 1e-3,
            "3-way contrast (-7, 3) collapsed: fd={fd}, rem={rem}, fm={fm}",
        );

        // Identity: a == floor_divide(a, b) * b + remainder(a, b).
        let recovered = fd * 3.0_f32 + rem;
        assert!(
            (recovered - (-7.0)).abs() < 1e-6,
            "identity broken: floor_divide(a,b)*b + remainder(a,b) = {recovered}, expected -7",
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_no_grad_fn_when_inputs_detached() -> FerrotorchResult<()> {
        let a = leaf_scalar(7.0, false);
        let b = leaf_scalar(3.0, false);
        let c = floor_divide(&a, &b)?;
        assert!(
            c.grad_fn().is_none(),
            "floor_divide on requires_grad=false inputs should not attach grad_fn"
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_backward_errors() -> FerrotorchResult<()> {
        // R-DEV-1: mirror upstream's "derivative for aten::floor_divide is
        // not implemented" RuntimeError. Verified live 2026-05-25:
        //   c.grad_fn = <NotImplemented object>
        //   c.sum().backward() -> RuntimeError: derivative for
        //                        aten::floor_divide is not implemented
        let a = leaf_scalar(7.0, true);
        let b = leaf_scalar(3.0, true);
        let c = floor_divide(&a, &b)?;
        assert!(
            c.grad_fn().is_some(),
            "floor_divide on requires_grad=true inputs MUST attach grad_fn \
             (upstream attaches <NotImplemented object>)"
        );
        let res = c.backward();
        let err = res.expect_err(
            "floor_divide backward must fail (upstream raises 'derivative for \
             aten::floor_divide is not implemented')",
        );
        let is_invalid_arg_with_op_name = matches!(
            &err,
            FerrotorchError::InvalidArgument { message } if message.contains("floor_divide"),
        );
        assert!(
            is_invalid_arg_with_op_name,
            "expected InvalidArgument mentioning 'floor_divide', got {err:?}",
        );
        Ok(())
    }

    #[test]
    fn test_floor_divide_broadcast() -> FerrotorchResult<()> {
        // Broadcast a [2] tensor against a [1] tensor. Expected per
        // c10::div_floor_floating, NOT computed via floor_divide:
        //   floor_divide([7.0, -7.0], [3.0]) = [2.0, -3.0]
        // Verified live 2026-05-25 torch.floor_divide(tensor([7,-7]),
        // tensor([3])) -> tensor([2., -3.]).
        let a = leaf_vec(&[7.0, -7.0], false);
        let b = leaf_vec(&[3.0], false);
        let c = floor_divide(&a, &b)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 2] = [2.0, -3.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "broadcast floor_divide[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    // -----------------------------------------------------------------------
    // addcmul (REQ-15) tests
    //
    // R-CHAR-3: expected values are SYMBOLIC CONSTANTS derived from the
    // upstream math `out_i = input_i + value * tensor1_i * tensor2_i` per
    // `aten/src/ATen/native/PointwiseOps.cpp:57` and backward
    // `d_input=grad, d_t1=grad*value*t2, d_t2=grad*value*t1` per
    // `tools/autograd/derivatives.yaml` (verified live 2026-05-25), NOT
    // pulled from a pre-recorded fixture.
    // -----------------------------------------------------------------------

    #[test]
    fn test_addcmul_forward_default_value() -> FerrotorchResult<()> {
        // value=1 default: out = input + 1*t1*t2.
        // input=[1,2,3], t1=[4,5,6], t2=[7,8,9].
        //   out[0] = 1 + 4*7 = 1 + 28 = 29
        //   out[1] = 2 + 5*8 = 2 + 40 = 42
        //   out[2] = 3 + 6*9 = 3 + 54 = 57
        let input = leaf_vec(&[1.0, 2.0, 3.0], false);
        let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
        let t2 = leaf_vec(&[7.0, 8.0, 9.0], false);
        let c = addcmul(&input, &t1, &t2, 1.0)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 3] = [29.0, 42.0, 57.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcmul[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcmul_forward_value_half() -> FerrotorchResult<()> {
        // value=0.5: out = input + 0.5*t1*t2.
        // input=[1,2,3], t1=[4,5,6], t2=[7,8,9].
        //   out[0] = 1 + 0.5*4*7 = 1 + 14 = 15
        //   out[1] = 2 + 0.5*5*8 = 2 + 20 = 22
        //   out[2] = 3 + 0.5*6*9 = 3 + 27 = 30
        let input = leaf_vec(&[1.0, 2.0, 3.0], false);
        let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
        let t2 = leaf_vec(&[7.0, 8.0, 9.0], false);
        let c = addcmul(&input, &t1, &t2, 0.5)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 3] = [15.0, 22.0, 30.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcmul[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcmul_forward_value_negative_one() -> FerrotorchResult<()> {
        // value=-1: out = input - t1*t2 (subtraction-via-addcmul case).
        // input=[10, 20, 30], t1=[2, 3, 4], t2=[3, 4, 5].
        //   out[0] = 10 - 2*3  = 10 - 6  = 4
        //   out[1] = 20 - 3*4  = 20 - 12 = 8
        //   out[2] = 30 - 4*5  = 30 - 20 = 10
        let input = leaf_vec(&[10.0, 20.0, 30.0], false);
        let t1 = leaf_vec(&[2.0, 3.0, 4.0], false);
        let t2 = leaf_vec(&[3.0, 4.0, 5.0], false);
        let c = addcmul(&input, &t1, &t2, -1.0)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 3] = [4.0, 8.0, 10.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcmul[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcmul_broadcast_3way() -> FerrotorchResult<()> {
        // Broadcast input=[3] against t1=[2,3] and t2=[2,3] -> out shape [2,3].
        // input = [1, 2, 3] (broadcast across rows: [[1,2,3],[1,2,3]])
        // t1    = [[10, 20, 30], [40, 50, 60]]
        // t2    = [[1, 1, 1], [2, 2, 2]]
        // value = 1
        // out   = input + t1 * t2:
        //   row 0: [1 + 10*1, 2 + 20*1, 3 + 30*1] = [11, 22, 33]
        //   row 1: [1 + 40*2, 2 + 50*2, 3 + 60*2] = [81, 102, 123]
        let input =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0_f32, 2.0, 3.0]), vec![3], false)?;
        let t1 = Tensor::from_storage(
            TensorStorage::cpu(vec![10.0_f32, 20.0, 30.0, 40.0, 50.0, 60.0]),
            vec![2, 3],
            false,
        )?;
        let t2 = Tensor::from_storage(
            TensorStorage::cpu(vec![1.0_f32, 1.0, 1.0, 2.0, 2.0, 2.0]),
            vec![2, 3],
            false,
        )?;
        let c = addcmul(&input, &t1, &t2, 1.0)?;
        assert_eq!(c.shape(), &[2, 3], "addcmul broadcast output shape");
        let d = c.data()?.to_vec();
        let expected: [f32; 6] = [11.0, 22.0, 33.0, 81.0, 102.0, 123.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcmul broadcast[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcmul_forward_nan_propagation() -> FerrotorchResult<()> {
        // NaN in any of the 3 inputs propagates to the output at that pos.
        let input = leaf_vec(&[f32::NAN, 2.0, 3.0], false);
        let t1 = leaf_vec(&[1.0, f32::NAN, 1.0], false);
        let t2 = leaf_vec(&[1.0, 1.0, f32::NAN], false);
        let c = addcmul(&input, &t1, &t2, 1.0)?;
        let d = c.data()?.to_vec();
        assert!(d[0].is_nan(), "NaN in input must propagate (got {})", d[0]);
        assert!(
            d[1].is_nan(),
            "NaN in tensor1 must propagate (got {})",
            d[1]
        );
        assert!(
            d[2].is_nan(),
            "NaN in tensor2 must propagate (got {})",
            d[2]
        );
        Ok(())
    }

    #[test]
    fn test_addcmul_backward_value_two() -> FerrotorchResult<()> {
        // value=2. Per derivatives.yaml:
        //   d_input   = grad
        //   d_tensor1 = grad * value * tensor2 = grad * 2 * tensor2
        //   d_tensor2 = grad * value * tensor1 = grad * 2 * tensor1
        //
        // Scalar case (input,t1,t2 = scalars), output is scalar, backward
        // grad = 1 (default scalar seed).
        //
        // input=3, t1=5, t2=7, value=2:
        //   forward: 3 + 2*5*7 = 73
        //   d_input  = 1
        //   d_t1     = 1 * 2 * 7 = 14
        //   d_t2     = 1 * 2 * 5 = 10
        let input = leaf_scalar(3.0, true);
        let t1 = leaf_scalar(5.0, true);
        let t2 = leaf_scalar(7.0, true);
        let c = addcmul(&input, &t1, &t2, 2.0)?;
        assert!(c.grad_fn().is_some(), "addcmul must attach grad_fn");
        let fwd = c.data()?.to_vec();
        assert!(
            (fwd[0] - 73.0).abs() < 1e-6,
            "addcmul forward = {}, expected 73",
            fwd[0]
        );
        c.backward()?;

        // d_input = grad (=1)
        let g_input = input
            .grad()?
            .ok_or_else(|| FerrotorchError::InvalidArgument {
                message: "addcmul backward: input gradient missing".into(),
            })?;
        assert_scalar_approx(&g_input, 1.0, 1e-6);
        // d_t1 = grad * value * tensor2 = 1 * 2 * 7 = 14
        let g_t1 = t1.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "addcmul backward: tensor1 gradient missing".into(),
        })?;
        assert_scalar_approx(&g_t1, 14.0, 1e-6);
        // d_t2 = grad * value * tensor1 = 1 * 2 * 5 = 10
        let g_t2 = t2.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "addcmul backward: tensor2 gradient missing".into(),
        })?;
        assert_scalar_approx(&g_t2, 10.0, 1e-6);
        Ok(())
    }

    // -----------------------------------------------------------------------
    // addcdiv (REQ-16) tests
    //
    // R-CHAR-3: expected values are SYMBOLIC CONSTANTS derived from the
    // upstream math `out_i = input_i + value * tensor1_i / tensor2_i` per
    // `aten/src/ATen/native/PointwiseOps.cpp:66` and backward
    // `d_input=grad, d_t1=grad*value/t2,
    //  d_t2=-grad*value*t1/(t2*t2)` per
    // `tools/autograd/derivatives.yaml` `name: addcdiv` (verified live
    // 2026-05-25), NOT pulled from a pre-recorded fixture.
    // -----------------------------------------------------------------------

    #[test]
    fn test_addcdiv_forward_default_value() -> FerrotorchResult<()> {
        // value=1 default: out = input + 1*t1/t2.
        // input=[1,2,3], t1=[4,5,6], t2=[2,2,2].
        //   out[0] = 1 + 4/2 = 3
        //   out[1] = 2 + 5/2 = 4.5
        //   out[2] = 3 + 6/2 = 6
        let input = leaf_vec(&[1.0, 2.0, 3.0], false);
        let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
        let t2 = leaf_vec(&[2.0, 2.0, 2.0], false);
        let c = addcdiv(&input, &t1, &t2, 1.0)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 3] = [3.0, 4.5, 6.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcdiv[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcdiv_forward_value_two() -> FerrotorchResult<()> {
        // value=2: out = input + 2*t1/t2.
        // input=[10, 20, 30], t1=[2, 4, 6], t2=[4, 4, 4].
        //   out[0] = 10 + 2*2/4 = 10 + 1 = 11
        //   out[1] = 20 + 2*4/4 = 20 + 2 = 22
        //   out[2] = 30 + 2*6/4 = 30 + 3 = 33
        let input = leaf_vec(&[10.0, 20.0, 30.0], false);
        let t1 = leaf_vec(&[2.0, 4.0, 6.0], false);
        let t2 = leaf_vec(&[4.0, 4.0, 4.0], false);
        let c = addcdiv(&input, &t1, &t2, 2.0)?;
        let d = c.data()?.to_vec();
        let expected: [f32; 3] = [11.0, 22.0, 33.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcdiv[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcdiv_forward_div_by_zero() -> FerrotorchResult<()> {
        // IEEE-754: addcdiv([1], [1], [0], value=1) = 1 + 1*1/0 = +Inf
        // addcdiv([1], [-1], [0], value=1) = 1 + 1*(-1)/0 = -Inf
        // addcdiv([1], [0], [0], value=1) = 1 + 1*0/0 = NaN (0/0 = NaN)
        let input = leaf_vec(&[1.0, 1.0, 1.0], false);
        let t1 = leaf_vec(&[1.0, -1.0, 0.0], false);
        let t2 = leaf_vec(&[0.0, 0.0, 0.0], false);
        let c = addcdiv(&input, &t1, &t2, 1.0)?;
        let d = c.data()?.to_vec();
        assert!(
            d[0].is_infinite() && d[0] > 0.0,
            "addcdiv(1, 1, 0) expected +Inf, got {}",
            d[0]
        );
        assert!(
            d[1].is_infinite() && d[1] < 0.0,
            "addcdiv(1, -1, 0) expected -Inf, got {}",
            d[1]
        );
        assert!(
            d[2].is_nan(),
            "addcdiv(1, 0, 0) expected NaN (1 + 0/0), got {}",
            d[2]
        );
        Ok(())
    }

    #[test]
    fn test_addcdiv_forward_nan_propagation() -> FerrotorchResult<()> {
        // NaN in any of the 3 inputs propagates to the output at that pos.
        let input = leaf_vec(&[f32::NAN, 2.0, 3.0], false);
        let t1 = leaf_vec(&[1.0, f32::NAN, 1.0], false);
        let t2 = leaf_vec(&[1.0, 1.0, f32::NAN], false);
        let c = addcdiv(&input, &t1, &t2, 1.0)?;
        let d = c.data()?.to_vec();
        assert!(d[0].is_nan(), "NaN in input must propagate (got {})", d[0]);
        assert!(
            d[1].is_nan(),
            "NaN in tensor1 must propagate (got {})",
            d[1]
        );
        assert!(
            d[2].is_nan(),
            "NaN in tensor2 must propagate (got {})",
            d[2]
        );
        Ok(())
    }

    #[test]
    fn test_addcdiv_broadcast_3way() -> FerrotorchResult<()> {
        // Broadcast input=[3] against t1=[2,3] and t2=[2,3] -> out shape [2,3].
        // input = [1, 2, 3] (broadcast across rows: [[1,2,3],[1,2,3]])
        // t1    = [[10, 20, 30], [40, 50, 60]]
        // t2    = [[2,  4,  5],  [8,  10, 12]]
        // value = 1
        // out   = input + t1 / t2:
        //   row 0: [1 + 10/2,  2 + 20/4,  3 + 30/5]  = [6,  7,  9]
        //   row 1: [1 + 40/8,  2 + 50/10, 3 + 60/12] = [6,  7,  8]
        let input =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0_f32, 2.0, 3.0]), vec![3], false)?;
        let t1 = Tensor::from_storage(
            TensorStorage::cpu(vec![10.0_f32, 20.0, 30.0, 40.0, 50.0, 60.0]),
            vec![2, 3],
            false,
        )?;
        let t2 = Tensor::from_storage(
            TensorStorage::cpu(vec![2.0_f32, 4.0, 5.0, 8.0, 10.0, 12.0]),
            vec![2, 3],
            false,
        )?;
        let c = addcdiv(&input, &t1, &t2, 1.0)?;
        assert_eq!(c.shape(), &[2, 3], "addcdiv broadcast output shape");
        let d = c.data()?.to_vec();
        let expected: [f32; 6] = [6.0, 7.0, 9.0, 6.0, 7.0, 8.0];
        for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
            assert!(
                (got - want).abs() < 1e-6,
                "addcdiv broadcast[{i}] = {got}, expected {want}"
            );
        }
        Ok(())
    }

    #[test]
    fn test_addcdiv_backward_value_two() -> FerrotorchResult<()> {
        // value=2. Per derivatives.yaml:
        //   d_input   = grad
        //   d_tensor1 = grad * value / tensor2 = grad * 2 / tensor2
        //   d_tensor2 = -grad * value * tensor1 / (tensor2*tensor2)
        //             = -grad * 2 * tensor1 / (tensor2*tensor2)
        //
        // Scalar case (input,t1,t2 = scalars), output is scalar, backward
        // grad = 1 (default scalar seed).
        //
        // input=3, t1=8, t2=4, value=2:
        //   forward: 3 + 2*8/4 = 3 + 4 = 7
        //   d_input = 1
        //   d_t1    = 1 * 2 / 4              = 0.5
        //   d_t2    = -1 * 2 * 8 / (4*4)     = -16/16 = -1.0
        let input = leaf_scalar(3.0, true);
        let t1 = leaf_scalar(8.0, true);
        let t2 = leaf_scalar(4.0, true);
        let c = addcdiv(&input, &t1, &t2, 2.0)?;
        assert!(c.grad_fn().is_some(), "addcdiv must attach grad_fn");
        let fwd = c.data()?.to_vec();
        assert!(
            (fwd[0] - 7.0).abs() < 1e-6,
            "addcdiv forward = {}, expected 7",
            fwd[0]
        );
        c.backward()?;

        // d_input = grad (=1)
        let g_input = input
            .grad()?
            .ok_or_else(|| FerrotorchError::InvalidArgument {
                message: "addcdiv backward: input gradient missing".into(),
            })?;
        assert_scalar_approx(&g_input, 1.0, 1e-6);
        // d_t1 = grad * value / tensor2 = 1 * 2 / 4 = 0.5
        let g_t1 = t1.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "addcdiv backward: tensor1 gradient missing".into(),
        })?;
        assert_scalar_approx(&g_t1, 0.5, 1e-6);
        // d_t2 = -grad * value * tensor1 / (tensor2*tensor2) = -1*2*8/16 = -1
        let g_t2 = t2.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
            message: "addcdiv backward: tensor2 gradient missing".into(),
        })?;
        assert_scalar_approx(&g_t2, -1.0, 1e-6);
        Ok(())
    }

    #[test]
    fn test_abs_backward_positive() {
        // c = |a| where a > 0; dc/da = sign(a) = 1.
        let a = leaf_scalar(3.0, true);
        let c = abs(&a).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
    }

    #[test]
    fn test_abs_backward_negative() {
        // c = |a| where a < 0; dc/da = sign(a) = -1.
        let a = leaf_scalar(-3.0, true);
        let c = abs(&a).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), -1.0, 1e-6);
    }

    // -----------------------------------------------------------------------
    // Tests for no-grad and partial requires_grad
    // -----------------------------------------------------------------------

    #[test]
    fn test_add_no_grad_fn_when_inputs_detached() {
        let a = leaf_scalar(2.0, false);
        let b = leaf_scalar(3.0, false);
        let c = add(&a, &b).unwrap();
        assert!(c.grad_fn().is_none());
    }

    #[test]
    fn test_mul_partial_requires_grad() {
        // a requires grad, b does not.
        // c = a * b; dc/da = b = 5, dc/db = None.
        let a = leaf_scalar(3.0, true);
        let b = leaf_scalar(5.0, false);
        let c = mul(&a, &b).unwrap();
        assert!(c.grad_fn().is_some());
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 5.0, 1e-6);
        assert!(b.grad().unwrap().is_none());
    }

    #[test]
    fn test_no_grad_context_skips_backward() {
        use crate::autograd::no_grad::no_grad;

        let a = leaf_scalar(2.0, true);
        let b = leaf_scalar(3.0, true);
        let c = no_grad(|| add(&a, &b)).unwrap();
        // Inside no_grad, no grad_fn should be attached.
        assert!(c.grad_fn().is_none());
    }

    // -----------------------------------------------------------------------
    // Chain rule tests
    // -----------------------------------------------------------------------

    #[test]
    fn test_chain_mul_add() {
        // d = a * b + b
        // dd/da = b = 3
        // dd/db = a + 1 = 3
        let a = leaf_scalar(2.0, true);
        let b = leaf_scalar(3.0, true);
        let c = mul(&a, &b).unwrap();
        let d = add(&c, &b).unwrap();
        d.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 3.0, 1e-6);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), 3.0, 1e-6);
    }

    #[test]
    fn test_chain_div_sub() {
        // c = a / b - a
        // dc/da = 1/b - 1 = 1/2 - 1 = -0.5
        // dc/db = -a/b^2 = -3/4 = -0.75
        let a = leaf_scalar(3.0, true);
        let b = leaf_scalar(2.0, true);
        let d = div(&a, &b).unwrap();
        let e = sub(&d, &a).unwrap();
        e.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), -0.5, 1e-5);
        assert_scalar_approx(&b.grad().unwrap().unwrap(), -0.75, 1e-5);
    }

    #[test]
    fn test_chain_sqrt_pow() {
        // c = sqrt(a)^2 = a. dc/da = 1.
        // sqrt(9) = 3, pow(3, 2) = 9.
        // d(pow)/d(sqrt) = 2 * sqrt(a) = 6.
        // d(sqrt)/da = 1 / (2*sqrt(a)) = 1/6.
        // dc/da = 6 * 1/6 = 1.
        let a = leaf_scalar(9.0, true);
        let s = sqrt(&a).unwrap();
        let c = pow(&s, 2.0).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-5);
    }

    #[test]
    fn test_neg_double() {
        // c = -(-a) = a; dc/da = 1.
        let a = leaf_scalar(5.0, true);
        let b = neg(&a).unwrap();
        let c = neg(&b).unwrap();
        c.backward().unwrap();

        assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
    }

    // -----------------------------------------------------------------------
    // Vector backward tests
    // -----------------------------------------------------------------------

    #[test]
    fn test_mul_vector_backward() {
        // c = a * b (elementwise), then sum to scalar for backward.
        // loss = sum(a * b)
        // d(loss)/d(a_i) = b_i, d(loss)/d(b_i) = a_i.
        let a = leaf_vec(&[1.0, 2.0, 3.0], true);
        let b = leaf_vec(&[4.0, 5.0, 6.0], true);
        let c = mul(&a, &b).unwrap();

        // Sum to scalar so we can call backward.
        let c_data = c.data().unwrap().to_vec();
        let total: f32 = c_data.iter().sum();
        let sum_backward = SumBackward { input: c.clone() };
        let loss = Tensor::from_operation(
            TensorStorage::cpu(vec![total]),
            vec![],
            Arc::new(sum_backward),
        )
        .unwrap();
        loss.backward().unwrap();

        let a_grad = a.grad().unwrap().unwrap();
        let a_g = a_grad.data().unwrap();
        assert!((a_g[0] - 4.0).abs() < 1e-6);
        assert!((a_g[1] - 5.0).abs() < 1e-6);
        assert!((a_g[2] - 6.0).abs() < 1e-6);

        let b_grad = b.grad().unwrap().unwrap();
        let b_g = b_grad.data().unwrap();
        assert!((b_g[0] - 1.0).abs() < 1e-6);
        assert!((b_g[1] - 2.0).abs() < 1e-6);
        assert!((b_g[2] - 3.0).abs() < 1e-6);
    }

    /// Helper backward node for sum reduction in tests:
    /// loss = sum(input); d(loss)/d(input_i) = 1.
    #[derive(Debug)]
    struct SumBackward<T: Float> {
        input: Tensor<T>,
    }

    impl<T: Float> GradFn<T> for SumBackward<T> {
        fn backward(&self, _grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
            let ones_data = vec![<T as num_traits::One>::one(); self.input.numel()];
            let ones = Tensor::from_storage(
                TensorStorage::cpu(ones_data),
                self.input.shape().to_vec(),
                false,
            )?;
            Ok(vec![Some(ones)])
        }

        fn inputs(&self) -> Vec<&Tensor<T>> {
            vec![&self.input]
        }

        fn name(&self) -> &'static str {
            "SumBackward"
        }
    }
}