ferrotorch-nn 0.6.1

//! Padding layers: constant, reflection, replication, and zero padding in 1-D, 2-D, 3-D.
//!
//! [CL-314] Add Conv3d, ConvTranspose1d/3d, and padding modules
//!
//! Each module pads the **last N** dimensions of the input tensor, matching
//! PyTorch semantics exactly.  Padding tuples specify *(left, right)* for 1-D,
//! *(left, right, top, bottom)* for 2-D, and
//! *(left, right, top, bottom, front, back)* for 3-D.
//!
//! ## REQ status (per `.design/ferrotorch-nn/padding.md`)
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 | SHIPPED | impl: `pub enum PaddingMode` here with 4 variants `Zeros` / `Reflect` / `Replicate` / `Circular`; non-test consumer: `ferrotorch-nn/src/conv.rs` uses `PaddingMode` as the `Conv{1,2,3}d` `padding_mode` field — the non-`Zeros` forward branch routes through `functional_pad_{1,2,3}d` (wiring landed in #1443), and `ConvTranspose{1,2,3}d::with_padding_mode` matches on it to reject non-`Zeros`. |
//! | REQ-2 | SHIPPED | impl: the grow-only `functional_pad_1d` / `functional_pad_2d` / `functional_pad_3d` entry points here dispatch on `PaddingMode`; the `Zeros`/constant arm routes through the crop-capable `functional_pad_1d_signed` / `functional_pad_2d_signed` / `functional_pad_3d_signed` (`isize` pads) which support NEGATIVE (crop) pads + mixed signs for `mode="constant"` via `pad_nd_signed_constant` + `PadNdSignedBackward`, mirroring `constant_pad_nd` at upstream `aten/src/ATen/native/PadNd.cpp:29-108` (#1611). Non-test consumer: the `usize` `functional_pad_{1,2,3}d` consume the signed entrypoints in production (the `Zeros` arm); `ferrotorch-nn/src/conv.rs` calls `functional_pad_{1,2,3}d` for the conv pre-pad; `ferrotorch-nn/src/functional.rs` re-exposes these as `nn::functional::pad`. |
//! | REQ-3 | SHIPPED | impl: `pub struct ConstantPad{1,2,3}d<T: Float>` here, mirroring `torch/nn/modules/padding.py` constant-pad family; non-test consumer: `pub use` in `lib.rs` exposes them to external crates; the vision-model code uses `ConstantPad2d` via the `lib.rs` re-export for padding non-square inputs. |
//! | REQ-4 | SHIPPED | impl: `pub struct ZeroPad{1,2,3}d<T: Float>` here; non-test consumer: `pub use` in `lib.rs` exposes them. |
//! | REQ-5 | SHIPPED | impl: `pub struct ReflectionPad{1,2,3}d<T: Float>` here with reflect-overflow check inside `pad_*d_reflect`; non-test consumer: `pub use` in `lib.rs`; reflection padding is the standard for U-nets and image-translation models. |
//! | REQ-6 | SHIPPED | impl: `pub struct ReplicationPad{1,2,3}d<T: Float>` here; non-test consumer: `pub use` in `lib.rs`. |
//! | REQ-7 | SHIPPED | impl: `pub struct CircularPad{1,2,3}d<T: Float>` here; non-test consumer: `pub use` in `lib.rs`. |
//! | REQ-8 | SHIPPED | impl: `macro_rules! impl_padding_module` here generates the `Module<T>` impls for all 12 structs; non-test consumer: `ferrotorch_optim` walks `Module::parameters()` of containers that include padding layers (every padding layer returns the empty parameter list, which is the correct behavior). |
//! | REQ-9 | NOT-STARTED | blocker #1441 (umbrella) — parity-sweep runner arms absent for all 6 padding ops. The impl is end-to-end verified by 40+ lib tests; only the runner-arm wiring is missing. |

use std::sync::Arc;

use ferrotorch_core::autograd::no_grad::is_grad_enabled;
use ferrotorch_core::storage::TensorStorage;
use ferrotorch_core::tensor::{GradFn, Tensor};
use ferrotorch_core::{FerrotorchError, FerrotorchResult, Float};

use crate::module::Module;
use crate::parameter::Parameter;

// ---------------------------------------------------------------------------
// Padding mode enum (used by conv layers with padding_mode)
// ---------------------------------------------------------------------------

/// Padding mode for convolution layers.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PaddingMode {
    /// Zero padding (default).
    Zeros,
    /// Reflect padding.
    Reflect,
    /// Replicate padding (edge padding).
    Replicate,
    /// Circular padding (wrap-around).
    Circular,
}

// ---------------------------------------------------------------------------
// Low-level pad helpers (operate on raw data)
// ---------------------------------------------------------------------------

/// Pad the last dimension of a contiguous tensor.
///
/// `shape` has at least 1 dimension. The padding values `(left, right)` are
/// added to dimension `ndim-1`.
fn pad_1d_constant<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    value: T,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let inner = shape[ndim - 1];
    let new_inner = inner + pad_left + pad_right;

    // Number of "rows" = product of all dimensions except the last.
    let rows: usize = shape[..ndim - 1].iter().product();
    let rows = if rows == 0 { 1 } else { rows };

    let mut out = vec![value; rows * new_inner];
    // Degenerate input (numel 0 — e.g. an empty data buffer paired with a
    // non-empty declared shape, or `inner == 0`): there is no source data to
    // copy in. Mirror upstream `aten/src/ATen/native/PadNd.cpp:94-106`, which
    // `fill_(value)`s the output then `copy_`s the source — a no-op for a
    // zero-element input — leaving the correctly-shaped, value-filled output.
    // The guard prevents the out-of-bounds slice on `data` (#1551).
    if !data.is_empty() {
        for r in 0..rows {
            let src_start = r * inner;
            let dst_start = r * new_inner + pad_left;
            out[dst_start..dst_start + inner].copy_from_slice(&data[src_start..src_start + inner]);
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 1] = new_inner;
    (out, new_shape)
}

/// Pad the last 2 dimensions of a contiguous tensor with a constant value.
fn pad_2d_constant<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    value: T,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;

    let outer: usize = shape[..ndim - 2].iter().product();
    let outer = if outer == 0 { 1 } else { outer };

    let mut out = vec![value; outer * new_h * new_w];
    // Degenerate input (numel 0): no source data to copy in. Same rationale as
    // `pad_1d_constant` — mirror upstream `PadNd.cpp:94-106` (#1551).
    if !data.is_empty() {
        for o in 0..outer {
            for row in 0..h {
                let src_off = o * h * w + row * w;
                let dst_off = o * new_h * new_w + (row + pad_top) * new_w + pad_left;
                out[dst_off..dst_off + w].copy_from_slice(&data[src_off..src_off + w]);
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

/// Pad the last 3 dimensions of a contiguous tensor with a constant value.
// Internal kernel: signature mirrors PyTorch's `F.pad` 3-axis layout
// (left, right, top, bottom, front, back); a config struct adds nothing.
#[allow(clippy::too_many_arguments)]
fn pad_3d_constant<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    pad_front: usize,
    pad_back: usize,
    value: T,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let d = shape[ndim - 3];
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_d = d + pad_front + pad_back;
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;

    let outer: usize = shape[..ndim - 3].iter().product();
    let outer = if outer == 0 { 1 } else { outer };

    let mut out = vec![value; outer * new_d * new_h * new_w];
    // Degenerate input (numel 0): no source data to copy in. Same rationale as
    // `pad_1d_constant` — mirror upstream `PadNd.cpp:94-106` (#1551).
    if !data.is_empty() {
        for o in 0..outer {
            for dep in 0..d {
                for row in 0..h {
                    let src_off = o * d * h * w + dep * h * w + row * w;
                    let dst_off = o * new_d * new_h * new_w
                        + (dep + pad_front) * new_h * new_w
                        + (row + pad_top) * new_w
                        + pad_left;
                    out[dst_off..dst_off + w].copy_from_slice(&data[src_off..src_off + w]);
                }
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 3] = new_d;
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

// ---------------------------------------------------------------------------
// Reflection padding helpers
// ---------------------------------------------------------------------------

/// Reflect-pad the last dimension.
fn pad_1d_reflect<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
) -> FerrotorchResult<(Vec<T>, Vec<usize>)> {
    let ndim = shape.len();
    let inner = shape[ndim - 1];
    if pad_left >= inner || pad_right >= inner {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "Reflection padding ({pad_left}, {pad_right}) must be less than input size ({inner})"
            ),
        });
    }
    let new_inner = inner + pad_left + pad_right;
    let rows: usize = shape[..ndim - 1].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; rows * new_inner];
    for r in 0..rows {
        let src = &data[r * inner..(r + 1) * inner];
        let dst = &mut out[r * new_inner..(r + 1) * new_inner];
        // Left reflection
        for i in 0..pad_left {
            dst[pad_left - 1 - i] = src[i + 1];
        }
        // Copy original
        dst[pad_left..pad_left + inner].copy_from_slice(src);
        // Right reflection
        for i in 0..pad_right {
            dst[pad_left + inner + i] = src[inner - 2 - i];
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 1] = new_inner;
    Ok((out, new_shape))
}

/// Reflect-pad the last 2 dimensions.
fn pad_2d_reflect<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
) -> FerrotorchResult<(Vec<T>, Vec<usize>)> {
    let ndim = shape.len();
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    if pad_left >= w || pad_right >= w || pad_top >= h || pad_bottom >= h {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "Reflection padding ({pad_left}, {pad_right}, {pad_top}, {pad_bottom}) must be less than input size ({h}, {w})"
            ),
        });
    }
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 2].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_h * new_w];

    for o in 0..outer {
        let src_base = o * h * w;
        let dst_base = o * new_h * new_w;

        for new_row in 0..new_h {
            // Map new_row to source row via reflection
            let src_row = if new_row < pad_top {
                pad_top - new_row
            } else if new_row >= pad_top + h {
                h - 2 - (new_row - pad_top - h)
            } else {
                new_row - pad_top
            };

            for new_col in 0..new_w {
                let src_col = if new_col < pad_left {
                    pad_left - new_col
                } else if new_col >= pad_left + w {
                    w - 2 - (new_col - pad_left - w)
                } else {
                    new_col - pad_left
                };

                out[dst_base + new_row * new_w + new_col] = data[src_base + src_row * w + src_col];
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    Ok((out, new_shape))
}

/// Reflect-pad the last 3 dimensions.
// Internal kernel: same 3-axis pad descriptor as `pad_3d_constant`.
#[allow(clippy::too_many_arguments)]
fn pad_3d_reflect<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    pad_front: usize,
    pad_back: usize,
) -> FerrotorchResult<(Vec<T>, Vec<usize>)> {
    let ndim = shape.len();
    let d = shape[ndim - 3];
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    if pad_left >= w
        || pad_right >= w
        || pad_top >= h
        || pad_bottom >= h
        || pad_front >= d
        || pad_back >= d
    {
        return Err(FerrotorchError::InvalidArgument {
            message: "Reflection padding must be less than corresponding input dimension".into(),
        });
    }
    let new_d = d + pad_front + pad_back;
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 3].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_d * new_h * new_w];

    for o in 0..outer {
        let src_base = o * d * h * w;
        let dst_base = o * new_d * new_h * new_w;

        for nd in 0..new_d {
            let sd = if nd < pad_front {
                pad_front - nd
            } else if nd >= pad_front + d {
                d - 2 - (nd - pad_front - d)
            } else {
                nd - pad_front
            };
            for nh in 0..new_h {
                let sh = if nh < pad_top {
                    pad_top - nh
                } else if nh >= pad_top + h {
                    h - 2 - (nh - pad_top - h)
                } else {
                    nh - pad_top
                };
                for nw in 0..new_w {
                    let sw = if nw < pad_left {
                        pad_left - nw
                    } else if nw >= pad_left + w {
                        w - 2 - (nw - pad_left - w)
                    } else {
                        nw - pad_left
                    };
                    out[dst_base + nd * new_h * new_w + nh * new_w + nw] =
                        data[src_base + sd * h * w + sh * w + sw];
                }
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 3] = new_d;
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    Ok((out, new_shape))
}

// ---------------------------------------------------------------------------
// Replication padding helpers
// ---------------------------------------------------------------------------

/// Replicate-pad the last dimension (clamp to edges).
fn pad_1d_replicate<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let inner = shape[ndim - 1];
    let new_inner = inner + pad_left + pad_right;
    let rows: usize = shape[..ndim - 1].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; rows * new_inner];
    for r in 0..rows {
        let src = &data[r * inner..(r + 1) * inner];
        let dst = &mut out[r * new_inner..(r + 1) * new_inner];
        for (i, d) in dst.iter_mut().enumerate() {
            let src_idx = if i < pad_left {
                0
            } else if i >= pad_left + inner {
                inner - 1
            } else {
                i - pad_left
            };
            *d = src[src_idx];
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 1] = new_inner;
    (out, new_shape)
}

/// Replicate-pad the last 2 dimensions.
fn pad_2d_replicate<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 2].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_h * new_w];

    for o in 0..outer {
        let src_base = o * h * w;
        let dst_base = o * new_h * new_w;
        for nr in 0..new_h {
            let sr = nr.saturating_sub(pad_top).min(h - 1);
            for nc in 0..new_w {
                let sc = nc.saturating_sub(pad_left).min(w - 1);
                out[dst_base + nr * new_w + nc] = data[src_base + sr * w + sc];
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

/// Replicate-pad the last 3 dimensions.
// Internal kernel: same 3-axis pad descriptor as `pad_3d_constant`.
#[allow(clippy::too_many_arguments)]
fn pad_3d_replicate<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    pad_front: usize,
    pad_back: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let d = shape[ndim - 3];
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_d = d + pad_front + pad_back;
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 3].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_d * new_h * new_w];

    for o in 0..outer {
        let src_base = o * d * h * w;
        let dst_base = o * new_d * new_h * new_w;
        for nd in 0..new_d {
            let sd = nd.saturating_sub(pad_front).min(d - 1);
            for nh in 0..new_h {
                let sh = nh.saturating_sub(pad_top).min(h - 1);
                for nw in 0..new_w {
                    let sw = nw.saturating_sub(pad_left).min(w - 1);
                    out[dst_base + nd * new_h * new_w + nh * new_w + nw] =
                        data[src_base + sd * h * w + sh * w + sw];
                }
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 3] = new_d;
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

// ---------------------------------------------------------------------------
// Circular padding helpers
// ---------------------------------------------------------------------------

/// Reject an all-non-negative circular pad that wraps around more than once.
///
/// The positive-only `pad_*_circular` helpers gather via `rem_euclid`, which
/// silently wraps a pad strictly larger than the axis size MULTIPLE times
/// (e.g. `circular [0,3]` on size 2 -> `[1,2,1,2,1]`). Upstream
/// `_pad_circular_symint` rejects this at `aten/src/ATen/native/PadNd.cpp:142`:
/// `TORCH_CHECK(pad_l <= size && pad_r <= size, "Padding value causes wrapping
/// around more than once.")`. For a non-negative pad the net extent is always
/// `>= size > 0`, so `:142` is the only check that can fire — mirror it here so
/// the positive circular path matches torch's accept/reject (`pad <= size`).
fn check_circular_positive(axes: &[(usize, usize)]) -> FerrotorchResult<()> {
    for (idx, &(size, pad)) in axes.iter().enumerate() {
        if pad > size {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "Circular padding {pad} on axis (size {size}, position {idx}) causes wrapping around more than once (pad must be <= size)"
                ),
            });
        }
    }
    Ok(())
}

/// Circular-pad the last dimension (wrap-around).
fn pad_1d_circular<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let inner = shape[ndim - 1];
    let new_inner = inner + pad_left + pad_right;
    let rows: usize = shape[..ndim - 1].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; rows * new_inner];
    for r in 0..rows {
        let src = &data[r * inner..(r + 1) * inner];
        let dst = &mut out[r * new_inner..(r + 1) * new_inner];
        for (i, d) in dst.iter_mut().enumerate() {
            // Map to source via modulo
            let src_idx = ((i as isize - pad_left as isize).rem_euclid(inner as isize)) as usize;
            *d = src[src_idx];
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 1] = new_inner;
    (out, new_shape)
}

/// Circular-pad the last 2 dimensions.
fn pad_2d_circular<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 2].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_h * new_w];

    for o in 0..outer {
        let src_base = o * h * w;
        let dst_base = o * new_h * new_w;
        for nr in 0..new_h {
            let sr = ((nr as isize - pad_top as isize).rem_euclid(h as isize)) as usize;
            for nc in 0..new_w {
                let sc = ((nc as isize - pad_left as isize).rem_euclid(w as isize)) as usize;
                out[dst_base + nr * new_w + nc] = data[src_base + sr * w + sc];
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

/// Circular-pad the last 3 dimensions.
// Internal kernel: same 3-axis pad descriptor as `pad_3d_constant`.
#[allow(clippy::too_many_arguments)]
fn pad_3d_circular<T: Float>(
    data: &[T],
    shape: &[usize],
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    pad_front: usize,
    pad_back: usize,
) -> (Vec<T>, Vec<usize>) {
    let ndim = shape.len();
    let d = shape[ndim - 3];
    let h = shape[ndim - 2];
    let w = shape[ndim - 1];
    let new_d = d + pad_front + pad_back;
    let new_h = h + pad_top + pad_bottom;
    let new_w = w + pad_left + pad_right;
    let outer: usize = shape[..ndim - 3].iter().copied().product::<usize>().max(1);

    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; outer * new_d * new_h * new_w];

    for o in 0..outer {
        let src_base = o * d * h * w;
        let dst_base = o * new_d * new_h * new_w;
        for nd in 0..new_d {
            let sd = ((nd as isize - pad_front as isize).rem_euclid(d as isize)) as usize;
            for nh in 0..new_h {
                let sh = ((nh as isize - pad_top as isize).rem_euclid(h as isize)) as usize;
                for nw in 0..new_w {
                    let sw = ((nw as isize - pad_left as isize).rem_euclid(w as isize)) as usize;
                    out[dst_base + nd * new_h * new_w + nh * new_w + nw] =
                        data[src_base + sd * h * w + sh * w + sw];
                }
            }
        }
    }

    let mut new_shape = shape.to_vec();
    new_shape[ndim - 3] = new_d;
    new_shape[ndim - 2] = new_h;
    new_shape[ndim - 1] = new_w;
    (out, new_shape)
}

// ===========================================================================
// Public functional API — apply arbitrary padding to a Tensor
// ===========================================================================

// ---------------------------------------------------------------------------
// Autograd for the 1-D functional pad path (used by Conv1d's non-zero
// padding_mode pre-pad). Same gather/scatter-add adjoint as the 2-D case;
// see the `Pad2dBackward` block below for the full derivation. A pad that
// returns `requires_grad = false` severs autograd — the #1550 bug class that
// the 2-D path already fixed; the 1-D path needs the same `Pad1dBackward`
// node so Conv1d's input gradient flows through the reflect/replicate/circular
// pre-pad. Mirrors upstream `torch/nn/modules/conv.py:367-371` routing
// non-zero modes through the differentiable `F.pad`.
// ---------------------------------------------------------------------------

/// For an output element at `new_idx` in a 1-D pad, return the linear index
/// into the (single) source row, or `None` if the element comes from the
/// constant fill (Zeros mode) and has no source.
fn src_index_1d(mode: PaddingMode, new_idx: usize, inner: usize, pad_left: usize) -> Option<usize> {
    let s: usize = match mode {
        PaddingMode::Zeros => {
            if new_idx < pad_left || new_idx >= pad_left + inner {
                return None;
            }
            new_idx - pad_left
        }
        PaddingMode::Reflect => {
            if new_idx < pad_left {
                pad_left - new_idx
            } else if new_idx >= pad_left + inner {
                inner - 2 - (new_idx - pad_left - inner)
            } else {
                new_idx - pad_left
            }
        }
        PaddingMode::Replicate => new_idx.saturating_sub(pad_left).min(inner - 1),
        PaddingMode::Circular => {
            ((new_idx as isize - pad_left as isize).rem_euclid(inner as isize)) as usize
        }
    };
    Some(s)
}

/// Backward node for the 1-D functional pad. Scatter-adds the output gradient
/// back onto the unpadded input row using the per-output source-index map.
#[derive(Debug)]
struct Pad1dBackward<T: Float> {
    input: Tensor<T>,
    input_shape: Vec<usize>,
    mode: PaddingMode,
    pad_left: usize,
}

impl<T: Float> GradFn<T> for Pad1dBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if !self.input.requires_grad() {
            return Ok(vec![None]);
        }
        let ndim = self.input_shape.len();
        let inner = self.input_shape[ndim - 1];
        let rows: usize = self.input_shape[..ndim - 1]
            .iter()
            .copied()
            .product::<usize>()
            .max(1);

        let go_shape = grad_output.shape();
        let new_inner = go_shape[ndim - 1];

        // The backward runs on host: scatter-add is data-dependent over the
        // index map. `data_vec` materialises the (possibly GPU) grad to CPU.
        let go = grad_output.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut grad_in = vec![zero; rows * inner];

        for r in 0..rows {
            let go_base = r * new_inner;
            let gi_base = r * inner;
            for ni in 0..new_inner {
                if let Some(src) = src_index_1d(self.mode, ni, inner, self.pad_left) {
                    grad_in[gi_base + src] += go[go_base + ni];
                }
            }
        }

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "Pad1dBackward"
    }
}

/// Apply padding to the last dimension of a tensor using the given mode.
///
/// This is the functional version used internally by conv layers with
/// `padding_mode`.
///
/// When `input` requires grad (and grad tracking is enabled) the returned
/// tensor carries a [`Pad1dBackward`] node so gradients flow back to `input`,
/// matching the differentiable `F.pad` that `torch/nn/modules/conv.py`
/// `_conv_forward` routes non-zero `padding_mode`s through (Conv1d at
/// `conv.py:367-371`).
pub fn functional_pad_1d<T: Float>(
    input: &Tensor<T>,
    pad_left: usize,
    pad_right: usize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    // `Zeros` is the runner's mapping for torch `mode="constant"`; route it
    // through the crop-capable signed constant path — the single source of
    // truth for constant padding, mirroring torch dispatching `mode="constant"`
    // through `constant_pad_nd` (`aten/src/ATen/native/PadNd.cpp:214-215`). For
    // a non-negative `usize` pad the signed forward is byte-identical to the old
    // `pad_1d_constant` and its `PadNdSignedBackward` scatter-add equals the old
    // `Pad1dBackward` adjoint; the `value` fill (#1553) is preserved.
    if mode == PaddingMode::Zeros {
        return functional_pad_1d_signed(input, pad_left as isize, pad_right as isize, mode, value);
    }

    let data = input.data_vec()?;
    let shape = input.shape();
    let input_shape = shape.to_vec();
    // The `Zeros` (constant) arm is dispatched above through the crop-capable
    // signed path; the remaining gather modes never crop and keep their
    // existing positive-only helpers + `Pad1dBackward` adjoint.
    let (out_data, new_shape) = match mode {
        PaddingMode::Reflect => pad_1d_reflect(&data, shape, pad_left, pad_right)?,
        PaddingMode::Replicate => pad_1d_replicate(&data, shape, pad_left, pad_right),
        PaddingMode::Circular => {
            let inner = shape[shape.len() - 1];
            check_circular_positive(&[(inner, pad_left), (inner, pad_right)])?;
            pad_1d_circular(&data, shape, pad_left, pad_right)
        }
        PaddingMode::Zeros => {
            return functional_pad_1d_signed(
                input,
                pad_left as isize,
                pad_right as isize,
                mode,
                value,
            );
        }
    };

    // Grad path: attach Pad1dBackward so the autograd graph stays connected.
    // Without this the prior `from_storage(.., false)` severed it (#1550 bug
    // class), and Conv1d's input gradient would not flow through the non-zero
    // padding_mode pre-pad.
    if is_grad_enabled() && input.requires_grad() {
        let grad_fn = Arc::new(Pad1dBackward {
            input: input.clone(),
            input_shape,
            mode,
            pad_left,
        });
        return Tensor::from_operation(TensorStorage::cpu(out_data), new_shape, grad_fn);
    }

    Tensor::from_storage(TensorStorage::cpu(out_data), new_shape, false)
}

// ---------------------------------------------------------------------------
// Autograd for the 2-D functional pad path (used by Conv2d's non-zero
// padding_mode pre-pad). Every pad mode is a pure *gather*:
//   out[k] = input[src_index_2d(k)]   (or 0 for the out-of-bounds Zeros case).
// The adjoint (VJP) of a gather is a scatter-add into the unpadded input:
//   grad_input[src_index_2d(k)] += grad_output[k].
// This single rule is correct for ALL modes — Zeros (interior crop, padded
// outputs have no source so contribute nothing), Reflect (the reflected
// boundary source indices repeat, so their grads fold/accumulate back onto
// the mirrored interior positions), Replicate (the edge source index repeats,
// summing into the edge), and Circular (wrapped source indices accumulate
// around). Mirrors upstream `torch/nn/modules/conv.py:367-371` routing
// non-zero modes through the differentiable `F.pad`.
// ---------------------------------------------------------------------------

/// For an output element at `(new_row, new_col)` in a 2-D pad, return the
/// linear index `sr * w + sc` into the (single) source plane, or `None` if the
/// element comes from the constant fill (Zeros mode) and has no source.
fn src_index_2d(
    mode: PaddingMode,
    new_row: usize,
    new_col: usize,
    h: usize,
    w: usize,
    pad_left: usize,
    pad_top: usize,
) -> Option<usize> {
    let sr: usize = match mode {
        PaddingMode::Zeros => {
            if new_row < pad_top || new_row >= pad_top + h {
                return None;
            }
            new_row - pad_top
        }
        PaddingMode::Reflect => {
            if new_row < pad_top {
                pad_top - new_row
            } else if new_row >= pad_top + h {
                h - 2 - (new_row - pad_top - h)
            } else {
                new_row - pad_top
            }
        }
        PaddingMode::Replicate => new_row.saturating_sub(pad_top).min(h - 1),
        PaddingMode::Circular => {
            ((new_row as isize - pad_top as isize).rem_euclid(h as isize)) as usize
        }
    };
    let sc: usize = match mode {
        PaddingMode::Zeros => {
            if new_col < pad_left || new_col >= pad_left + w {
                return None;
            }
            new_col - pad_left
        }
        PaddingMode::Reflect => {
            if new_col < pad_left {
                pad_left - new_col
            } else if new_col >= pad_left + w {
                w - 2 - (new_col - pad_left - w)
            } else {
                new_col - pad_left
            }
        }
        PaddingMode::Replicate => new_col.saturating_sub(pad_left).min(w - 1),
        PaddingMode::Circular => {
            ((new_col as isize - pad_left as isize).rem_euclid(w as isize)) as usize
        }
    };
    Some(sr * w + sc)
}

/// Backward node for the 2-D functional pad. Scatter-adds the output gradient
/// back onto the unpadded input plane using the per-output source-index map.
#[derive(Debug)]
struct Pad2dBackward<T: Float> {
    input: Tensor<T>,
    input_shape: Vec<usize>,
    mode: PaddingMode,
    pad_left: usize,
    pad_top: usize,
}

impl<T: Float> GradFn<T> for Pad2dBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if !self.input.requires_grad() {
            return Ok(vec![None]);
        }
        let ndim = self.input_shape.len();
        let h = self.input_shape[ndim - 2];
        let w = self.input_shape[ndim - 1];
        let outer: usize = self.input_shape[..ndim - 2]
            .iter()
            .copied()
            .product::<usize>()
            .max(1);

        let go_shape = grad_output.shape();
        let new_h = go_shape[ndim - 2];
        let new_w = go_shape[ndim - 1];

        // The backward runs on host: scatter-add is data-dependent over the
        // index map. `data_vec` materialises the (possibly GPU) grad to CPU.
        let go = grad_output.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut grad_in = vec![zero; outer * h * w];

        for o in 0..outer {
            let go_base = o * new_h * new_w;
            let gi_base = o * h * w;
            for nr in 0..new_h {
                for nc in 0..new_w {
                    if let Some(src) =
                        src_index_2d(self.mode, nr, nc, h, w, self.pad_left, self.pad_top)
                    {
                        grad_in[gi_base + src] += go[go_base + nr * new_w + nc];
                    }
                }
            }
        }

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "Pad2dBackward"
    }
}

/// Apply padding to the last 2 dimensions of a tensor using the given mode.
///
/// When `input` requires grad (and grad tracking is enabled) the returned
/// tensor carries a [`Pad2dBackward`] node so gradients flow back to `input`,
/// matching the differentiable `F.pad` that `torch/nn/modules/conv.py`
/// `_conv_forward` routes non-zero `padding_mode`s through.
pub fn functional_pad_2d<T: Float>(
    input: &Tensor<T>,
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    // `Zeros` (torch `mode="constant"`) routes through the crop-capable signed
    // path — see the `functional_pad_1d` note. The `value` fill (#1553) is
    // preserved; for non-negative `usize` pads the result is byte-identical.
    if mode == PaddingMode::Zeros {
        return functional_pad_2d_signed(
            input,
            pad_left as isize,
            pad_right as isize,
            pad_top as isize,
            pad_bottom as isize,
            mode,
            value,
        );
    }

    let data = input.data_vec()?;
    let shape = input.shape();
    let input_shape = shape.to_vec();
    let (out_data, new_shape) = match mode {
        PaddingMode::Reflect => {
            pad_2d_reflect(&data, shape, pad_left, pad_right, pad_top, pad_bottom)?
        }
        PaddingMode::Replicate => {
            pad_2d_replicate(&data, shape, pad_left, pad_right, pad_top, pad_bottom)
        }
        PaddingMode::Circular => {
            let nd = shape.len();
            let (h, w) = (shape[nd - 2], shape[nd - 1]);
            check_circular_positive(&[
                (w, pad_left),
                (w, pad_right),
                (h, pad_top),
                (h, pad_bottom),
            ])?;
            pad_2d_circular(&data, shape, pad_left, pad_right, pad_top, pad_bottom)
        }
        PaddingMode::Zeros => {
            return functional_pad_2d_signed(
                input,
                pad_left as isize,
                pad_right as isize,
                pad_top as isize,
                pad_bottom as isize,
                mode,
                value,
            );
        }
    };

    // Grad path: attach Pad2dBackward so the autograd graph stays connected
    // (the prior `from_storage(..., false)` severed it — #1550).
    if is_grad_enabled() && input.requires_grad() {
        let grad_fn = Arc::new(Pad2dBackward {
            input: input.clone(),
            input_shape,
            mode,
            pad_left,
            pad_top,
        });
        return Tensor::from_operation(TensorStorage::cpu(out_data), new_shape, grad_fn);
    }

    Tensor::from_storage(TensorStorage::cpu(out_data), new_shape, false)
}

// ---------------------------------------------------------------------------
// Autograd for the 3-D functional pad path (used by Conv3d's non-zero
// padding_mode pre-pad). Same gather/scatter-add adjoint as the 1-D / 2-D
// cases; see the `Pad2dBackward` block above for the full derivation. Without
// the backward node, the pad returns `requires_grad = false` and severs
// autograd — the #1550 bug class. Mirrors upstream `torch/nn/modules/conv.py`
// `Conv3d._conv_forward` (`conv.py:717-721`) routing non-zero modes through
// the differentiable `F.pad`.
// ---------------------------------------------------------------------------

/// For an output element at `(nd, nh, nw)` in a 3-D pad, return the linear
/// index `sd*H*W + sh*W + sw` into the (single) source volume, or `None` if
/// the element comes from the constant fill (Zeros mode) and has no source.
// Internal helper: the 3-axis pad descriptor (d/h/w + per-axis pad) carries
// proportionally more arguments than the 1-D/2-D variants.
#[allow(clippy::too_many_arguments)]
fn src_index_3d(
    mode: PaddingMode,
    nd: usize,
    nh: usize,
    nw: usize,
    d: usize,
    h: usize,
    w: usize,
    pad_left: usize,
    pad_top: usize,
    pad_front: usize,
) -> Option<usize> {
    // Axis-wise source resolver shared across all three spatial axes. Returns
    // `None` only for the Zeros mode out-of-bounds case (constant fill).
    fn axis(mode: PaddingMode, new_idx: usize, size: usize, pad_lo: usize) -> Option<usize> {
        let s = match mode {
            PaddingMode::Zeros => {
                if new_idx < pad_lo || new_idx >= pad_lo + size {
                    return None;
                }
                new_idx - pad_lo
            }
            PaddingMode::Reflect => {
                if new_idx < pad_lo {
                    pad_lo - new_idx
                } else if new_idx >= pad_lo + size {
                    size - 2 - (new_idx - pad_lo - size)
                } else {
                    new_idx - pad_lo
                }
            }
            PaddingMode::Replicate => new_idx.saturating_sub(pad_lo).min(size - 1),
            PaddingMode::Circular => {
                ((new_idx as isize - pad_lo as isize).rem_euclid(size as isize)) as usize
            }
        };
        Some(s)
    }
    let sd = axis(mode, nd, d, pad_front)?;
    let sh = axis(mode, nh, h, pad_top)?;
    let sw = axis(mode, nw, w, pad_left)?;
    Some(sd * h * w + sh * w + sw)
}

/// Backward node for the 3-D functional pad. Scatter-adds the output gradient
/// back onto the unpadded input volume using the per-output source-index map.
#[derive(Debug)]
struct Pad3dBackward<T: Float> {
    input: Tensor<T>,
    input_shape: Vec<usize>,
    mode: PaddingMode,
    pad_left: usize,
    pad_top: usize,
    pad_front: usize,
}

impl<T: Float> GradFn<T> for Pad3dBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if !self.input.requires_grad() {
            return Ok(vec![None]);
        }
        let ndim = self.input_shape.len();
        let d = self.input_shape[ndim - 3];
        let h = self.input_shape[ndim - 2];
        let w = self.input_shape[ndim - 1];
        let outer: usize = self.input_shape[..ndim - 3]
            .iter()
            .copied()
            .product::<usize>()
            .max(1);

        let go_shape = grad_output.shape();
        let new_d = go_shape[ndim - 3];
        let new_h = go_shape[ndim - 2];
        let new_w = go_shape[ndim - 1];

        // The backward runs on host: scatter-add is data-dependent over the
        // index map. `data_vec` materialises the (possibly GPU) grad to CPU.
        let go = grad_output.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut grad_in = vec![zero; outer * d * h * w];

        for o in 0..outer {
            let go_base = o * new_d * new_h * new_w;
            let gi_base = o * d * h * w;
            for ndp in 0..new_d {
                for nhp in 0..new_h {
                    for nwp in 0..new_w {
                        if let Some(src) = src_index_3d(
                            self.mode,
                            ndp,
                            nhp,
                            nwp,
                            d,
                            h,
                            w,
                            self.pad_left,
                            self.pad_top,
                            self.pad_front,
                        ) {
                            grad_in[gi_base + src] +=
                                go[go_base + ndp * new_h * new_w + nhp * new_w + nwp];
                        }
                    }
                }
            }
        }

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "Pad3dBackward"
    }
}

/// Apply padding to the last 3 dimensions of a tensor using the given mode.
///
/// When `input` requires grad (and grad tracking is enabled) the returned
/// tensor carries a [`Pad3dBackward`] node so gradients flow back to `input`,
/// matching the differentiable `F.pad` that `torch/nn/modules/conv.py`
/// `Conv3d._conv_forward` routes non-zero `padding_mode`s through
/// (`conv.py:717-721`).
// Public API: matches PyTorch's `torch.nn.functional.pad` signature for the
// 3-axis case (input + 6 pad amounts + mode + value); divergence would
// break parity with the upstream reference.
#[allow(clippy::too_many_arguments)]
pub fn functional_pad_3d<T: Float>(
    input: &Tensor<T>,
    pad_left: usize,
    pad_right: usize,
    pad_top: usize,
    pad_bottom: usize,
    pad_front: usize,
    pad_back: usize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    // `Zeros` (torch `mode="constant"`) routes through the crop-capable signed
    // path — see the `functional_pad_1d` note. The `value` fill (#1553) is
    // preserved; for non-negative `usize` pads the result is byte-identical.
    if mode == PaddingMode::Zeros {
        return functional_pad_3d_signed(
            input,
            pad_left as isize,
            pad_right as isize,
            pad_top as isize,
            pad_bottom as isize,
            pad_front as isize,
            pad_back as isize,
            mode,
            value,
        );
    }

    let data = input.data_vec()?;
    let shape = input.shape();
    let input_shape = shape.to_vec();
    let (out_data, new_shape) = match mode {
        PaddingMode::Reflect => pad_3d_reflect(
            &data, shape, pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back,
        )?,
        PaddingMode::Replicate => pad_3d_replicate(
            &data, shape, pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back,
        ),
        PaddingMode::Circular => {
            let nd = shape.len();
            let (d, h, w) = (shape[nd - 3], shape[nd - 2], shape[nd - 1]);
            check_circular_positive(&[
                (w, pad_left),
                (w, pad_right),
                (h, pad_top),
                (h, pad_bottom),
                (d, pad_front),
                (d, pad_back),
            ])?;
            pad_3d_circular(
                &data, shape, pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back,
            )
        }
        PaddingMode::Zeros => {
            return functional_pad_3d_signed(
                input,
                pad_left as isize,
                pad_right as isize,
                pad_top as isize,
                pad_bottom as isize,
                pad_front as isize,
                pad_back as isize,
                mode,
                value,
            );
        }
    };

    // Grad path: attach Pad3dBackward so the autograd graph stays connected.
    // Without this the prior `from_storage(.., false)` severed it (#1550 bug
    // class), and Conv3d's input gradient would not flow through the non-zero
    // padding_mode pre-pad.
    if is_grad_enabled() && input.requires_grad() {
        let grad_fn = Arc::new(Pad3dBackward {
            input: input.clone(),
            input_shape,
            mode,
            pad_left,
            pad_top,
            pad_front,
        });
        return Tensor::from_operation(TensorStorage::cpu(out_data), new_shape, grad_fn);
    }

    Tensor::from_storage(TensorStorage::cpu(out_data), new_shape, false)
}

// ===========================================================================
// Signed (crop-capable) functional pad — torch `constant_pad_nd` negative pad
// ===========================================================================
//
// `torch.nn.functional.pad` accepts NEGATIVE pad amounts: a negative pad on a
// side CROPS (removes) `|pad|` elements from that side instead of adding. ALL
// modes support this — upstream `aten/src/ATen/native/PadNd.cpp:207-242`
// (`_pad_enum_symint`) routes `constant` through `constant_pad_nd` (which
// narrows for negatives) and `reflect`/`replicate`/`circular` straight to the
// native `reflection_pad*` / `replication_pad*` / `_pad_circular` kernels, which
// compute `output = input + pad_l + pad_r` (a negative pad narrows the side)
// and gather with offset `max(0,-pad) - max(0,pad)` (ReflectionPad.cpp:46,
// PaddingKernel.cpp:63-65, PadNd.cpp:158-159). The non-constant modes still
// reject a non-zero `value` (PadNd.cpp:217-219). This signed-constant gather is
// the `PaddingMode::Zeros` forward; the other modes compose crop-then-pad
// (`functional_pad_nd_signed`), which is byte-identical to their native kernels.
//
// Forward (PadNd.cpp:29-108): for each padded dim with signed pads `(lo, hi)`
// the cropped input is `narrow(i, -lo, size+lo)` (when `lo<0`) then
// `narrow(i, 0, size'+hi)` (when `hi<0`); the output of size
// `new = size + lo + hi` is `fill_(value)`d and the cropped input copied into
// the `max(lo,0)` offset window (PadNd.cpp:94-106). Equivalently, an output
// index `o` reads source index `s = o - lo`: when `0 <= s < size` it is real
// data, otherwise (only possible for the POSITIVE-pad region) it is the `value`
// fill. This one rule handles MIXED signs per-dim correctly.
//
// Over-crop: torch's `narrow` rejects a negative length
// ("narrow(): length must be non-negative", from PadNd.cpp:49 / :54), and
// PadNd.cpp:76 `TORCH_CHECK(new_dim >= 0)`. We mirror BOTH: a left crop may not
// exceed `size`, and a right crop may not exceed the post-left-crop size — i.e.
// `size + min(lo,0) >= 0` AND `size + min(lo,0) + min(hi,0) >= 0`. A net size of
// exactly 0 is allowed (torch returns an empty dim, e.g. `F.pad(x3, [-1,-2])`).
//
// Backward: the adjoint of a crop-or-pad gather is a scatter-add into the
// (full, original-size) input — `grad_input[o - lo] += grad_output[o]` for the
// in-bounds outputs. Cropped-away positions receive no contribution (grad 0),
// matching torch's `constant_pad_nd` backward being itself a `constant_pad_nd`
// with negated pads.

/// Resolve, for a single axis, the source index a padded/cropped output index
/// reads from. Returns `None` for the constant-fill region (an output position
/// in the POSITIVE-pad area that has no source element). `lo` is the signed pad
/// on the low side of this axis.
#[inline]
fn signed_axis_src(new_idx: usize, size: usize, lo: isize) -> Option<usize> {
    let s = new_idx as isize - lo;
    if s >= 0 && (s as usize) < size {
        Some(s as usize)
    } else {
        None
    }
}

/// Validate the signed pads for one axis against torch's sequential-`narrow`
/// crop rule and return the new axis size. Errors when a crop removes more than
/// the (running) axis size — mirroring torch's
/// "narrow(): length must be non-negative" / `TORCH_CHECK(new_dim >= 0)`.
fn signed_axis_new_size(
    size: usize,
    lo: isize,
    hi: isize,
    axis_label: &str,
) -> FerrotorchResult<usize> {
    // Left crop applies first (PadNd.cpp:49): narrow length `size + lo` must be
    // non-negative when `lo < 0`.
    let after_left: isize = if lo < 0 {
        size as isize + lo
    } else {
        size as isize
    };
    if after_left < 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "constant pad: negative padding {lo} on {axis_label} crops more than the dimension size {size} (narrow length would be negative)"
            ),
        });
    }
    // Right crop applies to the post-left size (PadNd.cpp:54).
    let after_right: isize = if hi < 0 { after_left + hi } else { after_left };
    if after_right < 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "constant pad: negative padding ({lo}, {hi}) on {axis_label} crops more than the dimension size {size}, resulting in a negative output size"
            ),
        });
    }
    // The actual new size also adds the POSITIVE side of each pad back in.
    Ok((after_right + lo.max(0) + hi.max(0)) as usize)
}

/// Generic crop-capable constant pad over the last `npad` dimensions.
///
/// `pads` is `[lo_0, hi_0, lo_1, hi_1, ...]` ordered from the LAST padded axis
/// inward (i.e. matching the `(left, right, top, bottom, front, back)`
/// flattened layout the public entrypoints use). Returns `(data, new_shape)`.
fn pad_nd_signed_constant<T: Float>(
    data: &[T],
    shape: &[usize],
    pads: &[(isize, isize)],
    value: T,
) -> FerrotorchResult<(Vec<T>, Vec<usize>)> {
    let ndim = shape.len();
    let npad = pads.len();
    // `pads[0]` targets the LAST axis; map axis k (0-based from the last padded
    // axis) to absolute dim `ndim - 1 - k`.
    let mut new_shape = shape.to_vec();
    let mut new_sizes = vec![0usize; npad]; // new_sizes[k] for axis ndim-1-k
    for (k, &(lo, hi)) in pads.iter().enumerate() {
        let dim = ndim - 1 - k;
        let new_size = signed_axis_new_size(shape[dim], lo, hi, &format!("dimension {dim}"))?;
        new_sizes[k] = new_size;
        new_shape[dim] = new_size;
    }

    // Outer dims (everything before the first padded axis) are untouched.
    let first_padded = ndim - npad;
    let outer: usize = shape[..first_padded]
        .iter()
        .copied()
        .product::<usize>()
        .max(1);

    let new_total: usize = new_shape.iter().copied().product();
    let mut out = vec![value; new_total];

    // Degenerate input (numel 0 — e.g. shape `[0, 3]`: empty data buffer with a
    // non-empty declared dim): no source data to gather. Mirror upstream
    // `aten/src/ATen/native/PadNd.cpp:94-106`, which `fill_(value)`s the output
    // then `copy_`s the (empty) source — a no-op — leaving the value-filled
    // output. The guard prevents an out-of-bounds index into the empty `data`
    // (same #1551 bug class the positive-only `pad_*d_constant` helpers guard).
    if data.is_empty() {
        return Ok((out, new_shape));
    }

    // Per-element gather over the padded sub-volume. `npad` is at most 3 here,
    // so a small fixed-stride walk over the last axes is sufficient and clear.
    // Strides within the (single outer slice of the) input / output.
    let in_inner: usize = shape[first_padded..].iter().product();
    let out_inner: usize = new_shape[first_padded..].iter().product();

    // Source coordinate buffer reused per output element.
    for o in 0..outer {
        let in_base = o * in_inner;
        let out_base = o * out_inner;
        for flat in 0..out_inner {
            // Decode `flat` into per-axis output coords (last axis fastest).
            let mut rem = flat;
            let mut src_lin = 0usize;
            let mut src_stride = 1usize;
            let mut missing = false;
            // Walk axes from last (k=0) to first padded (k=npad-1).
            for k in 0..npad {
                let dim = ndim - 1 - k;
                let axis_new = new_shape[dim];
                let coord = rem % axis_new;
                rem /= axis_new;
                let lo = pads[k].0;
                match signed_axis_src(coord, shape[dim], lo) {
                    Some(s) => {
                        src_lin += s * src_stride;
                        src_stride *= shape[dim];
                    }
                    None => {
                        missing = true;
                        break;
                    }
                }
            }
            if !missing {
                out[out_base + flat] = data[in_base + src_lin];
            }
            // else: leave the `value` fill already in place.
        }
    }

    Ok((out, new_shape))
}

/// Backward node for the signed (crop-capable) constant functional pad. The
/// adjoint of the crop/pad gather is a scatter-add into the original-size
/// input: `grad_input[o - lo] += grad_output[o]` for in-bounds outputs. Cropped
/// positions get no contribution (grad 0). Mirrors torch's `constant_pad_nd`
/// backward (itself a `constant_pad_nd` with negated pads).
#[derive(Debug)]
struct PadNdSignedBackward<T: Float> {
    input: Tensor<T>,
    input_shape: Vec<usize>,
    /// `(lo, hi)` per padded axis, ordered LAST axis first (same as the forward).
    pads: Vec<(isize, isize)>,
}

impl<T: Float> GradFn<T> for PadNdSignedBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if !self.input.requires_grad() {
            return Ok(vec![None]);
        }
        let ndim = self.input_shape.len();
        let npad = self.pads.len();
        let first_padded = ndim - npad;
        let outer: usize = self.input_shape[..first_padded]
            .iter()
            .copied()
            .product::<usize>()
            .max(1);
        let in_inner: usize = self.input_shape[first_padded..].iter().product();

        let go_shape = grad_output.shape();
        let out_inner: usize = go_shape[first_padded..].iter().product();

        // The backward runs on host: scatter-add is data-dependent over the
        // index map. `data_vec` materialises the (possibly GPU) grad to CPU.
        let go = grad_output.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut grad_in = vec![zero; outer * in_inner];

        for o in 0..outer {
            let in_base = o * in_inner;
            let out_base = o * out_inner;
            for flat in 0..out_inner {
                let mut rem = flat;
                let mut src_lin = 0usize;
                let mut src_stride = 1usize;
                let mut missing = false;
                for k in 0..npad {
                    let dim = ndim - 1 - k;
                    let axis_new = go_shape[dim];
                    let coord = rem % axis_new;
                    rem /= axis_new;
                    let lo = self.pads[k].0;
                    match signed_axis_src(coord, self.input_shape[dim], lo) {
                        Some(s) => {
                            src_lin += s * src_stride;
                            src_stride *= self.input_shape[dim];
                        }
                        None => {
                            missing = true;
                            break;
                        }
                    }
                }
                if !missing {
                    grad_in[in_base + src_lin] += go[out_base + flat];
                }
            }
        }

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "PadNdSignedBackward"
    }
}

/// Apply the all-non-negative pad part of `pads` under a non-`Zeros` mode by
/// delegating to the positive-only helpers, so reflect/replicate/circular keep
/// their exact gather + autograd behaviour. `pads` is LAST axis first.
fn functional_pad_nd_positive<T: Float>(
    input: &Tensor<T>,
    pads: &[(isize, isize)],
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    match pads.len() {
        1 => functional_pad_1d(input, pads[0].0 as usize, pads[0].1 as usize, mode, value),
        2 => functional_pad_2d(
            input,
            pads[0].0 as usize,
            pads[0].1 as usize,
            pads[1].0 as usize,
            pads[1].1 as usize,
            mode,
            value,
        ),
        3 => functional_pad_3d(
            input,
            pads[0].0 as usize,
            pads[0].1 as usize,
            pads[1].0 as usize,
            pads[1].1 as usize,
            pads[2].0 as usize,
            pads[2].1 as usize,
            mode,
            value,
        ),
        other => Err(FerrotorchError::InvalidArgument {
            message: format!("functional_pad_nd_signed supports 1-3 padded dims, got {other}"),
        }),
    }
}

/// Unified reflect index map matching upstream
/// `aten/src/ATen/native/cpu/PaddingKernel.cpp:63-80`. `j` is the output
/// position, `size` is the ORIGINAL input size on this axis, and `pad` is the
/// signed LOW-side pad. The window offset is
/// `offset = max(0, -pad) - max(0, pad)` (`PaddingKernel.cpp:63-65`); the
/// reflected index is then read as `i + offset` from the ORIGINAL input
/// (`PaddingKernel.cpp:71-80`). This reads the original window directly, so a
/// positive pad on a cropped side correctly reaches elements a crop-first pass
/// would have discarded. Caller guarantees the resolved index is in
/// `0..size` via the reflect legality check (`|pad| < size` per side).
#[inline]
fn reflect_axis_src(j: usize, size: usize, pad: isize) -> usize {
    let j = j as isize;
    let size_i = size as isize;
    let offset = 0i64.max(-(pad as i64)) - 0i64.max(pad as i64);
    let offset = offset as isize;
    let i = if j < pad {
        pad * 2 - j
    } else if j >= pad && j < size_i + pad {
        j
    } else {
        (size_i + pad - 1) * 2 - j
    };
    (i + offset) as usize
}

/// Unified replicate index map matching upstream `ReplicationPad::index`
/// (`aten/src/ATen/native/cpu/PaddingKernel.cpp:84-95`). `j` is the output
/// position, `size` is the ORIGINAL input size on this axis, and `pad` is the
/// signed LOW-side pad. The window offset is
/// `offset = max(0, -pad) - max(0, pad)` (`PaddingKernel.cpp:63-65`); the
/// CLAMPED index is then read as `i + offset` from the ORIGINAL input window
/// (`PaddingKernel.cpp:87-94`): a position before the (possibly cropped) window
/// clamps to the left boundary `pad`, a position past it clamps to the right
/// boundary `size + pad - 1`, and an interior position reads `j`. Because the
/// gather always resolves against the ORIGINAL window, an over-crop that leaves
/// a zero-size axis still reads the preserved edge element — no `inner - 1`
/// underflow, no panic (#1625, R-CODE-2). For a non-negative pad this is
/// byte-identical to the old crop-then-pad clamp. Caller guarantees `size >= 1`
/// (an empty original axis cannot be replicated; the legality check rejects it).
#[inline]
fn replicate_axis_src(j: usize, size: usize, pad: isize) -> usize {
    let j = j as isize;
    let size_i = size as isize;
    let offset = 0i64.max(-(pad as i64)) - 0i64.max(pad as i64);
    let offset = offset as isize;
    let i = if j < pad {
        pad
    } else if j >= pad && j < size_i + pad {
        j
    } else {
        size_i + pad - 1
    };
    (i + offset) as usize
}

/// Circular index map mirroring `_pad_circular`'s slice-copy gather
/// (`aten/src/ATen/native/PadNd.cpp:148-187`). The kernel first copies a
/// (possibly cropped) center slice `out[max(lo,0) .. out_w-max(hi,0)]` from
/// `in[max(-lo,0) .. size-max(-hi,0)]`, then wraps the left pad from the END of
/// the output and the right pad from the START. So a wrap reads from the
/// CROPPED center — NOT a plain modulo against the original window (which only
/// coincides when there is no crop). `j` is the output position, `size` the
/// ORIGINAL input size, `(lo, hi)` the signed pads on this axis. Returns the
/// RAW (signed) source index into the ORIGINAL input; it may fall outside
/// `0..size` for an illegal pad — `circular_axis_new_size` pre-validates every
/// index lies in `0..size` before the gather casts it to `usize`. Only called
/// for `out_w >= 1` (an empty `out_w == 0` axis runs the gather zero times).
#[inline]
fn circular_axis_src(j: usize, size: usize, lo: isize, hi: isize) -> isize {
    let j = j as isize;
    let size_i = size as isize;
    let out_w = size_i + lo + hi;
    let lo_pos = lo.max(0);
    let hi_pos = hi.max(0);
    // Resolve `j` to a center-region output index (left/right wraps copy from
    // the already-written center), then map that center index to the input.
    let center = if j < lo_pos {
        // Left wrap (`pad_l > 0`): out[0..lo] <- out[out_w-lo-hi_pos .. out_w-hi_pos].
        out_w - lo - hi_pos + j
    } else if j >= out_w - hi_pos {
        // Right wrap (`pad_r > 0`): out[out_w-hi .. out_w] <- out[lo_pos .. lo_pos+hi].
        lo_pos + (j - (out_w - hi))
    } else {
        j
    };
    // Center → input: in[max(-lo,0) + (center - max(lo,0))].
    lo.min(0).abs() + (center - lo_pos)
}

/// PER-AXIS circular-pad legality, returning the new axis extent
/// (`size + lo + hi`, which may be `0` for a net-zero crop → an empty dim).
///
/// This mirrors EXACTLY the two `TORCH_CHECK`s inside `_pad_circular_symint`'s
/// shape loop (`aten/src/ATen/native/PadNd.cpp:140-145`) — and ONLY those. The
/// center slice-copy (`:158-161`) and the wrap gather (`:169-187`) are NOT
/// per-axis legality: torch first allocates the FULL N-D output
/// (`:148 auto out = self.new_empty_symint(out_shape, ...)`) and only THEN does
/// the per-axis `copy_`. Those `copy_`s are validated in
/// [`circular_axis_validate_nonempty`], gated on the WHOLE output being
/// non-empty (when any axis is `0`, `out` has `numel 0` and every `copy_` is a
/// no-op — see the holistic restructure in [`pad_nd_signed_reflect_circular`]).
///
/// - `:140-142` `TORCH_CHECK(pad_l <= size && pad_r <= size, "Padding value
///   causes wrapping around more than once.")` — a pad strictly greater than
///   `size` wraps more than once → `Err`. This is the ONLY per-axis legality.
/// - `:143-145` `TORCH_CHECK(out_shape >= 0, "Negative padding value is
///   resulting in an empty dimension")` — a negative net extent → `Err`; a net
///   extent of EXACTLY `0` is allowed (an empty `[..,0]` dim, like
///   `constant_pad_nd`), distinct from reflect which demands `>= 1`.
fn circular_axis_legality(
    size: usize,
    lo: isize,
    hi: isize,
    dim: usize,
) -> FerrotorchResult<usize> {
    let size_i = size as isize;
    // `:140-142` — a pad larger than the dim wraps around more than once.
    if lo > size_i || hi > size_i {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "Circular padding ({lo}, {hi}) causes wrapping around more than once on dimension {dim} (size {size})"
            ),
        });
    }
    // `:143-145` — a negative net extent is an error; net zero is an empty dim.
    let out_w = size_i + lo + hi;
    if out_w < 0 {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "Circular padding ({lo}, {hi}) on dimension {dim} of size {size} results in a negative output size {out_w} (empty dimension)"
            ),
        });
    }
    Ok(out_w as usize)
}

/// Normalize a `tensor.slice(dim, start, end)` to a clamped `[start, end)`
/// half-open range over a `length`-element axis, mirroring torch's
/// `slice_symint` index normalization (negative indices `+= length`, then clamp
/// to `[0, length]`). Used by [`circular_slicecopy_block`] to model every
/// `slice_symint` in `_pad_circular_symint` (`PadNd.cpp:148-187`).
#[inline]
fn circular_slice_range(length: isize, mut start: isize, mut end: isize) -> (usize, usize) {
    if start < 0 {
        start += length;
    }
    if end < 0 {
        end += length;
    }
    start = start.clamp(0, length);
    end = end.clamp(0, length);
    if end < start {
        end = start;
    }
    (start as usize, end as usize)
}

/// HOLISTIC faithful simulation of torch's `_pad_circular_symint` allocate-then-
/// copy algorithm (`aten/src/ATen/native/PadNd.cpp:148-187`) over the last
/// `npad` dims of ONE outer batch block. This replaces the prior per-axis
/// wrap-OOB / center-copy pre-validation (which rejected an axis whose ISOLATED
/// wrap was OOB even when a SIBLING axis had already emptied the whole output —
/// the #1628 cross-axis net-zero divergence). Instead of validating each axis in
/// isolation, we reproduce torch's exact sequence on the full N-D output buffer:
///
/// - `:148` `auto out = self.new_empty_symint(out_shape)` — a buffer with an
///   `init` mask (all `false`); uninitialized cells are tracked so an over-crop
///   that leaves a final cell unwritten is detected as the R-DEV-6 carve-out.
/// - `:154-161` ONE center `copy_`: narrow `out` and `self` on every padded dim
///   by `slice(dim, max(pad,0), …)` / `slice(dim, max(-pad,0), …)`, then copy.
///   `copy_` errors unless the source broadcasts to the destination shape (per
///   dim: sizes equal OR source size 1); a mismatch is a torch `RuntimeError`.
/// - `:169-187` the left/right wrap `copy_`s, each reading from `out` LIVE
///   (`in_slice = out.slice_symint(...)` aliases the buffer being written, so a
///   wrap reads cells the center or an earlier wrap just wrote — `:163-165`
///   "Corners will be written more than once"). Same broadcast-legality gate.
///
/// Because the wraps read from `out`, an axis whose isolated wrap would be OOB
/// is harmless when a different axis emptied the output (every `copy_` is then a
/// no-op over the empty extent), and torch's well-defined cross-axis wraps that
/// the prior per-axis check rejected are now reproduced byte-for-byte. After all
/// copies, any cell still uninitialized means torch read uninitialized memory
/// there (no reproducible byte-for-byte contract — R-DEV-6); ferrotorch rejects
/// such cases cleanly rather than returning nondeterministic garbage (R-CODE-2:
/// no panic). The legality `:140-145` is already enforced by
/// [`circular_axis_legality`] before this runs.
fn circular_slicecopy_block<T: Float>(
    in_block: &[T],
    in_inner_shape: &[usize],
    out_inner_shape: &[usize],
    pads: &[(isize, isize)],
) -> FerrotorchResult<Vec<T>> {
    let npad = pads.len();
    let ninner = in_inner_shape.len();
    let out_total: usize = out_inner_shape.iter().product();
    let zero = <T as num_traits::Zero>::zero();
    let mut out = vec![zero; out_total];
    let mut init = vec![false; out_total];

    // Row-major strides for the inner (padded-region) coordinate space.
    let mut in_strides = vec![1usize; ninner];
    let mut out_strides = vec![1usize; ninner];
    for d in (0..ninner.saturating_sub(1)).rev() {
        in_strides[d] = in_strides[d + 1] * in_inner_shape[d + 1];
        out_strides[d] = out_strides[d + 1] * out_inner_shape[d + 1];
    }

    // `pads` is ordered LAST padded axis first; the padded inner dims are the
    // trailing `npad` dims of the inner block. Inner-dim index for pad entry `k`
    // (which targets axis `dim = ninner - 1 - k`).
    let pad_for_inner_dim = |d: usize| -> (isize, isize) {
        // d in [ninner-npad, ninner-1] -> k = ninner - 1 - d
        pads[ninner - 1 - d]
    };

    // Per-dim half-open copy windows for the dst (`out`) and src.
    // `copy_block` copies `src[src_win]` (broadcast) into `out[dst_win]`,
    // propagating the init mask, and returns Err on a broadcast-illegal `copy_`.
    // `dst_win`/`src_win` are `(start,end)` per inner dim.
    //
    // When `read_data` is `Some`, the source is a SEPARATE buffer (the original
    // input, for the center copy — `read_strides` indexes it). When `read_data`
    // is `None`, the source is `out` ITSELF, read LIVE in the same pass: this
    // mirrors torch's `:169-187` wrap `copy_`s where `in_slice = out.slice(...)`
    // aliases the very `out` buffer being written (`read_strides` indexes `out`).
    // Iterating in row-major dst order, a wrap cell can therefore read a cell the
    // center (or an earlier dst cell in this same wrap) just wrote, deterministi-
    // cally propagating a narrow center band exactly as torch does (#1629).
    #[allow(clippy::too_many_arguments)]
    fn copy_block<T: Float>(
        out: &mut [T],
        init: &mut [bool],
        read_data: Option<&[T]>,
        read_init: Option<&[bool]>,
        ninner: usize,
        out_strides: &[usize],
        read_strides: &[usize],
        dst_win: &[(usize, usize)],
        src_win: &[(usize, usize)],
    ) -> FerrotorchResult<()> {
        // Broadcast-legality (torch `copy_`): per dim, dst extent must equal src
        // extent OR src extent must be 1. Otherwise torch raises.
        let mut dst_ext = vec![0usize; ninner];
        let mut src_ext = vec![0usize; ninner];
        for d in 0..ninner {
            dst_ext[d] = dst_win[d].1 - dst_win[d].0;
            src_ext[d] = src_win[d].1 - src_win[d].0;
            if dst_ext[d] != src_ext[d] && src_ext[d] != 1 {
                return Err(FerrotorchError::InvalidArgument {
                    message: format!(
                        "Circular padding: a slice copy of source extent {} into destination extent {} is not broadcastable on inner dim {d} (torch raises a size-mismatch here)",
                        src_ext[d], dst_ext[d]
                    ),
                });
            }
        }
        let total: usize = dst_ext.iter().product();
        if total == 0 {
            return Ok(()); // no-op over an empty extent (`:148` empty out_shape)
        }
        // torch `copy_` memory-overlap gate (live-read wraps only). When the wrap
        // reads from `out` itself (`read_data is None`), torch's `copy_` raises
        // `RuntimeError: ... refer to a single memory location` when the source
        // and destination slices each form a CONTIGUOUS memory run AND those runs
        // overlap by a non-identity offset (MEM_OVERLAP_YES). A wrap slices a
        // SINGLE dim `wd` (all other dims full-extent); its dst/src each form a
        // contiguous run iff every dim MORE MAJOR than `wd` has extent 1 (else the
        // slice repeats once per major index → strided, and torch's overlap
        // detector returns "too hard" and proceeds with the well-defined band
        // propagation, #1629). An EXACT-identity window pair is a self-copy no-op
        // torch always allows; disjoint windows never overlap. We mirror torch's
        // raise as a clean `Err` (R-CODE-2: never a panic).
        if read_data.is_none() {
            let mut wrap_dim: Option<usize> = None;
            for d in 0..ninner {
                if dst_win[d] != src_win[d] {
                    // a wrap differs on exactly one (the wrap) dim
                    wrap_dim = Some(d);
                    break;
                }
            }
            if let Some(wd) = wrap_dim {
                // contiguous run iff every more-major dim is collapsed to extent 1
                let runs_contiguous = (0..wd).all(|d| dst_ext[d] == 1);
                let ds = dst_win[wd];
                let ss = src_win[wd];
                let overlap = ds.0 < ss.1 && ss.0 < ds.1; // half-open range overlap
                let identical = ds == ss;
                if runs_contiguous && overlap && !identical {
                    return Err(FerrotorchError::InvalidArgument {
                        message:
                            "Circular padding: torch's wrap copy_ would read and write a single memory location over a contiguous slice (RuntimeError: some elements of the input and written-to tensor refer to a single memory location); ferrotorch rejects rather than fabricate (R-DEV-6)"
                                .to_string(),
                    });
                }
            }
        }
        // Iterate every dst coordinate; map to the (broadcast) src coordinate.
        let mut coord = vec![0usize; ninner];
        for _ in 0..total {
            let mut dst_off = 0usize;
            let mut src_off = 0usize;
            for d in 0..ninner {
                let dc = dst_win[d].0 + coord[d];
                dst_off += dc * out_strides[d];
                let sc = if src_ext[d] == 1 {
                    src_win[d].0
                } else {
                    src_win[d].0 + coord[d]
                };
                src_off += sc * read_strides[d];
            }
            // LIVE read: when `read_data` is `None` the source IS `out`/`init`
            // (torch's wrap `in_slice = out.slice(...)`), so we read the current
            // value at `src_off` — including a cell written earlier in this very
            // pass — before overwriting `dst_off`.
            let (v, src_inited) = match (read_data, read_init) {
                (Some(rd), ri) => (rd[src_off], ri.map(|m| m[src_off]).unwrap_or(true)),
                (None, _) => (out[src_off], init[src_off]),
            };
            out[dst_off] = v;
            init[dst_off] = src_inited;
            // advance coord (row-major over dst extents)
            let mut d = ninner;
            while d > 0 {
                d -= 1;
                coord[d] += 1;
                if coord[d] < dst_ext[d] {
                    break;
                }
                coord[d] = 0;
            }
        }
        Ok(())
    }

    // `:154-161` — the single CENTER copy. Build dst/src windows per inner dim.
    let mut dst_win = vec![(0usize, 0usize); ninner];
    let mut src_win = vec![(0usize, 0usize); ninner];
    for d in 0..ninner {
        let out_len = out_inner_shape[d] as isize;
        let in_len = in_inner_shape[d] as isize;
        if d < ninner - npad {
            // Non-padded inner dim: full extent on both sides.
            dst_win[d] = (0, out_inner_shape[d]);
            src_win[d] = (0, in_inner_shape[d]);
        } else {
            let (pl, pr) = pad_for_inner_dim(d);
            dst_win[d] = circular_slice_range(out_len, pl.max(0), out_len - pr.max(0));
            src_win[d] = circular_slice_range(in_len, (-pl).max(0), in_len - (-pr).max(0));
        }
    }
    copy_block(
        &mut out,
        &mut init,
        Some(in_block),
        None,
        ninner,
        &out_strides,
        &in_strides,
        &dst_win,
        &src_win,
    )?;

    // `:169-187` — the left/right wrap copies, each reading from `out` LIVE.
    // torch's `in_slice = out.slice_symint(...)` (`:176`/`:184`) aliases the SAME
    // `out` buffer the loop is writing, and `:163-165` is explicit that corners
    // are written more than once across the sequence. So each wrap reads the
    // CURRENT `out` (including cells the center or an earlier wrap just wrote),
    // deterministically propagating a narrow over-cropped center band exactly as
    // torch does (#1629). We pass `read_data = None` so `copy_block` reads `out`/
    // `init` in place — NOT a pre-copy snapshot. Cells torch never writes stay
    // uninit and are caught by the leftover-uninit R-DEV-6 check below.
    for (k, &(pl, pr)) in pads.iter().enumerate() {
        // i in torch is k counted from the FIRST padded axis; torch's `dim` is
        // the inner dim `ninner - npad + k`. Our `pads` is last-axis-first, so
        // entry k targets inner dim `ninner - 1 - k`. torch iterates i=0..npad
        // over `pad[2*i]` (first-axis-first); the set of (dim,pl,pr) visited is
        // identical, only the order differs — and torch's wraps on distinct dims
        // are order-independent for the WELL-DEFINED cases (the order-dependent
        // overlapping ones land in the R-DEV-6 leftover-uninit reject either way).
        let dim = ninner - 1 - k;
        let out_len = out_inner_shape[dim] as isize;
        if pl > 0 {
            let mut dwin = vec![(0usize, 0usize); ninner];
            let mut swin = vec![(0usize, 0usize); ninner];
            for d in 0..ninner {
                dwin[d] = (0, out_inner_shape[d]);
                swin[d] = (0, out_inner_shape[d]);
            }
            dwin[dim] = circular_slice_range(out_len, 0, pl);
            swin[dim] =
                circular_slice_range(out_len, out_len - pl - pr.max(0), out_len - pr.max(0));
            copy_block(
                &mut out,
                &mut init,
                None,
                None,
                ninner,
                &out_strides,
                &out_strides,
                &dwin,
                &swin,
            )?;
        }
        if pr > 0 {
            let mut dwin = vec![(0usize, 0usize); ninner];
            let mut swin = vec![(0usize, 0usize); ninner];
            for d in 0..ninner {
                dwin[d] = (0, out_inner_shape[d]);
                swin[d] = (0, out_inner_shape[d]);
            }
            dwin[dim] = circular_slice_range(out_len, out_len - pr, out_len);
            swin[dim] = circular_slice_range(out_len, pl.max(0), pl.max(0) + pr);
            copy_block(
                &mut out,
                &mut init,
                None,
                None,
                ninner,
                &out_strides,
                &out_strides,
                &dwin,
                &swin,
            )?;
        }
    }

    // R-DEV-6: if any output cell is still uninitialized, torch read freed /
    // uninitialized memory there (a mixed-sign over-crop where the cropped
    // center is narrower than the wrap, or an overlapping `copy_`). There is no
    // reproducible byte-for-byte contract, so ferrotorch rejects cleanly rather
    // than emit nondeterministic garbage (R-CODE-2: no panic).
    if init.iter().any(|&b| !b) {
        return Err(FerrotorchError::InvalidArgument {
            message:
                "Circular padding crops the center below the wrap width, so torch reads uninitialized memory (no byte-for-byte contract; R-DEV-6)"
                    .to_string(),
        });
    }
    Ok(out)
}

/// One `copy_` operation in torch's circular forward sequence, recorded as the
/// list of `(dst_offset, src_offset)` cell pairs it touches over the inner
/// (padded-region) flat buffer. `from_input` distinguishes the center copy
/// (source is the ORIGINAL input buffer, `PadNd.cpp:154-161`) from a wrap copy
/// (source is the LIVE `out` buffer, `:169-187`) — the two backprop into
/// different grad buffers.
struct CircularCopyOp {
    from_input: bool,
    pairs: Vec<(usize, usize)>,
}

/// BACKWARD of torch's circular slice-copy forward, computed as the exact
/// autograd TRANSPOSE of the forward `copy_` sequence over ONE outer batch
/// block. `_pad_circular` (`PadNd.cpp:148-187`) is a differentiable composition
/// of `new_empty` + `slice` + `copy_`, so torch autograd differentiates it
/// directly: there is no hand-written backward. Each `out_slice.copy_(in_slice)`
/// (`:161`, `:179`, `:185`) backprops as `grad_src += grad_dst` over its copied
/// cells AND then ZEROS `grad_dst` (a `copy_` OVERWRITES the destination, so the
/// dst's pre-copy value did not flow forward). Processing the recorded ops in
/// REVERSE order with this accumulate-then-zero rule reproduces torch's grad
/// byte-for-byte (R-DEV-1) — including the over-crop wrap-propagation cases
/// (#1629/#1631) where the OLD per-axis `circular_axis_src` gather returned an
/// out-of-range source index and PANICKED (#1631, R-CODE-2). A center copy's
/// source is the ORIGINAL input (its grad accumulates into `grad_in`); a wrap's
/// source is a LIVE `out` cell (its grad accumulates back into the working
/// `grad_out`). For an empty / net-zero output every op has zero pairs ⇒ zero
/// grad contribution (no OOB). `go_block` is this block's output grad; the
/// returned vector is this block's input grad.
fn circular_slicecopy_backward_block<T: Float>(
    go_block: &[T],
    in_inner_shape: &[usize],
    out_inner_shape: &[usize],
    pads: &[(isize, isize)],
) -> Vec<T> {
    let npad = pads.len();
    let ninner = in_inner_shape.len();
    let in_total: usize = in_inner_shape.iter().product();

    let mut in_strides = vec![1usize; ninner];
    let mut out_strides = vec![1usize; ninner];
    for d in (0..ninner.saturating_sub(1)).rev() {
        in_strides[d] = in_strides[d + 1] * in_inner_shape[d + 1];
        out_strides[d] = out_strides[d + 1] * out_inner_shape[d + 1];
    }

    let pad_for_inner_dim = |d: usize| -> (isize, isize) { pads[ninner - 1 - d] };

    // Enumerate the `(dst_off, src_off)` pairs of one `copy_`, mirroring
    // `circular_slicecopy_block`'s `copy_block` iteration 1:1 (same windows,
    // same row-major dst order, same broadcast rule). `src_strides` indexes the
    // input for the center copy and `out` for a live wrap.
    let enum_pairs = |dst_win: &[(usize, usize)],
                      src_win: &[(usize, usize)],
                      src_strides: &[usize]|
     -> Vec<(usize, usize)> {
        let mut dst_ext = vec![0usize; ninner];
        let mut src_ext = vec![0usize; ninner];
        for d in 0..ninner {
            dst_ext[d] = dst_win[d].1 - dst_win[d].0;
            src_ext[d] = src_win[d].1 - src_win[d].0;
        }
        let total: usize = dst_ext.iter().product();
        let mut pairs = Vec::with_capacity(total);
        if total == 0 {
            return pairs;
        }
        let mut coord = vec![0usize; ninner];
        for _ in 0..total {
            let mut dst_off = 0usize;
            let mut src_off = 0usize;
            for d in 0..ninner {
                dst_off += (dst_win[d].0 + coord[d]) * out_strides[d];
                let sc = if src_ext[d] == 1 {
                    src_win[d].0
                } else {
                    src_win[d].0 + coord[d]
                };
                src_off += sc * src_strides[d];
            }
            pairs.push((dst_off, src_off));
            let mut d = ninner;
            while d > 0 {
                d -= 1;
                coord[d] += 1;
                if coord[d] < dst_ext[d] {
                    break;
                }
                coord[d] = 0;
            }
        }
        pairs
    };

    let mut ops: Vec<CircularCopyOp> = Vec::new();

    // `:154-161` — the single CENTER copy (source = original input window).
    let mut dst_win = vec![(0usize, 0usize); ninner];
    let mut src_win = vec![(0usize, 0usize); ninner];
    for d in 0..ninner {
        let out_len = out_inner_shape[d] as isize;
        let in_len = in_inner_shape[d] as isize;
        if d < ninner - npad {
            dst_win[d] = (0, out_inner_shape[d]);
            src_win[d] = (0, in_inner_shape[d]);
        } else {
            let (pl, pr) = pad_for_inner_dim(d);
            dst_win[d] = circular_slice_range(out_len, pl.max(0), out_len - pr.max(0));
            src_win[d] = circular_slice_range(in_len, (-pl).max(0), in_len - (-pr).max(0));
        }
    }
    ops.push(CircularCopyOp {
        from_input: true,
        pairs: enum_pairs(&dst_win, &src_win, &in_strides),
    });

    // `:169-187` — the left/right wrap copies (source = LIVE `out`), recorded in
    // the SAME order as the forward.
    for (k, &(pl, pr)) in pads.iter().enumerate() {
        let dim = ninner - 1 - k;
        let out_len = out_inner_shape[dim] as isize;
        if pl > 0 {
            let mut dwin = vec![(0usize, 0usize); ninner];
            let mut swin = vec![(0usize, 0usize); ninner];
            for d in 0..ninner {
                dwin[d] = (0, out_inner_shape[d]);
                swin[d] = (0, out_inner_shape[d]);
            }
            dwin[dim] = circular_slice_range(out_len, 0, pl);
            swin[dim] =
                circular_slice_range(out_len, out_len - pl - pr.max(0), out_len - pr.max(0));
            ops.push(CircularCopyOp {
                from_input: false,
                pairs: enum_pairs(&dwin, &swin, &out_strides),
            });
        }
        if pr > 0 {
            let mut dwin = vec![(0usize, 0usize); ninner];
            let mut swin = vec![(0usize, 0usize); ninner];
            for d in 0..ninner {
                dwin[d] = (0, out_inner_shape[d]);
                swin[d] = (0, out_inner_shape[d]);
            }
            dwin[dim] = circular_slice_range(out_len, out_len - pr, out_len);
            swin[dim] = circular_slice_range(out_len, pl.max(0), pl.max(0) + pr);
            ops.push(CircularCopyOp {
                from_input: false,
                pairs: enum_pairs(&dwin, &swin, &out_strides),
            });
        }
    }

    // Reverse transpose: working `grad_out` starts as the incoming output grad;
    // `grad_in` accumulates the input grad. For each op (reverse order) add
    // `grad_out[dst]` into its source, THEN zero `grad_out[dst]` (the `copy_`
    // overwrote `dst`, so its pre-copy value contributed nothing). A wrap's
    // source is a live `out` cell ⇒ accumulate back into `grad_out`; the center
    // copy's source is an input cell ⇒ accumulate into `grad_in`. We accumulate
    // every contribution for the op BEFORE zeroing so distinct dst cells reading
    // distinct (or broadcast-shared) sources all land correctly.
    let zero = <T as num_traits::Zero>::zero();
    let mut grad_out = go_block.to_vec();
    let mut grad_in = vec![zero; in_total];
    for op in ops.iter().rev() {
        if op.from_input {
            for &(d, s) in &op.pairs {
                grad_in[s] += grad_out[d];
                grad_out[d] = zero;
            }
        } else {
            // Accumulate into a scratch keyed by source `out` cell first, then
            // zero the dst cells, then fold the scratch back into `grad_out`.
            // (A dst cell may also be a source cell of another pair in the SAME
            // op only for an identity self-copy, which the forward overlap gate
            // rejects; for legal wraps dst and src windows are disjoint, so the
            // ordering is moot — but the scratch keeps it correct regardless.)
            let mut contrib: Vec<(usize, T)> = Vec::with_capacity(op.pairs.len());
            for &(d, s) in &op.pairs {
                contrib.push((s, grad_out[d]));
            }
            for &(d, _) in &op.pairs {
                grad_out[d] = zero;
            }
            for (s, v) in contrib {
                grad_out[s] += v;
            }
        }
    }
    grad_in
}

/// Resolve, for one axis, the source index a reflect/circular output index
/// reads from the ORIGINAL input window. Both modes always read a real element
/// (never a fill), so this returns a bare `usize`. `(lo, hi)` are the signed
/// pads on this axis. The circular index is pre-validated in
/// `circular_axis_new_size` to lie in `0..size`, so the `as usize` cast here is
/// always in-bounds (no OOB — R-CODE-2).
#[inline]
fn signed_mode_axis_src(mode: PaddingMode, j: usize, size: usize, lo: isize, hi: isize) -> usize {
    match mode {
        PaddingMode::Reflect => reflect_axis_src(j, size, lo),
        PaddingMode::Replicate => replicate_axis_src(j, size, lo),
        PaddingMode::Circular => circular_axis_src(j, size, lo, hi) as usize,
        // Zeros routes through the constant gather; this resolver is only invoked
        // for Reflect/Replicate/Circular (see `pad_nd_signed_reflect_circular` /
        // `PadNdSignedModeBackward`); the clamp here is a defensive in-bounds
        // fallback that never executes.
        PaddingMode::Zeros => (j as isize - lo).clamp(0, size as isize - 1) as usize,
    }
}

/// Crop-capable reflect/replicate/circular pad over the last `npad` dimensions
/// using the unified index map against the ORIGINAL input window. `pads` is
/// `[(lo,hi), ...]` ordered LAST padded axis first. Output extent per axis is
/// `size + lo + hi` (negative pads narrow). Reflect legality (SIGNED `lo < size`
/// and `hi < size` per axis, checked against the ORIGINAL size, mirroring
/// `aten/src/ATen/native/ReflectionPad.cpp:48-49`) is validated here. Reflect &
/// replicate use a RANK-DEPENDENT net-zero rule: 1-D requires output `>= 1`
/// while 2-D/3-D allow a per-axis net-zero (empty `[..,0,..]`) so long as one
/// padded axis survives (`ReflectionPad.cpp:251`/`:152`,
/// `ReplicationPadding.cpp:114`). Replicate gathers with the boundary clamp of
/// `ReplicationPad::index` (`cpu/PaddingKernel.cpp:84-95`), so an over-crop to a
/// zero-size axis never underflows (#1625).
fn pad_nd_signed_reflect_circular<T: Float>(
    data: &[T],
    shape: &[usize],
    pads: &[(isize, isize)],
    mode: PaddingMode,
) -> FerrotorchResult<(Vec<T>, Vec<usize>)> {
    let ndim = shape.len();
    let npad = pads.len();
    let mut new_shape = shape.to_vec();
    // Reflect's net-zero output rule is RANK-DEPENDENT (matches torch's per-rank
    // meta functions): 1-D `reflection_pad1d` requires `output_w >= 1`
    // (`aten/src/ATen/native/ReflectionPad.cpp:60-65`) so a net-zero axis Errs,
    // but 2-D `reflection_pad2d` (`:251`) and 3-D `reflection_pad3d` (`:152`)
    // require only `output_w >= 1 || output_h >= 1 (|| output_d >= 1)`, allowing
    // an INDIVIDUAL spatial axis to be net-zero (an empty `[..,0,..]` tensor) as
    // long as at least one spatial axis survives. Replicate has the identical
    // rank-dependent shape: `replication_pad1d` requires `owidth >= 1`
    // (`ReplicationPadding.cpp:49`) while `replication_pad2d`/`3d` use the same
    // OR (`:114`). So per-axis we reject a net-zero ONLY when `npad == 1` (the
    // 1-D kernel); for `npad >= 2` a single axis may be 0, and a final guard
    // below enforces that not ALL spatial axes are 0. (#1626)
    let per_axis_min: isize = isize::from(npad == 1);
    for (k, &(lo, hi)) in pads.iter().enumerate() {
        let dim = ndim - 1 - k;
        let size = shape[dim] as isize;
        // Reflect: torch's check is SIGNED, not absolute
        // (`aten/src/ATen/native/ReflectionPad.cpp:48-49`):
        // `TORCH_CHECK(pad_l < input_w && pad_r < input_w, ...)`. A NEGATIVE
        // (crop) pad is always `< input_w`, so torch only rejects POSITIVE pads
        // whose magnitude reaches `>= input_w`. Replicate has NO such
        // `pad < input` check upstream (`ReplicationPadding.cpp` only guards the
        // output extent), so this rejection is reflect-only.
        if mode == PaddingMode::Reflect && (lo >= size || hi >= size) {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "Reflection padding ({lo}, {hi}) must be less than input size ({size}) on dimension {dim}"
                ),
            });
        }
        // Replicate requires a non-empty ORIGINAL axis (the clamp gathers a real
        // boundary element). torch's `check_valid_input` rejects a zero-size
        // input plane, so size 0 here is impossible for a valid call; guard
        // defensively to keep the clamp index in `0..size`.
        if mode == PaddingMode::Replicate && size == 0 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "Replication padding cannot replicate an empty input dimension {dim} (size 0)"
                ),
            });
        }
        // Circular: torch's `_pad_circular_symint` is allocate-then-copy
        // (`aten/src/ATen/native/PadNd.cpp:140-187`). The PER-AXIS legality is
        // ONLY `:142` (reject `pad > size`, wraps more than once) and `:144`
        // (reject a negative net extent; allow exactly `0` → an empty dim) —
        // `circular_axis_legality`. The center copy (`:158-161`) and the wrap
        // gather (`:169-187`) operate on slices of the FULL `:148 new_empty`
        // output, so they are validated SEPARATELY below, gated on the WHOLE
        // output being non-empty (any `out_i == 0` ⇒ every `copy_` no-ops ⇒
        // torch returns the empty tensor without materializing ANY wrap index,
        // #1628). Reflect/Replicate use the rank-dependent `per_axis_min` reject.
        let new_size: usize = if mode == PaddingMode::Circular {
            circular_axis_legality(shape[dim], lo, hi, dim)?
        } else {
            let n = size + lo + hi;
            if n < per_axis_min {
                return Err(FerrotorchError::InvalidArgument {
                    message: format!(
                        "padding ({lo}, {hi}) on dimension {dim} of size {size} yields output size {n} below the minimum {per_axis_min} for this rank"
                    ),
                });
            }
            n as usize
        };
        new_shape[dim] = new_size;
    }

    // 2-D/3-D reflect & replicate: at least one padded spatial axis must survive
    // (`output_w >= 1 || output_h >= 1 (|| output_d >= 1)`,
    // `ReflectionPad.cpp:251`/`:152`, `ReplicationPadding.cpp:114`). When every
    // padded axis collapsed to 0, torch Errs "input is too small".
    if npad >= 2
        && matches!(mode, PaddingMode::Reflect | PaddingMode::Replicate)
        && pads
            .iter()
            .enumerate()
            .all(|(k, _)| new_shape[ndim - 1 - k] == 0)
    {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "{mode:?} padding collapses every padded spatial axis to size 0 (torch requires at least one >= 1)"
            ),
        });
    }

    let first_padded = ndim - npad;
    let outer: usize = shape[..first_padded]
        .iter()
        .copied()
        .product::<usize>()
        .max(1);
    let in_inner: usize = shape[first_padded..].iter().product();
    let out_inner: usize = new_shape[first_padded..].iter().product();
    let zero = <T as num_traits::Zero>::zero();
    let new_total: usize = new_shape.iter().copied().product();
    let mut out = vec![zero; new_total];

    // CIRCULAR: HOLISTIC allocate-then-copy (`PadNd.cpp:148-187`) per outer
    // batch, mirroring torch's `:148 new_empty(out_shape)` + center/wrap `copy_`
    // sequence on the FULL N-D output. This replaces the prior per-axis wrap-OOB
    // pre-validation + per-axis gather, which rejected an axis whose ISOLATED
    // wrap was OOB even when a SIBLING axis had already emptied the whole output
    // (the #1628 cross-axis net-zero divergence). The simulator reproduces the
    // empty short-circuit (any `out_i == 0` ⇒ every `copy_` no-ops ⇒ empty
    // tensor), the cross-axis well-defined wraps, AND the R-DEV-6 over-crop
    // rejection (leftover-uninit ⇒ Err, never a panic) in one faithful pass.
    if mode == PaddingMode::Circular {
        let in_inner_shape = &shape[first_padded..];
        let out_inner_shape = &new_shape[first_padded..];
        for o in 0..outer {
            let in_block = &data[o * in_inner..(o + 1) * in_inner];
            let out_block =
                circular_slicecopy_block(in_block, in_inner_shape, out_inner_shape, pads)?;
            out[o * out_inner..(o + 1) * out_inner].copy_from_slice(&out_block);
        }
        return Ok((out, new_shape));
    }

    // REFLECT / REPLICATE: the unified original-window per-axis gather
    // (`cpu/PaddingKernel.cpp:63-105`). Each output index reads a real input
    // element via the mode's per-axis index resolver.
    for o in 0..outer {
        let in_base = o * in_inner;
        let out_base = o * out_inner;
        for flat in 0..out_inner {
            let mut rem = flat;
            let mut src_lin = 0usize;
            let mut src_stride = 1usize;
            for k in 0..npad {
                let dim = ndim - 1 - k;
                let axis_new = new_shape[dim];
                let coord = rem % axis_new;
                rem /= axis_new;
                let (lo, hi) = pads[k];
                let s = signed_mode_axis_src(mode, coord, shape[dim], lo, hi);
                src_lin += s * src_stride;
                src_stride *= shape[dim];
            }
            out[out_base + flat] = data[in_base + src_lin];
        }
    }

    Ok((out, new_shape))
}

/// Backward for the signed reflect/circular pad: the adjoint of the unified
/// gather is a scatter-add into the original-size input
/// (`grad_input[src(o)] += grad_output[o]`), matching torch's
/// `reflection_pad*_backward` / `_pad_circular` backward.
#[derive(Debug)]
struct PadNdSignedModeBackward<T: Float> {
    input: Tensor<T>,
    input_shape: Vec<usize>,
    mode: PaddingMode,
    /// `(lo, hi)` per padded axis, ordered LAST axis first (same as the forward).
    pads: Vec<(isize, isize)>,
}

impl<T: Float> GradFn<T> for PadNdSignedModeBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if !self.input.requires_grad() {
            return Ok(vec![None]);
        }
        let ndim = self.input_shape.len();
        let npad = self.pads.len();
        let first_padded = ndim - npad;
        let outer: usize = self.input_shape[..first_padded]
            .iter()
            .copied()
            .product::<usize>()
            .max(1);
        let in_inner: usize = self.input_shape[first_padded..].iter().product();

        let go_shape = grad_output.shape();
        let out_inner: usize = go_shape[first_padded..].iter().product();

        let go = grad_output.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut grad_in = vec![zero; outer * in_inner];

        if self.mode == PaddingMode::Circular {
            // CIRCULAR backward is the scatter-add TRANSPOSE of the #1629
            // holistic forward `circular_slicecopy_block` (live-wrap slice-copy),
            // NOT the old per-axis `circular_axis_src` gather (which returned an
            // out-of-range source index for an over-cropped axis → index OOB
            // panic, #1631). We replay the SAME forward output→source mapping via
            // `circular_slicecopy_src_map` (center copy + live wraps), then for
            // each forward write `out[o] = in[src_map[o]]` scatter-add
            // `grad_out[o]` into `grad_in[src_map[o]]`. Cells the forward read
            // more than once accumulate their grads, matching torch's
            // `_pad_circular` backward (the transpose of `PadNd.cpp:176-179`
            // `out_slice.copy_(in_slice)` aliasing reads). Over-cropped /
            // net-zero-empty outputs produce zero forward writes ⇒ zero grad
            // contribution (no OOB, no panic — R-CODE-2).
            let in_inner_shape = &self.input_shape[first_padded..];
            let out_inner_shape = &go_shape[first_padded..];
            for o in 0..outer {
                let in_base = o * in_inner;
                let out_base = o * out_inner;
                let go_block = &go[out_base..out_base + out_inner];
                let gi_block = circular_slicecopy_backward_block(
                    go_block,
                    in_inner_shape,
                    out_inner_shape,
                    &self.pads,
                );
                for (i, &v) in gi_block.iter().enumerate() {
                    grad_in[in_base + i] += v;
                }
            }
            let grad_input =
                Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
            return Ok(vec![Some(grad_input)]);
        }

        for o in 0..outer {
            let in_base = o * in_inner;
            let out_base = o * out_inner;
            for flat in 0..out_inner {
                let mut rem = flat;
                let mut src_lin = 0usize;
                let mut src_stride = 1usize;
                for k in 0..npad {
                    let dim = ndim - 1 - k;
                    let axis_new = go_shape[dim];
                    let coord = rem % axis_new;
                    rem /= axis_new;
                    let (lo, hi) = self.pads[k];
                    let s = signed_mode_axis_src(self.mode, coord, self.input_shape[dim], lo, hi);
                    src_lin += s * src_stride;
                    src_stride *= self.input_shape[dim];
                }
                grad_in[in_base + src_lin] += go[out_base + flat];
            }
        }

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(grad_in), self.input_shape.clone(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "PadNdSignedModeBackward"
    }
}

/// Shared signed-pad driver for the 1-D/2-D/3-D public entrypoints. `pads` is
/// ordered LAST padded axis first.
///
/// For `PaddingMode::Zeros` (torch `mode="constant"`) negative pads narrow via
/// the signed-constant gather below. For reflect/replicate/circular, live torch
/// 2.11 does NOT reject a negative pad — `_pad_enum` dispatches straight to the
/// native `reflection_pad*` / `replication_pad*` / `_pad_circular` kernels,
/// which compute `output = input + pad_l + pad_r` directly (a negative pad
/// narrows the side) and offset the gather window by `max(0,-pad) - max(0,pad)`
/// (`aten/src/ATen/native/ReflectionPad.cpp:46`,
/// `aten/src/ATen/native/cpu/PaddingKernel.cpp:63-65`,
/// `aten/src/ATen/native/PadNd.cpp:158-159`). That is byte-identical to first
/// CROPPING the negative side(s) (constant-mode narrow) and then applying the
/// positive pad part with the mode's gather — verified against the live oracle
/// (`reflect [-1,0]` on `[1,2,3,4,5]` -> `[2,3,4,5]`; `replicate [1,-1]` ->
/// `[1,1,2,3,4]` grad `[2,1,1,1,0]`; `circular [-1,0]` -> `[2,3,4,5]` grad
/// `[0,1,1,1,1]`; `reflect2d [-1,1,0,0]` on the 3x3 -> `[[2,3,2],[5,6,5],
/// [8,9,8]]`). We compose crop-then-pad so the backward chains the crop adjoint
/// (zero-pad of the cropped side) with the mode-pad adjoint (the gather
/// scatter-add) through the normal autograd graph. Over-cropping a side
/// (`crop >= dim`) still errors via the signed-constant `narrow` check, matching
/// torch (`PadNd.cpp:221-242`).
fn functional_pad_nd_signed<T: Float>(
    input: &Tensor<T>,
    pads: &[(isize, isize)],
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    let has_negative = pads.iter().any(|&(lo, hi)| lo < 0 || hi < 0);

    if mode != PaddingMode::Zeros {
        if !has_negative {
            // All-non-negative under a non-constant mode: pure mode-pad.
            return functional_pad_nd_positive(input, pads, mode, value);
        }
        // Reflect/Replicate/Circular with a negative (crop) pad: torch does NOT
        // crop first. It reflects/clamps/wraps against the ORIGINAL input window
        // via a single index map with offset `max(0,-pad) - max(0,pad)`
        // (`aten/src/ATen/native/cpu/PaddingKernel.cpp:63-95`,
        // `ReflectionPad.cpp:46-48`, `PadNd.cpp:158-159`). A positive pad on a
        // cropped side reads elements a crop-first pass would have discarded
        // (e.g. `reflect [-3,2]` on `[1,2,3,4]` -> `[4,3,2]`, not an error).
        //
        // Replicate in particular MUST use the original-window clamp rather than
        // crop-then-pad: when a crop reduces an axis to size 0, the crop-first
        // path fed a zero-size axis to `pad_*_replicate`, which computed
        // `inner - 1` / `h - 1` and PANICKED (subtract-overflow). torch's
        // `ReplicationPad::index` (`PaddingKernel.cpp:84-95`) clamps the gather
        // to `[pad, size+pad-1]` against the ORIGINAL window, so an over-crop
        // still reads the preserved boundary element — no underflow, no panic
        // (#1625, R-CODE-2). We gather directly from the original window and
        // scatter-add the adjoint through `PadNdSignedModeBackward` (#1620 #1621
        // #1625).
        let data = input.data_vec()?;
        let shape = input.shape();
        if pads.len() > shape.len() {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "pad targets {} dims but input has only {} dims",
                    pads.len(),
                    shape.len()
                ),
            });
        }
        let input_shape = shape.to_vec();
        let (out_data, new_shape) = pad_nd_signed_reflect_circular(&data, shape, pads, mode)?;
        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(PadNdSignedModeBackward {
                input: input.clone(),
                input_shape,
                mode,
                pads: pads.to_vec(),
            });
            return Tensor::from_operation(TensorStorage::cpu(out_data), new_shape, grad_fn);
        }
        return Tensor::from_storage(TensorStorage::cpu(out_data), new_shape, false);
    }

    let data = input.data_vec()?;
    let shape = input.shape();
    if pads.len() > shape.len() {
        return Err(FerrotorchError::InvalidArgument {
            message: format!(
                "pad targets {} dims but input has only {} dims",
                pads.len(),
                shape.len()
            ),
        });
    }
    let input_shape = shape.to_vec();
    let (out_data, new_shape) = pad_nd_signed_constant(&data, shape, pads, value)?;

    // Grad path: attach PadNdSignedBackward so autograd stays connected (same
    // #1550 bug class the positive-only paths fixed).
    if is_grad_enabled() && input.requires_grad() {
        let grad_fn = Arc::new(PadNdSignedBackward {
            input: input.clone(),
            input_shape,
            pads: pads.to_vec(),
        });
        return Tensor::from_operation(TensorStorage::cpu(out_data), new_shape, grad_fn);
    }

    Tensor::from_storage(TensorStorage::cpu(out_data), new_shape, false)
}

/// Apply crop-capable padding to the last dimension of a tensor. Unlike
/// [`functional_pad_1d`] (which takes `usize`), the pad amounts are SIGNED: a
/// negative value crops `|pad|` elements off that side, mirroring
/// `torch.nn.functional.pad(input, [left, right], mode="constant", value=...)`
/// with negative `left`/`right` (`aten/src/ATen/native/PadNd.cpp:29-108`).
///
/// Negative (crop) pads are supported under EVERY mode: `Zeros` narrows via the
/// signed-constant gather, while reflect/replicate/circular crop the negative
/// side(s) then apply their gather on the positive part — byte-identical to
/// torch's native kernels, which compute `output = input + pad_l + pad_r`
/// directly (`PadNd.cpp:221-242`). Over-cropping (removing more than the
/// dimension holds) returns `InvalidArgument`, mirroring torch's
/// "narrow(): length must be non-negative".
pub fn functional_pad_1d_signed<T: Float>(
    input: &Tensor<T>,
    pad_left: isize,
    pad_right: isize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    functional_pad_nd_signed(input, &[(pad_left, pad_right)], mode, value)
}

/// Crop-capable padding for the last 2 dimensions. Signed analogue of
/// [`functional_pad_2d`]; see [`functional_pad_1d_signed`] for the crop
/// semantics and constant-mode restriction.
pub fn functional_pad_2d_signed<T: Float>(
    input: &Tensor<T>,
    pad_left: isize,
    pad_right: isize,
    pad_top: isize,
    pad_bottom: isize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    // `pads` is LAST axis (W: left/right) first, then 2nd-last (H: top/bottom).
    functional_pad_nd_signed(
        input,
        &[(pad_left, pad_right), (pad_top, pad_bottom)],
        mode,
        value,
    )
}

/// Crop-capable padding for the last 3 dimensions. Signed analogue of
/// [`functional_pad_3d`]; see [`functional_pad_1d_signed`] for the crop
/// semantics and constant-mode restriction.
// Public API: matches `torch.nn.functional.pad`'s 3-axis layout
// (left, right, top, bottom, front, back) — 6 signed pad amounts.
#[allow(clippy::too_many_arguments)]
pub fn functional_pad_3d_signed<T: Float>(
    input: &Tensor<T>,
    pad_left: isize,
    pad_right: isize,
    pad_top: isize,
    pad_bottom: isize,
    pad_front: isize,
    pad_back: isize,
    mode: PaddingMode,
    value: T,
) -> FerrotorchResult<Tensor<T>> {
    // LAST axis (W) first, then H, then D (front/back).
    functional_pad_nd_signed(
        input,
        &[
            (pad_left, pad_right),
            (pad_top, pad_bottom),
            (pad_front, pad_back),
        ],
        mode,
        value,
    )
}

// ===========================================================================
// Macro to reduce boilerplate for Module implementations on padding layers
// ===========================================================================

macro_rules! impl_padding_module {
    ($name:ident) => {
        impl<T: Float> Module<T> for $name<T> {
            fn forward(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
                self.pad(input)
            }

            fn parameters(&self) -> Vec<&Parameter<T>> {
                vec![]
            }

            fn parameters_mut(&mut self) -> Vec<&mut Parameter<T>> {
                vec![]
            }

            fn named_parameters(&self) -> Vec<(String, &Parameter<T>)> {
                vec![]
            }

            fn train(&mut self) {
                self.training = true;
            }

            fn eval(&mut self) {
                self.training = false;
            }

            fn is_training(&self) -> bool {
                self.training
            }
        }
    };
}

// ===========================================================================
// ConstantPad1d / ConstantPad2d / ConstantPad3d
// ===========================================================================

/// Pads the last dimension of the input tensor with a constant value.
///
/// # Shape
/// - Input: `[*, L]`
/// - Output: `[*, L + pad_left + pad_right]`
#[derive(Debug)]
pub struct ConstantPad1d<T: Float> {
    /// Padding `(left, right)`.
    pub padding: (usize, usize),
    /// Constant fill value.
    pub value: T,
    training: bool,
}

impl<T: Float> ConstantPad1d<T> {
    pub fn new(padding: (usize, usize), value: T) -> Self {
        Self {
            padding,
            value,
            training: true,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        let data = input.data_vec()?;
        let (out, new_shape) = pad_1d_constant(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.value,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ConstantPad1d);

/// Pads the last 2 dimensions with a constant value.
///
/// # Shape
/// - Input: `[*, H, W]`
/// - Output: `[*, H + top + bottom, W + left + right]`
#[derive(Debug)]
pub struct ConstantPad2d<T: Float> {
    /// Padding `(left, right, top, bottom)`.
    pub padding: (usize, usize, usize, usize),
    /// Constant fill value.
    pub value: T,
    training: bool,
}

impl<T: Float> ConstantPad2d<T> {
    pub fn new(padding: (usize, usize, usize, usize), value: T) -> Self {
        Self {
            padding,
            value,
            training: true,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ConstantPad2d expects at least 2-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_2d_constant(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            self.value,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ConstantPad2d);

/// Pads the last 3 dimensions with a constant value.
///
/// # Shape
/// - Input: `[*, D, H, W]`
/// - Output: `[*, D + front + back, H + top + bottom, W + left + right]`
#[derive(Debug)]
pub struct ConstantPad3d<T: Float> {
    /// Padding `(left, right, top, bottom, front, back)`.
    pub padding: (usize, usize, usize, usize, usize, usize),
    /// Constant fill value.
    pub value: T,
    training: bool,
}

impl<T: Float> ConstantPad3d<T> {
    pub fn new(padding: (usize, usize, usize, usize, usize, usize), value: T) -> Self {
        Self {
            padding,
            value,
            training: true,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ConstantPad3d expects at least 3-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_3d_constant(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            self.padding.4,
            self.padding.5,
            self.value,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ConstantPad3d);

// ===========================================================================
// ZeroPad1d / ZeroPad2d / ZeroPad3d
// ===========================================================================

/// Pads the last dimension with zeros.
#[derive(Debug)]
pub struct ZeroPad1d<T: Float> {
    pub padding: (usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ZeroPad1d<T> {
    pub fn new(padding: (usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        let data = input.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let (out, new_shape) =
            pad_1d_constant(&data, input.shape(), self.padding.0, self.padding.1, zero);
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ZeroPad1d);

/// Pads the last 2 dimensions with zeros.
#[derive(Debug)]
pub struct ZeroPad2d<T: Float> {
    pub padding: (usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ZeroPad2d<T> {
    pub fn new(padding: (usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ZeroPad2d expects at least 2-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let (out, new_shape) = pad_2d_constant(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            zero,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ZeroPad2d);

/// Pads the last 3 dimensions with zeros.
#[derive(Debug)]
pub struct ZeroPad3d<T: Float> {
    pub padding: (usize, usize, usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ZeroPad3d<T> {
    pub fn new(padding: (usize, usize, usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ZeroPad3d expects at least 3-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let zero = <T as num_traits::Zero>::zero();
        let (out, new_shape) = pad_3d_constant(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            self.padding.4,
            self.padding.5,
            zero,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ZeroPad3d);

// ===========================================================================
// ReflectionPad1d / ReflectionPad2d / ReflectionPad3d
// ===========================================================================

/// Pads the last dimension using reflection of the input boundary.
#[derive(Debug)]
pub struct ReflectionPad1d<T: Float> {
    pub padding: (usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReflectionPad1d<T> {
    pub fn new(padding: (usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        let data = input.data_vec()?;
        let (out, new_shape) =
            pad_1d_reflect(&data, input.shape(), self.padding.0, self.padding.1)?;
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReflectionPad1d);

/// Pads the last 2 dimensions using reflection.
#[derive(Debug)]
pub struct ReflectionPad2d<T: Float> {
    pub padding: (usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReflectionPad2d<T> {
    pub fn new(padding: (usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ReflectionPad2d expects at least 2-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_2d_reflect(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
        )?;
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReflectionPad2d);

/// Pads the last 3 dimensions using reflection.
#[derive(Debug)]
pub struct ReflectionPad3d<T: Float> {
    pub padding: (usize, usize, usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReflectionPad3d<T> {
    pub fn new(padding: (usize, usize, usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ReflectionPad3d expects at least 3-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_3d_reflect(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            self.padding.4,
            self.padding.5,
        )?;
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReflectionPad3d);

// ===========================================================================
// ReplicationPad1d / ReplicationPad2d / ReplicationPad3d
// ===========================================================================

/// Pads the last dimension by replicating the edge values.
#[derive(Debug)]
pub struct ReplicationPad1d<T: Float> {
    pub padding: (usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReplicationPad1d<T> {
    pub fn new(padding: (usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        let data = input.data_vec()?;
        let (out, new_shape) =
            pad_1d_replicate(&data, input.shape(), self.padding.0, self.padding.1);
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReplicationPad1d);

/// Pads the last 2 dimensions by replicating edge values.
#[derive(Debug)]
pub struct ReplicationPad2d<T: Float> {
    pub padding: (usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReplicationPad2d<T> {
    pub fn new(padding: (usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ReplicationPad2d expects at least 2-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_2d_replicate(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReplicationPad2d);

/// Pads the last 3 dimensions by replicating edge values.
#[derive(Debug)]
pub struct ReplicationPad3d<T: Float> {
    pub padding: (usize, usize, usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> ReplicationPad3d<T> {
    pub fn new(padding: (usize, usize, usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() < 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "ReplicationPad3d expects at least 3-D input, got {:?}",
                    input.shape()
                ),
            });
        }
        let data = input.data_vec()?;
        let (out, new_shape) = pad_3d_replicate(
            &data,
            input.shape(),
            self.padding.0,
            self.padding.1,
            self.padding.2,
            self.padding.3,
            self.padding.4,
            self.padding.5,
        );
        Tensor::from_storage(TensorStorage::cpu(out), new_shape, false)
    }
}

impl_padding_module!(ReplicationPad3d);

// ===========================================================================
// CircularPad — wraps data circularly (periodic boundary conditions)
// ===========================================================================

/// 1-D circular padding: wraps the input circularly.
///
/// Input: [N, C, W]. Pads the W dimension with circular (periodic) values.
/// Matches PyTorch's `nn.CircularPad1d`.
#[derive(Debug, Clone)]
pub struct CircularPad1d<T: Float> {
    pub padding: (usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> CircularPad1d<T> {
    pub fn new(padding: (usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() != 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CircularPad1d: expected 3-D input [N,C,W], got {:?}",
                    input.shape()
                ),
            });
        }
        if input.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "CircularPad1d",
            });
        }
        let shape = input.shape();
        let (n, c, w) = (shape[0], shape[1], shape[2]);
        let (pl, pr) = self.padding;
        let new_w = w + pl + pr;
        let data = input.data()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut out = vec![zero; n * c * new_w];

        for batch in 0..n {
            for ch in 0..c {
                for ow in 0..new_w {
                    let iw = ((ow as isize - pl as isize).rem_euclid(w as isize)) as usize;
                    out[batch * c * new_w + ch * new_w + ow] = data[batch * c * w + ch * w + iw];
                }
            }
        }

        Tensor::from_storage(TensorStorage::cpu(out), vec![n, c, new_w], false)
    }
}

impl<T: Float> Default for CircularPad1d<T> {
    fn default() -> Self {
        Self::new((0, 0))
    }
}

impl_padding_module!(CircularPad1d);

/// 2-D circular padding. Input: [N, C, H, W].
/// Matches PyTorch's `nn.CircularPad2d`.
#[derive(Debug, Clone)]
pub struct CircularPad2d<T: Float> {
    pub padding: (usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> CircularPad2d<T> {
    pub fn new(padding: (usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() != 4 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CircularPad2d: expected 4-D input [N,C,H,W], got {:?}",
                    input.shape()
                ),
            });
        }
        if input.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "CircularPad2d",
            });
        }
        let shape = input.shape();
        let (n, c, h, w) = (shape[0], shape[1], shape[2], shape[3]);
        let (pl, pr, pt, pb) = self.padding;
        let new_h = h + pt + pb;
        let new_w = w + pl + pr;
        let data = input.data()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut out = vec![zero; n * c * new_h * new_w];

        for batch in 0..n {
            for ch in 0..c {
                for oh in 0..new_h {
                    let ih = ((oh as isize - pt as isize).rem_euclid(h as isize)) as usize;
                    for ow in 0..new_w {
                        let iw = ((ow as isize - pl as isize).rem_euclid(w as isize)) as usize;
                        out[batch * c * new_h * new_w + ch * new_h * new_w + oh * new_w + ow] =
                            data[batch * c * h * w + ch * h * w + ih * w + iw];
                    }
                }
            }
        }

        Tensor::from_storage(TensorStorage::cpu(out), vec![n, c, new_h, new_w], false)
    }
}

impl<T: Float> Default for CircularPad2d<T> {
    fn default() -> Self {
        Self::new((0, 0, 0, 0))
    }
}

impl_padding_module!(CircularPad2d);

/// 3-D circular padding. Input: [N, C, D, H, W].
/// Matches PyTorch's `nn.CircularPad3d`.
#[derive(Debug, Clone)]
pub struct CircularPad3d<T: Float> {
    pub padding: (usize, usize, usize, usize, usize, usize),
    training: bool,
    _phantom: std::marker::PhantomData<T>,
}

impl<T: Float> CircularPad3d<T> {
    pub fn new(padding: (usize, usize, usize, usize, usize, usize)) -> Self {
        Self {
            padding,
            training: true,
            _phantom: std::marker::PhantomData,
        }
    }

    fn pad(&self, input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
        if input.ndim() != 5 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CircularPad3d: expected 5-D input [N,C,D,H,W], got {:?}",
                    input.shape()
                ),
            });
        }
        if input.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "CircularPad3d",
            });
        }
        let shape = input.shape();
        let (n, c, d, h, w) = (shape[0], shape[1], shape[2], shape[3], shape[4]);
        let (pl, pr, pt, pb, pf, pk) = self.padding;
        let (new_d, new_h, new_w) = (d + pf + pk, h + pt + pb, w + pl + pr);
        let data = input.data()?;
        let zero = <T as num_traits::Zero>::zero();
        let mut out = vec![zero; n * c * new_d * new_h * new_w];

        for batch in 0..n {
            for ch in 0..c {
                for od in 0..new_d {
                    let id = ((od as isize - pf as isize).rem_euclid(d as isize)) as usize;
                    for oh in 0..new_h {
                        let ih = ((oh as isize - pt as isize).rem_euclid(h as isize)) as usize;
                        for ow in 0..new_w {
                            let iw = ((ow as isize - pl as isize).rem_euclid(w as isize)) as usize;
                            out[batch * c * new_d * new_h * new_w
                                + ch * new_d * new_h * new_w
                                + od * new_h * new_w
                                + oh * new_w
                                + ow] = data
                                [batch * c * d * h * w + ch * d * h * w + id * h * w + ih * w + iw];
                        }
                    }
                }
            }
        }

        Tensor::from_storage(
            TensorStorage::cpu(out),
            vec![n, c, new_d, new_h, new_w],
            false,
        )
    }
}

impl<T: Float> Default for CircularPad3d<T> {
    fn default() -> Self {
        Self::new((0, 0, 0, 0, 0, 0))
    }
}

impl_padding_module!(CircularPad3d);

// ===========================================================================
// Tests
// ===========================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use crate::module::Module;

    fn t(data: &[f32], shape: &[usize]) -> Tensor<f32> {
        Tensor::from_storage(TensorStorage::cpu(data.to_vec()), shape.to_vec(), false).unwrap()
    }

    fn assert_close(actual: &[f32], expected: &[f32], tol: f32) {
        assert_eq!(
            actual.len(),
            expected.len(),
            "length mismatch: {} vs {}",
            actual.len(),
            expected.len()
        );
        for (i, (&a, &e)) in actual.iter().zip(expected.iter()).enumerate() {
            assert!((a - e).abs() < tol, "index {i}: actual={a} expected={e}");
        }
    }

    // -----------------------------------------------------------------------
    // ConstantPad1d
    // -----------------------------------------------------------------------

    #[test]
    fn test_constant_pad1d_basic() {
        let pad = ConstantPad1d::<f32>::new((2, 3), 9.0);
        let input = t(&[1.0, 2.0, 3.0], &[1, 1, 3]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 8]);
        assert_close(
            output.data().unwrap(),
            &[9.0, 9.0, 1.0, 2.0, 3.0, 9.0, 9.0, 9.0],
            1e-7,
        );
    }

    // -----------------------------------------------------------------------
    // ZeroPad1d
    // -----------------------------------------------------------------------

    #[test]
    fn test_zero_pad1d() {
        let pad = ZeroPad1d::<f32>::new((1, 2));
        let input = t(&[1.0, 2.0, 3.0], &[3]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[6]);
        assert_close(
            output.data().unwrap(),
            &[0.0, 1.0, 2.0, 3.0, 0.0, 0.0],
            1e-7,
        );
    }

    // -----------------------------------------------------------------------
    // ZeroPad2d
    // -----------------------------------------------------------------------

    #[test]
    fn test_zero_pad2d() {
        let pad = ZeroPad2d::<f32>::new((1, 1, 1, 1));
        let input = t(&[1.0, 2.0, 3.0, 4.0], &[1, 1, 2, 2]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 4, 4]);
        #[rustfmt::skip]
        let expected = [
            0.0, 0.0, 0.0, 0.0,
            0.0, 1.0, 2.0, 0.0,
            0.0, 3.0, 4.0, 0.0,
            0.0, 0.0, 0.0, 0.0,
        ];
        assert_close(output.data().unwrap(), &expected, 1e-7);
    }

    // -----------------------------------------------------------------------
    // ZeroPad3d
    // -----------------------------------------------------------------------

    #[test]
    fn test_zero_pad3d_shape() {
        let pad = ZeroPad3d::<f32>::new((1, 1, 1, 1, 1, 1));
        let input = t(&[1.0; 2 * 2 * 2], &[1, 1, 2, 2, 2]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 4, 4, 4]);
    }

    // -----------------------------------------------------------------------
    // ReflectionPad1d
    // -----------------------------------------------------------------------

    #[test]
    fn test_reflection_pad1d() {
        let pad = ReflectionPad1d::<f32>::new((2, 2));
        // input = [1, 2, 3, 4]
        let input = t(&[1.0, 2.0, 3.0, 4.0], &[4]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[8]);
        // Reflect left: [3, 2, | 1, 2, 3, 4 | 3, 2]
        assert_close(
            output.data().unwrap(),
            &[3.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 2.0],
            1e-7,
        );
    }

    #[test]
    fn test_reflection_pad1d_too_large() {
        let pad = ReflectionPad1d::<f32>::new((4, 0));
        let input = t(&[1.0, 2.0, 3.0], &[3]); // size 3, pad 4 >= 3
        assert!(pad.forward(&input).is_err());
    }

    // -----------------------------------------------------------------------
    // ReflectionPad2d
    // -----------------------------------------------------------------------

    #[test]
    fn test_reflection_pad2d() {
        let pad = ReflectionPad2d::<f32>::new((1, 1, 1, 1));
        #[rustfmt::skip]
        let input = t(&[
            1.0, 2.0, 3.0,
            4.0, 5.0, 6.0,
            7.0, 8.0, 9.0,
        ], &[1, 1, 3, 3]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 5, 5]);
        // Corner (0,0) should reflect to (1,1) in src = 5.0
        let out = output.data().unwrap();
        assert_close(&out[0..1], &[5.0], 1e-7); // top-left corner
    }

    // -----------------------------------------------------------------------
    // ReplicationPad1d
    // -----------------------------------------------------------------------

    #[test]
    fn test_replication_pad1d() {
        let pad = ReplicationPad1d::<f32>::new((2, 3));
        let input = t(&[1.0, 2.0, 3.0], &[3]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[8]);
        assert_close(
            output.data().unwrap(),
            &[1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            1e-7,
        );
    }

    // -----------------------------------------------------------------------
    // ReplicationPad2d
    // -----------------------------------------------------------------------

    #[test]
    fn test_replication_pad2d() {
        let pad = ReplicationPad2d::<f32>::new((1, 1, 1, 1));
        #[rustfmt::skip]
        let input = t(&[
            1.0, 2.0,
            3.0, 4.0,
        ], &[1, 1, 2, 2]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 4, 4]);
        #[rustfmt::skip]
        let expected = [
            1.0, 1.0, 2.0, 2.0,
            1.0, 1.0, 2.0, 2.0,
            3.0, 3.0, 4.0, 4.0,
            3.0, 3.0, 4.0, 4.0,
        ];
        assert_close(output.data().unwrap(), &expected, 1e-7);
    }

    // -----------------------------------------------------------------------
    // ConstantPad2d
    // -----------------------------------------------------------------------

    #[test]
    fn test_constant_pad2d() {
        let pad = ConstantPad2d::<f32>::new((1, 1, 1, 1), -1.0);
        let input = t(&[5.0, 6.0, 7.0, 8.0], &[2, 2]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[4, 4]);
        #[rustfmt::skip]
        let expected = [
            -1.0, -1.0, -1.0, -1.0,
            -1.0, 5.0, 6.0, -1.0,
            -1.0, 7.0, 8.0, -1.0,
            -1.0, -1.0, -1.0, -1.0,
        ];
        assert_close(output.data().unwrap(), &expected, 1e-7);
    }

    // -----------------------------------------------------------------------
    // ConstantPad3d
    // -----------------------------------------------------------------------

    #[test]
    fn test_constant_pad3d_shape() {
        let pad = ConstantPad3d::<f32>::new((1, 2, 1, 2, 1, 2), 0.0);
        let input = t(&vec![1.0; 3 * 4 * 5], &[1, 1, 3, 4, 5]);
        let output = pad.forward(&input).unwrap();
        assert_eq!(output.shape(), &[1, 1, 6, 7, 8]);
    }

    // -----------------------------------------------------------------------
    // Circular padding (1D)
    // -----------------------------------------------------------------------

    #[test]
    fn test_circular_pad_1d() {
        // input = [1, 2, 3, 4], pad_left=1, pad_right=2
        // circular: [4, 1, 2, 3, 4, 1, 2]
        let data = [1.0f32, 2.0, 3.0, 4.0];
        let (out, new_shape) = pad_1d_circular(&data, &[4], 1, 2);
        assert_eq!(new_shape, &[7]);
        assert_close(&out, &[4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0], 1e-7);
    }

    // -----------------------------------------------------------------------
    // Padding mode enum
    // -----------------------------------------------------------------------

    #[test]
    fn test_padding_mode_eq() {
        assert_eq!(PaddingMode::Zeros, PaddingMode::Zeros);
        assert_ne!(PaddingMode::Zeros, PaddingMode::Reflect);
    }

    // -----------------------------------------------------------------------
    // Module trait: no parameters
    // -----------------------------------------------------------------------

    #[test]
    fn test_padding_module_no_params() {
        let pad = ZeroPad2d::<f32>::new((1, 1, 1, 1));
        assert!(pad.parameters().is_empty());
        assert!(pad.named_parameters().is_empty());
    }

    #[test]
    fn test_padding_module_train_eval() {
        let mut pad = ReflectionPad1d::<f32>::new((1, 1));
        assert!(pad.is_training());
        pad.eval();
        assert!(!pad.is_training());
        pad.train();
        assert!(pad.is_training());
    }

    // -----------------------------------------------------------------------
    // Degenerate (numel-0) constant pad — regression for #1551.
    //
    // op_db emits pad samples whose input has an empty data buffer paired
    // with a non-empty *declared* last dim (e.g. shape `[0, 3]`: numel 0,
    // inner 3). Previously `pad_{1,2,3}d_constant` forced rows/outer to 1 and
    // then read `inner`/`w` elements from the empty `data` slice, panicking
    // with "range end index N out of range for slice of length 0" at the
    // `copy_from_slice`. Upstream `torch.nn.functional.pad`
    // (`aten/src/ATen/native/PadNd.cpp:94-106`) allocates the padded output,
    // `fill_(value)`s it, then `copy_`s the (empty) source — a no-op — so the
    // result is the correctly-shaped, value-filled tensor. These assert the
    // fixed behaviour: no panic + correct output shape on numel-0 input.
    // -----------------------------------------------------------------------

    #[test]
    fn test_constant_pad1d_empty_numel_no_panic() {
        // shape [0, 3]: numel 0 but inner = 3. data buffer is empty.
        let (out, new_shape) = pad_1d_constant::<f32>(&[], &[0, 3], 2, 3, 7.0);
        // last dim padded 3 -> 3+2+3 = 8; outer 0-dim with forced row count 1.
        assert_eq!(new_shape, vec![0, 8]);
        // value-filled output, no source copied in.
        assert!(out.iter().all(|&v| v == 7.0));
    }

    #[test]
    fn test_constant_pad2d_empty_numel_no_panic() {
        // shape [0, 2, 3]: numel 0, h = 2, w = 3, empty data.
        let (out, new_shape) = pad_2d_constant::<f32>(&[], &[0, 2, 3], 1, 1, 1, 1, 5.0);
        assert_eq!(new_shape, vec![0, 4, 5]);
        assert!(out.iter().all(|&v| v == 5.0));
    }

    #[test]
    fn test_constant_pad3d_empty_numel_no_panic() {
        // shape [0, 2, 2, 3]: numel 0, d = 2, h = 2, w = 3, empty data.
        let (out, new_shape) = pad_3d_constant::<f32>(&[], &[0, 2, 2, 3], 1, 1, 1, 1, 1, 1, 3.0);
        assert_eq!(new_shape, vec![0, 4, 4, 5]);
        assert!(out.iter().all(|&v| v == 3.0));
    }

    // -----------------------------------------------------------------------
    // Regression: `functional_pad_{1,2,3}d` constant-mode must use `value`.
    //
    // The runner maps torch `mode="constant"` -> `PaddingMode::Zeros` and passes
    // the `value` kwarg through. Pre-fix the `Zeros` arm hardcoded `T::zero()`
    // and dropped `value` (`let _ = value;`), so `F.pad(x, p, "constant", 2.0)`
    // filled 0 instead of 2 — 256 parity-sweep failures (ferrotorch=0 vs
    // torch=2). Upstream `aten/src/ATen/native/PadNd.cpp:94` does
    // `output.fill_(value)` before copying the source. #1553.
    // -----------------------------------------------------------------------

    #[test]
    fn test_functional_pad_1d_constant_uses_value() {
        let input = t(&[1.0, 2.0, 3.0], &[1, 1, 3]);
        let out = functional_pad_1d(&input, 1, 1, PaddingMode::Zeros, 2.0).unwrap();
        assert_eq!(out.shape(), &[1, 1, 5]);
        // Padded region (first + last) must be the fill `value` 2.0, not 0.0.
        assert_close(out.data().unwrap(), &[2.0, 1.0, 2.0, 3.0, 2.0], 1e-7);
    }

    #[test]
    fn test_functional_pad_2d_constant_uses_value() {
        // 1x1x2x2 input, pad (left, right, top, bottom) = (1, 1, 1, 1).
        let input = t(&[1.0, 2.0, 3.0, 4.0], &[1, 1, 2, 2]);
        let out = functional_pad_2d(&input, 1, 1, 1, 1, PaddingMode::Zeros, 2.0).unwrap();
        assert_eq!(out.shape(), &[1, 1, 4, 4]);
        #[rustfmt::skip]
        let expected = [
            2.0, 2.0, 2.0, 2.0,
            2.0, 1.0, 2.0, 2.0,
            2.0, 3.0, 4.0, 2.0,
            2.0, 2.0, 2.0, 2.0,
        ];
        assert_close(out.data().unwrap(), &expected, 1e-7);
        // The border is the fill value; no padded cell is 0.
        assert!(out.data().unwrap().iter().all(|&v| v != 0.0));
    }

    #[test]
    fn test_functional_pad_3d_constant_uses_value() {
        // 1x1x1x1x1 input, pad all six axes by 0 except left/right by 1.
        let input = t(&[5.0], &[1, 1, 1, 1, 1]);
        let out = functional_pad_3d(&input, 1, 1, 0, 0, 0, 0, PaddingMode::Zeros, 2.0).unwrap();
        assert_eq!(out.shape(), &[1, 1, 1, 1, 3]);
        assert_close(out.data().unwrap(), &[2.0, 5.0, 2.0], 1e-7);
    }

    // -----------------------------------------------------------------------
    // Autograd-aware functional pad (Pad1dBackward / Pad3dBackward) — #1443.
    //
    // These are the pre-pad helpers Conv1d/Conv3d route non-zero padding_modes
    // through; a pad returning requires_grad=false severs autograd (the #1550
    // bug class the 2-D path already fixed). Expected gradients are from a live
    // PyTorch 2.11 `F.pad(...).sum().backward()` oracle (R-CHAR-3); the oracle
    // script is in the #1443 commit body.
    // -----------------------------------------------------------------------

    /// Helper: leaf tensor that requires grad.
    fn leaf(data: &[f32], shape: &[usize]) -> Tensor<f32> {
        Tensor::from_storage(TensorStorage::cpu(data.to_vec()), shape.to_vec(), true).unwrap()
    }

    /// `functional_pad_1d` Reflect attaches `Pad1dBackward` and scatter-adds the
    /// grad back onto the source row. torch: F.pad([1,2,3,4], (2,2), 'reflect')
    /// -> out [3,2,1,2,3,4,3,2]; sum().backward() grad_input = [1,3,3,1].
    #[test]
    fn test_functional_pad_1d_reflect_backward_matches_torch() {
        let x = leaf(&[1.0, 2.0, 3.0, 4.0], &[1, 1, 4]);
        let y = functional_pad_1d(&x, 2, 2, PaddingMode::Reflect, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 8]);
        assert!(
            y.grad_fn().is_some(),
            "functional_pad_1d Reflect lost grad_fn — would sever Conv1d autograd (#1550 class)"
        );
        assert_eq!(y.grad_fn().unwrap().name(), "Pad1dBackward");
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(g.data().unwrap(), &[1.0, 3.0, 3.0, 1.0], 1e-5);
    }

    /// `functional_pad_3d` Circular attaches `Pad3dBackward`. torch: a circular
    /// pad of (1,1,1,1,1,1) on a 2x2x2 volume wraps every cell exactly 8 times,
    /// so the all-ones grad_output backprops to a uniform grad of 8.
    #[test]
    fn test_functional_pad_3d_circular_backward_matches_torch() {
        let x_data: Vec<f32> = (1..=8).map(|v| v as f32).collect();
        let x = leaf(&x_data, &[1, 1, 2, 2, 2]);
        let y = functional_pad_3d(&x, 1, 1, 1, 1, 1, 1, PaddingMode::Circular, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 4, 4, 4]);
        assert!(y.grad_fn().is_some());
        assert_eq!(y.grad_fn().unwrap().name(), "Pad3dBackward");
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(g.data().unwrap(), &[8.0; 8], 1e-5);
    }

    // -----------------------------------------------------------------------
    // Negative (crop) padding — `torch.nn.functional.pad` with negative pad
    // amounts CROPS that side instead of adding. Only the constant
    // (`PaddingMode::Zeros`) path supports it; upstream
    // `aten/src/ATen/native/PadNd.cpp:29-108` (`constant_pad_nd`) narrows the
    // input for negative pads, fills the output with `value`, and copies the
    // cropped input into the positive-pad window. Reflect/replicate/circular
    // reject negative pads (PadNd.cpp:221-242). #1611.
    //
    // All expected forward + backward (sum().backward()) values below are from
    // a live PyTorch 2.11 oracle (R-CHAR-3); the deriving script is in the
    // #1611 commit body. Each block names the exact `F.pad(...)` call it pins.
    // -----------------------------------------------------------------------

    /// torch: `F.pad(torch.tensor([[[1,2,3,4,5]]]), [-1,-1], "constant")`
    /// -> out [2,3,4]; sum().backward() grad_input = [0,1,1,1,0].
    #[test]
    fn test_functional_pad_1d_signed_crop_both_matches_torch() {
        let x = leaf(&[1.0, 2.0, 3.0, 4.0, 5.0], &[1, 1, 5]);
        let y = functional_pad_1d_signed(&x, -1, -1, PaddingMode::Zeros, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 3]);
        assert_close(y.data().unwrap(), &[2.0, 3.0, 4.0], 1e-7);
        assert_eq!(y.grad_fn().unwrap().name(), "PadNdSignedBackward");
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(g.data().unwrap(), &[0.0, 1.0, 1.0, 1.0, 0.0], 1e-7);
    }

    /// Mixed signs: torch
    /// `F.pad(torch.tensor([[[1,2,3,4]]]), [-1,2], "constant", value=9)`
    /// -> out [2,3,4,9,9] (crop 1 from start, add 2 fill at end);
    /// sum().backward() grad_input = [0,1,1,1].
    #[test]
    fn test_functional_pad_1d_signed_mixed_matches_torch() {
        let x = leaf(&[1.0, 2.0, 3.0, 4.0], &[1, 1, 4]);
        let y = functional_pad_1d_signed(&x, -1, 2, PaddingMode::Zeros, 9.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 5]);
        assert_close(y.data().unwrap(), &[2.0, 3.0, 4.0, 9.0, 9.0], 1e-7);
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(g.data().unwrap(), &[0.0, 1.0, 1.0, 1.0], 1e-7);
    }

    /// 2-D crop: torch `F.pad(3x3, [-1,0, 0,-1], "constant")` crops the right
    /// column (last dim) and the bottom row (2nd-last) -> 2x2 [[2,3],[5,6]];
    /// sum().backward() grad = [[0,1,1],[0,1,1],[0,0,0]] (flattened).
    #[test]
    fn test_functional_pad_2d_signed_crop_matches_torch() {
        #[rustfmt::skip]
        let x = leaf(&[
            1.0, 2.0, 3.0,
            4.0, 5.0, 6.0,
            7.0, 8.0, 9.0,
        ], &[1, 1, 3, 3]);
        let y = functional_pad_2d_signed(&x, -1, 0, 0, -1, PaddingMode::Zeros, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 2, 2]);
        assert_close(y.data().unwrap(), &[2.0, 3.0, 5.0, 6.0], 1e-7);
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(
            g.data().unwrap(),
            &[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
            1e-7,
        );
    }

    /// 2-D mixed signs: torch
    /// `F.pad(2x3, [-1,2, 1,-1], "constant", value=7)` (last dim crop1/add2,
    /// 2nd-last add1/crop1) -> 2x4 [[7,7,7,7],[2,3,7,7]];
    /// sum().backward() grad = [[0,1,1],[0,0,0]] (flattened).
    #[test]
    fn test_functional_pad_2d_signed_mixed_matches_torch() {
        #[rustfmt::skip]
        let x = leaf(&[
            1.0, 2.0, 3.0,
            4.0, 5.0, 6.0,
        ], &[1, 1, 2, 3]);
        let y = functional_pad_2d_signed(&x, -1, 2, 1, -1, PaddingMode::Zeros, 7.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 2, 4]);
        #[rustfmt::skip]
        let expected = [
            7.0, 7.0, 7.0, 7.0,
            2.0, 3.0, 7.0, 7.0,
        ];
        assert_close(y.data().unwrap(), &expected, 1e-7);
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(g.data().unwrap(), &[0.0, 1.0, 1.0, 0.0, 0.0, 0.0], 1e-7);
    }

    /// 3-D crop: torch `F.pad(2x2x2 [1..8], [-1,0, 0,-1, -1,0], "constant")`
    /// (W crop right, H crop bottom, D crop front) -> 1x1x1 [6];
    /// sum().backward() grad = [0,0,0,0,0,1,0,0].
    #[test]
    fn test_functional_pad_3d_signed_crop_matches_torch() {
        let x_data: Vec<f32> = (1..=8).map(|v| v as f32).collect();
        let x = leaf(&x_data, &[1, 1, 2, 2, 2]);
        let y = functional_pad_3d_signed(&x, -1, 0, 0, -1, -1, 0, PaddingMode::Zeros, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 1, 1, 1]);
        assert_close(y.data().unwrap(), &[6.0], 1e-7);
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(
            g.data().unwrap(),
            &[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
            1e-7,
        );
    }

    /// 3-D mixed signs incl. positive adds: torch
    /// `F.pad(2x2x2 [1..8], [1,-1, 0,1, -1,2], "constant", value=3)`
    /// -> 3x3x2; sum().backward() grad = [0,0,0,0,1,0,1,0].
    #[test]
    fn test_functional_pad_3d_signed_mixed_matches_torch() {
        let x_data: Vec<f32> = (1..=8).map(|v| v as f32).collect();
        let x = leaf(&x_data, &[1, 1, 2, 2, 2]);
        let y = functional_pad_3d_signed(&x, 1, -1, 0, 1, -1, 2, PaddingMode::Zeros, 3.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 3, 3, 2]);
        #[rustfmt::skip]
        let expected = [
            3.0, 5.0, 3.0, 7.0, 3.0, 3.0, 3.0, 3.0, 3.0,
            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
        ];
        assert_close(y.data().unwrap(), &expected, 1e-7);
        let sum = ferrotorch_core::grad_fns::reduction::sum(&y).unwrap();
        ferrotorch_core::backward(&sum).unwrap();
        let g = x.grad().unwrap().expect("grad must be populated");
        assert_close(
            g.data().unwrap(),
            &[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
            1e-7,
        );
    }

    /// Over-crop: torch raises `RuntimeError: narrow(): length must be
    /// non-negative` when a single side crops more than the dim holds
    /// (`F.pad([[[1,2,3]]], [-4,0])`) or the combined net size is negative
    /// (`F.pad([[[1,2,3]]], [-2,-2])`). ferrotorch returns `InvalidArgument`.
    #[test]
    fn test_functional_pad_1d_signed_over_crop_errors() {
        // Single side over-crops (left 4 from size 3).
        let x = t(&[1.0, 2.0, 3.0], &[1, 1, 3]);
        assert!(
            functional_pad_1d_signed(&x, -4, 0, PaddingMode::Zeros, 0.0).is_err(),
            "single-side over-crop must error like torch narrow()"
        );
        // Combined net negative size (left 2 + right 2 from size 3 -> -1).
        assert!(
            functional_pad_1d_signed(&x, -2, -2, PaddingMode::Zeros, 0.0).is_err(),
            "combined net-negative crop must error like torch"
        );
        // Right side over-crops after left (left 1 -> size 2, right 3 -> -1).
        assert!(
            functional_pad_1d_signed(&x, -1, -3, PaddingMode::Zeros, 0.0).is_err(),
            "right-after-left over-crop must error like torch"
        );
    }

    /// Net-zero crop is NOT an error in torch: `F.pad([[[1,2,3]]], [-1,-2])`
    /// returns an empty dim `[1,1,0]`. ferrotorch must match (no error).
    #[test]
    fn test_functional_pad_1d_signed_net_zero_empty_dim_matches_torch() {
        let x = t(&[1.0, 2.0, 3.0], &[1, 1, 3]);
        let y = functional_pad_1d_signed(&x, -1, -2, PaddingMode::Zeros, 0.0).unwrap();
        assert_eq!(y.shape(), &[1, 1, 0]);
        assert!(y.data().unwrap().is_empty());
    }

    /// Negative (crop) pad under a non-constant mode CROPS — live torch 2.11's
    /// `_pad_enum` dispatches reflect/replicate/circular straight to the native
    /// kernels, which narrow for negative pads (`PadNd.cpp:221-242`). For
    /// `[-1, 0]` on `[1,2,3,4]` all three modes crop the left element, yielding
    /// `[2,3,4]` (the positive part of the pad is zero, so it is a pure crop).
    /// torch: `F.pad([[[1.,2.,3.,4.]]], [-1,0], mode=<m>)` -> shape [1,1,3],
    /// `[2,3,4]` for reflect/replicate/circular alike (#1620).
    #[test]
    fn test_functional_pad_signed_negative_non_constant_crops() {
        let x = t(&[1.0, 2.0, 3.0, 4.0], &[1, 1, 4]);
        for mode in [
            PaddingMode::Reflect,
            PaddingMode::Replicate,
            PaddingMode::Circular,
        ] {
            let y = functional_pad_1d_signed(&x, -1, 0, mode, 0.0)
                .unwrap_or_else(|_| panic!("negative pad under {mode:?} must crop, not error"));
            assert_eq!(
                y.shape(),
                &[1, 1, 3],
                "{mode:?} crops left -> shape [1,1,3]"
            );
            assert_close(y.data().unwrap(), &[2.0, 3.0, 4.0], 1e-7);
        }
    }

    /// A non-negative signed pad must be byte-identical to the existing
    /// positive-only `functional_pad_1d` (the delegation invariant that makes
    /// the signed path the single source of truth for constant padding without
    /// changing conv.rs's production behaviour). torch:
    /// `F.pad([[[1,2,3]]], [1,1], "constant", value=2)` -> [2,1,2,3,2].
    #[test]
    fn test_functional_pad_1d_signed_nonneg_equals_positive_path() {
        let input = t(&[1.0, 2.0, 3.0], &[1, 1, 3]);
        let signed = functional_pad_1d_signed(&input, 1, 1, PaddingMode::Zeros, 2.0).unwrap();
        let positive = functional_pad_1d(&input, 1, 1, PaddingMode::Zeros, 2.0).unwrap();
        assert_eq!(signed.shape(), positive.shape());
        assert_close(signed.data().unwrap(), positive.data().unwrap(), 1e-7);
        assert_close(signed.data().unwrap(), &[2.0, 1.0, 2.0, 3.0, 2.0], 1e-7);
    }
}