ferrotorch-nn 0.6.1

//! Loss functions for training neural networks.
//!
//! Unlike layers, loss functions are **not** `Module<T>`. They are callable
//! structs with a `forward(&self, pred, target) -> FerrotorchResult<Tensor<T>>`
//! method. Each loss attaches a backward node to the returned tensor when
//! gradient tracking is enabled.
//!
//! ## REQ status (per `.design/ferrotorch-nn/loss.md`)
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 | SHIPPED | `pub struct MSELoss { pub reduction: Reduction }` + `MSEBackward<T>` mirrors `torch/nn/modules/loss.py:566-630`; consumed by `ferrotorch-optim/src/sgd.rs:524,831` MSE training-loop harness. Runner-arm gap tracked by #1444. |
//! | REQ-2 | SHIPPED | `pub struct CrossEntropyLoss { pub reduction, pub label_smoothing }` + numerically stable log-softmax + `CrossEntropyBackward<T>` mirrors `torch/nn/modules/loss.py:1197-1406`; consumed via `ferrotorch-nn/src/lib.rs:213` re-export and `ferrotorch_nn::prelude::CrossEntropyLoss` at `lib.rs:289`. Runner-arm: #1444. |
//! | REQ-3 | SHIPPED | `pub struct BCEWithLogitsLoss { pub reduction, pub pos_weight }` using the `softplus`-stable form + `BCEWithLogitsBackward<T>` mirrors `torch/nn/modules/loss.py:718-845`; consumed via `lib.rs:213,289` re-exports. Runner-arm: #1444. |
//! | REQ-4 | SHIPPED | `pub struct BCELoss { pub reduction }` + `BCEBackward<T>` mirrors `torch/nn/modules/loss.py:632-717`; consumed via `lib.rs:213,289` re-exports. Runner-arm: #1444. |
//! | REQ-5 | SHIPPED | `pub struct L1Loss { pub reduction }` + `L1Backward<T>` mirrors `torch/nn/modules/loss.py:65-134`; consumed via `lib.rs:213,289` re-exports. Runner-arm: #1444. |
//! | REQ-6 | SHIPPED | `pub struct NLLLoss { pub reduction, pub ignore_index, pub weight }` + `NLLBackward<T>` mirrors `torch/nn/modules/loss.py:135-273`; consumed via `lib.rs:213,289` re-exports. Runner-arm: #1444. |
//! | REQ-7 | SHIPPED | `pub struct KLDivLoss { pub reduction, pub log_target }` + `KLDivBackward<T>` mirrors `torch/nn/modules/loss.py:463-565`; consumed via `lib.rs:214` re-export. Runner-arm: #1444. |
//! | REQ-8 | SHIPPED | `pub struct SmoothL1Loss { pub reduction, pub beta }` mirrors `torch/nn/modules/loss.py:987-1079`; consumed via `lib.rs:215` re-export. Runner-arm: #1444. |
//! | REQ-9 | SHIPPED | `pub struct HuberLoss { pub reduction, pub delta }` + `HuberBackward<T>` mirrors `torch/nn/modules/loss.py:1080-1149`; consumed via `lib.rs:213` re-export. Runner-arm: #1444. |
//! | REQ-10 | SHIPPED | `pub struct PoissonNLLLoss { pub reduction, pub log_input, pub full, pub eps }` + `PoissonNLLBackward<T>` mirrors `torch/nn/modules/loss.py:286-375`; consumed via `lib.rs:214` re-export. Runner-arm: #1444. |
//! | REQ-11 | SHIPPED | `pub struct GaussianNLLLoss { pub reduction, pub full, pub eps }` with eps-clamp on `var` + `GaussianNLLBackward<T>` mirrors `torch/nn/modules/loss.py:376-462`; consumed via `lib.rs:213` re-export. Runner-arm: #1444. |
//! | REQ-12 | SHIPPED | `pub struct HingeEmbeddingLoss { pub reduction, pub margin }` + `HingeEmbeddingBackward<T>` mirrors `torch/nn/modules/loss.py:846-923`; consumed via `lib.rs:214` re-export. Runner-arm: #1444. |
//! | REQ-13 | SHIPPED | `pub struct MarginRankingLoss { pub reduction, pub margin }` + `MarginRankingBackward<T>` with `forward_pair(x1, x2, y)` mirrors `torch/nn/modules/loss.py:1694-1760`; consumed via `lib.rs:214` re-export. Runner-arm: #1444. |
//! | REQ-14 | SHIPPED | `pub struct TripletMarginLoss { pub reduction, pub margin, pub p, pub swap, pub eps }` + `TripletMarginBackward<T>` with `forward_triplet(anchor, positive, negative)` mirrors `torch/nn/modules/loss.py:1857-1966`; consumed via `lib.rs:216` re-export. Runner-arm: #1444. |
//! | REQ-15 | SHIPPED | `pub struct CosineEmbeddingLoss { pub reduction, pub margin }` + `CosineEmbeddingBackward<T>` with `forward_pair(x1, x2, y)` mirrors `torch/nn/modules/loss.py:1622-1693`; consumed via `lib.rs:213` re-export. Runner-arm: #1444. |
//! | REQ-16 | SHIPPED | `pub struct CTCLoss { pub blank, pub reduction, pub zero_infinity }` + `CTCBackward<T>` (forward + backward DP) mirrors `torch/nn/modules/loss.py:2102-2245`; consumed via `lib.rs:213` re-export. Runner-arm: #1444. |
//! | REQ-17 | SHIPPED | `pub struct MultiMarginLoss` + `MultiMarginBackward<T>` and `pub struct MultiLabelSoftMarginLoss` + `MultiLabelSoftMarginBackward<T>` mirror `torch/nn/modules/loss.py:1566-1622, 1761-1857`; consumed via `lib.rs:215` re-export. |
//! | REQ-18 | SHIPPED | Every loss returns `Tensor::from_operation(..., grad_fn)` when `is_grad_enabled() && pred.requires_grad()`; the hand-written backward nodes (`MSEBackward`, `CrossEntropyBackward`, `BCEWithLogitsBackward`, `BCEBackward`, `L1Backward`, `NLLBackward`, `KLDivBackward`, `HuberBackward`, `PoissonNLLBackward`, `GaussianNLLBackward`, `HingeEmbeddingBackward`, `MarginRankingBackward`, `TripletMarginBackward`, `CosineEmbeddingBackward`, `CTCBackward`, `MultiMarginBackward`, `MultiLabelSoftMarginBackward`) divide by N for `Reduction::Mean`; consumed by `ferrotorch-optim/src/sgd.rs:524,831` MSE production training loop. |
//! | REQ-19 | SHIPPED | Every `forward` opens with `autocast_guard("<op_name>")` from `ferrotorch_core::autograd::autocast_ops` (`"mse_loss"`, `"cross_entropy"`, `"bce_with_logits"`, `"bce"`, `"l1_loss"`, `"nll_loss"`, `"kl_div"`, `"smooth_l1"`, `"huber"`, `"poisson_nll"`, `"gaussian_nll"`, `"hinge_embedding"`, `"margin_ranking"`, `"triplet_margin"`, `"cosine_embedding"`, `"ctc"`); consumed by the autocast policy table classifying every loss as `FullPrecision`. |

use std::sync::Arc;

use ferrotorch_core::Float;
use ferrotorch_core::autograd::autocast_ops::autocast_guard;
use ferrotorch_core::autograd::no_grad::is_grad_enabled;
use ferrotorch_core::error::{FerrotorchError, FerrotorchResult};
use ferrotorch_core::ops::elementwise::{binary_map, mean, sum, unary_map};
use ferrotorch_core::storage::TensorStorage;
use ferrotorch_core::tensor::{GradFn, Tensor};
use num_traits::{One, Zero};

use crate::module::Reduction;

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/// Apply the chosen reduction to an unreduced loss tensor.
fn apply_reduction<T: Float>(
    unreduced: &Tensor<T>,
    reduction: Reduction,
) -> FerrotorchResult<Tensor<T>> {
    match reduction {
        Reduction::None => Ok(unreduced.clone()),
        Reduction::Mean => mean(unreduced),
        Reduction::Sum => sum(unreduced),
    }
}

// ===========================================================================
// MSELoss
// ===========================================================================

/// Mean Squared Error loss.
///
/// `loss_i = (pred_i - target_i)^2`, then the chosen reduction is applied.
///
/// Construct via [`MSELoss::new`] / [`MSELoss::default`]; new fields may be
/// added in minor releases (`#[non_exhaustive]`). Direct field access via
/// `loss.reduction` continues to work.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct MSELoss {
    pub reduction: Reduction,
}

impl MSELoss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    /// Compute MSE loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"mse_loss"`).
    pub fn forward<T: Float>(
        &self,
        pred: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("mse_loss");

        if pred.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "MSELoss: pred shape {:?} != target shape {:?}",
                    pred.shape(),
                    target.shape()
                ),
            });
        }

        let diff = binary_map(pred, target, |p, t| p - t)?;
        let sq = unary_map(&diff, |x| x * x)?;
        let reduced = apply_reduction(&sq, self.reduction)?;

        if is_grad_enabled() && pred.requires_grad() {
            let grad_fn = Arc::new(MSEBackward {
                pred: pred.clone(),
                target: target.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for MSELoss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `MSELoss`.
///
/// `grad_pred = 2 * (pred - target) * grad_output / n` (mean reduction)
/// `grad_pred = 2 * (pred - target) * grad_output`     (sum reduction)
/// `grad_pred = 2 * (pred - target) * grad_output`     (no reduction, elementwise)
#[derive(Debug)]
struct MSEBackward<T: Float> {
    pred: Tensor<T>,
    target: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for MSEBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use ferrotorch_core::autograd::no_grad::no_grad;
        use ferrotorch_core::grad_fns::arithmetic::{mul, sub};

        // grad = 2 * (pred - target) * grad_output [/ n for mean]
        let grad_input = no_grad(|| {
            let diff = sub(&self.pred, &self.target)?;
            let two =
                ferrotorch_core::creation::scalar(T::from(2.0).unwrap())?.to(self.pred.device())?;
            let scaled = mul(&diff, &two)?;
            let result = mul(&scaled, grad_output)?;
            match self.reduction {
                Reduction::Mean => {
                    let n = ferrotorch_core::creation::scalar(
                        T::from(self.pred.shape().iter().product::<usize>()).unwrap(),
                    )?
                    .to(self.pred.device())?;
                    ferrotorch_core::grad_fns::arithmetic::div(&result, &n)
                }
                _ => Ok(result),
            }
        })?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.pred]
    }

    fn name(&self) -> &'static str {
        "MSEBackward"
    }
}

// ===========================================================================
// CrossEntropyLoss
// ===========================================================================

/// Cross-entropy loss combining log-softmax and NLL.
///
/// Expects logits `[B, C]` and integer class targets `[B]` (stored as floats,
/// e.g. `0.0`, `1.0`, `2.0`).
///
/// With label smoothing `ls`:
/// ```text
/// loss = (1 - ls) * nll + ls * (-log_probs.mean(dim=-1))
/// ```
///
/// Construct via [`CrossEntropyLoss::new`] / [`CrossEntropyLoss::default`];
/// `#[non_exhaustive]` reserves the right to add fields in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct CrossEntropyLoss {
    pub reduction: Reduction,
    pub label_smoothing: f64,
}

impl CrossEntropyLoss {
    pub fn new(reduction: Reduction, label_smoothing: f64) -> Self {
        Self {
            reduction,
            label_smoothing,
        }
    }

    /// Compute cross-entropy loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"cross_entropy"`).
    pub fn forward<T: Float>(
        &self,
        logits: &Tensor<T>,
        targets: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("cross_entropy");

        let shape = logits.shape();
        if shape.len() != 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CrossEntropyLoss: expected 2D logits [B, C], got shape {:?}",
                    shape
                ),
            });
        }
        let batch = shape[0];
        let classes = shape[1];

        if targets.shape() != [batch] {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "CrossEntropyLoss: target shape {:?} does not match batch size {}",
                    targets.shape(),
                    batch,
                ),
            });
        }

        let logits_data = logits.data_vec()?;
        let targets_data = targets.data_vec()?;
        let ls = T::from(self.label_smoothing).unwrap();
        let one = <T as One>::one();

        // Compute log_softmax along dim=-1 (the class dimension).
        let mut log_probs = vec![<T as Zero>::zero(); batch * classes];
        let mut softmax_out = vec![<T as Zero>::zero(); batch * classes];

        for b in 0..batch {
            let base = b * classes;
            // Numerical stability: subtract max.
            let mut max_val = logits_data[base];
            for c in 1..classes {
                if logits_data[base + c] > max_val {
                    max_val = logits_data[base + c];
                }
            }
            let mut sum_exp = <T as Zero>::zero();
            for c in 0..classes {
                let e = (logits_data[base + c] - max_val).exp();
                softmax_out[base + c] = e;
                sum_exp += e;
            }
            let log_sum = sum_exp.ln();
            for c in 0..classes {
                softmax_out[base + c] = softmax_out[base + c] / sum_exp;
                log_probs[base + c] = logits_data[base + c] - max_val - log_sum;
            }
        }

        // Compute per-sample loss.
        let mut losses = vec![<T as Zero>::zero(); batch];
        for b in 0..batch {
            let base = b * classes;
            let target_class = targets_data[b].to_usize().unwrap_or(0);

            // NLL component: -log_probs[target_class]
            let nll = -log_probs[base + target_class];

            if self.label_smoothing > 0.0 {
                // Smooth component: -mean(log_probs along class dim)
                let mut sum_lp = <T as Zero>::zero();
                for c in 0..classes {
                    sum_lp += log_probs[base + c];
                }
                let smooth = -sum_lp / T::from(classes).unwrap();
                losses[b] = (one - ls) * nll + ls * smooth;
            } else {
                losses[b] = nll;
            }
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && logits.requires_grad() {
            let softmax_tensor =
                Tensor::from_storage(TensorStorage::cpu(softmax_out), vec![batch, classes], false)?;
            let grad_fn = Arc::new(CrossEntropyBackward {
                logits: logits.clone(),
                targets: targets.clone(),
                softmax: softmax_tensor,
                label_smoothing: self.label_smoothing,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for CrossEntropyLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 0.0)
    }
}

/// Backward for `CrossEntropyLoss`.
///
/// Gradient through log_softmax + NLL:
/// `grad_logits[b, c] = softmax[b, c] - one_hot[b, c]` (for mean: divided by B)
///
/// With label smoothing `ls`:
/// `grad_logits[b, c] = (1 - ls) * (softmax[b, c] - one_hot[b, c])
///                     + ls * (softmax[b, c] - 1/C)`
/// Simplifies to: `softmax[b, c] - ((1 - ls) * one_hot[b, c] + ls / C)`
#[derive(Debug)]
struct CrossEntropyBackward<T: Float> {
    logits: Tensor<T>,
    targets: Tensor<T>,
    softmax: Tensor<T>,
    label_smoothing: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for CrossEntropyBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "CrossEntropy backward" }`
    /// when `grad_output` lives on a CUDA device — the CPU softmax-minus-target
    /// kernel has no GPU counterpart yet. Move `grad_output` to CPU explicitly to
    /// run this backward, or file a follow-up to land the GPU kernel.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.logits.shape();
        let batch = shape[0];
        let classes = shape[1];
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "CrossEntropy backward",
            });
        }
        let sm_data = self.softmax.data()?;
        let targets_data = self.targets.data()?;
        let grad_data = grad_output.data()?;
        let ls = T::from(self.label_smoothing).unwrap();
        let one = <T as One>::one();
        let inv_c = T::from(1.0).unwrap() / T::from(classes).unwrap();

        let mut result = vec![<T as Zero>::zero(); batch * classes];

        for b in 0..batch {
            let base = b * classes;
            let target_class = targets_data[b].to_usize().unwrap_or(0);

            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            for c in 0..classes {
                let sm = sm_data[base + c];
                let one_hot = if c == target_class {
                    one
                } else {
                    <T as Zero>::zero()
                };
                // grad = softmax - ((1 - ls) * one_hot + ls / C)
                let target_dist = (one - ls) * one_hot + ls * inv_c;
                result[base + c] = (sm - target_dist) * scale;
            }
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.logits]
    }

    fn name(&self) -> &'static str {
        "CrossEntropyBackward"
    }
}

// ===========================================================================
// BCEWithLogitsLoss
// ===========================================================================

/// Binary cross-entropy loss with logits (numerically stable).
///
/// ```text
/// loss = max(x, 0) - x*y + log(1 + exp(-|x|))
/// ```
///
/// Backward: `grad = sigmoid(x) - y`
///
/// `#[non_exhaustive]`: construct via [`BCEWithLogitsLoss::new`] /
/// [`BCEWithLogitsLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct BCEWithLogitsLoss {
    pub reduction: Reduction,
}

impl BCEWithLogitsLoss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    /// Compute BCE with logits loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"bce_with_logits"`).
    pub fn forward<T: Float>(
        &self,
        logits: &Tensor<T>,
        targets: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("bce_with_logits");

        if logits.shape() != targets.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "BCEWithLogitsLoss: logits shape {:?} != targets shape {:?}",
                    logits.shape(),
                    targets.shape()
                ),
            });
        }

        let logits_data = logits.data_vec()?;
        let targets_data = targets.data_vec()?;
        let zero = <T as Zero>::zero();
        let one = <T as One>::one();

        // loss = max(x, 0) - x*y + log(1 + exp(-|x|))
        let loss_data: Vec<T> = logits_data
            .iter()
            .zip(targets_data.iter())
            .map(|(&x, &y)| {
                let relu_x = if x > zero { x } else { zero };
                let abs_x = if x > zero { x } else { -x };
                relu_x - x * y + (one + (-abs_x).exp()).ln()
            })
            .collect();

        let unreduced = Tensor::from_storage(
            TensorStorage::cpu(loss_data),
            logits.shape().to_vec(),
            false,
        )?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && logits.requires_grad() {
            let grad_fn = Arc::new(BCEWithLogitsBackward {
                logits: logits.clone(),
                targets: targets.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for BCEWithLogitsLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `BCEWithLogitsLoss`.
///
/// `grad = (sigmoid(x) - y) * grad_output`
#[derive(Debug)]
struct BCEWithLogitsBackward<T: Float> {
    logits: Tensor<T>,
    targets: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for BCEWithLogitsBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use ferrotorch_core::autograd::no_grad::no_grad;
        use ferrotorch_core::grad_fns::activation::sigmoid;
        use ferrotorch_core::grad_fns::arithmetic::{div, mul, sub};

        // grad = (sigmoid(logits) - targets) * grad_output [/ n for mean]
        let grad_input = no_grad(|| {
            let sig = sigmoid(&self.logits)?;
            let diff = sub(&sig, &self.targets)?;
            let result = mul(&diff, grad_output)?;
            match self.reduction {
                Reduction::Mean => {
                    let n = ferrotorch_core::creation::scalar(
                        T::from(self.logits.shape().iter().product::<usize>()).unwrap(),
                    )?
                    .to(self.logits.device())?;
                    div(&result, &n)
                }
                _ => Ok(result),
            }
        })?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.logits]
    }

    fn name(&self) -> &'static str {
        "BCEWithLogitsBackward"
    }
}

// ===========================================================================
// HuberLoss
// ===========================================================================

/// Huber loss (smooth L1).
///
/// ```text
/// if |error| < delta:  0.5 * error^2
/// else:                delta * (|error| - 0.5 * delta)
/// ```
///
/// `#[non_exhaustive]`: construct via [`HuberLoss::new`] /
/// [`HuberLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct HuberLoss {
    pub reduction: Reduction,
    pub delta: f64,
}

impl HuberLoss {
    pub fn new(reduction: Reduction, delta: f64) -> Self {
        Self { reduction, delta }
    }

    pub fn forward<T: Float>(
        &self,
        pred: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if pred.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "HuberLoss: pred shape {:?} != target shape {:?}",
                    pred.shape(),
                    target.shape()
                ),
            });
        }

        let pred_data = pred.data_vec()?;
        let target_data = target.data_vec()?;
        let delta = T::from(self.delta).unwrap();
        let half = T::from(0.5).unwrap();

        let loss_data: Vec<T> = pred_data
            .iter()
            .zip(target_data.iter())
            .map(|(&p, &t)| {
                let error = p - t;
                let abs_error = error.abs();
                if abs_error < delta {
                    half * error * error
                } else {
                    delta * (abs_error - half * delta)
                }
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), pred.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && pred.requires_grad() {
            let grad_fn = Arc::new(HuberBackward {
                pred: pred.clone(),
                target: target.clone(),
                delta: self.delta,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for HuberLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 1.0)
    }
}

/// Backward for `HuberLoss`.
///
/// ```text
/// if |error| < delta:  grad * error
/// else:                grad * delta * sign(error)
/// ```
#[derive(Debug)]
struct HuberBackward<T: Float> {
    pred: Tensor<T>,
    target: Tensor<T>,
    delta: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for HuberBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use ferrotorch_core::autograd::no_grad::no_grad;
        use ferrotorch_core::grad_fns::arithmetic::{div, mul, sub};
        use ferrotorch_core::grad_fns::transcendental::clamp;

        // Huber gradient: clamp(pred - target, -delta, delta) * grad_output [/ n]
        let delta_t = T::from(self.delta).unwrap();
        let grad_input = no_grad(|| {
            let error = sub(&self.pred, &self.target)?;
            let clamped = clamp(&error, -delta_t, delta_t)?;
            let result = mul(&clamped, grad_output)?;
            match self.reduction {
                Reduction::Mean => {
                    let n = ferrotorch_core::creation::scalar(
                        T::from(self.pred.shape().iter().product::<usize>()).unwrap(),
                    )?
                    .to(self.pred.device())?;
                    div(&result, &n)
                }
                _ => Ok(result),
            }
        })?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.pred]
    }

    fn name(&self) -> &'static str {
        "HuberBackward"
    }
}

// ===========================================================================
// KLDivLoss
// ===========================================================================

/// Kullback-Leibler divergence loss.
///
/// Expects **log-probabilities** as `input` and **probabilities** as `target`:
///
/// ```text
/// loss_i = target_i * (log(target_i) - input_i)
/// ```
///
/// This matches PyTorch's `KLDivLoss` with `log_target=False`. The caller
/// is responsible for passing log-probabilities (e.g., from `LogSoftmax`).
///
/// Note: entries where `target_i == 0` contribute zero loss (0 * log(0) is
/// treated as 0 following the convention).
///
/// `#[non_exhaustive]`: construct via [`KLDivLoss::new`] /
/// [`KLDivLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct KLDivLoss {
    pub reduction: Reduction,
}

impl KLDivLoss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if input.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "KLDivLoss: input shape {:?} != target shape {:?}",
                    input.shape(),
                    target.shape()
                ),
            });
        }

        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let zero = <T as Zero>::zero();

        // KL(target || input) = sum(target * (log(target) - input))
        // where log(0) * 0 = 0 by convention.
        let loss_data: Vec<T> = input_data
            .iter()
            .zip(target_data.iter())
            .map(|(&inp, &tgt)| {
                if tgt > zero {
                    tgt * (tgt.ln() - inp)
                } else {
                    zero
                }
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), input.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(KLDivBackward {
                input: input.clone(),
                target: target.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for KLDivLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `KLDivLoss`.
///
/// `grad_input = -target * grad_output` (since d/d(input) of target*(log(target) - input) = -target)
#[derive(Debug)]
struct KLDivBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for KLDivBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use ferrotorch_core::autograd::no_grad::no_grad;
        use ferrotorch_core::grad_fns::arithmetic::{div, mul, neg};

        // grad = -target * grad_output [/ n for mean]
        let grad_input = no_grad(|| {
            let neg_target = neg(&self.target)?;
            let result = mul(&neg_target, grad_output)?;
            match self.reduction {
                Reduction::Mean => {
                    let n = ferrotorch_core::creation::scalar(
                        T::from(self.input.shape().iter().product::<usize>()).unwrap(),
                    )?
                    .to(self.input.device())?;
                    div(&result, &n)
                }
                _ => Ok(result),
            }
        })?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "KLDivBackward"
    }
}

// ===========================================================================
// CosineEmbeddingLoss
// ===========================================================================

/// Cosine embedding loss for measuring similarity between pairs.
///
/// For positive pairs (y = 1):
/// ```text
/// loss = 1 - cos(x1, x2)
/// ```
///
/// For negative pairs (y = -1):
/// ```text
/// loss = max(0, cos(x1, x2) - margin)
/// ```
///
/// `x1` and `x2` must have the same shape. `y` must be a 1-D tensor of
/// `1.0` or `-1.0` values with length equal to the batch size (first dim).
///
/// `#[non_exhaustive]`: construct via [`CosineEmbeddingLoss::new`] /
/// [`CosineEmbeddingLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct CosineEmbeddingLoss {
    pub reduction: Reduction,
    pub margin: f64,
}

impl CosineEmbeddingLoss {
    pub fn new(reduction: Reduction, margin: f64) -> Self {
        Self { reduction, margin }
    }

    /// Forward pass.
    ///
    /// - `x1`: tensor of shape `[B, D]` or `[D]`.
    /// - `x2`: tensor of shape `[B, D]` or `[D]`.
    /// - `y`: tensor of shape `[B]` or `[1]` with values `1.0` or `-1.0`.
    pub fn forward_pair<T: Float>(
        &self,
        x1: &Tensor<T>,
        x2: &Tensor<T>,
        y: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if x1.shape() != x2.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "CosineEmbeddingLoss: x1 shape {:?} != x2 shape {:?}",
                    x1.shape(),
                    x2.shape()
                ),
            });
        }

        let x1_data = x1.data_vec()?;
        let x2_data = x2.data_vec()?;
        let y_data = y.data_vec()?;
        let zero = <T as Zero>::zero();
        let one = <T as One>::one();
        let margin_t = T::from(self.margin).unwrap();

        let shape = x1.shape();
        let (batch, feat) = if shape.len() == 1 {
            (1, shape[0])
        } else if shape.len() == 2 {
            (shape[0], shape[1])
        } else {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CosineEmbeddingLoss: expected 1D or 2D input, got shape {:?}",
                    shape
                ),
            });
        };

        if y_data.len() != batch {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "CosineEmbeddingLoss: y length {} != batch size {}",
                    y_data.len(),
                    batch
                ),
            });
        }

        let mut losses = vec![zero; batch];
        for b in 0..batch {
            let base = b * feat;
            // Compute cosine similarity.
            let mut dot = zero;
            let mut norm1_sq = zero;
            let mut norm2_sq = zero;
            for f in 0..feat {
                let a = x1_data[base + f];
                let bv = x2_data[base + f];
                dot += a * bv;
                norm1_sq += a * a;
                norm2_sq += bv * bv;
            }
            let denom = norm1_sq.sqrt() * norm2_sq.sqrt();
            let cos_sim = if denom > zero { dot / denom } else { zero };

            if y_data[b] > zero {
                // Positive pair: loss = 1 - cos_sim.
                losses[b] = one - cos_sim;
            } else {
                // Negative pair: loss = max(0, cos_sim - margin).
                let v = cos_sim - margin_t;
                losses[b] = if v > zero { v } else { zero };
            }
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && (x1.requires_grad() || x2.requires_grad()) {
            let grad_fn = Arc::new(CosineEmbeddingBackward {
                x1: x1.clone(),
                x2: x2.clone(),
                y: y.clone(),
                margin: self.margin,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for CosineEmbeddingLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 0.0)
    }
}

/// Backward for `CosineEmbeddingLoss`.
///
/// For positive pairs (y = 1):
/// ```text
/// d(loss)/d(x1_f) = -(x2_f / (||x1|| * ||x2||) - cos_sim * x1_f / ||x1||^2)
/// d(loss)/d(x2_f) = -(x1_f / (||x1|| * ||x2||) - cos_sim * x2_f / ||x2||^2)
/// ```
///
/// For negative pairs (y = -1) where `cos(x1, x2) - margin > 0`:
/// ```text
/// d(loss)/d(x1_f) = x2_f / (||x1|| * ||x2||) - cos_sim * x1_f / ||x1||^2
/// d(loss)/d(x2_f) = x1_f / (||x1|| * ||x2||) - cos_sim * x2_f / ||x2||^2
/// ```
#[derive(Debug)]
struct CosineEmbeddingBackward<T: Float> {
    x1: Tensor<T>,
    x2: Tensor<T>,
    y: Tensor<T>,
    margin: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for CosineEmbeddingBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "CosineEmbedding backward" }`
    /// when `grad_output` lives on a CUDA device — the per-sample cosine-similarity
    /// gradient kernel has no GPU counterpart yet. Move `grad_output` to CPU
    /// explicitly to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.x1.shape();
        let (batch, feat) = if shape.len() == 1 {
            (1, shape[0])
        } else {
            (shape[0], shape[1])
        };

        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "CosineEmbedding backward",
            });
        }
        let x1_data = self.x1.data()?;
        let x2_data = self.x2.data()?;
        let y_data = self.y.data()?;
        let grad_data = grad_output.data()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();

        let mut grad_x1 = vec![zero; batch * feat];
        let mut grad_x2 = vec![zero; batch * feat];

        for b in 0..batch {
            let base = b * feat;

            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            // Compute cosine similarity for this sample.
            let mut dot = zero;
            let mut norm1_sq = zero;
            let mut norm2_sq = zero;
            for f in 0..feat {
                let a = x1_data[base + f];
                let bv = x2_data[base + f];
                dot += a * bv;
                norm1_sq += a * a;
                norm2_sq += bv * bv;
            }
            let norm1 = norm1_sq.sqrt();
            let norm2 = norm2_sq.sqrt();
            let denom = norm1 * norm2;

            if denom <= zero {
                continue;
            }

            let cos_sim = dot / denom;

            let is_positive = y_data[b] > zero;
            let is_active = if is_positive {
                true
            } else {
                cos_sim - margin_t > zero
            };

            if !is_active {
                continue;
            }

            // sign: -1 for positive pairs, +1 for negative pairs
            let sign = if is_positive {
                -<T as One>::one()
            } else {
                <T as One>::one()
            };

            for f in 0..feat {
                let a = x1_data[base + f];
                let bv = x2_data[base + f];
                // d(cos)/d(x1_f) = x2_f / (||x1|| * ||x2||) - cos * x1_f / ||x1||^2
                let d_cos_x1 = bv / denom - cos_sim * a / norm1_sq;
                let d_cos_x2 = a / denom - cos_sim * bv / norm2_sq;
                grad_x1[base + f] = sign * d_cos_x1 * scale;
                grad_x2[base + f] = sign * d_cos_x2 * scale;
            }
        }

        let grad_x1_tensor =
            Tensor::from_storage(TensorStorage::cpu(grad_x1), shape.to_vec(), false)?;
        let grad_x2_tensor =
            Tensor::from_storage(TensorStorage::cpu(grad_x2), shape.to_vec(), false)?;
        Ok(vec![Some(grad_x1_tensor), Some(grad_x2_tensor)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.x1, &self.x2]
    }

    fn name(&self) -> &'static str {
        "CosineEmbeddingBackward"
    }
}

// ===========================================================================
// L1Loss
// ===========================================================================

/// L1 (Mean Absolute Error) loss.
///
/// ```text
/// loss_i = |pred_i - target_i|
/// ```
///
/// Then the chosen reduction is applied.
///
/// Matches `torch.nn.L1Loss`.
///
/// `#[non_exhaustive]`: construct via [`L1Loss::new`] / [`L1Loss::default`];
/// new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct L1Loss {
    pub reduction: Reduction,
}

impl L1Loss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    /// Compute L1 loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"l1_loss"`).
    pub fn forward<T: Float>(
        &self,
        pred: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("l1_loss");

        if pred.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "L1Loss: pred shape {:?} != target shape {:?}",
                    pred.shape(),
                    target.shape()
                ),
            });
        }

        let diff = binary_map(pred, target, |p, t| p - t)?;
        let abs_diff = unary_map(&diff, |x| x.abs())?;
        let reduced = apply_reduction(&abs_diff, self.reduction)?;

        if is_grad_enabled() && pred.requires_grad() {
            let grad_fn = Arc::new(L1Backward {
                pred: pred.clone(),
                target: target.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for L1Loss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `L1Loss`.
///
/// `grad_pred = sign(pred - target) * grad_output / n` (mean reduction)
/// `grad_pred = sign(pred - target) * grad_output`     (sum reduction)
/// `grad_pred = sign(pred - target) * grad_output`     (no reduction, elementwise)
///
/// `sign(0)` is defined as `0` to match PyTorch behavior.
#[derive(Debug)]
struct L1Backward<T: Float> {
    pred: Tensor<T>,
    target: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for L1Backward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "L1 backward" }`
    /// when `grad_output` lives on a CUDA device — the elementwise sign gradient
    /// kernel has no GPU counterpart yet. Move `grad_output` to CPU explicitly to
    /// run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda { op: "L1 backward" });
        }
        let pred_data = self.pred.data()?;
        let target_data = self.target.data()?;
        let grad_data = grad_output.data()?;
        let n = T::from(pred_data.len()).unwrap();

        let sign = |x: T| -> T {
            let zero = <T as Zero>::zero();
            if x > zero {
                <T as One>::one()
            } else if x < zero {
                -<T as One>::one()
            } else {
                zero
            }
        };

        let result: Vec<T> = match self.reduction {
            Reduction::Mean => {
                let go = grad_data[0];
                pred_data
                    .iter()
                    .zip(target_data.iter())
                    .map(|(&p, &t)| sign(p - t) * go / n)
                    .collect()
            }
            Reduction::Sum => {
                let go = grad_data[0];
                pred_data
                    .iter()
                    .zip(target_data.iter())
                    .map(|(&p, &t)| sign(p - t) * go)
                    .collect()
            }
            Reduction::None => pred_data
                .iter()
                .zip(target_data.iter())
                .zip(grad_data.iter())
                .map(|((&p, &t), &g)| sign(p - t) * g)
                .collect(),
        };

        let grad_input = Tensor::from_storage(
            TensorStorage::cpu(result),
            self.pred.shape().to_vec(),
            false,
        )?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.pred]
    }

    fn name(&self) -> &'static str {
        "L1Backward"
    }
}

// ===========================================================================
// NLLLoss
// ===========================================================================

/// Negative log-likelihood loss.
///
/// Takes **log-probabilities** of shape `[B, C]` and integer class targets
/// `[B]` (stored as floats, e.g. `0.0`, `1.0`, `2.0`).
///
/// ```text
/// loss_b = -log_probs[b, target[b]]
/// ```
///
/// Supports an optional `ignore_index`: samples whose target equals this
/// value are excluded from the loss computation. When using `Reduction::Mean`,
/// the denominator is the count of non-ignored samples.
///
/// Matches `torch.nn.NLLLoss`.
///
/// `#[non_exhaustive]`: construct via [`NLLLoss::new`] / [`NLLLoss::default`];
/// new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct NLLLoss {
    pub reduction: Reduction,
    /// If set, class indices equal to this value are ignored.
    pub ignore_index: Option<isize>,
}

impl NLLLoss {
    pub fn new(reduction: Reduction, ignore_index: Option<isize>) -> Self {
        Self {
            reduction,
            ignore_index,
        }
    }

    /// Compute NLL loss.
    ///
    /// # Arguments
    ///
    /// * `log_probs` - Log-probabilities of shape `[B, C]`.
    /// * `targets` - Class indices of shape `[B]`, stored as floats.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"nll_loss"`).
    pub fn forward<T: Float>(
        &self,
        log_probs: &Tensor<T>,
        targets: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("nll_loss");

        let shape = log_probs.shape();
        if shape.len() != 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "NLLLoss: expected 2D log_probs [B, C], got shape {:?}",
                    shape
                ),
            });
        }
        let batch = shape[0];
        let classes = shape[1];

        if targets.shape() != [batch] {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "NLLLoss: target shape {:?} does not match batch size {}",
                    targets.shape(),
                    batch,
                ),
            });
        }

        if batch == 0 {
            // Empty batch: return scalar zero for Mean/Sum, empty [0] for None.
            return match self.reduction {
                Reduction::None => Tensor::from_storage(TensorStorage::cpu(vec![]), vec![0], false),
                _ => Tensor::from_storage(
                    TensorStorage::cpu(vec![<T as Zero>::zero()]),
                    vec![],
                    false,
                ),
            };
        }

        let lp_data = log_probs.data_vec()?;
        let targets_data = targets.data_vec()?;

        let mut losses = vec![<T as Zero>::zero(); batch];
        let mut valid_count: usize = 0;

        for b in 0..batch {
            let target_idx = targets_data[b].to_isize().unwrap_or(0);

            // Check ignore_index.
            if let Some(ignore) = self.ignore_index {
                if target_idx == ignore {
                    // This sample is ignored (loss = 0, not counted).
                    continue;
                }
            }

            let target_class = target_idx as usize;
            if target_class >= classes {
                return Err(FerrotorchError::InvalidArgument {
                    message: format!(
                        "NLLLoss: target index {} is out of range for {} classes at batch element {}",
                        target_class, classes, b
                    ),
                });
            }

            losses[b] = -lp_data[b * classes + target_class];
            valid_count += 1;
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;

        // Apply reduction, but for Mean we need to use valid_count instead of batch.
        let reduced = match self.reduction {
            Reduction::None => unreduced.clone(),
            Reduction::Sum => sum(&unreduced)?,
            Reduction::Mean => {
                if valid_count == 0 {
                    // All samples ignored: return 0.
                    Tensor::from_storage(
                        TensorStorage::cpu(vec![<T as Zero>::zero()]),
                        vec![],
                        false,
                    )?
                } else {
                    let s = sum(&unreduced)?;
                    let s_data = s.data_vec()?;
                    let mean_val = s_data[0] / T::from(valid_count).unwrap();
                    Tensor::from_storage(TensorStorage::cpu(vec![mean_val]), vec![], false)?
                }
            }
        };

        if is_grad_enabled() && log_probs.requires_grad() {
            let grad_fn = Arc::new(NLLBackward {
                log_probs: log_probs.clone(),
                targets: targets.clone(),
                reduction: self.reduction,
                ignore_index: self.ignore_index,
                valid_count,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for NLLLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, None)
    }
}

/// Backward for `NLLLoss`.
///
/// `grad_log_probs[b, c] = 0` for `c != target[b]`
/// `grad_log_probs[b, target[b]] = -1 * scale`
///
/// where `scale = grad_output / valid_count` (mean) or `grad_output` (sum).
#[derive(Debug)]
struct NLLBackward<T: Float> {
    log_probs: Tensor<T>,
    targets: Tensor<T>,
    reduction: Reduction,
    ignore_index: Option<isize>,
    valid_count: usize,
}

impl<T: Float> GradFn<T> for NLLBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "NLL backward" }`
    /// when `grad_output` lives on a CUDA device — the per-target one-hot scatter
    /// has no GPU counterpart yet. Move `grad_output` to CPU explicitly to run
    /// this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.log_probs.shape();
        let batch = shape[0];
        let classes = shape[1];

        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda { op: "NLL backward" });
        }
        let targets_data = self.targets.data()?;
        let grad_data = grad_output.data()?;

        let mut result = vec![<T as Zero>::zero(); batch * classes];

        for b in 0..batch {
            let target_idx = targets_data[b].to_isize().unwrap_or(0);

            if let Some(ignore) = self.ignore_index {
                if target_idx == ignore {
                    continue;
                }
            }

            let target_class = target_idx as usize;

            let scale = match self.reduction {
                Reduction::Mean => {
                    if self.valid_count > 0 {
                        grad_data[0] / T::from(self.valid_count).unwrap()
                    } else {
                        <T as Zero>::zero()
                    }
                }
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            result[b * classes + target_class] = -scale;
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.log_probs]
    }

    fn name(&self) -> &'static str {
        "NLLBackward"
    }
}

// ===========================================================================
// SmoothL1Loss
// ===========================================================================

/// Smooth L1 loss, an alias for [`HuberLoss`] with `delta = 1.0`.
///
/// ```text
/// if |error| < 1:  0.5 * error^2
/// else:            |error| - 0.5
/// ```
///
/// This is the same as PyTorch's `SmoothL1Loss` with `beta=1.0`.
///
/// `#[non_exhaustive]`: construct via [`SmoothL1Loss::new`] /
/// [`SmoothL1Loss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct SmoothL1Loss {
    pub reduction: Reduction,
}

impl SmoothL1Loss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    pub fn forward<T: Float>(
        &self,
        pred: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        // Delegate to HuberLoss with delta = 1.0.
        let huber = HuberLoss::new(self.reduction, 1.0);
        huber.forward(pred, target)
    }
}

impl Default for SmoothL1Loss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

// ===========================================================================
// BCELoss
// ===========================================================================

/// Binary cross-entropy loss.
///
/// Expects **probabilities** (after sigmoid) as `input` and binary targets in `{0, 1}`.
///
/// ```text
/// loss_i = -(target_i * log(input_i) + (1 - target_i) * log(1 - input_i))
/// ```
///
/// **Important**: unlike [`BCEWithLogitsLoss`], this does **not** apply sigmoid
/// internally. Inputs must be in `[0, 1]`. For numerical stability, values are
/// clamped to `[eps, 1 - eps]` where `eps = 1e-12`.
///
/// Matches `torch.nn.BCELoss`.
///
/// `#[non_exhaustive]`: construct via [`BCELoss::new`] / [`BCELoss::default`];
/// new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct BCELoss {
    pub reduction: Reduction,
}

impl BCELoss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    /// Compute BCE loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"bce_loss"`).
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("bce_loss");

        if input.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "BCELoss: input shape {:?} != target shape {:?}",
                    input.shape(),
                    target.shape()
                ),
            });
        }

        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let one = <T as One>::one();
        let eps = T::from(1e-12).unwrap();
        let one_m_eps = one - eps;

        let loss_data: Vec<T> = input_data
            .iter()
            .zip(target_data.iter())
            .map(|(&x, &y)| {
                // Clamp for numerical stability.
                let xc = if x < eps {
                    eps
                } else if x > one_m_eps {
                    one_m_eps
                } else {
                    x
                };
                -(y * xc.ln() + (one - y) * (one - xc).ln())
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), input.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(BCEBackward {
                input: input.clone(),
                target: target.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for BCELoss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `BCELoss`.
///
/// `grad = (-target / input + (1 - target) / (1 - input)) * grad_output`
#[derive(Debug)]
struct BCEBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for BCEBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "BCE backward" }`
    /// when `grad_output` lives on a CUDA device — the elementwise BCE gradient
    /// kernel has no GPU counterpart yet. Move `grad_output` to CPU explicitly
    /// to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda { op: "BCE backward" });
        }
        let input_data = self.input.data()?;
        let target_data = self.target.data()?;
        let grad_data = grad_output.data()?;
        let one = <T as One>::one();
        let eps = T::from(1e-12).unwrap();
        let one_m_eps = one - eps;
        let n = T::from(input_data.len()).unwrap();

        let result: Vec<T> = match self.reduction {
            Reduction::Mean => {
                let go = grad_data[0];
                input_data
                    .iter()
                    .zip(target_data.iter())
                    .map(|(&x, &y)| {
                        let xc = if x < eps {
                            eps
                        } else if x > one_m_eps {
                            one_m_eps
                        } else {
                            x
                        };
                        (-y / xc + (one - y) / (one - xc)) * go / n
                    })
                    .collect()
            }
            Reduction::Sum => {
                let go = grad_data[0];
                input_data
                    .iter()
                    .zip(target_data.iter())
                    .map(|(&x, &y)| {
                        let xc = if x < eps {
                            eps
                        } else if x > one_m_eps {
                            one_m_eps
                        } else {
                            x
                        };
                        (-y / xc + (one - y) / (one - xc)) * go
                    })
                    .collect()
            }
            Reduction::None => input_data
                .iter()
                .zip(target_data.iter())
                .zip(grad_data.iter())
                .map(|((&x, &y), &g)| {
                    let xc = if x < eps {
                        eps
                    } else if x > one_m_eps {
                        one_m_eps
                    } else {
                        x
                    };
                    (-y / xc + (one - y) / (one - xc)) * g
                })
                .collect(),
        };

        let grad_input = Tensor::from_storage(
            TensorStorage::cpu(result),
            self.input.shape().to_vec(),
            false,
        )?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "BCEBackward"
    }
}

// ===========================================================================
// TripletMarginLoss
// ===========================================================================

/// Triplet margin loss for metric learning.
///
/// ```text
/// loss = max(0, d(anchor, positive) - d(anchor, negative) + margin)
/// ```
///
/// where `d(x, y) = ||x - y||_p` is the Lp distance. Default `p = 2`.
///
/// Matches `torch.nn.TripletMarginLoss`.
///
/// `#[non_exhaustive]`: construct via [`TripletMarginLoss::new`] /
/// [`TripletMarginLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct TripletMarginLoss {
    pub reduction: Reduction,
    pub margin: f64,
    pub p: f64,
}

impl TripletMarginLoss {
    pub fn new(reduction: Reduction, margin: f64, p: f64) -> Self {
        Self {
            reduction,
            margin,
            p,
        }
    }

    /// Compute triplet margin loss.
    ///
    /// - `anchor`: tensor of shape `[B, D]`.
    /// - `positive`: tensor of shape `[B, D]`.
    /// - `negative`: tensor of shape `[B, D]`.
    pub fn forward<T: Float>(
        &self,
        anchor: &Tensor<T>,
        positive: &Tensor<T>,
        negative: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if anchor.shape() != positive.shape() || anchor.shape() != negative.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "TripletMarginLoss: shape mismatch: anchor {:?}, positive {:?}, negative {:?}",
                    anchor.shape(),
                    positive.shape(),
                    negative.shape()
                ),
            });
        }

        let shape = anchor.shape();
        if shape.len() != 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "TripletMarginLoss: expected 2D input [B, D], got shape {:?}",
                    shape
                ),
            });
        }

        let batch = shape[0];
        let feat = shape[1];
        let anchor_data = anchor.data_vec()?;
        let positive_data = positive.data_vec()?;
        let negative_data = negative.data_vec()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();
        let p_val = T::from(self.p).unwrap();
        let inv_p = T::from(1.0 / self.p).unwrap();

        let mut losses = vec![zero; batch];

        for (b, loss) in losses.iter_mut().enumerate() {
            let base = b * feat;
            let mut dist_pos = zero;
            let mut dist_neg = zero;
            for f in 0..feat {
                let dp = (anchor_data[base + f] - positive_data[base + f]).abs();
                let dn = (anchor_data[base + f] - negative_data[base + f]).abs();
                dist_pos += dp.powf(p_val);
                dist_neg += dn.powf(p_val);
            }
            dist_pos = dist_pos.powf(inv_p);
            dist_neg = dist_neg.powf(inv_p);

            let val = dist_pos - dist_neg + margin_t;
            *loss = if val > zero { val } else { zero };
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && anchor.requires_grad() {
            let grad_fn = Arc::new(TripletMarginBackward {
                anchor: anchor.clone(),
                positive: positive.clone(),
                negative: negative.clone(),
                margin: self.margin,
                p: self.p,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for TripletMarginLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 1.0, 2.0)
    }
}

/// Backward for `TripletMarginLoss`.
///
/// Only produces gradient for the anchor input. The gradient for the positive
/// and negative inputs is symmetric but we only track the anchor.
///
/// When `d_pos - d_neg + margin > 0`:
/// ```text
/// grad_anchor = (d(anchor - positive)/||a-p||_p - d(anchor - negative)/||a-n||_p) * scale
/// ```
#[derive(Debug)]
struct TripletMarginBackward<T: Float> {
    anchor: Tensor<T>,
    positive: Tensor<T>,
    negative: Tensor<T>,
    margin: f64,
    p: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for TripletMarginBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "TripletMargin backward" }`
    /// when `grad_output` lives on a CUDA device — the per-batch margin
    /// gradient routing has no GPU counterpart yet. Move `grad_output` to CPU
    /// explicitly to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.anchor.shape();
        let batch = shape[0];
        let feat = shape[1];
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "TripletMargin backward",
            });
        }
        let anchor_data = self.anchor.data()?;
        let positive_data = self.positive.data()?;
        let negative_data = self.negative.data()?;
        let grad_data = grad_output.data()?;

        let zero = <T as Zero>::zero();
        let p_val = T::from(self.p).unwrap();
        let margin_t = T::from(self.margin).unwrap();
        let inv_p = T::from(1.0 / self.p).unwrap();
        let p_m1 = p_val - <T as One>::one();

        let mut result = vec![zero; batch * feat];

        for b in 0..batch {
            let base = b * feat;

            // Compute distances.
            let mut dist_pos = zero;
            let mut dist_neg = zero;
            for f in 0..feat {
                let dp = (anchor_data[base + f] - positive_data[base + f]).abs();
                let dn = (anchor_data[base + f] - negative_data[base + f]).abs();
                dist_pos += dp.powf(p_val);
                dist_neg += dn.powf(p_val);
            }
            dist_pos = dist_pos.powf(inv_p);
            dist_neg = dist_neg.powf(inv_p);

            let triplet_val = dist_pos - dist_neg + margin_t;
            if triplet_val <= zero {
                // Hinge is zero — no gradient.
                continue;
            }

            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            let eps = T::from(1e-12).unwrap();

            for f in 0..feat {
                let diff_pos = anchor_data[base + f] - positive_data[base + f];
                let diff_neg = anchor_data[base + f] - negative_data[base + f];

                // d(||x||_p)/d(x_i) = sign(x_i) * |x_i|^(p-1) / ||x||_p^(p-1)
                let grad_pos = if dist_pos > eps {
                    diff_pos.signum() * diff_pos.abs().powf(p_m1) / dist_pos.powf(p_m1)
                } else {
                    zero
                };
                let grad_neg = if dist_neg > eps {
                    diff_neg.signum() * diff_neg.abs().powf(p_m1) / dist_neg.powf(p_m1)
                } else {
                    zero
                };

                result[base + f] = (grad_pos - grad_neg) * scale;
            }
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.anchor]
    }

    fn name(&self) -> &'static str {
        "TripletMarginBackward"
    }
}

// ===========================================================================
// MarginRankingLoss
// ===========================================================================

/// Margin ranking loss.
///
/// Given inputs `x1`, `x2` and label `y` (1 or -1):
///
/// ```text
/// loss = max(0, -y * (x1 - x2) + margin)
/// ```
///
/// Matches `torch.nn.MarginRankingLoss`.
///
/// `#[non_exhaustive]`: construct via [`MarginRankingLoss::new`] /
/// [`MarginRankingLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct MarginRankingLoss {
    pub reduction: Reduction,
    pub margin: f64,
}

impl MarginRankingLoss {
    pub fn new(reduction: Reduction, margin: f64) -> Self {
        Self { reduction, margin }
    }

    /// Compute margin ranking loss.
    ///
    /// - `x1`: 1-D tensor of shape `[N]`.
    /// - `x2`: 1-D tensor of shape `[N]`.
    /// - `y`: 1-D tensor of shape `[N]` with values `1.0` or `-1.0`.
    pub fn forward<T: Float>(
        &self,
        x1: &Tensor<T>,
        x2: &Tensor<T>,
        y: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if x1.shape() != x2.shape() || x1.shape() != y.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "MarginRankingLoss: shape mismatch: x1 {:?}, x2 {:?}, y {:?}",
                    x1.shape(),
                    x2.shape(),
                    y.shape()
                ),
            });
        }

        let x1_data = x1.data_vec()?;
        let x2_data = x2.data_vec()?;
        let y_data = y.data_vec()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();

        let loss_data: Vec<T> = x1_data
            .iter()
            .zip(x2_data.iter())
            .zip(y_data.iter())
            .map(|((&a, &b), &yi)| {
                let val = -yi * (a - b) + margin_t;
                if val > zero { val } else { zero }
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), x1.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && x1.requires_grad() {
            let grad_fn = Arc::new(MarginRankingBackward {
                x1: x1.clone(),
                x2: x2.clone(),
                y: y.clone(),
                margin: self.margin,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for MarginRankingLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 0.0)
    }
}

/// Backward for `MarginRankingLoss`.
///
/// When `-y * (x1 - x2) + margin > 0`:
/// `grad_x1 = -y * grad_output`
#[derive(Debug)]
struct MarginRankingBackward<T: Float> {
    x1: Tensor<T>,
    x2: Tensor<T>,
    y: Tensor<T>,
    margin: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for MarginRankingBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "MarginRanking backward" }`
    /// when `grad_output` lives on a CUDA device — the pairwise margin gradient
    /// kernel has no GPU counterpart yet. Move `grad_output` to CPU explicitly
    /// to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "MarginRanking backward",
            });
        }
        let x1_data = self.x1.data()?;
        let x2_data = self.x2.data()?;
        let y_data = self.y.data()?;
        let grad_data = grad_output.data()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();
        let n = T::from(x1_data.len()).unwrap();

        let result: Vec<T> = match self.reduction {
            Reduction::Mean => {
                let go = grad_data[0];
                x1_data
                    .iter()
                    .zip(x2_data.iter())
                    .zip(y_data.iter())
                    .map(|((&a, &b), &yi)| {
                        let val = -yi * (a - b) + margin_t;
                        if val > zero { -yi * go / n } else { zero }
                    })
                    .collect()
            }
            Reduction::Sum => {
                let go = grad_data[0];
                x1_data
                    .iter()
                    .zip(x2_data.iter())
                    .zip(y_data.iter())
                    .map(|((&a, &b), &yi)| {
                        let val = -yi * (a - b) + margin_t;
                        if val > zero { -yi * go } else { zero }
                    })
                    .collect()
            }
            Reduction::None => x1_data
                .iter()
                .zip(x2_data.iter())
                .zip(y_data.iter())
                .zip(grad_data.iter())
                .map(|(((&a, &b), &yi), &g)| {
                    let val = -yi * (a - b) + margin_t;
                    if val > zero { -yi * g } else { zero }
                })
                .collect(),
        };

        let grad_input =
            Tensor::from_storage(TensorStorage::cpu(result), self.x1.shape().to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.x1]
    }

    fn name(&self) -> &'static str {
        "MarginRankingBackward"
    }
}

// ===========================================================================
// CTCLoss
// ===========================================================================

/// Connectionist Temporal Classification loss.
///
/// Computes the CTC loss between a continuous (unsegmented) time series and a
/// target sequence. Used in speech recognition and OCR where alignment between
/// input and output is unknown.
///
/// - `log_probs`: Log-probabilities of shape `[T, B, C]` (time, batch, classes).
/// - `targets`: 1-D tensor of concatenated target sequences.
/// - `input_lengths`: Length of each input sequence in the batch.
/// - `target_lengths`: Length of each target sequence in the batch.
/// - `blank`: Index of the blank label (default 0).
///
/// Matches `torch.nn.CTCLoss`.
///
/// `#[non_exhaustive]`: construct via [`CTCLoss::new`] / [`CTCLoss::default`];
/// new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct CTCLoss {
    pub reduction: Reduction,
    pub blank: usize,
    pub zero_infinity: bool,
}

impl CTCLoss {
    pub fn new(reduction: Reduction, blank: usize, zero_infinity: bool) -> Self {
        Self {
            reduction,
            blank,
            zero_infinity,
        }
    }

    /// Compute CTC loss using the forward-backward algorithm.
    ///
    /// # Arguments
    ///
    /// * `log_probs` - Log-probabilities `[T, B, C]`.
    /// * `targets` - Concatenated target labels (1-D).
    /// * `input_lengths` - Input sequence lengths `[B]`.
    /// * `target_lengths` - Target sequence lengths `[B]`.
    pub fn forward<T: Float>(
        &self,
        log_probs: &Tensor<T>,
        targets: &Tensor<T>,
        input_lengths: &[usize],
        target_lengths: &[usize],
    ) -> FerrotorchResult<Tensor<T>> {
        let shape = log_probs.shape();
        if shape.len() != 3 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CTCLoss: expected 3D log_probs [T, B, C], got shape {:?}",
                    shape
                ),
            });
        }

        let max_t = shape[0];
        let batch = shape[1];
        let num_classes = shape[2];

        if input_lengths.len() != batch || target_lengths.len() != batch {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "CTCLoss: batch size {} but input_lengths.len()={}, target_lengths.len()={}",
                    batch,
                    input_lengths.len(),
                    target_lengths.len()
                ),
            });
        }

        let lp_data = log_probs.data_vec()?;
        let targets_data = targets.data_vec()?;
        let neg_inf = T::from(-1e30).unwrap();
        let zero = <T as Zero>::zero();

        let mut losses = vec![zero; batch];
        let mut target_offset = 0usize;

        for b in 0..batch {
            let t_len = input_lengths[b].min(max_t);
            let s_len = target_lengths[b];

            // Extract this sample's targets.
            let tgt: Vec<usize> = (0..s_len)
                .map(|i| targets_data[target_offset + i].to_usize().unwrap_or(0))
                .collect();
            target_offset += s_len;

            if s_len == 0 {
                // Empty target: loss = -log_prob(blank at every timestep).
                let mut log_prob_blank = zero;
                for t in 0..t_len {
                    let idx = t * batch * num_classes + b * num_classes + self.blank;
                    log_prob_blank += lp_data[idx];
                }
                losses[b] = -log_prob_blank;
                continue;
            }

            // Build extended label sequence with blanks: [blank, s0, blank, s1, blank, ...]
            let ext_len = 2 * s_len + 1;
            let mut ext_labels = vec![self.blank; ext_len];
            for i in 0..s_len {
                ext_labels[2 * i + 1] = tgt[i];
            }

            // Forward pass: alpha[t][s] = log-prob of emitting ext_labels[0..=s] in time 0..=t.
            let mut alpha = vec![vec![neg_inf; ext_len]; t_len];

            // t = 0
            let lp_blank_0 = lp_data[b * num_classes + ext_labels[0]];
            alpha[0][0] = lp_blank_0;
            if ext_len > 1 {
                let lp_first = lp_data[b * num_classes + ext_labels[1]];
                alpha[0][1] = lp_first;
            }

            for t in 1..t_len {
                for s in 0..ext_len {
                    let lp_val = lp_data[t * batch * num_classes + b * num_classes + ext_labels[s]];

                    let mut log_sum = alpha[t - 1][s];
                    if s >= 1 {
                        log_sum = log_add_exp(log_sum, alpha[t - 1][s - 1]);
                    }
                    if s >= 2 && ext_labels[s] != self.blank && ext_labels[s] != ext_labels[s - 2] {
                        log_sum = log_add_exp(log_sum, alpha[t - 1][s - 2]);
                    }

                    alpha[t][s] = log_sum + lp_val;
                }
            }

            // Total log-probability: log_add_exp of the last two states.
            let log_prob =
                log_add_exp(alpha[t_len - 1][ext_len - 1], alpha[t_len - 1][ext_len - 2]);

            let loss_val = -log_prob;

            if self.zero_infinity && (loss_val == T::infinity() || loss_val.is_nan()) {
                losses[b] = zero;
            } else {
                losses[b] = loss_val;
            }
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && log_probs.requires_grad() {
            let grad_fn = Arc::new(CTCBackward {
                log_probs: log_probs.clone(),
                targets: targets.clone(),
                input_lengths: input_lengths.to_vec(),
                target_lengths: target_lengths.to_vec(),
                blank: self.blank,
                zero_infinity: self.zero_infinity,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for CTCLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 0, false)
    }
}

/// Log-add-exp: `log(exp(a) + exp(b))` in a numerically stable way.
fn log_add_exp<T: Float>(a: T, b: T) -> T {
    let max = if a > b { a } else { b };
    let min = if a > b { b } else { a };
    // If max is -inf, both are -inf.
    let threshold = T::from(-1e29).unwrap();
    if max < threshold {
        max
    } else {
        max + (min - max).exp().ln_1p()
    }
}

/// Backward for `CTCLoss`.
///
/// Uses the full forward-backward algorithm to compute gradients w.r.t.
/// `log_probs`. For each `(t, b, c)`:
///
/// ```text
/// grad[t, b, c] = exp(log_probs[t,b,c]) - (1/P) * sum_{s: ext[s]==c} exp(alpha[t][s] + beta[t][s] - log_probs[t,b,c])
/// ```
///
/// where `P = exp(log_prob_total)` is the total path probability.
#[derive(Debug)]
struct CTCBackward<T: Float> {
    log_probs: Tensor<T>,
    targets: Tensor<T>,
    input_lengths: Vec<usize>,
    target_lengths: Vec<usize>,
    blank: usize,
    zero_infinity: bool,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for CTCBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "CTC backward" }`
    /// when `grad_output` lives on a CUDA device — the forward/backward dynamic
    /// programming kernel for connectionist temporal classification has no GPU
    /// counterpart yet. Move `grad_output` to CPU explicitly to run this
    /// backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda { op: "CTC backward" });
        }

        let shape = self.log_probs.shape();
        let max_t = shape[0];
        let batch = shape[1];
        let num_classes = shape[2];
        let total_size = max_t * batch * num_classes;

        let lp_data = self.log_probs.data()?;
        let targets_data = self.targets.data()?;
        let grad_data = grad_output.data()?;
        let neg_inf = T::from(-1e30).unwrap();
        let zero = <T as Zero>::zero();

        let mut result = vec![zero; total_size];
        let mut target_offset = 0usize;

        for b in 0..batch {
            let t_len = self.input_lengths[b].min(max_t);
            let s_len = self.target_lengths[b];

            let tgt: Vec<usize> = (0..s_len)
                .map(|i| targets_data[target_offset + i].to_usize().unwrap_or(0))
                .collect();
            target_offset += s_len;

            let go_scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            if s_len == 0 {
                // Empty target: grad = -1 at blank for each timestep.
                for t in 0..t_len {
                    let idx = t * batch * num_classes + b * num_classes + self.blank;
                    result[idx] = -go_scale;
                }
                continue;
            }

            let ext_len = 2 * s_len + 1;
            let mut ext_labels = vec![self.blank; ext_len];
            for i in 0..s_len {
                ext_labels[2 * i + 1] = tgt[i];
            }

            // Forward pass (alpha).
            let mut alpha = vec![vec![neg_inf; ext_len]; t_len];
            alpha[0][0] = lp_data[b * num_classes + ext_labels[0]];
            if ext_len > 1 {
                alpha[0][1] = lp_data[b * num_classes + ext_labels[1]];
            }
            for t in 1..t_len {
                for s in 0..ext_len {
                    let lp_val = lp_data[t * batch * num_classes + b * num_classes + ext_labels[s]];
                    let mut log_sum = alpha[t - 1][s];
                    if s >= 1 {
                        log_sum = log_add_exp(log_sum, alpha[t - 1][s - 1]);
                    }
                    if s >= 2 && ext_labels[s] != self.blank && ext_labels[s] != ext_labels[s - 2] {
                        log_sum = log_add_exp(log_sum, alpha[t - 1][s - 2]);
                    }
                    alpha[t][s] = log_sum + lp_val;
                }
            }

            let log_prob =
                log_add_exp(alpha[t_len - 1][ext_len - 1], alpha[t_len - 1][ext_len - 2]);

            if self.zero_infinity && ((-log_prob) == T::infinity() || (-log_prob).is_nan()) {
                continue;
            }

            // Backward pass (beta).
            // beta[t][s] = log P(observing labels l'[s..] from time t onward),
            // where alpha INCLUDES the emission at time t but beta does NOT.
            // This avoids double-counting: alpha[t][s] + beta[t][s] =
            // log P(path passes through state s at time t).
            //
            // Initialization: beta[T-1][s] = 0 for valid ending states.
            // Recurrence: beta[t][s] = log_add_exp over successors s' of
            //   (y_{t+1}^{l'[s']} + beta[t+1][s']).
            let mut beta = vec![vec![neg_inf; ext_len]; t_len];
            beta[t_len - 1][ext_len - 1] = zero;
            if ext_len > 1 {
                beta[t_len - 1][ext_len - 2] = zero;
            }
            for t in (0..t_len.saturating_sub(1)).rev() {
                for s in (0..ext_len).rev() {
                    // Successor s (same state): y_{t+1}^{l'[s]} + beta[t+1][s]
                    let lp_s =
                        lp_data[(t + 1) * batch * num_classes + b * num_classes + ext_labels[s]];
                    let mut log_sum = lp_s + beta[t + 1][s];
                    if s + 1 < ext_len {
                        let lp_s1 = lp_data
                            [(t + 1) * batch * num_classes + b * num_classes + ext_labels[s + 1]];
                        log_sum = log_add_exp(log_sum, lp_s1 + beta[t + 1][s + 1]);
                    }
                    if s + 2 < ext_len
                        && ext_labels[s] != self.blank
                        && ext_labels[s] != ext_labels[s + 2]
                    {
                        let lp_s2 = lp_data
                            [(t + 1) * batch * num_classes + b * num_classes + ext_labels[s + 2]];
                        log_sum = log_add_exp(log_sum, lp_s2 + beta[t + 1][s + 2]);
                    }
                    beta[t][s] = log_sum;
                }
            }

            // Accumulate gradients.
            // alpha[t][s] includes emission, beta[t][s] does not, so
            // alpha[t][s] + beta[t][s] = log P(all paths through state s at time t).
            //
            // d(-log P)/d(log_probs[t,b,c]) = -(1/P) * dP/d(log_probs[t,b,c])
            // Since y_t^c = exp(log_probs[t,b,c]):
            //   dP/d(log_probs[t,b,c]) = sum_{s:l'[s]=c} exp(alpha[t][s] + beta[t][s])
            //
            // So: grad[t,b,c] = -exp(log_ab_per_class[c] - log_prob)
            for t in 0..t_len {
                let mut log_ab_per_class = vec![neg_inf; num_classes];
                for s in 0..ext_len {
                    let c = ext_labels[s];
                    let ab = alpha[t][s] + beta[t][s];
                    log_ab_per_class[c] = log_add_exp(log_ab_per_class[c], ab);
                }

                let threshold = T::from(-1e29).unwrap();
                let base_idx = t * batch * num_classes + b * num_classes;
                for (c, &log_ab) in log_ab_per_class.iter().enumerate() {
                    let occupation = if log_ab > threshold {
                        (log_ab - log_prob).exp()
                    } else {
                        zero
                    };
                    result[base_idx + c] = -occupation * go_scale;
                }
            }
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.log_probs]
    }

    fn name(&self) -> &'static str {
        "CTCBackward"
    }
}

// ===========================================================================
// PoissonNLLLoss
// ===========================================================================

/// Negative log-likelihood loss with Poisson distribution.
///
/// Target is expected to be a count (non-negative integer or float).
///
/// ```text
/// loss = exp(input) - target * input      (if log_input=true, the default)
/// loss = input - target * log(input+eps)  (if log_input=false)
/// ```
///
/// Matches `torch.nn.PoissonNLLLoss`.
///
/// `#[non_exhaustive]`: construct via [`PoissonNLLLoss::new`] /
/// [`PoissonNLLLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct PoissonNLLLoss {
    pub reduction: Reduction,
    pub log_input: bool,
    pub eps: f64,
}

impl PoissonNLLLoss {
    pub fn new(reduction: Reduction, log_input: bool, eps: f64) -> Self {
        Self {
            reduction,
            log_input,
            eps,
        }
    }

    /// Compute Poisson NLL loss.
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if input.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "PoissonNLLLoss: input shape {:?} != target shape {:?}",
                    input.shape(),
                    target.shape()
                ),
            });
        }

        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let eps_t = T::from(self.eps).unwrap();

        let loss_data: Vec<T> = input_data
            .iter()
            .zip(target_data.iter())
            .map(|(&x, &y)| {
                if self.log_input {
                    // loss = exp(x) - y * x
                    x.exp() - y * x
                } else {
                    // loss = x - y * log(x + eps)
                    x - y * (x + eps_t).ln()
                }
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), input.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(PoissonNLLBackward {
                input: input.clone(),
                target: target.clone(),
                log_input: self.log_input,
                eps: self.eps,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for PoissonNLLLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, true, 1e-8)
    }
}

/// Backward for `PoissonNLLLoss`.
///
/// ```text
/// log_input=true:  grad = (exp(input) - target) * grad_output
/// log_input=false: grad = (1 - target / (input + eps)) * grad_output
/// ```
#[derive(Debug)]
struct PoissonNLLBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    log_input: bool,
    eps: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for PoissonNLLBackward<T> {
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        use ferrotorch_core::autograd::no_grad::no_grad;
        use ferrotorch_core::grad_fns::arithmetic::{add, div, mul, sub};
        use ferrotorch_core::grad_fns::transcendental::exp;

        let device = self.input.device();
        let grad_input = no_grad(|| {
            // local = exp(input) - target  OR  1 - target / (input + eps)
            let local = if self.log_input {
                let exp_input = exp(&self.input)?;
                sub(&exp_input, &self.target)?
            } else {
                let eps =
                    ferrotorch_core::creation::scalar(T::from(self.eps).unwrap())?.to(device)?;
                let one = ferrotorch_core::creation::scalar(<T as One>::one())?.to(device)?;
                let denom = add(&self.input, &eps)?;
                let ratio = div(&self.target, &denom)?;
                sub(&one, &ratio)?
            };
            let result = mul(&local, grad_output)?;
            match self.reduction {
                Reduction::Mean => {
                    let n = ferrotorch_core::creation::scalar(
                        T::from(self.input.shape().iter().product::<usize>()).unwrap(),
                    )?
                    .to(device)?;
                    div(&result, &n)
                }
                _ => Ok(result),
            }
        })?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "PoissonNLLBackward"
    }
}

// ===========================================================================
// MultiMarginLoss
// ===========================================================================

/// Multi-class margin loss (hinge loss for classification).
///
/// For each sample with true class `y`:
///
/// ```text
/// loss = (1/C) * sum_{j != y} max(0, margin - x[y] + x[j])^p
/// ```
///
/// where `p` is 1 or 2 (default 1).
///
/// Matches `torch.nn.MultiMarginLoss`.
///
/// `#[non_exhaustive]`: construct via [`MultiMarginLoss::new`] /
/// [`MultiMarginLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct MultiMarginLoss {
    pub reduction: Reduction,
    pub p: usize,
    pub margin: f64,
}

impl MultiMarginLoss {
    pub fn new(reduction: Reduction, p: usize, margin: f64) -> Self {
        Self {
            reduction,
            p,
            margin,
        }
    }

    /// Compute multi-margin loss.
    ///
    /// - `input`: `[B, C]` (scores for each class).
    /// - `target`: `[B]` (class indices, stored as floats).
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        let shape = input.shape();
        if shape.len() != 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "MultiMarginLoss: expected 2D input [B, C], got shape {:?}",
                    shape
                ),
            });
        }

        let batch = shape[0];
        let classes = shape[1];

        if target.shape() != [batch] {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "MultiMarginLoss: target shape {:?} does not match batch size {}",
                    target.shape(),
                    batch,
                ),
            });
        }

        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();
        let inv_c = T::from(1.0).unwrap() / T::from(classes).unwrap();

        let mut losses = vec![zero; batch];

        for b in 0..batch {
            let base = b * classes;
            let y = target_data[b].to_usize().unwrap_or(0);
            let x_y = input_data[base + y];
            let mut sample_loss = zero;

            for j in 0..classes {
                if j == y {
                    continue;
                }
                let val = margin_t - x_y + input_data[base + j];
                if val > zero {
                    sample_loss += if self.p == 2 { val * val } else { val };
                }
            }

            losses[b] = sample_loss * inv_c;
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(MultiMarginBackward {
                input: input.clone(),
                target: target.clone(),
                p: self.p,
                margin: self.margin,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for MultiMarginLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 1, 1.0)
    }
}

/// Backward for `MultiMarginLoss`.
#[derive(Debug)]
struct MultiMarginBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    p: usize,
    margin: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for MultiMarginBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "MultiMargin backward" }`
    /// when `grad_output` lives on a CUDA device — the per-class hinge gradient
    /// kernel has no GPU counterpart yet. Move `grad_output` to CPU explicitly
    /// to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.input.shape();
        let batch = shape[0];
        let classes = shape[1];

        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "MultiMargin backward",
            });
        }
        let input_data = self.input.data()?;
        let target_data = self.target.data()?;
        let grad_data = grad_output.data()?;

        let zero = <T as Zero>::zero();
        let one = <T as One>::one();
        let two = T::from(2.0).unwrap();
        let margin_t = T::from(self.margin).unwrap();
        let inv_c = one / T::from(classes).unwrap();

        let mut result = vec![zero; batch * classes];

        for b in 0..batch {
            let base = b * classes;
            let y = target_data[b].to_usize().unwrap_or(0);
            let x_y = input_data[base + y];

            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            let mut grad_y = zero;
            for j in 0..classes {
                if j == y {
                    continue;
                }
                let val = margin_t - x_y + input_data[base + j];
                if val > zero {
                    // d/d(x_j) of max(0, margin - x_y + x_j)^p
                    let g_j = if self.p == 2 { two * val } else { one };
                    result[base + j] = g_j * inv_c * scale;
                    // d/d(x_y) accumulates -g_j
                    grad_y = grad_y - g_j * inv_c * scale;
                }
            }
            result[base + y] += grad_y;
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "MultiMarginBackward"
    }
}

// ===========================================================================
// MultiLabelSoftMarginLoss
// ===========================================================================

/// Multi-label one-versus-all loss based on max-entropy.
///
/// For each element:
///
/// ```text
/// loss = -(target * log(sigma(input)) + (1 - target) * log(1 - sigma(input)))
/// ```
///
/// This is equivalent to `BCEWithLogitsLoss` applied independently per label,
/// then summed over the class dimension and reduced over the batch.
///
/// Matches `torch.nn.MultiLabelSoftMarginLoss`.
///
/// `#[non_exhaustive]`: construct via [`MultiLabelSoftMarginLoss::new`] /
/// [`MultiLabelSoftMarginLoss::default`]; new fields may be added in minor
/// releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct MultiLabelSoftMarginLoss {
    pub reduction: Reduction,
}

impl MultiLabelSoftMarginLoss {
    pub fn new(reduction: Reduction) -> Self {
        Self { reduction }
    }

    /// Compute multi-label soft margin loss.
    ///
    /// - `input`: `[B, C]` (raw logits).
    /// - `target`: `[B, C]` (binary labels, 0 or 1).
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        let shape = input.shape();
        if shape.len() != 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "MultiLabelSoftMarginLoss: expected 2D input [B, C], got shape {:?}",
                    shape
                ),
            });
        }

        if input.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "MultiLabelSoftMarginLoss: input shape {:?} != target shape {:?}",
                    input.shape(),
                    target.shape()
                ),
            });
        }

        let batch = shape[0];
        let classes = shape[1];
        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let zero = <T as Zero>::zero();
        let one = <T as One>::one();
        let inv_c = one / T::from(classes).unwrap();

        // Per-sample loss: mean over classes of BCE-with-logits.
        let mut losses = vec![zero; batch];

        for (b, loss) in losses.iter_mut().enumerate() {
            let base = b * classes;
            let mut sample_loss = zero;
            for c in 0..classes {
                let x = input_data[base + c];
                let y = target_data[base + c];
                // BCE with logits: max(x,0) - x*y + log(1 + exp(-|x|))
                let relu_x = if x > zero { x } else { zero };
                let abs_x = if x > zero { x } else { -x };
                sample_loss += relu_x - x * y + (one + (-abs_x).exp()).ln();
            }
            *loss = sample_loss * inv_c;
        }

        let unreduced = Tensor::from_storage(TensorStorage::cpu(losses), vec![batch], false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(MultiLabelSoftMarginBackward {
                input: input.clone(),
                target: target.clone(),
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for MultiLabelSoftMarginLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean)
    }
}

/// Backward for `MultiLabelSoftMarginLoss`.
///
/// `grad = (sigmoid(input) - target) / C * scale`
#[derive(Debug)]
struct MultiLabelSoftMarginBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for MultiLabelSoftMarginBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "MultiLabelSoftMargin backward" }`
    /// when `grad_output` lives on a CUDA device — the elementwise sigmoid-based
    /// gradient kernel has no GPU counterpart yet. Move `grad_output` to CPU
    /// explicitly to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        let shape = self.input.shape();
        let batch = shape[0];
        let classes = shape[1];

        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "MultiLabelSoftMargin backward",
            });
        }
        let input_data = self.input.data()?;
        let target_data = self.target.data()?;
        let grad_data = grad_output.data()?;
        let one = <T as One>::one();
        let inv_c = one / T::from(classes).unwrap();

        let mut result = vec![<T as Zero>::zero(); batch * classes];

        for b in 0..batch {
            let base = b * classes;

            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(batch).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[b],
            };

            for c in 0..classes {
                let x = input_data[base + c];
                let y = target_data[base + c];
                let sig = one / (one + (-x).exp());
                result[base + c] = (sig - y) * inv_c * scale;
            }
        }

        let grad_input = Tensor::from_storage(TensorStorage::cpu(result), shape.to_vec(), false)?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "MultiLabelSoftMarginBackward"
    }
}

// ===========================================================================
// HingeEmbeddingLoss
// ===========================================================================

/// Hinge embedding loss for learning non-linear embeddings or semi-supervised
/// learning.
///
/// ```text
/// loss = x           if y == 1
/// loss = max(0, margin - x)  if y == -1
/// ```
///
/// Matches `torch.nn.HingeEmbeddingLoss`.
///
/// `#[non_exhaustive]`: construct via [`HingeEmbeddingLoss::new`] /
/// [`HingeEmbeddingLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct HingeEmbeddingLoss {
    pub reduction: Reduction,
    pub margin: f64,
}

impl HingeEmbeddingLoss {
    pub fn new(reduction: Reduction, margin: f64) -> Self {
        Self { reduction, margin }
    }

    /// Compute hinge embedding loss.
    ///
    /// - `input`: distance or similarity values.
    /// - `y`: labels, `1.0` for positive, `-1.0` for negative.
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        y: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        if input.shape() != y.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "HingeEmbeddingLoss: input shape {:?} != y shape {:?}",
                    input.shape(),
                    y.shape()
                ),
            });
        }

        let input_data = input.data_vec()?;
        let y_data = y.data_vec()?;
        let zero = <T as Zero>::zero();
        let margin_t = T::from(self.margin).unwrap();

        let loss_data: Vec<T> = input_data
            .iter()
            .zip(y_data.iter())
            .map(|(&x, &yi)| {
                if yi > zero {
                    // Positive: loss = x
                    x
                } else {
                    // Negative: loss = max(0, margin - x)
                    let val = margin_t - x;
                    if val > zero { val } else { zero }
                }
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), input.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && input.requires_grad() {
            let grad_fn = Arc::new(HingeEmbeddingBackward {
                input: input.clone(),
                y: y.clone(),
                margin: self.margin,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for HingeEmbeddingLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, 1.0)
    }
}

/// Backward for `HingeEmbeddingLoss`.
///
/// ```text
/// y == 1:   grad = 1 * grad_output
/// y == -1:  grad = -1 * grad_output  if margin - x > 0, else 0
/// ```
#[derive(Debug)]
struct HingeEmbeddingBackward<T: Float> {
    input: Tensor<T>,
    y: Tensor<T>,
    margin: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for HingeEmbeddingBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "HingeEmbedding backward" }`
    /// when `grad_output` lives on a CUDA device — the hinge gradient kernel has
    /// no GPU counterpart yet. Move `grad_output` to CPU explicitly to run this
    /// backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "HingeEmbedding backward",
            });
        }
        let input_data = self.input.data()?;
        let y_data = self.y.data()?;
        let grad_data = grad_output.data()?;
        let zero = <T as Zero>::zero();
        let one = <T as One>::one();
        let margin_t = T::from(self.margin).unwrap();
        let n = T::from(input_data.len()).unwrap();

        let result: Vec<T> = match self.reduction {
            Reduction::Mean => {
                let go = grad_data[0];
                input_data
                    .iter()
                    .zip(y_data.iter())
                    .map(|(&x, &yi)| {
                        if yi > zero {
                            one * go / n
                        } else {
                            let val = margin_t - x;
                            if val > zero { -one * go / n } else { zero }
                        }
                    })
                    .collect()
            }
            Reduction::Sum => {
                let go = grad_data[0];
                input_data
                    .iter()
                    .zip(y_data.iter())
                    .map(|(&x, &yi)| {
                        if yi > zero {
                            one * go
                        } else {
                            let val = margin_t - x;
                            if val > zero { -one * go } else { zero }
                        }
                    })
                    .collect()
            }
            Reduction::None => input_data
                .iter()
                .zip(y_data.iter())
                .zip(grad_data.iter())
                .map(|((&x, &yi), &g)| {
                    if yi > zero {
                        one * g
                    } else {
                        let val = margin_t - x;
                        if val > zero { -one * g } else { zero }
                    }
                })
                .collect(),
        };

        let grad_input = Tensor::from_storage(
            TensorStorage::cpu(result),
            self.input.shape().to_vec(),
            false,
        )?;
        Ok(vec![Some(grad_input)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input]
    }

    fn name(&self) -> &'static str {
        "HingeEmbeddingBackward"
    }
}

// ===========================================================================
// GaussianNLLLoss
// ===========================================================================

/// Gaussian negative log-likelihood loss.
///
/// Models the target as drawn from a Gaussian with predicted mean and variance:
///
/// ```text
/// loss = 0.5 * (log(var) + (input - target)^2 / var + log(2*pi))
/// ```
///
/// The `log(2*pi)` constant is included when `full` is `true` (default `false`),
/// matching `torch.nn.GaussianNLLLoss`.
///
/// - `input`: predicted mean, any shape.
/// - `target`: observed values, same shape as `input`.
/// - `var`: predicted variance, same shape as `input` (must be positive).
///
/// The `eps` parameter clamps variance from below for numerical stability.
///
/// `#[non_exhaustive]`: construct via [`GaussianNLLLoss::new`] /
/// [`GaussianNLLLoss::default`]; new fields may be added in minor releases.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct GaussianNLLLoss {
    pub reduction: Reduction,
    pub full: bool,
    pub eps: f64,
}

impl GaussianNLLLoss {
    pub fn new(reduction: Reduction, full: bool, eps: f64) -> Self {
        Self {
            reduction,
            full,
            eps,
        }
    }

    /// Compute Gaussian NLL loss.
    ///
    /// Participates in autocast: classified as `FullPrecision` (`"gaussian_nll_loss"`).
    pub fn forward<T: Float>(
        &self,
        input: &Tensor<T>,
        target: &Tensor<T>,
        var: &Tensor<T>,
    ) -> FerrotorchResult<Tensor<T>> {
        autocast_guard("gaussian_nll_loss");

        if input.shape() != target.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "GaussianNLLLoss: input shape {:?} != target shape {:?}",
                    input.shape(),
                    target.shape()
                ),
            });
        }

        if input.shape() != var.shape() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "GaussianNLLLoss: input shape {:?} != var shape {:?}",
                    input.shape(),
                    var.shape()
                ),
            });
        }

        let input_data = input.data_vec()?;
        let target_data = target.data_vec()?;
        let var_data = var.data_vec()?;
        let half = T::from(0.5).unwrap();
        let eps_t = T::from(self.eps).unwrap();
        let log_2pi = T::from((2.0 * std::f64::consts::PI).ln()).unwrap();

        let loss_data: Vec<T> = input_data
            .iter()
            .zip(target_data.iter())
            .zip(var_data.iter())
            .map(|((&inp, &tgt), &v)| {
                let v_clamped = if v < eps_t { eps_t } else { v };
                let diff = inp - tgt;
                let mut l = half * (v_clamped.ln() + diff * diff / v_clamped);
                if self.full {
                    l += half * log_2pi;
                }
                l
            })
            .collect();

        let unreduced =
            Tensor::from_storage(TensorStorage::cpu(loss_data), input.shape().to_vec(), false)?;
        let reduced = apply_reduction(&unreduced, self.reduction)?;

        if is_grad_enabled() && (input.requires_grad() || var.requires_grad()) {
            let grad_fn = Arc::new(GaussianNLLBackward {
                input: input.clone(),
                target: target.clone(),
                var: var.clone(),
                eps: self.eps,
                reduction: self.reduction,
            });
            Tensor::from_operation(
                TensorStorage::cpu(reduced.data_vec()?),
                reduced.shape().to_vec(),
                grad_fn,
            )
        } else {
            Ok(reduced)
        }
    }
}

impl Default for GaussianNLLLoss {
    fn default() -> Self {
        Self::new(Reduction::Mean, false, 1e-6)
    }
}

/// Backward for `GaussianNLLLoss`.
///
/// ```text
/// d(loss)/d(input) = (input - target) / var
/// d(loss)/d(var)   = 0.5 * (1/var - (input - target)^2 / var^2)
/// ```
#[derive(Debug)]
struct GaussianNLLBackward<T: Float> {
    input: Tensor<T>,
    target: Tensor<T>,
    var: Tensor<T>,
    eps: f64,
    reduction: Reduction,
}

impl<T: Float> GradFn<T> for GaussianNLLBackward<T> {
    /// # Errors
    ///
    /// Returns `FerrotorchError::NotImplementedOnCuda { op: "GaussianNLL backward" }`
    /// when `grad_output` lives on a CUDA device — the per-element Gaussian NLL
    /// gradient kernel has no GPU counterpart yet. Move `grad_output` to CPU
    /// explicitly to run this backward.
    /// Also propagates any `FerrotorchError` from intermediate tensor reads.
    fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
        if grad_output.is_cuda() {
            return Err(FerrotorchError::NotImplementedOnCuda {
                op: "GaussianNLL backward",
            });
        }

        let input_data = self.input.data()?;
        let target_data = self.target.data()?;
        let var_data = self.var.data()?;
        let grad_data = grad_output.data()?;
        let n = input_data.len();
        let half = T::from(0.5).unwrap();
        let eps_t = T::from(self.eps).unwrap();
        let zero = <T as Zero>::zero();

        let mut grad_input = vec![zero; n];
        let mut grad_var = vec![zero; n];

        for i in 0..n {
            let scale = match self.reduction {
                Reduction::Mean => grad_data[0] / T::from(n).unwrap(),
                Reduction::Sum => grad_data[0],
                Reduction::None => grad_data[i],
            };

            let v = if var_data[i] < eps_t {
                eps_t
            } else {
                var_data[i]
            };
            let diff = input_data[i] - target_data[i];

            // d(loss)/d(input) = (input - target) / var
            grad_input[i] = diff / v * scale;

            // d(loss)/d(var) = 0.5 * (1/var - diff^2 / var^2)
            grad_var[i] = half * (<T as One>::one() / v - diff * diff / (v * v)) * scale;
        }

        let shape = self.input.shape().to_vec();
        let grad_input_tensor =
            Tensor::from_storage(TensorStorage::cpu(grad_input), shape.clone(), false)?;
        let grad_var_tensor = Tensor::from_storage(TensorStorage::cpu(grad_var), shape, false)?;
        Ok(vec![Some(grad_input_tensor), Some(grad_var_tensor)])
    }

    fn inputs(&self) -> Vec<&Tensor<T>> {
        vec![&self.input, &self.var]
    }

    fn name(&self) -> &'static str {
        "GaussianNLLBackward"
    }
}

// ===========================================================================
// Tests
// ===========================================================================

#[cfg(test)]
#[allow(clippy::needless_range_loop)]
mod tests {
    use super::*;
    use ferrotorch_core::autograd::graph::backward;
    use ferrotorch_core::storage::TensorStorage;

    /// Helper: 1-D leaf tensor with requires_grad.
    fn leaf_vec(vals: &[f64]) -> Tensor<f64> {
        Tensor::from_storage(TensorStorage::cpu(vals.to_vec()), vec![vals.len()], true).unwrap()
    }

    /// Helper: 1-D tensor without grad (for targets).
    fn target_vec(vals: &[f64]) -> Tensor<f64> {
        Tensor::from_storage(TensorStorage::cpu(vals.to_vec()), vec![vals.len()], false).unwrap()
    }

    /// Helper: 2-D leaf tensor with requires_grad.
    fn leaf_2d(vals: &[f64], shape: &[usize]) -> Tensor<f64> {
        Tensor::from_storage(TensorStorage::cpu(vals.to_vec()), shape.to_vec(), true).unwrap()
    }

    // -----------------------------------------------------------------------
    // MSELoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_mse_forward_mean() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = MSELoss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        // Each diff is 0.5, squared is 0.25, mean is 0.25.
        assert!(out.is_scalar());
        assert!(
            (out.item().unwrap() - 0.25).abs() < 1e-7,
            "MSE mean: expected 0.25, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_mse_forward_sum() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = MSELoss::new(Reduction::Sum);
        let out = loss.forward(&pred, &target).unwrap();
        // sum of 0.25 * 3 = 0.75
        assert!(
            (out.item().unwrap() - 0.75).abs() < 1e-7,
            "MSE sum: expected 0.75, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_mse_forward_none() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = MSELoss::new(Reduction::None);
        let out = loss.forward(&pred, &target).unwrap();
        assert_eq!(out.shape(), &[3]);
        let d = out.data().unwrap();
        for i in 0..3 {
            assert!(
                (d[i] - 0.25).abs() < 1e-7,
                "MSE none[{}]: expected 0.25, got {}",
                i,
                d[i]
            );
        }
    }

    #[test]
    fn test_mse_backward_mean() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = MSELoss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = 2 * (pred - target) / n = 2 * (-0.5) / 3 = -1/3
        let expected = -1.0 / 3.0;
        for i in 0..3 {
            assert!(
                (g[i] - expected).abs() < 1e-7,
                "MSE grad[{}]: expected {}, got {}",
                i,
                expected,
                g[i]
            );
        }
    }

    #[test]
    fn test_mse_backward_sum() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = MSELoss::new(Reduction::Sum);
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = 2 * (-0.5) = -1.0
        for i in 0..3 {
            assert!(
                (g[i] - (-1.0)).abs() < 1e-7,
                "MSE sum grad[{}]: expected -1.0, got {}",
                i,
                g[i]
            );
        }
    }

    #[test]
    fn test_mse_zero_loss() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let loss = MSELoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(out.item().unwrap().abs() < 1e-10);
    }

    // -----------------------------------------------------------------------
    // CrossEntropyLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_cross_entropy_forward_mean() {
        // 2 samples, 3 classes
        // logits: [[1, 2, 3], [1, 2, 3]], targets: [2, 0]
        let logits = leaf_2d(&[1.0, 2.0, 3.0, 1.0, 2.0, 3.0], &[2, 3]);
        let targets = target_vec(&[2.0, 0.0]);
        let loss = CrossEntropyLoss::default();
        let out = loss.forward(&logits, &targets).unwrap();

        // log_softmax uses max subtraction: max=3, shifted = [-2, -1, 0]
        // sum_exp = e^{-2} + e^{-1} + e^0
        let sum_exp = (-2.0_f64).exp() + (-1.0_f64).exp() + 1.0;
        let log_sum = sum_exp.ln();
        // log_softmax[c] = logits[c] - max - log_sum
        let lsm = [
            1.0 - 3.0 - log_sum,
            2.0 - 3.0 - log_sum,
            3.0 - 3.0 - log_sum,
        ];
        // nll for sample 0 (target=2): -lsm[2]
        // nll for sample 1 (target=0): -lsm[0]
        let expected = (-lsm[2] + (-lsm[0])) / 2.0;

        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "CE mean: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cross_entropy_forward_sum() {
        let logits = leaf_2d(&[1.0, 2.0, 3.0, 1.0, 2.0, 3.0], &[2, 3]);
        let targets = target_vec(&[2.0, 0.0]);
        let loss = CrossEntropyLoss::new(Reduction::Sum, 0.0);
        let out = loss.forward(&logits, &targets).unwrap();

        let sum_exp = (-2.0_f64).exp() + (-1.0_f64).exp() + 1.0;
        let log_sum = sum_exp.ln();
        let lsm = [
            1.0 - 3.0 - log_sum,
            2.0 - 3.0 - log_sum,
            3.0 - 3.0 - log_sum,
        ];
        let expected = -lsm[2] + (-lsm[0]);

        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "CE sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cross_entropy_forward_none() {
        let logits = leaf_2d(&[1.0, 2.0, 3.0, 1.0, 2.0, 3.0], &[2, 3]);
        let targets = target_vec(&[2.0, 0.0]);
        let loss = CrossEntropyLoss::new(Reduction::None, 0.0);
        let out = loss.forward(&logits, &targets).unwrap();

        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();

        let sum_exp = (-2.0_f64).exp() + (-1.0_f64).exp() + 1.0;
        let log_sum = sum_exp.ln();
        let lsm = [
            1.0 - 3.0 - log_sum,
            2.0 - 3.0 - log_sum,
            3.0 - 3.0 - log_sum,
        ];

        assert!((d[0] - (-lsm[2])).abs() < 1e-6);
        assert!((d[1] - (-lsm[0])).abs() < 1e-6);
    }

    #[test]
    fn test_cross_entropy_backward_mean() {
        // Single sample for simpler gradient check.
        let logits = leaf_2d(&[1.0, 2.0, 3.0], &[1, 3]);
        let targets = target_vec(&[1.0]);
        let loss = CrossEntropyLoss::default();
        let out = loss.forward(&logits, &targets).unwrap();
        backward(&out).unwrap();

        let grad = logits.grad().unwrap().unwrap();
        let g = grad.data().unwrap();

        // softmax([1,2,3])
        let sum_exp = 1.0_f64.exp() + 2.0_f64.exp() + 3.0_f64.exp();
        let sm = [
            1.0_f64.exp() / sum_exp,
            2.0_f64.exp() / sum_exp,
            3.0_f64.exp() / sum_exp,
        ];
        // grad = (softmax - one_hot) / batch_size, target=1
        // batch_size = 1
        let expected = [sm[0] - 0.0, sm[1] - 1.0, sm[2] - 0.0];

        for i in 0..3 {
            assert!(
                (g[i] - expected[i]).abs() < 1e-6,
                "CE grad[{}]: expected {}, got {}",
                i,
                expected[i],
                g[i]
            );
        }
    }

    #[test]
    fn test_cross_entropy_label_smoothing() {
        let logits = leaf_2d(&[1.0, 2.0, 3.0], &[1, 3]);
        let targets = target_vec(&[2.0]);
        let ls = 0.1;
        let loss = CrossEntropyLoss::new(Reduction::Mean, ls);
        let out = loss.forward(&logits, &targets).unwrap();

        // Compute expected: max=3, shifted=[-2,-1,0]
        let max_val = 3.0_f64;
        let sum_exp = (-2.0_f64).exp() + (-1.0_f64).exp() + 1.0;
        let log_sum = sum_exp.ln();
        let lsm = [
            1.0 - max_val - log_sum,
            2.0 - max_val - log_sum,
            3.0 - max_val - log_sum,
        ];
        let nll = -lsm[2];
        let smooth = -(lsm[0] + lsm[1] + lsm[2]) / 3.0;
        let expected = (1.0 - ls) * nll + ls * smooth;

        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "CE label smoothing: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cross_entropy_large_logits_stability() {
        // Large logits should not produce NaN or Inf.
        let logits = leaf_2d(&[1000.0, 1001.0, 999.0], &[1, 3]);
        let targets = target_vec(&[1.0]);
        let loss = CrossEntropyLoss::default();
        let out = loss.forward(&logits, &targets).unwrap();
        let val = out.item().unwrap();
        assert!(
            val.is_finite(),
            "CE with large logits produced non-finite: {}",
            val
        );

        // The correct answer: softmax([1000,1001,999]) ~= [e^(-1), e^0, e^(-2)] / Z
        // log_softmax([1000,1001,999]) = [1000-1001-log(Z), 0-log(Z), 999-1001-log(Z)]
        // where Z = e^(-1) + 1 + e^(-2)
        let z = (-1.0_f64).exp() + 1.0 + (-2.0_f64).exp();
        let expected = -(1001.0 - 1001.0 - z.ln()); // nll for target 1
        assert!(
            (val - expected).abs() < 1e-5,
            "CE large logits: expected {}, got {}",
            expected,
            val
        );
    }

    #[test]
    fn test_cross_entropy_negative_logits_stability() {
        let logits = leaf_2d(&[-1000.0, -999.0, -1001.0], &[1, 3]);
        let targets = target_vec(&[1.0]);
        let loss = CrossEntropyLoss::default();
        let out = loss.forward(&logits, &targets).unwrap();
        let val = out.item().unwrap();
        assert!(
            val.is_finite(),
            "CE with large negative logits produced non-finite: {}",
            val
        );
    }

    // -----------------------------------------------------------------------
    // BCEWithLogitsLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_bce_forward_mean() {
        // x = [0, 0], y = [1, 0]
        // loss(0, 1) = max(0,0) - 0*1 + log(1+exp(0)) = 0 - 0 + log(2) = log(2)
        // loss(0, 0) = max(0,0) - 0*0 + log(1+exp(0)) = 0 - 0 + log(2) = log(2)
        // mean = log(2)
        let logits = leaf_vec(&[0.0, 0.0]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::Mean);
        let out = loss.forward(&logits, &targets).unwrap();
        let expected = 2.0_f64.ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "BCE mean: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_bce_forward_sum() {
        let logits = leaf_vec(&[0.0, 0.0]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::Sum);
        let out = loss.forward(&logits, &targets).unwrap();
        let expected = 2.0 * 2.0_f64.ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "BCE sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_bce_forward_none() {
        let logits = leaf_vec(&[0.0, 0.0]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::None);
        let out = loss.forward(&logits, &targets).unwrap();
        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();
        let ln2 = 2.0_f64.ln();
        assert!((d[0] - ln2).abs() < 1e-7);
        assert!((d[1] - ln2).abs() < 1e-7);
    }

    #[test]
    fn test_bce_backward_mean() {
        let logits = leaf_vec(&[0.0, 0.0]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::Mean);
        let out = loss.forward(&logits, &targets).unwrap();
        backward(&out).unwrap();

        let grad = logits.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = (sigmoid(x) - y) / n
        // sigmoid(0) = 0.5
        // g[0] = (0.5 - 1.0) / 2 = -0.25
        // g[1] = (0.5 - 0.0) / 2 = 0.25
        assert!(
            (g[0] - (-0.25)).abs() < 1e-7,
            "BCE grad[0]: expected -0.25, got {}",
            g[0]
        );
        assert!(
            (g[1] - 0.25).abs() < 1e-7,
            "BCE grad[1]: expected 0.25, got {}",
            g[1]
        );
    }

    #[test]
    fn test_bce_numerical_stability_large_positive() {
        // Large positive logits should not overflow.
        let logits = leaf_vec(&[100.0]);
        let targets = target_vec(&[1.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::Mean);
        let out = loss.forward(&logits, &targets).unwrap();
        let val = out.item().unwrap();
        assert!(
            val.is_finite(),
            "BCE large positive logit: non-finite {}",
            val
        );
        // loss = max(100,0) - 100*1 + log(1+exp(-100)) ~ 0 + ~0 = ~0
        assert!(
            val < 1e-10,
            "BCE large positive logit: expected ~0, got {}",
            val
        );
    }

    #[test]
    fn test_bce_numerical_stability_large_negative() {
        let logits = leaf_vec(&[-100.0]);
        let targets = target_vec(&[0.0]);
        let loss = BCEWithLogitsLoss::new(Reduction::Mean);
        let out = loss.forward(&logits, &targets).unwrap();
        let val = out.item().unwrap();
        assert!(
            val.is_finite(),
            "BCE large negative logit: non-finite {}",
            val
        );
        // loss = max(-100,0) - (-100)*0 + log(1+exp(-100)) ~ 0 + 0 + ~0
        assert!(
            val < 1e-10,
            "BCE large negative logit: expected ~0, got {}",
            val
        );
    }

    // -----------------------------------------------------------------------
    // HuberLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_huber_forward_quadratic_region() {
        // error = 0.3 (< delta=1.0), loss = 0.5 * 0.3^2 = 0.045
        let pred = leaf_vec(&[1.3]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default(); // delta = 1.0
        let out = loss.forward(&pred, &target).unwrap();
        assert!(
            (out.item().unwrap() - 0.045).abs() < 1e-7,
            "Huber quadratic: expected 0.045, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_huber_forward_linear_region() {
        // error = 2.0 (>= delta=1.0), loss = 1.0 * (2.0 - 0.5) = 1.5
        let pred = leaf_vec(&[3.0]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(
            (out.item().unwrap() - 1.5).abs() < 1e-7,
            "Huber linear: expected 1.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_huber_forward_sum() {
        let pred = leaf_vec(&[1.3, 3.0]);
        let target = target_vec(&[1.0, 1.0]);
        let loss = HuberLoss::new(Reduction::Sum, 1.0);
        let out = loss.forward(&pred, &target).unwrap();
        let expected = 0.045 + 1.5;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "Huber sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_huber_forward_none() {
        let pred = leaf_vec(&[1.3, 3.0]);
        let target = target_vec(&[1.0, 1.0]);
        let loss = HuberLoss::new(Reduction::None, 1.0);
        let out = loss.forward(&pred, &target).unwrap();
        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();
        assert!((d[0] - 0.045).abs() < 1e-7);
        assert!((d[1] - 1.5).abs() < 1e-7);
    }

    #[test]
    fn test_huber_backward_quadratic() {
        // error = 0.3 (< delta=1.0), grad = error / n = 0.3 / 1 = 0.3
        let pred = leaf_vec(&[1.3]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // mean reduction with n=1: grad = error / n = 0.3
        assert!(
            (g[0] - 0.3).abs() < 1e-7,
            "Huber quadratic grad: expected 0.3, got {}",
            g[0]
        );
    }

    #[test]
    fn test_huber_backward_linear() {
        // error = 2.0 (>= delta=1.0), grad = delta * sign(error) / n = 1.0 / 1 = 1.0
        let pred = leaf_vec(&[3.0]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        assert!(
            (g[0] - 1.0).abs() < 1e-7,
            "Huber linear grad: expected 1.0, got {}",
            g[0]
        );
    }

    #[test]
    fn test_huber_backward_negative_error() {
        // error = -2.0, grad = delta * sign(-2.0) / n = -1.0
        let pred = leaf_vec(&[-1.0]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        assert!(
            (g[0] - (-1.0)).abs() < 1e-7,
            "Huber negative error grad: expected -1.0, got {}",
            g[0]
        );
    }

    #[test]
    fn test_huber_custom_delta() {
        // delta=0.5, error=0.3 (< 0.5): quadratic, loss = 0.5 * 0.09 = 0.045
        let pred = leaf_vec(&[1.3]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::new(Reduction::Mean, 0.5);
        let out = loss.forward(&pred, &target).unwrap();
        assert!(
            (out.item().unwrap() - 0.045).abs() < 1e-7,
            "Huber custom delta quadratic: expected 0.045, got {}",
            out.item().unwrap()
        );

        // delta=0.5, error=1.0 (>= 0.5): linear, loss = 0.5 * (1.0 - 0.25) = 0.375
        let pred2 = leaf_vec(&[2.0]);
        let target2 = target_vec(&[1.0]);
        let out2 = loss.forward(&pred2, &target2).unwrap();
        assert!(
            (out2.item().unwrap() - 0.375).abs() < 1e-7,
            "Huber custom delta linear: expected 0.375, got {}",
            out2.item().unwrap()
        );
    }

    #[test]
    fn test_huber_zero_loss() {
        let pred = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0]);
        let loss = HuberLoss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(out.item().unwrap().abs() < 1e-10);
    }

    // -----------------------------------------------------------------------
    // no_grad disables backward nodes
    // -----------------------------------------------------------------------

    #[test]
    fn test_mse_no_grad() {
        ferrotorch_core::no_grad(|| {
            let pred = leaf_vec(&[1.0, 2.0]);
            let target = target_vec(&[1.5, 2.5]);
            let loss = MSELoss::default();
            let out = loss.forward(&pred, &target).unwrap();
            assert!(
                out.grad_fn().is_none(),
                "MSELoss inside no_grad should not attach grad_fn"
            );
        });
    }

    #[test]
    fn test_ce_no_grad() {
        ferrotorch_core::no_grad(|| {
            let logits = leaf_2d(&[1.0, 2.0, 3.0], &[1, 3]);
            let targets = target_vec(&[0.0]);
            let loss = CrossEntropyLoss::default();
            let out = loss.forward(&logits, &targets).unwrap();
            assert!(
                out.grad_fn().is_none(),
                "CrossEntropyLoss inside no_grad should not attach grad_fn"
            );
        });
    }

    #[test]
    fn test_bce_no_grad() {
        ferrotorch_core::no_grad(|| {
            let logits = leaf_vec(&[0.0, 1.0]);
            let targets = target_vec(&[1.0, 0.0]);
            let loss = BCEWithLogitsLoss::default();
            let out = loss.forward(&logits, &targets).unwrap();
            assert!(
                out.grad_fn().is_none(),
                "BCEWithLogitsLoss inside no_grad should not attach grad_fn"
            );
        });
    }

    #[test]
    fn test_huber_no_grad() {
        ferrotorch_core::no_grad(|| {
            let pred = leaf_vec(&[1.0]);
            let target = target_vec(&[2.0]);
            let loss = HuberLoss::default();
            let out = loss.forward(&pred, &target).unwrap();
            assert!(
                out.grad_fn().is_none(),
                "HuberLoss inside no_grad should not attach grad_fn"
            );
        });
    }

    // -----------------------------------------------------------------------
    // Shape mismatch errors
    // -----------------------------------------------------------------------

    #[test]
    fn test_mse_shape_mismatch() {
        let pred = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let loss = MSELoss::default();
        assert!(loss.forward(&pred, &target).is_err());
    }

    #[test]
    fn test_bce_shape_mismatch() {
        let logits = leaf_vec(&[0.0]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = BCEWithLogitsLoss::default();
        assert!(loss.forward(&logits, &targets).is_err());
    }

    #[test]
    fn test_huber_shape_mismatch() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.0]);
        let loss = HuberLoss::default();
        assert!(loss.forward(&pred, &target).is_err());
    }

    #[test]
    fn test_ce_logits_wrong_dims() {
        // 1D logits should fail (needs 2D).
        let logits = leaf_vec(&[1.0, 2.0, 3.0]);
        let targets = target_vec(&[1.0]);
        let loss = CrossEntropyLoss::default();
        assert!(loss.forward(&logits, &targets).is_err());
    }

    #[test]
    fn test_ce_target_batch_mismatch() {
        let logits = leaf_2d(&[1.0, 2.0, 3.0], &[1, 3]);
        let targets = target_vec(&[0.0, 1.0]); // batch size 2, logits batch 1
        let loss = CrossEntropyLoss::default();
        assert!(loss.forward(&logits, &targets).is_err());
    }

    // -----------------------------------------------------------------------
    // KLDivLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_kl_div_forward_mean() {
        // input = log-probabilities, target = probabilities.
        // target = [0.25, 0.75], input = [ln(0.5), ln(0.5)]
        // KL = 0.25 * (ln(0.25) - ln(0.5)) + 0.75 * (ln(0.75) - ln(0.5))
        //    = 0.25 * ln(0.5) + 0.75 * ln(1.5)
        let input = leaf_vec(&[0.5_f64.ln(), 0.5_f64.ln()]);
        let target = target_vec(&[0.25, 0.75]);
        let loss = KLDivLoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();

        let expected =
            0.25 * (0.25_f64.ln() - 0.5_f64.ln()) + 0.75 * (0.75_f64.ln() - 0.5_f64.ln());
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "KL sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_kl_div_zero_target_contributes_zero() {
        // target = [0, 1], input = [ln(0.5), ln(0.5)]
        // KL = 0 + 1 * (ln(1) - ln(0.5)) = 0 + ln(2)
        let input = leaf_vec(&[0.5_f64.ln(), 0.5_f64.ln()]);
        let target = target_vec(&[0.0, 1.0]);
        let loss = KLDivLoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();
        let expected = 2.0_f64.ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "KL zero target: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_kl_div_identical_distributions() {
        // When input = log(target), KL should be 0.
        let target = target_vec(&[0.3, 0.7]);
        let input = leaf_vec(&[0.3_f64.ln(), 0.7_f64.ln()]);
        let loss = KLDivLoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-10,
            "KL same dist: expected ~0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_kl_div_backward() {
        let input = leaf_vec(&[0.5_f64.ln(), 0.5_f64.ln()]);
        let target = target_vec(&[0.25, 0.75]);
        let loss = KLDivLoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = -target (for sum reduction)
        assert!(
            (g[0] - (-0.25)).abs() < 1e-7,
            "KL grad[0]: expected -0.25, got {}",
            g[0]
        );
        assert!(
            (g[1] - (-0.75)).abs() < 1e-7,
            "KL grad[1]: expected -0.75, got {}",
            g[1]
        );
    }

    #[test]
    fn test_kl_div_shape_mismatch() {
        let input = leaf_vec(&[0.0, 0.0]);
        let target = target_vec(&[0.5]);
        let loss = KLDivLoss::default();
        assert!(loss.forward(&input, &target).is_err());
    }

    // -----------------------------------------------------------------------
    // CosineEmbeddingLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_cosine_embedding_positive_pair() {
        // x1 = [1, 0], x2 = [0, 1], y = 1 (positive)
        // cos(x1, x2) = 0, loss = 1 - 0 = 1
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 = leaf_2d(&[0.0, 1.0], &[1, 2]);
        let y = target_vec(&[1.0]);
        let loss = CosineEmbeddingLoss::default();
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        assert!(
            (out.item().unwrap() - 1.0).abs() < 1e-7,
            "cosine positive orthogonal: expected 1.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cosine_embedding_positive_identical() {
        // x1 = x2 = [1, 1], y = 1, cos = 1, loss = 0
        let x1 = leaf_2d(&[1.0, 1.0], &[1, 2]);
        let x2 = leaf_2d(&[1.0, 1.0], &[1, 2]);
        let y = target_vec(&[1.0]);
        let loss = CosineEmbeddingLoss::default();
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-7,
            "cosine positive identical: expected 0.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cosine_embedding_negative_pair() {
        // x1 = [1, 0], x2 = [1, 0], y = -1 (negative), margin = 0.5
        // cos = 1.0, loss = max(0, 1.0 - 0.5) = 0.5
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let y = target_vec(&[-1.0]);
        let loss = CosineEmbeddingLoss::new(Reduction::Mean, 0.5);
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        assert!(
            (out.item().unwrap() - 0.5).abs() < 1e-7,
            "cosine negative same: expected 0.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cosine_embedding_negative_orthogonal() {
        // x1 = [1, 0], x2 = [0, 1], y = -1, margin = 0.0
        // cos = 0.0, loss = max(0, 0.0 - 0.0) = 0
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 = leaf_2d(&[0.0, 1.0], &[1, 2]);
        let y = target_vec(&[-1.0]);
        let loss = CosineEmbeddingLoss::new(Reduction::Mean, 0.0);
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-7,
            "cosine negative orthogonal: expected 0.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_cosine_embedding_shape_mismatch() {
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 = leaf_2d(&[1.0, 0.0, 0.0], &[1, 3]);
        let y = target_vec(&[1.0]);
        let loss = CosineEmbeddingLoss::default();
        assert!(loss.forward_pair(&x1, &x2, &y).is_err());
    }

    // -----------------------------------------------------------------------
    // SmoothL1Loss
    // -----------------------------------------------------------------------

    #[test]
    fn test_smooth_l1_forward_quadratic() {
        // error = 0.3 (< 1.0), loss = 0.5 * 0.09 = 0.045
        let pred = leaf_vec(&[1.3]);
        let target = target_vec(&[1.0]);
        let loss = SmoothL1Loss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(
            (out.item().unwrap() - 0.045).abs() < 1e-7,
            "SmoothL1 quadratic: expected 0.045, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_smooth_l1_forward_linear() {
        // error = 2.0 (>= 1.0), loss = 1.0 * (2.0 - 0.5) = 1.5
        let pred = leaf_vec(&[3.0]);
        let target = target_vec(&[1.0]);
        let loss = SmoothL1Loss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(
            (out.item().unwrap() - 1.5).abs() < 1e-7,
            "SmoothL1 linear: expected 1.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_smooth_l1_matches_huber() {
        // SmoothL1Loss should produce identical results to HuberLoss(delta=1.0).
        let pred = leaf_vec(&[0.5, 2.0, -1.0]);
        let target = target_vec(&[1.0, 0.0, 0.5]);

        let smooth = SmoothL1Loss::new(Reduction::Sum);
        let huber = HuberLoss::new(Reduction::Sum, 1.0);

        let s_out = smooth.forward(&pred, &target).unwrap();
        let h_out = huber.forward(&pred, &target).unwrap();

        assert!(
            (s_out.item().unwrap() - h_out.item().unwrap()).abs() < 1e-10,
            "SmoothL1 and Huber(1.0) diverge: {} vs {}",
            s_out.item().unwrap(),
            h_out.item().unwrap()
        );
    }

    #[test]
    fn test_smooth_l1_zero_loss() {
        let pred = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0]);
        let loss = SmoothL1Loss::default();
        let out = loss.forward(&pred, &target).unwrap();
        assert!(out.item().unwrap().abs() < 1e-10);
    }

    // -------------------------------------------------------------------
    // autocast_guard integration: loss forwards fire the guard
    // -------------------------------------------------------------------

    #[test]
    fn test_mse_loss_fires_autocast_guard_when_enabled() {
        use ferrotorch_core::autograd::autocast::{AutocastDtype, autocast, set_autocast_debug};
        use ferrotorch_core::autograd::autocast_ops::{AutocastCategory, drain_autocast_events};
        set_autocast_debug(true);

        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);

        // Outside autocast: no events.
        drain_autocast_events();
        let _ = MSELoss::new(Reduction::Mean)
            .forward(&pred, &target)
            .unwrap();
        assert!(drain_autocast_events().is_empty());

        // Inside autocast: records "mse_loss" as FullPrecision.
        autocast(AutocastDtype::F16, || {
            drain_autocast_events();
            let _ = MSELoss::new(Reduction::Mean)
                .forward(&pred, &target)
                .unwrap();
            let events = drain_autocast_events();
            assert_eq!(events.len(), 1);
            assert_eq!(events[0].op, "mse_loss");
            assert_eq!(events[0].category, AutocastCategory::FullPrecision);
        });
    }

    #[test]
    fn test_cross_entropy_fires_autocast_guard_when_enabled() {
        use ferrotorch_core::autograd::autocast::{AutocastDtype, autocast, set_autocast_debug};
        use ferrotorch_core::autograd::autocast_ops::{AutocastCategory, drain_autocast_events};
        set_autocast_debug(true);

        // 2 samples, 3 classes.
        let logits = leaf_2d(&[1.0, 2.0, 3.0, 1.0, 2.0, 3.0], &[2, 3]);
        let targets = target_vec(&[2.0, 0.0]); // class indices

        // Outside autocast: no events.
        drain_autocast_events();
        let _ = CrossEntropyLoss::new(Reduction::Mean, 0.0)
            .forward(&logits, &targets)
            .unwrap();
        assert!(drain_autocast_events().is_empty());

        // Inside autocast: records "cross_entropy" as FullPrecision.
        autocast(AutocastDtype::BF16, || {
            drain_autocast_events();
            let _ = CrossEntropyLoss::new(Reduction::Mean, 0.0)
                .forward(&logits, &targets)
                .unwrap();
            let events = drain_autocast_events();
            assert_eq!(events.len(), 1);
            assert_eq!(events[0].op, "cross_entropy");
            assert_eq!(events[0].category, AutocastCategory::FullPrecision);
        });
    }

    #[test]
    fn test_bce_with_logits_fires_autocast_guard_when_enabled() {
        use ferrotorch_core::autograd::autocast::{AutocastDtype, autocast, set_autocast_debug};
        use ferrotorch_core::autograd::autocast_ops::{AutocastCategory, drain_autocast_events};
        set_autocast_debug(true);

        let logits = leaf_vec(&[0.5, -0.5, 1.0]);
        let targets = target_vec(&[1.0, 0.0, 1.0]);

        // Outside autocast: no events.
        drain_autocast_events();
        let _ = BCEWithLogitsLoss::new(Reduction::Mean)
            .forward(&logits, &targets)
            .unwrap();
        assert!(drain_autocast_events().is_empty());

        // Inside autocast: records "bce_with_logits" as FullPrecision.
        autocast(AutocastDtype::F16, || {
            drain_autocast_events();
            let _ = BCEWithLogitsLoss::new(Reduction::Mean)
                .forward(&logits, &targets)
                .unwrap();
            let events = drain_autocast_events();
            assert_eq!(events.len(), 1);
            assert_eq!(events[0].op, "bce_with_logits");
            assert_eq!(events[0].category, AutocastCategory::FullPrecision);
        });
    }

    // -----------------------------------------------------------------------
    // L1Loss
    // -----------------------------------------------------------------------

    #[test]
    fn test_l1_forward_mean() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = L1Loss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        // Each |diff| is 0.5, mean is 0.5.
        assert!(out.is_scalar());
        assert!(
            (out.item().unwrap() - 0.5).abs() < 1e-7,
            "L1 mean: expected 0.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_l1_forward_sum() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = L1Loss::new(Reduction::Sum);
        let out = loss.forward(&pred, &target).unwrap();
        // sum of 0.5 * 3 = 1.5
        assert!(
            (out.item().unwrap() - 1.5).abs() < 1e-7,
            "L1 sum: expected 1.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_l1_forward_none() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = L1Loss::new(Reduction::None);
        let out = loss.forward(&pred, &target).unwrap();
        assert_eq!(out.shape(), &[3]);
        let d = out.data().unwrap();
        for i in 0..3 {
            assert!(
                (d[i] - 0.5).abs() < 1e-7,
                "L1 none[{}]: expected 0.5, got {}",
                i,
                d[i]
            );
        }
    }

    #[test]
    fn test_l1_backward_mean() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.5, 2.5, 3.5]);
        let loss = L1Loss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = sign(pred - target) / n = sign(-0.5) / 3 = -1/3
        let expected = -1.0 / 3.0;
        for i in 0..3 {
            assert!(
                (g[i] - expected).abs() < 1e-7,
                "L1 backward mean[{}]: expected {}, got {}",
                i,
                expected,
                g[i]
            );
        }
    }

    #[test]
    fn test_l1_backward_sum() {
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[0.5, 1.5, 2.5]);
        let loss = L1Loss::new(Reduction::Sum);
        let out = loss.forward(&pred, &target).unwrap();
        backward(&out).unwrap();

        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad = sign(pred - target) * 1 = sign(0.5) = 1
        for i in 0..3 {
            assert!(
                (g[i] - 1.0).abs() < 1e-7,
                "L1 backward sum[{}]: expected 1.0, got {}",
                i,
                g[i]
            );
        }
    }

    #[test]
    fn test_l1_shape_mismatch() {
        let pred = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let loss = L1Loss::default();
        assert!(loss.forward(&pred, &target).is_err());
    }

    #[test]
    fn test_l1_zero_diff() {
        // When pred == target, loss = 0 and sign(0) = 0.
        let pred = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let loss = L1Loss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        assert!((out.item().unwrap()).abs() < 1e-10);

        backward(&out).unwrap();
        let grad = pred.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        for i in 0..3 {
            assert!(
                g[i].abs() < 1e-10,
                "L1 zero diff grad[{}] should be 0, got {}",
                i,
                g[i]
            );
        }
    }

    #[test]
    fn test_l1_mixed_signs() {
        // Pred above and below target.
        let pred = leaf_vec(&[3.0, 1.0]);
        let target = target_vec(&[1.0, 3.0]);
        let loss = L1Loss::new(Reduction::Mean);
        let out = loss.forward(&pred, &target).unwrap();
        // |3-1| = 2, |1-3| = 2, mean = 2
        assert!(
            (out.item().unwrap() - 2.0).abs() < 1e-7,
            "L1 mixed: expected 2.0, got {}",
            out.item().unwrap()
        );
    }

    // -----------------------------------------------------------------------
    // NLLLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_nll_forward_mean() {
        // log_probs: [2, 3], targets: [2]
        // Manually compute: sample 0 target=1, sample 1 target=0
        let log_probs = leaf_2d(&[-1.5, -0.5, -2.0, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = NLLLoss::default();
        let out = loss.forward(&log_probs, &targets).unwrap();
        // loss = -(-0.5 + -0.8) / 2 = (0.5 + 0.8) / 2 = 0.65
        assert!(out.is_scalar());
        assert!(
            (out.item().unwrap() - 0.65).abs() < 1e-7,
            "NLL mean: expected 0.65, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_nll_forward_sum() {
        let log_probs = leaf_2d(&[-1.5, -0.5, -2.0, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = NLLLoss::new(Reduction::Sum, None);
        let out = loss.forward(&log_probs, &targets).unwrap();
        // loss = 0.5 + 0.8 = 1.3
        assert!(
            (out.item().unwrap() - 1.3).abs() < 1e-7,
            "NLL sum: expected 1.3, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_nll_forward_none() {
        let log_probs = leaf_2d(&[-1.5, -0.5, -2.0, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = NLLLoss::new(Reduction::None, None);
        let out = loss.forward(&log_probs, &targets).unwrap();
        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();
        assert!(
            (d[0] - 0.5).abs() < 1e-7,
            "NLL none[0]: expected 0.5, got {}",
            d[0]
        );
        assert!(
            (d[1] - 0.8).abs() < 1e-7,
            "NLL none[1]: expected 0.8, got {}",
            d[1]
        );
    }

    #[test]
    fn test_nll_backward_mean() {
        let log_probs = leaf_2d(&[-1.5, -0.5, -2.0, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0]);
        let loss = NLLLoss::default();
        let out = loss.forward(&log_probs, &targets).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad[0, 1] = -1/2, grad[1, 0] = -1/2, rest = 0
        let expected = [0.0, -0.5, 0.0, -0.5, 0.0, 0.0];
        for i in 0..6 {
            assert!(
                (g[i] - expected[i]).abs() < 1e-7,
                "NLL backward mean[{}]: expected {}, got {}",
                i,
                expected[i],
                g[i]
            );
        }
    }

    #[test]
    fn test_nll_ignore_index() {
        let log_probs = leaf_2d(&[-1.5, -0.5, -2.0, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0]);
        // Ignore class 0 — only sample 0 (target=1) contributes.
        let loss = NLLLoss::new(Reduction::Mean, Some(0));
        let out = loss.forward(&log_probs, &targets).unwrap();
        // loss = -(-0.5) / 1 = 0.5
        assert!(
            (out.item().unwrap() - 0.5).abs() < 1e-7,
            "NLL ignore_index mean: expected 0.5, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_nll_ignore_index_all_ignored() {
        let log_probs = leaf_2d(&[-1.5, -0.5, -0.8, -1.2], &[2, 2]);
        let targets = target_vec(&[0.0, 0.0]);
        let loss = NLLLoss::new(Reduction::Mean, Some(0));
        let out = loss.forward(&log_probs, &targets).unwrap();
        // All ignored => 0.
        assert!(
            (out.item().unwrap()).abs() < 1e-10,
            "NLL all ignored: expected 0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_nll_wrong_log_probs_shape() {
        // 1-D input should error.
        let log_probs = leaf_vec(&[-0.5, -1.0, -1.5]);
        let targets = target_vec(&[1.0]);
        let loss = NLLLoss::default();
        assert!(loss.forward(&log_probs, &targets).is_err());
    }

    #[test]
    fn test_nll_target_shape_mismatch() {
        let log_probs = leaf_2d(&[-0.5, -1.0, -1.5, -0.8, -1.2, -1.0], &[2, 3]);
        let targets = target_vec(&[1.0, 0.0, 2.0]);
        let loss = NLLLoss::default();
        assert!(loss.forward(&log_probs, &targets).is_err());
    }

    #[test]
    fn test_nll_target_out_of_range() {
        let log_probs = leaf_2d(&[-0.5, -1.0], &[1, 2]);
        let targets = target_vec(&[5.0]); // Only 2 classes.
        let loss = NLLLoss::default();
        assert!(loss.forward(&log_probs, &targets).is_err());
    }

    #[test]
    fn test_nll_empty_batch() {
        let log_probs = leaf_2d(&[], &[0, 3]);
        let targets = target_vec(&[]);
        let loss = NLLLoss::default();
        let out = loss.forward(&log_probs, &targets).unwrap();
        assert!((out.item().unwrap()).abs() < 1e-10);
    }

    #[test]
    fn test_nll_backward_sum() {
        let log_probs = leaf_2d(&[-1.0, -2.0, -3.0, -4.0], &[2, 2]);
        let targets = target_vec(&[0.0, 1.0]);
        let loss = NLLLoss::new(Reduction::Sum, None);
        let out = loss.forward(&log_probs, &targets).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad[0, 0] = -1, grad[1, 1] = -1, rest = 0
        let expected = [-1.0, 0.0, 0.0, -1.0];
        for i in 0..4 {
            assert!(
                (g[i] - expected[i]).abs() < 1e-7,
                "NLL backward sum[{}]: expected {}, got {}",
                i,
                expected[i],
                g[i]
            );
        }
    }

    #[test]
    fn test_nll_backward_with_ignore() {
        let log_probs = leaf_2d(&[-1.0, -2.0, -3.0, -4.0], &[2, 2]);
        let targets = target_vec(&[0.0, 1.0]);
        // Ignore target=0, so only sample 1 (target=1) contributes.
        let loss = NLLLoss::new(Reduction::Mean, Some(0));
        let out = loss.forward(&log_probs, &targets).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // Only sample 1, target=1: grad[1,1] = -1/1 = -1, rest = 0.
        let expected = [0.0, 0.0, 0.0, -1.0];
        for i in 0..4 {
            assert!(
                (g[i] - expected[i]).abs() < 1e-7,
                "NLL backward ignore[{}]: expected {}, got {}",
                i,
                expected[i],
                g[i]
            );
        }
    }

    // -----------------------------------------------------------------------
    // BCELoss (probability input, not logits)
    // -----------------------------------------------------------------------

    #[test]
    fn test_bce_loss_forward_mean() {
        // input = sigmoid output (probabilities), target = binary labels
        let input = leaf_vec(&[0.8, 0.4, 0.6]);
        let target = target_vec(&[1.0, 0.0, 1.0]);
        let loss = BCELoss::new(Reduction::Mean);
        let out = loss.forward(&input, &target).unwrap();
        // Per-element:
        //   [0]: -(1*ln(0.8) + 0*ln(0.2)) = -ln(0.8)
        //   [1]: -(0*ln(0.4) + 1*ln(0.6)) = -ln(0.6)
        //   [2]: -(1*ln(0.6) + 0*ln(0.4)) = -ln(0.6)
        let expected = (-0.8_f64.ln() + -0.6_f64.ln() + -0.6_f64.ln()) / 3.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "BCELoss mean: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_bce_loss_forward_sum() {
        let input = leaf_vec(&[0.8, 0.4]);
        let target = target_vec(&[1.0, 0.0]);
        let loss = BCELoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();
        let expected = -0.8_f64.ln() + -(0.6_f64).ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "BCELoss sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_bce_loss_forward_none() {
        let input = leaf_vec(&[0.8, 0.4]);
        let target = target_vec(&[1.0, 0.0]);
        let loss = BCELoss::new(Reduction::None);
        let out = loss.forward(&input, &target).unwrap();
        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();
        assert!(
            (d[0] - (-0.8_f64.ln())).abs() < 1e-7,
            "BCELoss none[0]: expected {}, got {}",
            -0.8_f64.ln(),
            d[0]
        );
        assert!(
            (d[1] - (-0.6_f64.ln())).abs() < 1e-7,
            "BCELoss none[1]: expected {}, got {}",
            -0.6_f64.ln(),
            d[1]
        );
    }

    #[test]
    fn test_bce_loss_backward_mean() {
        let input = leaf_vec(&[0.8, 0.4]);
        let target = target_vec(&[1.0, 0.0]);
        let loss = BCELoss::new(Reduction::Mean);
        let out = loss.forward(&input, &target).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad[0] = (-1/0.8 + 0/0.2) / 2 = -1.25 / 2 = -0.625
        // grad[1] = (0/0.4 + 1/0.6) / 2 = 1.6667 / 2 = 0.8333
        assert!(
            (g[0] - (-0.625)).abs() < 1e-5,
            "BCELoss backward[0]: expected -0.625, got {}",
            g[0]
        );
        let exp1 = 1.0 / 0.6 / 2.0;
        assert!(
            (g[1] - exp1).abs() < 1e-5,
            "BCELoss backward[1]: expected {}, got {}",
            exp1,
            g[1]
        );
    }

    #[test]
    fn test_bce_loss_shape_mismatch() {
        let input = leaf_vec(&[0.5, 0.5]);
        let target = target_vec(&[1.0]);
        let loss = BCELoss::default();
        assert!(loss.forward(&input, &target).is_err());
    }

    // -----------------------------------------------------------------------
    // TripletMarginLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_triplet_margin_forward_mean() {
        // anchor=[0, 0], positive=[1, 0], negative=[3, 0]
        // d_pos = 1.0, d_neg = 3.0, margin=1.0
        // loss = max(0, 1 - 3 + 1) = max(0, -1) = 0
        let anchor = leaf_2d(&[0.0, 0.0], &[1, 2]);
        let positive =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], false).unwrap();
        let negative =
            Tensor::from_storage(TensorStorage::cpu(vec![3.0, 0.0]), vec![1, 2], false).unwrap();
        let loss = TripletMarginLoss::default();
        let out = loss.forward(&anchor, &positive, &negative).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-7,
            "Triplet loss should be 0 when negative is far, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_triplet_margin_forward_active() {
        // anchor=[0, 0], positive=[2, 0], negative=[1, 0]
        // d_pos = 2.0, d_neg = 1.0, margin=1.0
        // loss = max(0, 2 - 1 + 1) = 2.0
        let anchor = leaf_2d(&[0.0, 0.0], &[1, 2]);
        let positive =
            Tensor::from_storage(TensorStorage::cpu(vec![2.0, 0.0]), vec![1, 2], false).unwrap();
        let negative =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], false).unwrap();
        let loss = TripletMarginLoss::default();
        let out = loss.forward(&anchor, &positive, &negative).unwrap();
        assert!(
            (out.item().unwrap() - 2.0).abs() < 1e-7,
            "Triplet loss: expected 2.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_triplet_margin_batch() {
        // batch of 2: first triplet active, second not
        let anchor = leaf_2d(&[0.0, 0.0, 0.0, 0.0], &[2, 2]);
        let positive = Tensor::from_storage(
            TensorStorage::cpu(vec![2.0, 0.0, 1.0, 0.0]),
            vec![2, 2],
            false,
        )
        .unwrap();
        let negative = Tensor::from_storage(
            TensorStorage::cpu(vec![1.0, 0.0, 5.0, 0.0]),
            vec![2, 2],
            false,
        )
        .unwrap();
        let loss = TripletMarginLoss::new(Reduction::Mean, 1.0, 2.0);
        let out = loss.forward(&anchor, &positive, &negative).unwrap();
        // Sample 0: d_pos=2, d_neg=1, loss=max(0, 2-1+1)=2
        // Sample 1: d_pos=1, d_neg=5, loss=max(0, 1-5+1)=0
        // Mean = 1.0
        assert!(
            (out.item().unwrap() - 1.0).abs() < 1e-7,
            "Triplet batch mean: expected 1.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_triplet_margin_shape_mismatch() {
        let anchor = leaf_2d(&[0.0, 0.0], &[1, 2]);
        let positive =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0, 0.0]), vec![1, 3], false)
                .unwrap();
        let negative =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], false).unwrap();
        let loss = TripletMarginLoss::default();
        assert!(loss.forward(&anchor, &positive, &negative).is_err());
    }

    // -----------------------------------------------------------------------
    // MarginRankingLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_margin_ranking_forward_mean() {
        // x1=[1.0, -1.0], x2=[0.0, 0.0], y=[1.0, -1.0], margin=0.0
        // sample 0: max(0, -1*(1-0) + 0) = max(0, -1) = 0
        // sample 1: max(0, 1*(-1-0) + 0) = max(0, -1) = 0
        // BUT with margin=1:
        // sample 0: max(0, -1*(1-0) + 1) = 0
        // sample 1: max(0, 1*(-1-0) + 1) = 0
        let x1 = leaf_vec(&[2.0, 0.5]);
        let x2 = target_vec(&[1.0, 1.0]);
        let y = target_vec(&[1.0, -1.0]);
        let loss = MarginRankingLoss::new(Reduction::Mean, 0.0);
        let out = loss.forward(&x1, &x2, &y).unwrap();
        // sample 0: max(0, -1*(2-1)+0) = max(0, -1) = 0
        // sample 1: max(0, 1*(0.5-1)+0) = max(0, -0.5) = 0
        assert!(
            out.item().unwrap().abs() < 1e-7,
            "MarginRanking: expected 0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_margin_ranking_forward_with_margin() {
        let x1 = leaf_vec(&[1.0, 0.5]);
        let x2 = target_vec(&[0.5, 1.0]);
        let y = target_vec(&[1.0, 1.0]);
        let loss = MarginRankingLoss::new(Reduction::Mean, 1.0);
        let out = loss.forward(&x1, &x2, &y).unwrap();
        // sample 0: max(0, -1*(1-0.5)+1) = max(0, 0.5) = 0.5
        // sample 1: max(0, -1*(0.5-1)+1) = max(0, 1.5) = 1.5
        // mean = 1.0
        assert!(
            (out.item().unwrap() - 1.0).abs() < 1e-7,
            "MarginRanking: expected 1.0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_margin_ranking_backward() {
        let x1 = leaf_vec(&[0.5]);
        let x2 = target_vec(&[1.0]);
        let y = target_vec(&[1.0]);
        let loss = MarginRankingLoss::new(Reduction::Mean, 1.0);
        // max(0, -1*(0.5-1)+1) = max(0, 1.5) = 1.5
        let out = loss.forward(&x1, &x2, &y).unwrap();
        backward(&out).unwrap();

        let grad = x1.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // Hinge is active, grad = -y / n = -1.0 / 1 = -1.0
        assert!(
            (g[0] - (-1.0)).abs() < 1e-7,
            "MarginRanking backward: expected -1, got {}",
            g[0]
        );
    }

    #[test]
    fn test_margin_ranking_shape_mismatch() {
        let x1 = leaf_vec(&[1.0, 2.0]);
        let x2 = target_vec(&[1.0]);
        let y = target_vec(&[1.0, -1.0]);
        let loss = MarginRankingLoss::default();
        assert!(loss.forward(&x1, &x2, &y).is_err());
    }

    // -----------------------------------------------------------------------
    // CTCLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_ctc_simple() {
        // T=3, B=1, C=3 (blank=0, labels 1,2)
        // Perfect alignment: [blank, 1, 2] for target [1, 2]
        // log_probs that heavily favor the correct alignment
        let mut lp = vec![-10.0_f64; 3 * 3]; // [T=3, B=1, C=3]
        // Shape [T, B=1, C=3]; row-major stride = (3, 3, 1).
        let idx = |t: usize, c: usize| t * 3 + c;
        // t=0: blank (class 0) is likely
        lp[idx(0, 0)] = -0.1;
        lp[idx(0, 1)] = -10.0;
        lp[idx(0, 2)] = -10.0;
        // t=1: class 1 is likely
        lp[idx(1, 0)] = -10.0;
        lp[idx(1, 1)] = -0.1;
        lp[idx(1, 2)] = -10.0;
        // t=2: class 2 is likely
        lp[idx(2, 0)] = -10.0;
        lp[idx(2, 1)] = -10.0;
        lp[idx(2, 2)] = -0.1;

        let log_probs = Tensor::from_storage(TensorStorage::cpu(lp), vec![3, 1, 3], false).unwrap();
        let targets = target_vec(&[1.0, 2.0]);
        let loss = CTCLoss::default();
        let out = loss.forward(&log_probs, &targets, &[3], &[2]).unwrap();
        // Loss should be close to 0.3 (sum of -(-0.1)*3 paths, dominated by direct path)
        assert!(
            out.item().unwrap() < 1.0,
            "CTC: expected low loss for aligned input, got {}",
            out.item().unwrap()
        );
        assert!(
            out.item().unwrap() >= 0.0,
            "CTC: loss should be non-negative, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_ctc_empty_target() {
        // Empty target: loss = -sum(log_prob_blank over time)
        let lp = vec![-0.5_f64, -10.0, -10.0, -0.3, -10.0, -10.0]; // [T=2, B=1, C=3]
        let log_probs = Tensor::from_storage(TensorStorage::cpu(lp), vec![2, 1, 3], false).unwrap();
        let targets = target_vec(&[]);
        let loss = CTCLoss::new(Reduction::Mean, 0, false);
        let out = loss.forward(&log_probs, &targets, &[2], &[0]).unwrap();
        // loss = -(-0.5 + -0.3) = 0.8
        assert!(
            (out.item().unwrap() - 0.8).abs() < 1e-7,
            "CTC empty target: expected 0.8, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_ctc_wrong_shape() {
        let log_probs = leaf_vec(&[0.0, 0.0, 0.0]); // 1-D, not 3-D
        let targets = target_vec(&[1.0]);
        let loss = CTCLoss::default();
        assert!(loss.forward(&log_probs, &targets, &[3], &[1]).is_err());
    }

    // -----------------------------------------------------------------------
    // PoissonNLLLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_poisson_nll_forward_log_input() {
        let input = leaf_vec(&[0.0, 1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let loss = PoissonNLLLoss::default(); // log_input=true
        let out = loss.forward(&input, &target).unwrap();
        // loss[i] = exp(x) - y * x
        let e0 = 0.0_f64.exp() - 1.0 * 0.0; // 1 - 0 = 1.0
        let e1 = 1.0_f64.exp() - 2.0 * 1.0; // e - 2 ≈ 0.718
        let e2 = 2.0_f64.exp() - 3.0 * 2.0; // e^2 - 6 ≈ 1.389
        let expected = (e0 + e1 + e2) / 3.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "Poisson NLL: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_poisson_nll_forward_no_log_input() {
        let input = leaf_vec(&[1.0, 2.0, 3.0]);
        let target = target_vec(&[1.0, 2.0, 3.0]);
        let eps = 1e-8;
        let loss = PoissonNLLLoss::new(Reduction::Mean, false, eps);
        let out = loss.forward(&input, &target).unwrap();
        // loss[i] = x - y * log(x + eps)
        let e0 = 1.0 - 1.0 * (1.0 + eps).ln();
        let e1 = 2.0 - 2.0 * (2.0 + eps).ln();
        let e2 = 3.0 - 3.0 * (3.0 + eps).ln();
        let expected = (e0 + e1 + e2) / 3.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "Poisson NLL no_log: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_poisson_nll_backward() {
        let input = leaf_vec(&[1.0]);
        let target = target_vec(&[2.0]);
        let loss = PoissonNLLLoss::default();
        let out = loss.forward(&input, &target).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // d/dx(exp(x) - 2x) = exp(x) - 2, at x=1: e - 2
        let expected = 1.0_f64.exp() - 2.0;
        assert!(
            (g[0] - expected).abs() < 1e-6,
            "Poisson backward: expected {}, got {}",
            expected,
            g[0]
        );
    }

    #[test]
    fn test_poisson_nll_shape_mismatch() {
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0]);
        let loss = PoissonNLLLoss::default();
        assert!(loss.forward(&input, &target).is_err());
    }

    // -----------------------------------------------------------------------
    // MultiMarginLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_multi_margin_forward_mean() {
        // B=1, C=3, target=1
        // input = [1, 3, 2], target = 1
        // loss = (1/3) * sum_{j!=1} max(0, 1 - x[1] + x[j])
        //      = (1/3) * (max(0, 1-3+1) + max(0, 1-3+2))
        //      = (1/3) * (0 + 0) = 0 (correct class has highest margin)
        let input = leaf_2d(&[1.0, 3.0, 2.0], &[1, 3]);
        let target = target_vec(&[1.0]);
        let loss = MultiMarginLoss::default();
        let out = loss.forward(&input, &target).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-7,
            "MultiMargin: expected 0, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_multi_margin_forward_active() {
        // input = [2, 1, 3], target = 1
        // loss = (1/3) * (max(0, 1-1+2) + max(0, 1-1+3))
        //      = (1/3) * (2 + 3) = 5/3
        let input = leaf_2d(&[2.0, 1.0, 3.0], &[1, 3]);
        let target = target_vec(&[1.0]);
        let loss = MultiMarginLoss::default();
        let out = loss.forward(&input, &target).unwrap();
        let expected = 5.0 / 3.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "MultiMargin active: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_multi_margin_p2() {
        // input = [2, 1, 3], target = 1, p=2
        // loss = (1/3) * (max(0, 1-1+2)^2 + max(0, 1-1+3)^2)
        //      = (1/3) * (4 + 9) = 13/3
        let input = leaf_2d(&[2.0, 1.0, 3.0], &[1, 3]);
        let target = target_vec(&[1.0]);
        let loss = MultiMarginLoss::new(Reduction::Mean, 2, 1.0);
        let out = loss.forward(&input, &target).unwrap();
        let expected = 13.0 / 3.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "MultiMargin p=2: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_multi_margin_backward() {
        let input = leaf_2d(&[2.0, 1.0, 3.0], &[1, 3]);
        let target = target_vec(&[1.0]);
        let loss = MultiMarginLoss::new(Reduction::Sum, 1, 1.0);
        let out = loss.forward(&input, &target).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // p=1: d/dx[j] for j!=y where hinge is active: 1/C
        // d/dx[y] = -sum of active / C
        // j=0: hinge = max(0, 1-1+2)=2 > 0, active => grad[0] = 1/3
        // j=2: hinge = max(0, 1-1+3)=3 > 0, active => grad[2] = 1/3
        // grad[1] (target) = -(1/3 + 1/3) = -2/3
        assert!(
            (g[0] - 1.0 / 3.0).abs() < 1e-7,
            "MultiMargin grad[0]: expected 1/3, got {}",
            g[0]
        );
        assert!(
            (g[1] - (-2.0 / 3.0)).abs() < 1e-7,
            "MultiMargin grad[1]: expected -2/3, got {}",
            g[1]
        );
        assert!(
            (g[2] - 1.0 / 3.0).abs() < 1e-7,
            "MultiMargin grad[2]: expected 1/3, got {}",
            g[2]
        );
    }

    #[test]
    fn test_multi_margin_wrong_shape() {
        let input = leaf_vec(&[1.0, 2.0, 3.0]); // 1-D, not 2-D
        let target = target_vec(&[1.0]);
        let loss = MultiMarginLoss::default();
        assert!(loss.forward(&input, &target).is_err());
    }

    // -----------------------------------------------------------------------
    // MultiLabelSoftMarginLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_multi_label_soft_margin_forward() {
        // B=1, C=2, input = [2.0, -1.0], target = [1.0, 0.0]
        // Per-class BCE-with-logits:
        //   c=0: max(2,0) - 2*1 + log(1+exp(-2)) = 2 - 2 + log(1+exp(-2)) = log(1+exp(-2))
        //   c=1: max(0,0) - (-1)*0 + log(1+exp(-1)) = log(1+exp(-1))
        // loss = (bce0 + bce1) / C = (bce0 + bce1) / 2
        let input = leaf_2d(&[2.0, -1.0], &[1, 2]);
        let target =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], false).unwrap();
        let loss = MultiLabelSoftMarginLoss::default();
        let out = loss.forward(&input, &target).unwrap();
        let bce0 = (1.0 + (-2.0_f64).exp()).ln();
        let bce1 = (1.0 + (-1.0_f64).exp()).ln();
        let expected = (bce0 + bce1) / 2.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-6,
            "MultiLabelSoftMargin: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_multi_label_soft_margin_backward() {
        let input = leaf_2d(&[0.0, 0.0], &[1, 2]);
        let target =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], false).unwrap();
        let loss = MultiLabelSoftMarginLoss::new(Reduction::Sum);
        let out = loss.forward(&input, &target).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // sigmoid(0) = 0.5
        // grad[0] = (0.5 - 1) / 2 = -0.25
        // grad[1] = (0.5 - 0) / 2 = 0.25
        assert!(
            (g[0] - (-0.25)).abs() < 1e-6,
            "MLSM backward[0]: expected -0.25, got {}",
            g[0]
        );
        assert!(
            (g[1] - 0.25).abs() < 1e-6,
            "MLSM backward[1]: expected 0.25, got {}",
            g[1]
        );
    }

    #[test]
    fn test_multi_label_soft_margin_shape_mismatch() {
        let input = leaf_2d(&[1.0, 2.0], &[1, 2]);
        let target =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0, 0.0]), vec![1, 3], false)
                .unwrap();
        let loss = MultiLabelSoftMarginLoss::default();
        assert!(loss.forward(&input, &target).is_err());
    }

    // -----------------------------------------------------------------------
    // HingeEmbeddingLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_hinge_embedding_forward_mean() {
        // input = [0.5, 2.0], y = [1.0, -1.0], margin=1.0
        // loss[0] = 0.5 (positive)
        // loss[1] = max(0, 1.0 - 2.0) = 0 (negative, large input)
        // mean = 0.25
        let input = leaf_vec(&[0.5, 2.0]);
        let y = target_vec(&[1.0, -1.0]);
        let loss = HingeEmbeddingLoss::default();
        let out = loss.forward(&input, &y).unwrap();
        assert!(
            (out.item().unwrap() - 0.25).abs() < 1e-7,
            "HingeEmbedding mean: expected 0.25, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_hinge_embedding_negative_active() {
        // input = [0.3], y = [-1.0], margin=1.0
        // loss = max(0, 1.0 - 0.3) = 0.7
        let input = leaf_vec(&[0.3]);
        let y = target_vec(&[-1.0]);
        let loss = HingeEmbeddingLoss::new(Reduction::Mean, 1.0);
        let out = loss.forward(&input, &y).unwrap();
        assert!(
            (out.item().unwrap() - 0.7).abs() < 1e-7,
            "HingeEmbedding active: expected 0.7, got {}",
            out.item().unwrap()
        );
    }

    #[test]
    fn test_hinge_embedding_backward() {
        let input = leaf_vec(&[0.5, 0.3]);
        let y = target_vec(&[1.0, -1.0]);
        let loss = HingeEmbeddingLoss::new(Reduction::Mean, 1.0);
        let out = loss.forward(&input, &y).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // grad[0]: y=1, grad = 1 / 2 = 0.5
        // grad[1]: y=-1, margin-x = 0.7 > 0, grad = -1 / 2 = -0.5
        assert!(
            (g[0] - 0.5).abs() < 1e-7,
            "HingeEmb backward[0]: expected 0.5, got {}",
            g[0]
        );
        assert!(
            (g[1] - (-0.5)).abs() < 1e-7,
            "HingeEmb backward[1]: expected -0.5, got {}",
            g[1]
        );
    }

    #[test]
    fn test_hinge_embedding_shape_mismatch() {
        let input = leaf_vec(&[0.5, 0.3]);
        let y = target_vec(&[1.0]);
        let loss = HingeEmbeddingLoss::default();
        assert!(loss.forward(&input, &y).is_err());
    }

    #[test]
    fn test_hinge_embedding_all_reductions() {
        let input = leaf_vec(&[0.5, 0.3]);
        let y = target_vec(&[1.0, -1.0]);
        // margin = 1.0
        // loss = [0.5, 0.7]

        let sum_loss = HingeEmbeddingLoss::new(Reduction::Sum, 1.0)
            .forward(&input, &y)
            .unwrap();
        assert!(
            (sum_loss.item().unwrap() - 1.2).abs() < 1e-7,
            "HingeEmb sum: expected 1.2, got {}",
            sum_loss.item().unwrap()
        );

        let none_loss = HingeEmbeddingLoss::new(Reduction::None, 1.0)
            .forward(&input, &y)
            .unwrap();
        assert_eq!(none_loss.shape(), &[2]);
        let d = none_loss.data().unwrap();
        assert!((d[0] - 0.5).abs() < 1e-7);
        assert!((d[1] - 0.7).abs() < 1e-7);
    }

    // -----------------------------------------------------------------------
    // GaussianNLLLoss
    // -----------------------------------------------------------------------

    #[test]
    fn test_gaussian_nll_forward_mean() {
        // input = [1.0, 2.0], target = [1.5, 2.5], var = [1.0, 2.0]
        // loss[0] = 0.5 * (ln(1.0) + (1.0-1.5)^2 / 1.0) = 0.5 * (0 + 0.25) = 0.125
        // loss[1] = 0.5 * (ln(2.0) + (2.0-2.5)^2 / 2.0) = 0.5 * (ln(2) + 0.125)
        // mean = (loss[0] + loss[1]) / 2
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.5, 2.5]);
        let var = target_vec(&[1.0, 2.0]);
        let loss = GaussianNLLLoss::default();
        let out = loss.forward(&input, &target, &var).unwrap();
        let e0 = 0.5 * (0.0 + 0.25);
        let e1 = 0.5 * (2.0_f64.ln() + 0.125);
        let expected = (e0 + e1) / 2.0;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "GaussianNLL mean: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_gaussian_nll_forward_sum() {
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.5, 2.5]);
        let var = target_vec(&[1.0, 2.0]);
        let loss = GaussianNLLLoss::new(Reduction::Sum, false, 1e-6);
        let out = loss.forward(&input, &target, &var).unwrap();
        let e0 = 0.5 * (0.0 + 0.25);
        let e1 = 0.5 * (2.0_f64.ln() + 0.125);
        let expected = e0 + e1;
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "GaussianNLL sum: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_gaussian_nll_forward_none() {
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.5, 2.5]);
        let var = target_vec(&[1.0, 2.0]);
        let loss = GaussianNLLLoss::new(Reduction::None, false, 1e-6);
        let out = loss.forward(&input, &target, &var).unwrap();
        assert_eq!(out.shape(), &[2]);
        let d = out.data().unwrap();
        let e0 = 0.5 * (0.0 + 0.25);
        let e1 = 0.5 * (2.0_f64.ln() + 0.125);
        assert!(
            (d[0] - e0).abs() < 1e-7,
            "GaussianNLL none[0]: expected {}, got {}",
            e0,
            d[0]
        );
        assert!(
            (d[1] - e1).abs() < 1e-7,
            "GaussianNLL none[1]: expected {}, got {}",
            e1,
            d[1]
        );
    }

    #[test]
    fn test_gaussian_nll_full_mode() {
        // With full=true, adds 0.5 * log(2*pi) per element.
        let input = leaf_vec(&[0.0]);
        let target = target_vec(&[0.0]);
        let var = target_vec(&[1.0]);
        let loss = GaussianNLLLoss::new(Reduction::Mean, true, 1e-6);
        let out = loss.forward(&input, &target, &var).unwrap();
        // loss = 0.5 * (ln(1) + 0 + ln(2*pi)) = 0.5 * ln(2*pi)
        let expected = 0.5 * (2.0 * std::f64::consts::PI).ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-7,
            "GaussianNLL full: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_gaussian_nll_backward_input() {
        // d(loss)/d(input) = (input - target) / var
        // input=2.0, target=1.0, var=4.0 => grad = 1.0/4.0 = 0.25 (mean, n=1)
        let input = leaf_vec(&[2.0]);
        let target = target_vec(&[1.0]);
        let var = target_vec(&[4.0]);
        let loss = GaussianNLLLoss::new(Reduction::Mean, false, 1e-6);
        let out = loss.forward(&input, &target, &var).unwrap();
        backward(&out).unwrap();

        let grad = input.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        let expected = (2.0 - 1.0) / 4.0;
        assert!(
            (g[0] - expected).abs() < 1e-7,
            "GaussianNLL backward input: expected {}, got {}",
            expected,
            g[0]
        );
    }

    #[test]
    fn test_gaussian_nll_backward_var() {
        // d(loss)/d(var) = 0.5 * (1/var - diff^2/var^2)
        // input=2.0, target=1.0, var=4.0
        // => 0.5 * (1/4 - 1/16) = 0.5 * (0.25 - 0.0625) = 0.09375
        let input = leaf_vec(&[2.0]);
        let target = target_vec(&[1.0]);
        let var_tensor =
            Tensor::from_storage(TensorStorage::cpu(vec![4.0]), vec![1], true).unwrap();
        let loss = GaussianNLLLoss::new(Reduction::Mean, false, 1e-6);
        let out = loss.forward(&input, &target, &var_tensor).unwrap();
        backward(&out).unwrap();

        let grad = var_tensor.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        let expected = 0.5 * (1.0 / 4.0 - 1.0 / 16.0);
        assert!(
            (g[0] - expected).abs() < 1e-7,
            "GaussianNLL backward var: expected {}, got {}",
            expected,
            g[0]
        );
    }

    #[test]
    fn test_gaussian_nll_eps_clamp() {
        // Very small variance should be clamped to eps.
        let input = leaf_vec(&[1.0]);
        let target = target_vec(&[1.0]);
        let var = target_vec(&[0.0]); // zero variance
        let eps = 1e-6;
        let loss = GaussianNLLLoss::new(Reduction::Mean, false, eps);
        let out = loss.forward(&input, &target, &var).unwrap();
        // diff = 0, so loss = 0.5 * ln(eps)
        let expected = 0.5 * eps.ln();
        assert!(
            (out.item().unwrap() - expected).abs() < 1e-5,
            "GaussianNLL eps clamp: expected {}, got {}",
            expected,
            out.item().unwrap()
        );
    }

    #[test]
    fn test_gaussian_nll_shape_mismatch() {
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0]);
        let var = target_vec(&[1.0, 1.0]);
        let loss = GaussianNLLLoss::default();
        assert!(loss.forward(&input, &target, &var).is_err());
    }

    #[test]
    fn test_gaussian_nll_var_shape_mismatch() {
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0]);
        let var = target_vec(&[1.0]);
        let loss = GaussianNLLLoss::default();
        assert!(loss.forward(&input, &target, &var).is_err());
    }

    #[test]
    fn test_gaussian_nll_zero_loss() {
        // When input == target and var == 1, loss = 0.5 * (0 + 0) = 0
        let input = leaf_vec(&[1.0, 2.0]);
        let target = target_vec(&[1.0, 2.0]);
        let var = target_vec(&[1.0, 1.0]);
        let loss = GaussianNLLLoss::default();
        let out = loss.forward(&input, &target, &var).unwrap();
        assert!(
            out.item().unwrap().abs() < 1e-10,
            "GaussianNLL zero loss: expected ~0, got {}",
            out.item().unwrap()
        );
    }

    // -----------------------------------------------------------------------
    // CosineEmbeddingLoss backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_cosine_embedding_backward_positive() {
        // x1 = [3, 4], x2 = [4, 3], y = 1 (positive)
        // ||x1|| = 5, ||x2|| = 5, dot = 24, cos = 24/25 = 0.96
        // loss = 1 - 0.96 = 0.04
        let x1 = leaf_2d(&[3.0, 4.0], &[1, 2]);
        let x2 =
            Tensor::from_storage(TensorStorage::cpu(vec![4.0, 3.0]), vec![1, 2], true).unwrap();
        let y = target_vec(&[1.0]);
        let loss = CosineEmbeddingLoss::new(Reduction::Sum, 0.0);
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();

        let cos_val = 24.0 / 25.0;
        assert!(
            (out.item().unwrap() - (1.0 - cos_val)).abs() < 1e-7,
            "CosEmb positive: expected {}, got {}",
            1.0 - cos_val,
            out.item().unwrap()
        );

        backward(&out).unwrap();

        let grad_x1 = x1.grad().unwrap().unwrap();
        let g1 = grad_x1.data().unwrap();
        // d(loss)/d(x1_f) = -(x2_f/(||x1||*||x2||) - cos*x1_f/||x1||^2)
        // f=0: -(4/25 - 0.96*3/25) = -(0.16 - 0.1152) = -0.0448
        // f=1: -(3/25 - 0.96*4/25) = -(0.12 - 0.1536) = 0.0336
        let expected_g1_0 = -(4.0 / 25.0 - cos_val * 3.0 / 25.0);
        let expected_g1_1 = -(3.0 / 25.0 - cos_val * 4.0 / 25.0);
        assert!(
            (g1[0] - expected_g1_0).abs() < 1e-7,
            "CosEmb backward x1[0]: expected {}, got {}",
            expected_g1_0,
            g1[0]
        );
        assert!(
            (g1[1] - expected_g1_1).abs() < 1e-7,
            "CosEmb backward x1[1]: expected {}, got {}",
            expected_g1_1,
            g1[1]
        );

        let grad_x2 = x2.grad().unwrap().unwrap();
        let g2 = grad_x2.data().unwrap();
        let expected_g2_0 = -(3.0 / 25.0 - cos_val * 4.0 / 25.0);
        let expected_g2_1 = -(4.0 / 25.0 - cos_val * 3.0 / 25.0);
        assert!(
            (g2[0] - expected_g2_0).abs() < 1e-7,
            "CosEmb backward x2[0]: expected {}, got {}",
            expected_g2_0,
            g2[0]
        );
        assert!(
            (g2[1] - expected_g2_1).abs() < 1e-7,
            "CosEmb backward x2[1]: expected {}, got {}",
            expected_g2_1,
            g2[1]
        );
    }

    #[test]
    fn test_cosine_embedding_backward_negative_active() {
        // x1 = [1, 0], x2 = [1, 0], y = -1, margin = 0.5
        // cos = 1.0, loss = max(0, 1.0 - 0.5) = 0.5
        // Gradients should be opposite sign of positive case.
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 =
            Tensor::from_storage(TensorStorage::cpu(vec![1.0, 0.0]), vec![1, 2], true).unwrap();
        let y = target_vec(&[-1.0]);
        let loss = CosineEmbeddingLoss::new(Reduction::Sum, 0.5);
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        assert!(
            (out.item().unwrap() - 0.5).abs() < 1e-7,
            "CosEmb negative active: expected 0.5, got {}",
            out.item().unwrap()
        );
        backward(&out).unwrap();

        let grad_x1 = x1.grad().unwrap().unwrap();
        let g1 = grad_x1.data().unwrap();
        // d(cos)/d(x1_0) = x2_0/(||x1||*||x2||) - cos*x1_0/||x1||^2 = 1/1 - 1*1/1 = 0
        // d(cos)/d(x1_1) = x2_1/(||x1||*||x2||) - cos*x1_1/||x1||^2 = 0/1 - 1*0/1 = 0
        // For negative: grad = +d(cos)/d(x1)
        assert!(
            g1[0].abs() < 1e-7,
            "CosEmb neg backward x1[0]: expected 0, got {}",
            g1[0]
        );
        assert!(
            g1[1].abs() < 1e-7,
            "CosEmb neg backward x1[1]: expected 0, got {}",
            g1[1]
        );
    }

    #[test]
    fn test_cosine_embedding_backward_negative_inactive() {
        // x1 = [1, 0], x2 = [0, 1], y = -1, margin = 0.0
        // cos = 0, loss = max(0, 0 - 0) = 0, hinge inactive => grad = 0
        let x1 = leaf_2d(&[1.0, 0.0], &[1, 2]);
        let x2 =
            Tensor::from_storage(TensorStorage::cpu(vec![0.0, 1.0]), vec![1, 2], true).unwrap();
        let y = target_vec(&[-1.0]);
        let loss = CosineEmbeddingLoss::new(Reduction::Sum, 0.0);
        let out = loss.forward_pair(&x1, &x2, &y).unwrap();
        backward(&out).unwrap();

        let grad_x1 = x1.grad().unwrap().unwrap();
        let g1 = grad_x1.data().unwrap();
        assert!(
            g1[0].abs() < 1e-7 && g1[1].abs() < 1e-7,
            "CosEmb inactive neg: expected zero grad, got {:?}",
            g1
        );
    }

    // -----------------------------------------------------------------------
    // CTCLoss backward
    // -----------------------------------------------------------------------

    #[test]
    fn test_ctc_backward_gradients_sum_to_zero() {
        // For a valid probability distribution, gradients w.r.t. log_probs
        // should approximately sum to zero over classes for each timestep
        // (since probabilities sum to 1).
        let mut lp = vec![-10.0_f64; 3 * 3];
        // t=0: blank likely
        lp[0] = -0.1;
        lp[1] = -5.0;
        lp[2] = -5.0;
        // t=1: class 1 likely
        lp[3] = -5.0;
        lp[4] = -0.1;
        lp[5] = -5.0;
        // t=2: class 2 likely
        lp[6] = -5.0;
        lp[7] = -5.0;
        lp[8] = -0.1;

        let log_probs = Tensor::from_storage(TensorStorage::cpu(lp), vec![3, 1, 3], true).unwrap();
        let targets = target_vec(&[1.0, 2.0]);
        let loss = CTCLoss::new(Reduction::Sum, 0, false);
        let out = loss.forward(&log_probs, &targets, &[3], &[2]).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // Verify gradients are finite.
        for i in 0..9 {
            assert!(g[i].is_finite(), "CTC grad[{}] is not finite: {}", i, g[i]);
        }
    }

    #[test]
    fn test_ctc_backward_empty_target() {
        // Empty target: grad should be -1 at blank for each timestep.
        let lp = vec![-0.5_f64, -10.0, -10.0, -0.3, -10.0, -10.0];
        let log_probs = Tensor::from_storage(TensorStorage::cpu(lp), vec![2, 1, 3], true).unwrap();
        let targets = target_vec(&[]);
        let loss = CTCLoss::new(Reduction::Sum, 0, false);
        let out = loss.forward(&log_probs, &targets, &[2], &[0]).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let g = grad.data().unwrap();
        // blank is class 0; grad at blank positions should be -1.
        assert!(
            (g[0] - (-1.0)).abs() < 1e-7,
            "CTC empty target grad[t=0,blank]: expected -1, got {}",
            g[0]
        );
        assert!(
            (g[3] - (-1.0)).abs() < 1e-7,
            "CTC empty target grad[t=1,blank]: expected -1, got {}",
            g[3]
        );
        // Non-blank positions should be 0.
        assert!(
            g[1].abs() < 1e-7,
            "CTC empty target grad[t=0,c=1]: expected 0, got {}",
            g[1]
        );
    }

    #[test]
    fn test_ctc_backward_no_grad() {
        // Inside no_grad, CTC should not attach grad_fn.
        ferrotorch_core::no_grad(|| {
            let lp = vec![-0.5_f64; 3 * 2];
            let log_probs =
                Tensor::from_storage(TensorStorage::cpu(lp), vec![3, 1, 2], true).unwrap();
            let targets = target_vec(&[1.0]);
            let loss = CTCLoss::default();
            let out = loss.forward(&log_probs, &targets, &[3], &[1]).unwrap();
            assert!(
                out.grad_fn().is_none(),
                "CTCLoss inside no_grad should not attach grad_fn"
            );
        });
    }

    #[test]
    fn test_ctc_backward_numerical_gradient() {
        // Numerical gradient check with central differences.
        let base_lp = vec![
            -0.5_f64, -1.0, -2.0, // t=0
            -1.0, -0.5, -2.0, // t=1
            -2.0, -1.0, -0.5, // t=2
        ];
        let eps = 1e-5;

        // Compute analytical gradient.
        let log_probs =
            Tensor::from_storage(TensorStorage::cpu(base_lp.clone()), vec![3, 1, 3], true).unwrap();
        let targets = target_vec(&[1.0, 2.0]);
        let loss = CTCLoss::new(Reduction::Sum, 0, false);
        let out = loss.forward(&log_probs, &targets, &[3], &[2]).unwrap();
        backward(&out).unwrap();

        let grad = log_probs.grad().unwrap().unwrap();
        let analytical = grad.data_vec().unwrap();

        // Numerical gradient via central differences.
        for idx in 0..9 {
            let mut lp_plus = base_lp.clone();
            lp_plus[idx] += eps;
            let lp_p =
                Tensor::from_storage(TensorStorage::cpu(lp_plus), vec![3, 1, 3], false).unwrap();
            let t_p = target_vec(&[1.0, 2.0]);
            let out_p = CTCLoss::new(Reduction::Sum, 0, false)
                .forward(&lp_p, &t_p, &[3], &[2])
                .unwrap();

            let mut lp_minus = base_lp.clone();
            lp_minus[idx] -= eps;
            let lp_m =
                Tensor::from_storage(TensorStorage::cpu(lp_minus), vec![3, 1, 3], false).unwrap();
            let t_m = target_vec(&[1.0, 2.0]);
            let out_m = CTCLoss::new(Reduction::Sum, 0, false)
                .forward(&lp_m, &t_m, &[3], &[2])
                .unwrap();

            let numerical = (out_p.item().unwrap() - out_m.item().unwrap()) / (2.0 * eps);
            assert!(
                (analytical[idx] - numerical).abs() < 1e-4,
                "CTC grad[{}]: analytical={}, numerical={}",
                idx,
                analytical[idx],
                numerical,
            );
        }
    }
}