use std::any::TypeId;
use std::sync::Arc;
use crate::autograd::no_grad::is_grad_enabled;
use crate::dtype::Float;
use crate::error::{FerrotorchError, FerrotorchResult};
use crate::gpu_dispatch::gpu_backend;
use crate::ops::elementwise::unary_map;
use crate::storage::TensorStorage;
use crate::tensor::{GradFn, Tensor};
#[inline]
fn is_f32<T: Float>() -> bool {
TypeId::of::<T>() == TypeId::of::<f32>()
}
#[derive(Debug)]
pub struct ReluBackward<T: Float> {
input: Tensor<T>,
}
impl<T: Float> ReluBackward<T> {
pub fn new(input: Tensor<T>) -> Self {
Self { input }
}
}
impl<T: Float> GradFn<T> for ReluBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
if grad_output.is_cuda() && is_f32::<T>() {
let backend = gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let result_h = backend.relu_backward_f32(
grad_output.gpu_handle()?,
self.input.gpu_handle()?,
)?;
let grad_input = Tensor::from_storage(
TensorStorage::gpu(result_h),
self.input.shape().to_vec(),
false,
)?;
return Ok(vec![Some(grad_input)]);
}
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let zero = <T as num_traits::Zero>::zero();
let result: Vec<T> = input_data
.iter()
.zip(grad_data.iter())
.map(|(&x, &g)| if x > zero { g } else { zero })
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"ReluBackward"
}
}
#[derive(Debug)]
pub struct SigmoidBackward<T: Float> {
input: Tensor<T>,
output: Tensor<T>,
}
impl<T: Float> SigmoidBackward<T> {
pub fn new(input: Tensor<T>, output: Tensor<T>) -> Self {
Self { input, output }
}
}
impl<T: Float> GradFn<T> for SigmoidBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_output = if self.output.is_cuda() { self.output.cpu()? } else { self.output.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let s_data = cpu_output.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let result: Vec<T> = s_data
.iter()
.zip(grad_data.iter())
.map(|(&s, &g)| g * s * (one - s))
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"SigmoidBackward"
}
}
#[derive(Debug)]
pub struct TanhBackward<T: Float> {
input: Tensor<T>,
output: Tensor<T>,
}
impl<T: Float> TanhBackward<T> {
pub fn new(input: Tensor<T>, output: Tensor<T>) -> Self {
Self { input, output }
}
}
impl<T: Float> GradFn<T> for TanhBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_output = if self.output.is_cuda() { self.output.cpu()? } else { self.output.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let t_data = cpu_output.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let result: Vec<T> = t_data
.iter()
.zip(grad_data.iter())
.map(|(&t, &g)| g * (one - t * t))
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"TanhBackward"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
pub enum GeluApproximate {
#[default]
None,
Tanh,
Sigmoid,
}
impl std::fmt::Display for GeluApproximate {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GeluApproximate::None => write!(f, "none"),
GeluApproximate::Tanh => write!(f, "tanh"),
GeluApproximate::Sigmoid => write!(f, "sigmoid"),
}
}
}
#[derive(Debug)]
pub struct GeluBackward<T: Float> {
input: Tensor<T>,
approximate: GeluApproximate,
}
impl<T: Float> GeluBackward<T> {
pub fn new(input: Tensor<T>, approximate: GeluApproximate) -> Self {
Self { input, approximate }
}
}
impl<T: Float> GradFn<T> for GeluBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
if self.approximate == GeluApproximate::Sigmoid
&& grad_output.is_cuda()
&& is_f32::<T>()
{
let backend = gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let result_h = backend.gelu_backward_f32(
grad_output.gpu_handle()?,
self.input.gpu_handle()?,
)?;
let grad_input = Tensor::from_storage(
TensorStorage::gpu(result_h),
self.input.shape().to_vec(),
false,
)?;
return Ok(vec![Some(grad_input)]);
}
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let result: Vec<T> = match self.approximate {
GeluApproximate::None => {
let sqrt_2 = T::from(std::f64::consts::SQRT_2).unwrap();
let inv_sqrt_2pi = T::from(1.0 / (2.0 * std::f64::consts::PI).sqrt()).unwrap();
let half = T::from(0.5).unwrap();
input_data.iter().zip(grad_data.iter()).map(|(&x, &g)| {
let cdf = half * (one + erf_approx(x / sqrt_2));
let pdf = inv_sqrt_2pi * (-(x * x) / (one + one)).exp();
g * (cdf + x * pdf)
}).collect()
}
GeluApproximate::Tanh => {
let half = T::from(0.5).unwrap();
let sqrt_2_over_pi = T::from((2.0 / std::f64::consts::PI).sqrt()).unwrap();
let c = T::from(0.044715).unwrap();
let c3 = T::from(3.0 * 0.044715).unwrap(); input_data.iter().zip(grad_data.iter()).map(|(&x, &g)| {
let x3 = x * x * x;
let inner = sqrt_2_over_pi * (x + c * x3);
let tanh_inner = inner.tanh();
let dtanh = one - tanh_inner * tanh_inner;
let d_inner = sqrt_2_over_pi * (one + c3 * x * x);
g * (half * (one + tanh_inner) + half * x * dtanh * d_inner)
}).collect()
}
GeluApproximate::Sigmoid => {
let k = T::from(1.702).unwrap();
input_data.iter().zip(grad_data.iter()).map(|(&x, &g)| {
let s = one / (one + (-k * x).exp());
g * (s + k * x * s * (one - s))
}).collect()
}
};
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"GeluBackward"
}
}
fn erf_approx<T: Float>(x: T) -> T {
let one = <T as num_traits::One>::one();
let zero = <T as num_traits::Zero>::zero();
let sign = if x < zero { -one } else { one };
let x = if x < zero { -x } else { x };
let p = T::from(0.3275911).unwrap();
let a1 = T::from(0.254829592).unwrap();
let a2 = T::from(-0.284496736).unwrap();
let a3 = T::from(1.421413741).unwrap();
let a4 = T::from(-1.453152027).unwrap();
let a5 = T::from(1.061405429).unwrap();
let t = one / (one + p * x);
let t2 = t * t;
let t3 = t2 * t;
let t4 = t3 * t;
let t5 = t4 * t;
let poly = a1 * t + a2 * t2 + a3 * t3 + a4 * t4 + a5 * t5;
sign * (one - poly * (-x * x).exp())
}
#[derive(Debug)]
pub struct SiluBackward<T: Float> {
input: Tensor<T>,
}
impl<T: Float> SiluBackward<T> {
pub fn new(input: Tensor<T>) -> Self {
Self { input }
}
}
impl<T: Float> GradFn<T> for SiluBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let result: Vec<T> = input_data
.iter()
.zip(grad_data.iter())
.map(|(&x, &g)| {
let s = one / (one + (-x).exp());
g * (s + x * s * (one - s))
})
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"SiluBackward"
}
}
#[derive(Debug)]
pub struct SoftmaxBackward<T: Float> {
input: Tensor<T>,
output: Tensor<T>,
}
impl<T: Float> SoftmaxBackward<T> {
pub fn new(input: Tensor<T>, output: Tensor<T>) -> Self {
Self { input, output }
}
}
impl<T: Float> GradFn<T> for SoftmaxBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_output = if self.output.is_cuda() { self.output.cpu()? } else { self.output.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let s_data = cpu_output.data()?;
let grad_data = cpu_go.data()?;
let shape = self.output.shape();
if shape.is_empty() {
let zero = <T as num_traits::Zero>::zero();
let grad_input =
Tensor::from_storage(TensorStorage::cpu(vec![zero]), vec![], false)?;
return Ok(vec![Some(grad_input)]);
}
let last_dim = *shape.last().unwrap();
let outer = s_data.len() / last_dim.max(1);
let mut result = vec![<T as num_traits::Zero>::zero(); s_data.len()];
for i in 0..outer {
let base = i * last_dim;
let mut dot = <T as num_traits::Zero>::zero();
for j in 0..last_dim {
dot = dot + grad_data[base + j] * s_data[base + j];
}
for j in 0..last_dim {
result[base + j] = s_data[base + j] * (grad_data[base + j] - dot);
}
}
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"SoftmaxBackward"
}
}
#[derive(Debug)]
pub struct LogSoftmaxBackward<T: Float> {
input: Tensor<T>,
softmax_output: Tensor<T>,
}
impl<T: Float> LogSoftmaxBackward<T> {
pub fn new(input: Tensor<T>, softmax_output: Tensor<T>) -> Self {
Self {
input,
softmax_output,
}
}
}
impl<T: Float> GradFn<T> for LogSoftmaxBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_sm = if self.softmax_output.is_cuda() { self.softmax_output.cpu()? } else { self.softmax_output.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let sm_data = cpu_sm.data()?;
let grad_data = cpu_go.data()?;
let shape = self.input.shape();
if shape.is_empty() {
let zero = <T as num_traits::Zero>::zero();
let grad_input =
Tensor::from_storage(TensorStorage::cpu(vec![zero]), vec![], false)?;
return Ok(vec![Some(grad_input)]);
}
let last_dim = *shape.last().unwrap();
let outer = sm_data.len() / last_dim.max(1);
let mut result = vec![<T as num_traits::Zero>::zero(); sm_data.len()];
for i in 0..outer {
let base = i * last_dim;
let mut sum_grad = <T as num_traits::Zero>::zero();
for j in 0..last_dim {
sum_grad = sum_grad + grad_data[base + j];
}
for j in 0..last_dim {
result[base + j] = grad_data[base + j] - sm_data[base + j] * sum_grad;
}
}
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
let grad_input = if self.input.is_cuda() { grad_input.to(self.input.device())? } else { grad_input };
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"LogSoftmaxBackward"
}
}
pub fn relu<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if input.is_cuda() {
let backend = crate::gpu_dispatch::gpu_backend()
.ok_or(FerrotorchError::DeviceUnavailable)?;
let handle = backend.relu_f32(input.gpu_handle()?)?;
let storage = TensorStorage::gpu(handle);
let shape = input.shape().to_vec();
if is_grad_enabled() && input.requires_grad() {
let grad_fn = Arc::new(ReluBackward::new(input.clone()));
Tensor::from_operation(storage, shape, grad_fn)
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let zero = <T as num_traits::Zero>::zero();
let output = unary_map(input, |x| if x > zero { x } else { zero })?;
if is_grad_enabled() && input.requires_grad() {
let grad_fn = Arc::new(ReluBackward::new(input.clone()));
let (storage, shape) = output.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, grad_fn)
} else {
Ok(output)
}
}
}
pub fn sigmoid<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let cpu_input = if input.is_cuda() { input.cpu()? } else { input.clone() };
let output = if std::mem::size_of::<T>() == 4 {
let data = cpu_input.data()?;
let n = data.len();
let inp: &[f32] = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
let neg: Vec<f32> = inp.iter().map(|&x| -x).collect();
let mut exp_out = vec![0.0f32; n];
ferray_ufunc::kernels::simd_f32::exp_f32(&neg, &mut exp_out);
let result: Vec<T> = exp_out.iter().map(|&e| T::from(1.0f32 / (1.0 + e)).unwrap()).collect();
Tensor::from_storage(TensorStorage::cpu(result), input.shape().to_vec(), false)?
} else {
let one = <T as num_traits::One>::one();
unary_map(input, |x| one / (one + (-x).exp()))?
};
let device = input.device();
if is_grad_enabled() && input.requires_grad() {
let result = Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(SigmoidBackward::new(input.clone(), output.clone())),
)?;
if device.is_cuda() { result.to(device) } else { Ok(result) }
} else {
if device.is_cuda() { output.to(device) } else { Ok(output) }
}
}
pub fn tanh<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let output = unary_map(input, |x| x.tanh())?;
if is_grad_enabled() && input.requires_grad() {
let result = Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(TanhBackward::new(input.clone(), output.clone())),
)?;
Ok(result)
} else {
Ok(output)
}
}
pub fn gelu_with<T: Float>(
input: &Tensor<T>,
approximate: GeluApproximate,
) -> FerrotorchResult<Tensor<T>> {
if approximate == GeluApproximate::Sigmoid && input.is_cuda() {
if let Some(backend) = crate::gpu_dispatch::gpu_backend() {
let handle = backend.gelu_f32(input.gpu_handle()?)?;
return if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::gpu(handle),
input.shape().to_vec(),
Arc::new(GeluBackward::new(input.clone(), approximate)),
)
} else {
Tensor::from_storage(TensorStorage::gpu(handle), input.shape().to_vec(), false)
};
}
}
let one = <T as num_traits::One>::one();
let output = match approximate {
GeluApproximate::None => {
let sqrt_2 = T::from(std::f64::consts::SQRT_2).unwrap();
let half = T::from(0.5).unwrap();
unary_map(input, |x| {
x * half * (one + erf_approx(x / sqrt_2))
})?
}
GeluApproximate::Tanh => {
let half = T::from(0.5).unwrap();
let sqrt_2_over_pi = T::from((2.0 / std::f64::consts::PI).sqrt()).unwrap();
let c = T::from(0.044715).unwrap();
unary_map(input, |x| {
let inner = sqrt_2_over_pi * (x + c * x * x * x);
half * x * (one + inner.tanh())
})?
}
GeluApproximate::Sigmoid => {
let k = T::from(1.702).unwrap();
unary_map(input, |x| {
let s = one / (one + (-k * x).exp());
x * s
})?
}
};
if is_grad_enabled() && input.requires_grad() {
let (storage, shape) = output.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(GeluBackward::new(input.clone(), approximate)),
)
} else {
Ok(output)
}
}
pub fn gelu<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
gelu_with(input, GeluApproximate::default())
}
pub fn silu<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let one = <T as num_traits::One>::one();
let output = unary_map(input, |x| {
let s = one / (one + (-x).exp());
x * s
})?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(SiluBackward::new(input.clone())),
)
} else {
Ok(output)
}
}
pub fn softmax<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let shape = input.shape().to_vec();
if input.is_cuda() {
if let Some(backend) = crate::gpu_dispatch::gpu_backend() {
let last_dim = *shape.last().unwrap_or(&1);
let rows = input.numel() / last_dim.max(1);
let handle = backend.softmax_f32(input.gpu_handle()?, rows, last_dim)?;
return if is_grad_enabled() && input.requires_grad() {
let cache_handle = backend.clone_buffer(&handle)?;
let output_cache = Tensor::from_storage(
TensorStorage::gpu(cache_handle),
shape.clone(),
false,
)?;
Tensor::from_operation(
TensorStorage::gpu(handle),
shape,
Arc::new(SoftmaxBackward::new(input.clone(), output_cache)),
)
} else {
Tensor::from_storage(TensorStorage::gpu(handle), shape, false)
};
}
}
let data = input.data()?;
let result = if shape.is_empty() {
vec![<T as num_traits::One>::one()]
} else {
let last_dim = *shape.last().unwrap();
let outer = data.len() / last_dim.max(1);
let mut out = vec![<T as num_traits::Zero>::zero(); data.len()];
for i in 0..outer {
let base = i * last_dim;
let mut max_val = data[base];
for j in 1..last_dim {
if data[base + j] > max_val {
max_val = data[base + j];
}
}
let mut sum_exp = <T as num_traits::Zero>::zero();
for j in 0..last_dim {
let e = (data[base + j] - max_val).exp();
out[base + j] = e;
sum_exp = sum_exp + e;
}
for j in 0..last_dim {
out[base + j] = out[base + j] / sum_exp;
}
}
out
};
let output =
Tensor::from_storage(TensorStorage::cpu(result), shape, false)?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(SoftmaxBackward::new(input.clone(), output.clone())),
)
} else {
Ok(output)
}
}
pub fn log_softmax<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let cpu_input = if input.is_cuda() { input.cpu()? } else { input.clone() };
let data = cpu_input.data()?;
let shape = input.shape();
let (sm_vec, lsm_vec) = if shape.is_empty() {
(
vec![<T as num_traits::One>::one()],
vec![<T as num_traits::Zero>::zero()],
)
} else {
let last_dim = *shape.last().unwrap();
let outer = data.len() / last_dim.max(1);
let mut sm = vec![<T as num_traits::Zero>::zero(); data.len()];
let mut lsm = vec![<T as num_traits::Zero>::zero(); data.len()];
for i in 0..outer {
let base = i * last_dim;
let mut max_val = data[base];
for j in 1..last_dim {
if data[base + j] > max_val {
max_val = data[base + j];
}
}
let mut sum_exp = <T as num_traits::Zero>::zero();
for j in 0..last_dim {
let e = (data[base + j] - max_val).exp();
sm[base + j] = e;
sum_exp = sum_exp + e;
}
let log_sum = sum_exp.ln();
for j in 0..last_dim {
sm[base + j] = sm[base + j] / sum_exp;
lsm[base + j] = data[base + j] - max_val - log_sum;
}
}
(sm, lsm)
};
let softmax_tensor =
Tensor::from_storage(TensorStorage::cpu(sm_vec), shape.to_vec(), false)?;
let output =
Tensor::from_storage(TensorStorage::cpu(lsm_vec), shape.to_vec(), false)?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(LogSoftmaxBackward::new(input.clone(), softmax_tensor)),
)
} else {
Ok(output)
}
}
#[derive(Debug)]
pub struct SoftplusBackward<T: Float> {
input: Tensor<T>,
beta: f64,
threshold: f64,
}
impl<T: Float> SoftplusBackward<T> {
pub fn new(input: Tensor<T>, beta: f64, threshold: f64) -> Self {
Self {
input,
beta,
threshold,
}
}
}
impl<T: Float> GradFn<T> for SoftplusBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let beta = T::from(self.beta).unwrap();
let threshold = T::from(self.threshold).unwrap();
let result: Vec<T> = input_data
.iter()
.zip(grad_data.iter())
.map(|(&x, &g)| {
let bx = beta * x;
if bx > threshold {
g
} else {
let sig = one / (one + (-bx).exp());
g * sig
}
})
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"SoftplusBackward"
}
}
pub fn softplus<T: Float>(
input: &Tensor<T>,
beta: f64,
threshold: f64,
) -> FerrotorchResult<Tensor<T>> {
let one = <T as num_traits::One>::one();
let beta_t = T::from(beta).unwrap();
let threshold_t = T::from(threshold).unwrap();
let output = unary_map(input, |x| {
let bx = beta_t * x;
if bx > threshold_t {
x
} else {
(one + bx.exp()).ln() / beta_t
}
})?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(SoftplusBackward::new(input.clone(), beta, threshold)),
)
} else {
Ok(output)
}
}
#[derive(Debug)]
pub struct EluBackward<T: Float> {
input: Tensor<T>,
alpha: f64,
}
impl<T: Float> EluBackward<T> {
pub fn new(input: Tensor<T>, alpha: f64) -> Self {
Self { input, alpha }
}
}
impl<T: Float> GradFn<T> for EluBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let zero = <T as num_traits::Zero>::zero();
let alpha = T::from(self.alpha).unwrap();
let result: Vec<T> = input_data
.iter()
.zip(grad_data.iter())
.map(|(&x, &g)| {
if x > zero {
g
} else {
g * alpha * x.exp()
}
})
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"EluBackward"
}
}
pub fn elu<T: Float>(input: &Tensor<T>, alpha: f64) -> FerrotorchResult<Tensor<T>> {
let zero = <T as num_traits::Zero>::zero();
let one = <T as num_traits::One>::one();
let alpha_t = T::from(alpha).unwrap();
let output = unary_map(input, |x| {
if x > zero {
x
} else {
alpha_t * (x.exp() - one)
}
})?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(EluBackward::new(input.clone(), alpha)),
)
} else {
Ok(output)
}
}
#[derive(Debug)]
pub struct MishBackward<T: Float> {
input: Tensor<T>,
}
impl<T: Float> MishBackward<T> {
pub fn new(input: Tensor<T>) -> Self {
Self { input }
}
}
impl<T: Float> GradFn<T> for MishBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let cpu_input = if self.input.is_cuda() { self.input.cpu()? } else { self.input.clone() };
let cpu_go = if grad_output.is_cuda() { grad_output.cpu()? } else { grad_output.clone() };
let input_data = cpu_input.data()?;
let grad_data = cpu_go.data()?;
let one = <T as num_traits::One>::one();
let result: Vec<T> = input_data
.iter()
.zip(grad_data.iter())
.map(|(&x, &g)| {
let sp = (one + x.exp()).ln(); let t = sp.tanh(); let sig = one / (one + (-x).exp()); let dmish = t + x * sig * (one - t * t);
g * dmish
})
.collect();
let grad_input = Tensor::from_storage(
TensorStorage::cpu(result),
self.input.shape().to_vec(),
false,
)?;
Ok(vec![Some(grad_input)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"MishBackward"
}
}
pub fn mish<T: Float>(input: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let one = <T as num_traits::One>::one();
let output = unary_map(input, |x| {
let sp = (one + x.exp()).ln();
x * sp.tanh()
})?;
if is_grad_enabled() && input.requires_grad() {
Tensor::from_operation(
TensorStorage::cpu(output.data()?.to_vec()),
output.shape().to_vec(),
Arc::new(MishBackward::new(input.clone())),
)
} else {
Ok(output)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::autograd::graph::backward;
use crate::storage::TensorStorage;
fn leaf_scalar(val: f64) -> Tensor<f64> {
Tensor::from_storage(TensorStorage::cpu(vec![val]), vec![], true).unwrap()
}
fn leaf_vec(vals: &[f64]) -> Tensor<f64> {
Tensor::from_storage(
TensorStorage::cpu(vals.to_vec()),
vec![vals.len()],
true,
)
.unwrap()
}
fn numerical_grad_scalar(f: impl Fn(f64) -> f64, x: f64) -> f64 {
let h = 1e-5;
(f(x + h) - f(x - h)) / (2.0 * h)
}
#[test]
fn test_relu_forward_positive() {
let x = leaf_scalar(2.0);
let y = relu(&x).unwrap();
assert!((y.item().unwrap() - 2.0).abs() < 1e-7);
}
#[test]
fn test_relu_forward_negative() {
let x = leaf_scalar(-3.0);
let y = relu(&x).unwrap();
assert!((y.item().unwrap()).abs() < 1e-7);
}
#[test]
fn test_relu_backward_positive() {
let x = leaf_scalar(2.0);
let y = relu(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
(grad.item().unwrap() - 1.0).abs() < 1e-6,
"relu grad at x=2: expected 1.0, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_relu_backward_negative() {
let x = leaf_scalar(-1.5);
let y = relu(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
grad.item().unwrap().abs() < 1e-6,
"relu grad at x=-1.5: expected 0.0, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_relu_forward_vector() {
let x = leaf_vec(&[-1.0, 0.5, 2.0, -0.3]);
let y = relu(&x).unwrap();
let y_data = y.data().unwrap();
assert!((y_data[0] - 0.0).abs() < 1e-7);
assert!((y_data[1] - 0.5).abs() < 1e-7);
assert!((y_data[2] - 2.0).abs() < 1e-7);
assert!((y_data[3] - 0.0).abs() < 1e-7);
}
#[test]
fn test_sigmoid_forward() {
let x = leaf_scalar(0.0);
let y = sigmoid(&x).unwrap();
assert!((y.item().unwrap() - 0.5).abs() < 1e-7);
}
#[test]
fn test_sigmoid_backward() {
let x = leaf_scalar(0.0);
let y = sigmoid(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
(grad.item().unwrap() - 0.25).abs() < 1e-6,
"sigmoid grad at x=0: expected 0.25, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_sigmoid_backward_nonzero() {
let val = 1.0_f64;
let x = leaf_scalar(val);
let y = sigmoid(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| 1.0 / (1.0 + (-v).exp()), val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-5,
"sigmoid grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_tanh_forward() {
let x = leaf_scalar(0.0);
let y = tanh(&x).unwrap();
assert!(y.item().unwrap().abs() < 1e-7);
}
#[test]
fn test_tanh_backward_at_zero() {
let x = leaf_scalar(0.0);
let y = tanh(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
(grad.item().unwrap() - 1.0).abs() < 1e-6,
"tanh grad at x=0: expected 1.0, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_tanh_backward_nonzero() {
let val = 0.8_f64;
let x = leaf_scalar(val);
let y = tanh(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| v.tanh(), val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-5,
"tanh grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_gelu_forward_zero() {
for mode in [GeluApproximate::None, GeluApproximate::Tanh, GeluApproximate::Sigmoid] {
let x = leaf_scalar(0.0);
let y = gelu_with(&x, mode).unwrap();
assert!(y.item().unwrap().abs() < 1e-7, "gelu({mode}) at 0 should be 0");
}
}
#[test]
fn test_gelu_exact_forward_values() {
let x = leaf_scalar(1.0);
let y = gelu_with(&x, GeluApproximate::None).unwrap();
let val = y.item().unwrap();
assert!(
(val - 0.8413).abs() < 1e-3,
"exact gelu(1.0) ≈ 0.8413, got {val}"
);
let x = leaf_scalar(-1.0);
let y = gelu_with(&x, GeluApproximate::None).unwrap();
let val = y.item().unwrap();
assert!(
(val - (-0.1587)).abs() < 1e-3,
"exact gelu(-1.0) ≈ -0.1587, got {val}"
);
}
#[test]
fn test_gelu_tanh_forward_values() {
let x = leaf_scalar(1.0);
let y = gelu_with(&x, GeluApproximate::Tanh).unwrap();
let val = y.item().unwrap();
assert!(
(val - 0.8412).abs() < 2e-3,
"tanh gelu(1.0) ≈ 0.8412, got {val}"
);
}
#[test]
fn test_gelu_sigmoid_forward_values() {
let x = leaf_scalar(1.0);
let y = gelu_with(&x, GeluApproximate::Sigmoid).unwrap();
let val = y.item().unwrap();
let expected = 1.0 / (1.0 + (-1.702_f64).exp());
assert!(
(val - expected).abs() < 1e-5,
"sigmoid gelu(1.0) ≈ {expected}, got {val}"
);
}
#[test]
fn test_gelu_backward_exact() {
let val = 1.0_f64;
let x = leaf_scalar(val);
let y = gelu_with(&x, GeluApproximate::None).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let sqrt_2 = std::f64::consts::SQRT_2;
let cdf = 0.5 * (1.0 + erf_approx(v / sqrt_2));
v * cdf
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"exact gelu grad at x={val}: expected {expected}, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_gelu_backward_tanh() {
let val = 1.0_f64;
let x = leaf_scalar(val);
let y = gelu_with(&x, GeluApproximate::Tanh).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let sqrt_2_over_pi = (2.0 / std::f64::consts::PI).sqrt();
let inner = sqrt_2_over_pi * (v + 0.044715 * v * v * v);
0.5 * v * (1.0 + inner.tanh())
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"tanh gelu grad at x={val}: expected {expected}, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_gelu_backward_sigmoid() {
let val = 1.0_f64;
let x = leaf_scalar(val);
let y = gelu_with(&x, GeluApproximate::Sigmoid).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let k = 1.702_f64;
let expected = numerical_grad_scalar(
|v| {
let s = 1.0 / (1.0 + (-k * v).exp());
v * s
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"sigmoid gelu grad at x={val}: expected {expected}, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_gelu_default_is_exact() {
let x = leaf_scalar(1.0);
let y_default = gelu(&x).unwrap();
let x2 = leaf_scalar(1.0);
let y_exact = gelu_with(&x2, GeluApproximate::None).unwrap();
assert!(
(y_default.item().unwrap() - y_exact.item().unwrap()).abs() < 1e-10,
"default gelu should match exact mode"
);
}
#[test]
fn test_silu_forward_zero() {
let x = leaf_scalar(0.0);
let y = silu(&x).unwrap();
assert!(y.item().unwrap().abs() < 1e-7);
}
#[test]
fn test_silu_backward() {
let val = 1.5_f64;
let x = leaf_scalar(val);
let y = silu(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let s = 1.0 / (1.0 + (-v).exp());
v * s
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"silu grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_softmax_forward_1d() {
let x = leaf_vec(&[1.0, 2.0, 3.0]);
let y = softmax(&x).unwrap();
let d = y.data().unwrap();
let total: f64 = d.iter().copied().sum();
assert!(
(total - 1.0).abs() < 1e-7,
"softmax sum: expected 1.0, got {}",
total
);
assert!(d[0] < d[1]);
assert!(d[1] < d[2]);
}
#[test]
fn test_softmax_backward_1d() {
let vals = [1.0_f64, 2.0, 3.0];
let x = leaf_vec(&vals);
let y = softmax(&x).unwrap();
let y_data = y.data().unwrap().to_vec();
let grad_output = Tensor::from_storage(
TensorStorage::cpu(vec![1.0, 0.0, 0.0]),
vec![3],
false,
)
.unwrap();
let bwd = SoftmaxBackward::new(x.clone(), y.clone());
let grads = bwd.backward(&grad_output).unwrap();
let gx = grads[0].as_ref().unwrap().data().unwrap().to_vec();
let s0 = y_data[0];
let s1 = y_data[1];
let s2 = y_data[2];
let expected = [s0 * (1.0 - s0), s0 * (0.0 - s1), s0 * (0.0 - s2)];
for (i, (&got, &exp)) in gx.iter().zip(expected.iter()).enumerate() {
assert!(
(got - exp).abs() < 1e-7,
"softmax grad[{}]: expected {}, got {}",
i,
exp,
got
);
}
}
#[test]
fn test_log_softmax_forward_1d() {
let x = leaf_vec(&[1.0, 2.0, 3.0]);
let y = log_softmax(&x).unwrap();
let d = y.data().unwrap();
let total: f64 = d.iter().map(|&v| v.exp()).sum();
assert!(
(total - 1.0).abs() < 1e-7,
"exp(log_softmax) sum: expected 1.0, got {}",
total
);
}
#[test]
fn test_log_softmax_backward_1d() {
let vals = [1.0_f64, 2.0, 3.0];
let x = leaf_vec(&vals);
let x_nograd = Tensor::from_storage(
TensorStorage::cpu(vals.to_vec()),
vec![3],
false,
)
.unwrap();
let sm = softmax(&x_nograd).unwrap();
let sm_data = sm.data().unwrap().to_vec();
let grad_output = Tensor::from_storage(
TensorStorage::cpu(vec![1.0, 0.0, 0.0]),
vec![3],
false,
)
.unwrap();
let bwd = LogSoftmaxBackward::new(x.clone(), sm);
let grads = bwd.backward(&grad_output).unwrap();
let gx = grads[0].as_ref().unwrap().data().unwrap().to_vec();
let expected = [1.0 - sm_data[0], 0.0 - sm_data[1], 0.0 - sm_data[2]];
for (i, (&got, &exp)) in gx.iter().zip(expected.iter()).enumerate() {
assert!(
(got - exp).abs() < 1e-7,
"log_softmax grad[{}]: expected {}, got {}",
i,
exp,
got
);
}
}
#[test]
fn test_relu_no_grad() {
crate::autograd::no_grad::no_grad(|| {
let x = leaf_scalar(2.0);
let y = relu(&x).unwrap();
assert!(
y.grad_fn().is_none(),
"relu inside no_grad should not attach grad_fn"
);
});
}
#[test]
fn test_sigmoid_no_grad() {
crate::autograd::no_grad::no_grad(|| {
let x = leaf_scalar(1.0);
let y = sigmoid(&x).unwrap();
assert!(
y.grad_fn().is_none(),
"sigmoid inside no_grad should not attach grad_fn"
);
});
}
#[test]
fn test_softplus_forward_zero() {
let x = leaf_scalar(0.0);
let y = softplus(&x, 1.0, 20.0).unwrap();
assert!(
(y.item().unwrap() - 2.0_f64.ln()).abs() < 1e-7,
"softplus(0) = {}, expected {}",
y.item().unwrap(),
2.0_f64.ln()
);
}
#[test]
fn test_softplus_forward_large() {
let x = leaf_scalar(25.0);
let y = softplus(&x, 1.0, 20.0).unwrap();
assert!(
(y.item().unwrap() - 25.0).abs() < 1e-5,
"softplus(25) = {}, expected 25.0",
y.item().unwrap()
);
}
#[test]
fn test_softplus_backward_at_zero() {
let x = leaf_scalar(0.0);
let y = softplus(&x, 1.0, 20.0).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
(grad.item().unwrap() - 0.5).abs() < 1e-6,
"softplus grad at x=0: expected 0.5, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_softplus_backward_positive() {
let val = 2.0_f64;
let x = leaf_scalar(val);
let y = softplus(&x, 1.0, 20.0).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| (1.0 + v.exp()).ln(), val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"softplus grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_softplus_backward_negative() {
let val = -1.5_f64;
let x = leaf_scalar(val);
let y = softplus(&x, 1.0, 20.0).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| (1.0 + v.exp()).ln(), val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"softplus grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_softplus_backward_custom_beta() {
let val = 1.0_f64;
let beta = 2.0_f64;
let x = leaf_scalar(val);
let y = softplus(&x, beta, 20.0).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| (1.0 + (beta * v).exp()).ln() / beta, val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"softplus grad at x={}, beta={}: expected {}, got {}",
val,
beta,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_softplus_backward_vector() {
let x = leaf_vec(&[-2.0, -0.5, 0.0, 1.0, 3.0]);
let y = softplus(&x, 1.0, 20.0).unwrap();
let sum = crate::grad_fns::reduction::sum(&y).unwrap();
backward(&sum).unwrap();
let grad = x.grad().unwrap().unwrap();
let grad_data = grad.data().unwrap();
for (i, &val) in [-2.0_f64, -0.5, 0.0, 1.0, 3.0].iter().enumerate() {
let expected = numerical_grad_scalar(|v| (1.0 + v.exp()).ln(), val);
assert!(
(grad_data[i] - expected).abs() < 1e-4,
"softplus grad[{}] at x={}: expected {}, got {}",
i,
val,
expected,
grad_data[i]
);
}
}
#[test]
fn test_softplus_no_grad() {
crate::autograd::no_grad::no_grad(|| {
let x = leaf_scalar(1.0);
let y = softplus(&x, 1.0, 20.0).unwrap();
assert!(
y.grad_fn().is_none(),
"softplus inside no_grad should not attach grad_fn"
);
});
}
#[test]
fn test_elu_forward_positive() {
let x = leaf_scalar(2.0);
let y = elu(&x, 1.0).unwrap();
assert!((y.item().unwrap() - 2.0).abs() < 1e-7);
}
#[test]
fn test_elu_forward_negative() {
let x = leaf_scalar(-1.0);
let y = elu(&x, 1.0).unwrap();
let expected = (-1.0_f64).exp() - 1.0;
assert!(
(y.item().unwrap() - expected).abs() < 1e-7,
"elu(-1) = {}, expected {}",
y.item().unwrap(),
expected
);
}
#[test]
fn test_elu_backward_positive() {
let x = leaf_scalar(2.0);
let y = elu(&x, 1.0).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
assert!(
(grad.item().unwrap() - 1.0).abs() < 1e-6,
"elu grad at x=2: expected 1.0, got {}",
grad.item().unwrap()
);
}
#[test]
fn test_elu_backward_negative() {
let val = -1.0_f64;
let alpha = 1.0_f64;
let x = leaf_scalar(val);
let y = elu(&x, alpha).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(|v| if v > 0.0 { v } else { alpha * (v.exp() - 1.0) }, val);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"elu grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_elu_backward_custom_alpha() {
let val = -0.5_f64;
let alpha = 2.0_f64;
let x = leaf_scalar(val);
let y = elu(&x, alpha).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = alpha * val.exp();
assert!(
(grad.item().unwrap() - expected).abs() < 1e-5,
"elu grad at x={}, alpha={}: expected {}, got {}",
val,
alpha,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_elu_no_grad() {
crate::autograd::no_grad::no_grad(|| {
let x = leaf_scalar(1.0);
let y = elu(&x, 1.0).unwrap();
assert!(
y.grad_fn().is_none(),
"elu inside no_grad should not attach grad_fn"
);
});
}
#[test]
fn test_mish_forward_zero() {
let x = leaf_scalar(0.0);
let y = mish(&x).unwrap();
assert!(y.item().unwrap().abs() < 1e-7);
}
#[test]
fn test_mish_forward_positive() {
let x = leaf_scalar(20.0);
let y = mish(&x).unwrap();
assert!(
(y.item().unwrap() - 20.0).abs() < 0.01,
"mish(20) = {}, expected ~20",
y.item().unwrap()
);
}
#[test]
fn test_mish_backward_at_zero() {
let x = leaf_scalar(0.0);
let y = mish(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let sp = (1.0 + v.exp()).ln();
v * sp.tanh()
},
0.0,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"mish grad at x=0: expected {}, got {}",
expected,
grad.item().unwrap()
);
}
#[test]
fn test_mish_backward_positive() {
let val = 1.5_f64;
let x = leaf_scalar(val);
let y = mish(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let sp = (1.0 + v.exp()).ln();
v * sp.tanh()
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"mish grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_mish_backward_negative() {
let val = -1.0_f64;
let x = leaf_scalar(val);
let y = mish(&x).unwrap();
backward(&y).unwrap();
let grad = x.grad().unwrap().unwrap();
let expected = numerical_grad_scalar(
|v| {
let sp = (1.0 + v.exp()).ln();
v * sp.tanh()
},
val,
);
assert!(
(grad.item().unwrap() - expected).abs() < 1e-4,
"mish grad at x={}: expected {}, got {}",
val,
expected,
grad.item().unwrap()
);
}
#[test]
fn test_mish_no_grad() {
crate::autograd::no_grad::no_grad(|| {
let x = leaf_scalar(1.0);
let y = mish(&x).unwrap();
assert!(
y.grad_fn().is_none(),
"mish inside no_grad should not attach grad_fn"
);
});
}
}