use std::any::TypeId;
use std::sync::Arc;
use crate::autograd::no_grad::{is_grad_enabled, no_grad};
use crate::dtype::Float;
use crate::error::{FerrotorchError, FerrotorchResult};
use crate::ops::elementwise::{fast_add, fast_mul, scalar_map, unary_map};
use crate::shape::broadcast_shapes;
use crate::storage::TensorStorage;
use crate::tensor::{GradFn, Tensor};
#[inline]
fn is_f64<T: Float>() -> bool {
TypeId::of::<T>() == TypeId::of::<f64>()
}
#[inline]
fn is_f32<T: Float>() -> bool {
TypeId::of::<T>() == TypeId::of::<f32>()
}
#[inline]
fn is_bf16<T: Float>() -> bool {
TypeId::of::<T>() == TypeId::of::<half::bf16>()
}
#[inline]
fn is_f16<T: Float>() -> bool {
TypeId::of::<T>() == TypeId::of::<half::f16>()
}
#[inline]
fn ensure_contig_for_gpu<T: Float>(t: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if !t.is_cuda() {
return Ok(t.clone());
}
let view_matches_buffer =
t.is_contiguous() && t.storage_offset() == 0 && t.numel() == t.storage_len();
if view_matches_buffer {
return Ok(t.clone());
}
if t.shape().len() <= 8
&& let Some(backend) = crate::gpu_dispatch::gpu_backend()
{
let in_handle = t.gpu_handle()?;
let out_shape = t.shape().to_vec();
let src_strides = t.strides().to_vec();
let src_offset = t.storage_offset();
let out_handle = if is_f32::<T>() {
backend
.strided_copy_f32(in_handle, &out_shape, &src_strides, src_offset)
.ok()
} else if is_f64::<T>() {
backend
.strided_copy_f64(in_handle, &out_shape, &src_strides, src_offset)
.ok()
} else {
None
};
if let Some(handle) = out_handle {
let storage = TensorStorage::gpu(handle);
return Tensor::from_storage(storage, out_shape, false);
}
}
t.contiguous()
}
#[inline]
fn needs_grad<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> bool {
is_grad_enabled() && (a.requires_grad() || b.requires_grad())
}
#[inline]
fn needs_grad_unary<T: Float>(a: &Tensor<T>) -> bool {
is_grad_enabled() && a.requires_grad()
}
pub(crate) fn reduce_grad_to_shape<T: Float>(
grad: &Tensor<T>,
target_shape: &[usize],
) -> FerrotorchResult<Tensor<T>> {
let grad_shape = grad.shape();
if grad_shape == target_shape {
return Ok(grad.clone());
}
if target_shape.is_empty() {
return crate::grad_fns::reduction::sum(grad);
}
if grad.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let grad_c = ensure_contig_for_gpu(grad)?;
let mut handle = backend.clone_buffer(grad_c.gpu_handle()?)?;
let mut current_shape = grad_c.shape().to_vec();
let target_ndim = target_shape.len();
while current_shape.len() > target_ndim {
handle = if is_f32::<T>() {
backend.sum_axis_f32(&handle, ¤t_shape, 0)?
} else {
backend.sum_axis_f64(&handle, ¤t_shape, 0)?
};
current_shape.remove(0);
}
for axis in 0..current_shape.len() {
if axis < target_shape.len() && target_shape[axis] == 1 && current_shape[axis] > 1 {
handle = if is_f32::<T>() {
backend.sum_axis_f32(&handle, ¤t_shape, axis)?
} else {
backend.sum_axis_f64(&handle, ¤t_shape, axis)?
};
current_shape[axis] = 1;
}
}
return Tensor::from_storage(TensorStorage::gpu(handle), target_shape.to_vec(), false);
}
if grad.is_cuda() {
return Err(FerrotorchError::NotImplementedOnCuda {
op: "broadcast_grad",
});
}
let grad_data = grad.data()?;
let grad_ndim = grad_shape.len();
let target_ndim = target_shape.len();
let grad_numel: usize = grad_shape.iter().product();
let target_numel: usize = target_shape.iter().product();
if grad_numel == target_numel {
return Tensor::from_storage(
TensorStorage::cpu(grad_data.to_vec()),
target_shape.to_vec(),
false,
);
}
if grad_ndim < target_ndim {
return Err(FerrotorchError::ShapeMismatch {
message: format!(
"reduce_grad_to_shape: gradient has {grad_ndim} dim(s) but target has {target_ndim} dim(s) ({grad_shape:?} -> {target_shape:?}). \
Standard broadcasting backward requires grad_ndim >= target_ndim."
),
});
}
let padded_target: Vec<usize> = if target_ndim < grad_ndim {
let mut p = vec![1usize; grad_ndim - target_ndim];
p.extend_from_slice(target_shape);
p
} else {
target_shape.to_vec()
};
let out_numel: usize = target_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let mut target_strides = vec![1usize; target_ndim];
for td in (0..target_ndim.saturating_sub(1)).rev() {
target_strides[td] = target_strides[td + 1] * target_shape[td + 1];
}
let offset = grad_ndim - target_ndim;
for (i, &grad_val) in grad_data.iter().enumerate() {
let mut coords = [0usize; 16]; let mut rem = i;
for d in (0..grad_ndim).rev() {
coords[d] = rem % grad_shape[d];
rem /= grad_shape[d];
}
let mut flat = 0usize;
for (td, &target_stride) in target_strides.iter().enumerate() {
let gd = td + offset;
let coord = if padded_target[gd] == 1 {
0
} else {
coords[gd]
};
flat += coord * target_stride;
}
result[flat] += grad_val;
}
Tensor::from_storage(TensorStorage::cpu(result), target_shape.to_vec(), false)
}
#[derive(Debug)]
struct AddBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for AddBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"AddBackward"
}
}
pub fn add<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("add", "tensor_op", &[a.shape(), b.shape()], || {
add_inner(a, b)
})
}
fn add_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let b_c = ensure_contig_for_gpu(b)?;
let needs_broadcast = a_c.shape() != b_c.shape();
let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
if needs_broadcast {
let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"broadcast_add",
f32 => backend.broadcast_add_f32(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f64 => backend.broadcast_add_f64(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
bf16 => backend.broadcast_add_bf16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f16 => backend.broadcast_add_f16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
)?;
(h, out_shape)
} else {
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"add",
f32 => backend.add_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
f64 => backend.add_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
bf16 => backend.add_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
f16 => backend.add_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
)?;
(h, a_c.shape().to_vec())
};
let storage = TensorStorage::gpu(handle);
if needs_grad(a, b) {
Tensor::from_operation(
storage,
out_shape,
Arc::new(AddBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Tensor::from_storage(storage, out_shape, false)
}
} else {
let result = fast_add(a, b)?;
if needs_grad(a, b) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(AddBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct AddScaledBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
alpha: f64,
}
impl<T: Float> GradFn<T> for AddScaledBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let alpha_t: T = num_traits::cast::cast(self.alpha).ok_or_else(|| {
FerrotorchError::InvalidArgument {
message: format!(
"AddScaledBackward: alpha {} not representable in tensor dtype",
self.alpha
),
}
})?;
let scaled = no_grad(|| scale_tensor(grad_output, alpha_t))?;
Some(reduce_grad_to_shape(&scaled, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"AddScaledBackward"
}
fn scalar_args(&self) -> Vec<f64> {
vec![self.alpha]
}
}
fn scale_tensor<T: Float>(t: &Tensor<T>, alpha: T) -> FerrotorchResult<Tensor<T>> {
if t.is_cuda() {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let tc = ensure_contig_for_gpu(t)?;
let handle = if is_f32::<T>() {
let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
backend.scale_f32(tc.gpu_handle()?, s)?
} else if is_f64::<T>() {
let s: f64 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
backend.scale_f64(tc.gpu_handle()?, s)?
} else if is_bf16::<T>() {
let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
backend.scale_bf16_bf16(tc.gpu_handle()?, s)?
} else if is_f16::<T>() {
let s: f32 = num_traits::cast::cast(alpha).ok_or(FerrotorchError::DeviceUnavailable)?;
backend.scale_f16(tc.gpu_handle()?, s)?
} else {
return Err(FerrotorchError::NotImplementedOnCuda { op: "scale_tensor" });
};
Tensor::from_storage(TensorStorage::gpu(handle), tc.shape().to_vec(), false)
} else {
scalar_map(t, alpha, |x, s| x * s)
}
}
fn check_out_allowed<T: Float>(out: &Tensor<T>, op_name: &str) -> FerrotorchResult<()> {
if out.grad_fn().is_some() {
return Err(FerrotorchError::InvalidArgument {
message: format!(
"{op_name}: `out` tensor is part of the computation graph \
(has grad_fn = {:?}); cannot write into it",
out.grad_fn().map(|gf| gf.name()),
),
});
}
if out.requires_grad() && out.is_leaf() {
return Err(FerrotorchError::InvalidArgument {
message: format!(
"{op_name}: `out` is a leaf tensor with requires_grad=true; \
the write would not be tracked by autograd"
),
});
}
Ok(())
}
pub fn add_out<T: Float>(out: &Tensor<T>, a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<()> {
add_scaled_out(out, a, b, 1.0)
}
pub fn add_scaled_out<T: Float>(
out: &Tensor<T>,
a: &Tensor<T>,
b: &Tensor<T>,
alpha: f64,
) -> FerrotorchResult<()> {
check_out_allowed(out, "add_scaled_out")?;
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if a.device() != out.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: out.device(),
});
}
let broadcast_shape = broadcast_shapes(a.shape(), b.shape())?;
let result = no_grad(|| add_scaled(a, b, alpha))?;
let (storage, result_shape) = result.into_storage_and_shape()?;
if out.shape() == broadcast_shape.as_slice() {
unsafe { out.update_storage(storage)? };
} else {
debug_assert_eq!(result_shape, broadcast_shape);
unsafe { out.update_storage_and_shape(storage, broadcast_shape)? };
}
Ok(())
}
pub fn add_scaled<T: Float>(
a: &Tensor<T>,
b: &Tensor<T>,
alpha: f64,
) -> FerrotorchResult<Tensor<T>> {
#[allow(clippy::float_cmp)]
let is_identity = alpha == 1.0;
if is_identity {
return add(a, b);
}
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope(
"add_scaled",
"tensor_op",
&[a.shape(), b.shape()],
|| {
if let Some(result) = maybe_add_scaled_fused_gpu(a, b, alpha)? {
return if needs_grad(a, b) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(AddScaledBackward {
a: a.clone(),
b: b.clone(),
alpha,
}),
)
} else {
Ok(result)
};
}
let alpha_t: T =
num_traits::cast::cast(alpha).ok_or_else(|| FerrotorchError::InvalidArgument {
message: format!("add_scaled: alpha {alpha} not representable in tensor dtype"),
})?;
let b_scaled = no_grad(|| scale_tensor(b, alpha_t))?;
let result = no_grad(|| add_inner(a, &b_scaled))?;
if needs_grad(a, b) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(AddScaledBackward {
a: a.clone(),
b: b.clone(),
alpha,
}),
)
} else {
Ok(result)
}
},
)
}
fn maybe_add_scaled_fused_gpu<T: Float>(
a: &Tensor<T>,
b: &Tensor<T>,
alpha: f64,
) -> FerrotorchResult<Option<Tensor<T>>> {
if !(a.is_cuda() && b.is_cuda()) {
return Ok(None);
}
if !(is_f32::<T>() || is_f64::<T>()) {
return Ok(None);
}
if !alpha.is_finite() {
return Ok(None);
}
let Some(backend) = crate::gpu_dispatch::gpu_backend() else {
return Ok(None);
};
let a_c = ensure_contig_for_gpu(a)?;
let b_c = ensure_contig_for_gpu(b)?;
if a_c.shape() != b_c.shape() {
return Ok(None);
}
let out_shape = a_c.shape().to_vec();
let handle = if is_f32::<T>() {
backend.add_scaled_f32(a_c.gpu_handle()?, b_c.gpu_handle()?, alpha)?
} else {
backend.add_scaled_f64(a_c.gpu_handle()?, b_c.gpu_handle()?, alpha)?
};
let storage = TensorStorage::gpu(handle);
let result = Tensor::from_storage(storage, out_shape, false)?;
Ok(Some(result))
}
pub fn sub<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
sub_scaled(a, b, 1.0)
}
pub fn sub_scaled<T: Float>(
a: &Tensor<T>,
b: &Tensor<T>,
alpha: f64,
) -> FerrotorchResult<Tensor<T>> {
add_scaled(a, b, -alpha)
}
pub fn rsub<T: Float>(a: &Tensor<T>, b: &Tensor<T>, alpha: f64) -> FerrotorchResult<Tensor<T>> {
sub_scaled(b, a, alpha)
}
#[derive(Debug)]
struct MulBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for MulBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
if grad_output.requires_grad() || grad_output.grad_fn().is_some() {
let da = if self.a.requires_grad() {
let raw = mul(grad_output, &self.b)?;
Some(reduce_grad_to_shape(&raw, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let raw = mul(grad_output, &self.a)?;
Some(reduce_grad_to_shape(&raw, self.b.shape())?)
} else {
None
};
return Ok(vec![da, db]);
}
let da = if self.a.requires_grad() {
let raw = no_grad(|| mul(grad_output, &self.b))?;
Some(reduce_grad_to_shape(&raw, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let raw = no_grad(|| mul(grad_output, &self.a))?;
Some(reduce_grad_to_shape(&raw, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"MulBackward"
}
}
pub fn mul<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("mul", "tensor_op", &[a.shape(), b.shape()], || {
mul_inner(a, b)
})
}
fn mul_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let b_c = ensure_contig_for_gpu(b)?;
let needs_broadcast = a_c.shape() != b_c.shape();
let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
if needs_broadcast {
let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"broadcast_mul",
f32 => backend.broadcast_mul_f32(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f64 => backend.broadcast_mul_f64(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
bf16 => backend.broadcast_mul_bf16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f16 => backend.broadcast_mul_f16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
)?;
(h, out_shape)
} else {
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"mul",
f32 => backend.mul_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
f64 => backend.mul_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
bf16 => backend.mul_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
f16 => backend.mul_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
)?;
(h, a_c.shape().to_vec())
};
let storage = TensorStorage::gpu(handle);
if needs_grad(a, b) {
Tensor::from_operation(
storage,
out_shape,
Arc::new(MulBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Tensor::from_storage(storage, out_shape, false)
}
} else {
let result = fast_mul(a, b)?;
if needs_grad(a, b) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(MulBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct DivBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for DivBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
let raw = no_grad(|| div(grad_output, &self.b))?;
Some(reduce_grad_to_shape(&raw, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let raw = no_grad(|| {
let neg_go = neg(grad_output)?;
let neg_go_a = mul(&neg_go, &self.a)?;
let b_sq = mul(&self.b, &self.b)?;
div(&neg_go_a, &b_sq)
})?;
Some(reduce_grad_to_shape(&raw, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"DivBackward"
}
}
pub fn div<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("div", "tensor_op", &[a.shape(), b.shape()], || {
div_inner(a, b)
})
}
fn div_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() && (is_f32::<T>() || is_f64::<T>() || is_bf16::<T>() || is_f16::<T>()) {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let b_c = ensure_contig_for_gpu(b)?;
let needs_broadcast = a_c.shape() != b_c.shape();
let (handle, out_shape): (crate::gpu_dispatch::GpuBufferHandle, Vec<usize>) =
if needs_broadcast {
let out_shape = broadcast_shapes(a_c.shape(), b_c.shape())?;
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"broadcast_div",
f32 => backend.broadcast_div_f32(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f64 => backend.broadcast_div_f64(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
bf16 => backend.broadcast_div_bf16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
f16 => backend.broadcast_div_f16(
a_c.gpu_handle()?,
b_c.gpu_handle()?,
a_c.shape(),
b_c.shape(),
&out_shape,
),
)?;
(h, out_shape)
} else {
let h: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"div",
f32 => backend.div_f32(a_c.gpu_handle()?, b_c.gpu_handle()?),
f64 => backend.div_f64(a_c.gpu_handle()?, b_c.gpu_handle()?),
bf16 => backend.div_bf16_bf16(a_c.gpu_handle()?, b_c.gpu_handle()?),
f16 => backend.div_f16(a_c.gpu_handle()?, b_c.gpu_handle()?),
)?;
(h, a_c.shape().to_vec())
};
let storage = TensorStorage::gpu(handle);
if needs_grad(a, b) {
Tensor::from_operation(
storage,
out_shape,
Arc::new(DivBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Tensor::from_storage(storage, out_shape, false)
}
} else {
let result = crate::ops::elementwise::fast_div(a, b)?;
if needs_grad(a, b) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(DivBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct NegBackward<T: Float> {
a: Tensor<T>,
}
impl<T: Float> GradFn<T> for NegBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
Some(no_grad(|| neg(grad_output))?)
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"NegBackward"
}
}
pub fn neg<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("neg", "tensor_op", &[a.shape()], || neg_inner(a))
}
fn neg_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let handle: crate::gpu_dispatch::GpuBufferHandle = crate::dispatch_floating_dtype!(
T,
"neg",
f32 => backend.neg_f32(a_c.gpu_handle()?),
f64 => backend.neg_f64(a_c.gpu_handle()?),
bf16 => backend.neg_bf16_bf16(a_c.gpu_handle()?),
f16 => backend.neg_f16(a_c.gpu_handle()?),
)?;
let storage = TensorStorage::gpu(handle);
let shape = a_c.shape().to_vec();
if needs_grad_unary(a) {
Tensor::from_operation(storage, shape, Arc::new(NegBackward { a: a.clone() }))
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let result = unary_map(a, |x| -x)?;
if needs_grad_unary(a) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, Arc::new(NegBackward { a: a.clone() }))
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct PowBackward<T: Float> {
a: Tensor<T>,
exp: f64,
}
impl<T: Float> GradFn<T> for PowBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
if grad_output.requires_grad() || grad_output.grad_fn().is_some() {
let a_pow = pow(&self.a, self.exp - 1.0)?; let exp_t = T::from(self.exp).unwrap();
let exp_tensor = Tensor::from_storage(
TensorStorage::cpu(vec![exp_t; self.a.numel().max(1)]),
self.a.shape().to_vec(),
false,
)?;
let scaled = mul(&exp_tensor, &a_pow)?; Some(mul(grad_output, &scaled)?) } else if grad_output.is_cuda() {
let da = no_grad(|| {
let a_pow = pow(&self.a, self.exp - 1.0)?;
let exp_t = T::from(self.exp).unwrap();
let exp_tensor = Tensor::from_storage(
TensorStorage::cpu(vec![exp_t; self.a.numel().max(1)]),
self.a.shape().to_vec(),
false,
)?;
let exp_gpu = exp_tensor.to(self.a.device())?;
let scaled = mul(&exp_gpu, &a_pow)?;
mul(grad_output, &scaled)
})?;
Some(da)
} else {
let go_data = grad_output.data()?;
let a_data = self.a.data()?;
let exp_t = T::from(self.exp).unwrap();
let exp_m1 = T::from(self.exp - 1.0).unwrap();
let grad_a: Vec<T> = go_data
.iter()
.zip(a_data.iter())
.map(|(&g, &a)| g * exp_t * a.powf(exp_m1))
.collect();
Some(Tensor::from_storage(
TensorStorage::cpu(grad_a),
self.a.shape().to_vec(),
false,
)?)
}
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"PowBackward"
}
fn scalar_args(&self) -> Vec<f64> {
vec![self.exp]
}
}
pub fn pow<T: Float>(a: &Tensor<T>, exp: f64) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
let _ = exp;
return Ok(out);
}
crate::profiler_hook::profile_op_scope("pow", "tensor_op", &[a.shape()], || pow_inner(a, exp))
}
fn pow_inner<T: Float>(a: &Tensor<T>, exp: f64) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let handle = if is_f32::<T>() {
backend.pow_f32(a_c.gpu_handle()?, exp as f32)?
} else {
backend.pow_f64(a_c.gpu_handle()?, exp)?
};
let storage = TensorStorage::gpu(handle);
let shape = a_c.shape().to_vec();
if needs_grad_unary(a) {
Tensor::from_operation(storage, shape, Arc::new(PowBackward { a: a.clone(), exp }))
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let exp_t = T::from(exp).unwrap();
let result = scalar_map(a, exp_t, |x, e| x.powf(e))?;
if needs_grad_unary(a) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, Arc::new(PowBackward { a: a.clone(), exp }))
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct SqrtBackward<T: Float> {
a: Tensor<T>,
}
impl<T: Float> GradFn<T> for SqrtBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
if grad_output.is_cuda() {
let da = no_grad(|| {
let sqrt_a = sqrt(&self.a)?;
let two_t = T::from(2.0).unwrap();
let two_tensor = Tensor::from_storage(
TensorStorage::cpu(vec![two_t; self.a.numel().max(1)]),
self.a.shape().to_vec(),
false,
)?;
let two_gpu = two_tensor.to(self.a.device())?;
let denom = mul(&two_gpu, &sqrt_a)?;
div(grad_output, &denom)
})?;
Some(da)
} else {
let go_data = grad_output.data()?;
let a_data = self.a.data()?;
let two = T::from(2.0).unwrap();
let grad_a: Vec<T> = go_data
.iter()
.zip(a_data.iter())
.map(|(&g, &a)| g / (two * a.sqrt()))
.collect();
Some(Tensor::from_storage(
TensorStorage::cpu(grad_a),
self.a.shape().to_vec(),
false,
)?)
}
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"SqrtBackward"
}
}
pub fn sqrt<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("sqrt", "tensor_op", &[a.shape()], || sqrt_inner(a))
}
fn sqrt_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() && (is_f32::<T>() || is_f64::<T>() || is_f16::<T>()) {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let handle = if is_f32::<T>() {
backend.sqrt_f32(a_c.gpu_handle()?)?
} else if is_f64::<T>() {
backend.sqrt_f64(a_c.gpu_handle()?)?
} else {
backend.sqrt_f16(a_c.gpu_handle()?)?
};
let storage = TensorStorage::gpu(handle);
let shape = a_c.shape().to_vec();
if needs_grad_unary(a) {
Tensor::from_operation(storage, shape, Arc::new(SqrtBackward { a: a.clone() }))
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let result = unary_map(a, |x| x.sqrt())?;
if needs_grad_unary(a) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, Arc::new(SqrtBackward { a: a.clone() }))
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct RsqrtBackward<T: Float> {
a: Tensor<T>,
c: Tensor<T>,
}
impl<T: Float> GradFn<T> for RsqrtBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
if grad_output.is_cuda() {
let da = no_grad(|| {
let neg_half =
T::from(-0.5).ok_or_else(|| FerrotorchError::InvalidArgument {
message: "RsqrtBackward: -0.5 not representable in tensor dtype".into(),
})?;
let nh_tensor = Tensor::from_storage(
TensorStorage::cpu(vec![neg_half; self.c.numel().max(1)]),
self.c.shape().to_vec(),
false,
)?;
let nh_gpu = nh_tensor.to(self.c.device())?;
let c_sq = mul(&self.c, &self.c)?;
let c_cu = mul(&c_sq, &self.c)?;
let neg_half_c_cu = mul(&nh_gpu, &c_cu)?;
mul(grad_output, &neg_half_c_cu)
})?;
Some(da)
} else {
let go_data = grad_output.data()?;
let c_data = self.c.data()?;
let neg_half = T::from(-0.5).ok_or_else(|| FerrotorchError::InvalidArgument {
message: "RsqrtBackward: -0.5 not representable in tensor dtype".into(),
})?;
let grad_a: Vec<T> = go_data
.iter()
.zip(c_data.iter())
.map(|(&g, &c)| neg_half * g * c * c * c)
.collect();
Some(Tensor::from_storage(
TensorStorage::cpu(grad_a),
self.a.shape().to_vec(),
false,
)?)
}
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"RsqrtBackward"
}
}
pub fn rsqrt<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("rsqrt", "tensor_op", &[a.shape()], || rsqrt_inner(a))
}
fn rsqrt_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() {
let c = no_grad(|| {
let sqrt_a = sqrt(a)?;
let one = <T as num_traits::One>::one();
let ones = Tensor::from_storage(
TensorStorage::cpu(vec![one; a.numel().max(1)]),
a.shape().to_vec(),
false,
)?;
let ones_gpu = ones.to(a.device())?;
div(&ones_gpu, &sqrt_a)
})?;
let (storage, shape) = c.clone().into_storage_and_shape()?;
if needs_grad_unary(a) {
Tensor::from_operation(storage, shape, Arc::new(RsqrtBackward { a: a.clone(), c }))
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let result = unary_map(a, |x| <T as num_traits::One>::one() / x.sqrt())?;
if needs_grad_unary(a) {
let c = result.clone();
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, Arc::new(RsqrtBackward { a: a.clone(), c }))
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct ReciprocalBackward<T: Float> {
a: Tensor<T>,
c: Tensor<T>,
}
impl<T: Float> GradFn<T> for ReciprocalBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
if grad_output.is_cuda() {
let da = no_grad(|| {
let c_sq = mul(&self.c, &self.c)?;
let neg_go = neg(grad_output)?;
mul(&neg_go, &c_sq)
})?;
Some(da)
} else {
let go_data = grad_output.data()?;
let c_data = self.c.data()?;
let grad_a: Vec<T> = go_data
.iter()
.zip(c_data.iter())
.map(|(&g, &c)| -g * c * c)
.collect();
Some(Tensor::from_storage(
TensorStorage::cpu(grad_a),
self.a.shape().to_vec(),
false,
)?)
}
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"ReciprocalBackward"
}
}
pub fn reciprocal<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("reciprocal", "tensor_op", &[a.shape()], || {
reciprocal_inner(a)
})
}
fn reciprocal_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() {
let c = no_grad(|| {
let one = <T as num_traits::One>::one();
let ones = Tensor::from_storage(
TensorStorage::cpu(vec![one; a.numel().max(1)]),
a.shape().to_vec(),
false,
)?;
let ones_gpu = ones.to(a.device())?;
div(&ones_gpu, a)
})?;
let (storage, shape) = c.clone().into_storage_and_shape()?;
if needs_grad_unary(a) {
Tensor::from_operation(
storage,
shape,
Arc::new(ReciprocalBackward { a: a.clone(), c }),
)
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let result = unary_map(a, |x| <T as num_traits::One>::one() / x)?;
if needs_grad_unary(a) {
let c = result.clone();
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(ReciprocalBackward { a: a.clone(), c }),
)
} else {
Ok(result)
}
}
}
#[derive(Debug)]
struct RemainderBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for RemainderBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let raw = no_grad(|| {
let q = div(&self.a, &self.b)?;
let floor_q = unary_map(&q, |x| x.floor())?;
let neg_go = neg(grad_output)?;
mul(&neg_go, &floor_q)
})?;
Some(reduce_grad_to_shape(&raw, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"RemainderBackward"
}
}
pub fn remainder<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope(
"remainder",
"tensor_op",
&[a.shape(), b.shape()],
|| remainder_inner(a, b),
)
}
fn remainder_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let out_shape = broadcast_shapes(a.shape(), b.shape())?;
let device = a.device();
let a_data = a.data_vec()?;
let b_data = b.data_vec()?;
let a_shape = a.shape().to_vec();
let b_shape = b.shape().to_vec();
let out_numel: usize = out_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let out_ndim = out_shape.len();
let pad_a = out_ndim - a_shape.len();
let pad_b = out_ndim - b_shape.len();
let a_strides: Vec<usize> = {
let mut s = vec![1usize; a_shape.len()];
for d in (0..a_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * a_shape[d + 1];
}
s
};
let b_strides: Vec<usize> = {
let mut s = vec![1usize; b_shape.len()];
for d in (0..b_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * b_shape[d + 1];
}
s
};
let zero = <T as num_traits::Zero>::zero();
for i in 0..out_numel {
let mut rem_i = i;
let mut coords = [0usize; 16];
for d in (0..out_ndim).rev() {
coords[d] = rem_i % out_shape[d];
rem_i /= out_shape[d];
}
let mut a_flat = 0usize;
for (d, &s) in a_strides.iter().enumerate() {
let oc = coords[d + pad_a];
let coord = if a_shape[d] == 1 { 0 } else { oc };
a_flat += coord * s;
}
let mut b_flat = 0usize;
for (d, &s) in b_strides.iter().enumerate() {
let oc = coords[d + pad_b];
let coord = if b_shape[d] == 1 { 0 } else { oc };
b_flat += coord * s;
}
let av = a_data[a_flat];
let bv = b_data[b_flat];
let mut m = av % bv;
if m != zero && (bv < zero) != (m < zero) {
m += bv;
}
result[i] = m;
}
let storage = TensorStorage::on_device(result, device)?;
let out = Tensor::from_storage(storage, out_shape, false)?;
if needs_grad(a, b) {
let (storage, shape) = out.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(RemainderBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(out)
}
}
#[derive(Debug)]
struct FmodBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for FmodBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let da = if self.a.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.a.shape())?)
} else {
None
};
let db = if self.b.requires_grad() {
let raw = no_grad(|| {
let q = div(&self.a, &self.b)?;
let trunc_q = unary_map(&q, |x| x.trunc())?;
let neg_go = neg(grad_output)?;
mul(&neg_go, &trunc_q)
})?;
Some(reduce_grad_to_shape(&raw, self.b.shape())?)
} else {
None
};
Ok(vec![da, db])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"FmodBackward"
}
}
pub fn fmod<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("fmod", "tensor_op", &[a.shape(), b.shape()], || {
fmod_inner(a, b)
})
}
fn fmod_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let out_shape = broadcast_shapes(a.shape(), b.shape())?;
let device = a.device();
let a_data = a.data_vec()?;
let b_data = b.data_vec()?;
let a_shape = a.shape().to_vec();
let b_shape = b.shape().to_vec();
let out_numel: usize = out_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let out_ndim = out_shape.len();
let pad_a = out_ndim - a_shape.len();
let pad_b = out_ndim - b_shape.len();
let a_strides: Vec<usize> = {
let mut s = vec![1usize; a_shape.len()];
for d in (0..a_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * a_shape[d + 1];
}
s
};
let b_strides: Vec<usize> = {
let mut s = vec![1usize; b_shape.len()];
for d in (0..b_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * b_shape[d + 1];
}
s
};
for i in 0..out_numel {
let mut rem_i = i;
let mut coords = [0usize; 16];
for d in (0..out_ndim).rev() {
coords[d] = rem_i % out_shape[d];
rem_i /= out_shape[d];
}
let mut a_flat = 0usize;
for (d, &s) in a_strides.iter().enumerate() {
let oc = coords[d + pad_a];
let coord = if a_shape[d] == 1 { 0 } else { oc };
a_flat += coord * s;
}
let mut b_flat = 0usize;
for (d, &s) in b_strides.iter().enumerate() {
let oc = coords[d + pad_b];
let coord = if b_shape[d] == 1 { 0 } else { oc };
b_flat += coord * s;
}
let av = a_data[a_flat];
let bv = b_data[b_flat];
result[i] = av % bv;
}
let storage = TensorStorage::on_device(result, device)?;
let out = Tensor::from_storage(storage, out_shape, false)?;
if needs_grad(a, b) {
let (storage, shape) = out.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(FmodBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(out)
}
}
#[derive(Debug)]
struct FloorDivideBackward<T: Float> {
a: Tensor<T>,
b: Tensor<T>,
}
impl<T: Float> GradFn<T> for FloorDivideBackward<T> {
fn backward(&self, _grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
Err(FerrotorchError::InvalidArgument {
message: "derivative for floor_divide is not implemented \
(PyTorch parity: torch.floor_divide has no entry in \
tools/autograd/derivatives.yaml and raises the same \
error on .backward())"
.into(),
})
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a, &self.b]
}
fn name(&self) -> &'static str {
"FloorDivideBackward"
}
}
pub fn floor_divide<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.device() != b.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: a.device(),
got: b.device(),
});
}
if let Some(out) = crate::meta_propagate::binary_broadcast(a, b)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope(
"floor_divide",
"tensor_op",
&[a.shape(), b.shape()],
|| floor_divide_inner(a, b),
)
}
fn floor_divide_inner<T: Float>(a: &Tensor<T>, b: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
let out_shape = broadcast_shapes(a.shape(), b.shape())?;
let device = a.device();
let a_data = a.data_vec()?;
let b_data = b.data_vec()?;
let a_shape = a.shape().to_vec();
let b_shape = b.shape().to_vec();
let out_numel: usize = out_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let out_ndim = out_shape.len();
let pad_a = out_ndim - a_shape.len();
let pad_b = out_ndim - b_shape.len();
let a_strides: Vec<usize> = {
let mut s = vec![1usize; a_shape.len()];
for d in (0..a_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * a_shape[d + 1];
}
s
};
let b_strides: Vec<usize> = {
let mut s = vec![1usize; b_shape.len()];
for d in (0..b_shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * b_shape[d + 1];
}
s
};
let zero = <T as num_traits::Zero>::zero();
let one = <T as num_traits::One>::one();
let half = T::from(0.5_f64).unwrap_or(zero);
for i in 0..out_numel {
let mut rem_i = i;
let mut coords = [0usize; 16];
for d in (0..out_ndim).rev() {
coords[d] = rem_i % out_shape[d];
rem_i /= out_shape[d];
}
let mut a_flat = 0usize;
for (d, &s) in a_strides.iter().enumerate() {
let oc = coords[d + pad_a];
let coord = if a_shape[d] == 1 { 0 } else { oc };
a_flat += coord * s;
}
let mut b_flat = 0usize;
for (d, &s) in b_strides.iter().enumerate() {
let oc = coords[d + pad_b];
let coord = if b_shape[d] == 1 { 0 } else { oc };
b_flat += coord * s;
}
let av = a_data[a_flat];
let bv = b_data[b_flat];
let floordiv = if bv == zero {
av / bv
} else {
let m = av % bv;
let mut div = (av - m) / bv;
if m != zero && (bv < zero) != (m < zero) {
div = div - one;
}
if div == zero {
let q = av / bv;
zero.copysign(q)
} else {
let f = div.floor();
if div - f > half { f + one } else { f }
}
};
result[i] = floordiv;
}
let storage = TensorStorage::on_device(result, device)?;
let out = Tensor::from_storage(storage, out_shape, false)?;
if needs_grad(a, b) {
let (storage, shape) = out.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(FloorDivideBackward {
a: a.clone(),
b: b.clone(),
}),
)
} else {
Ok(out)
}
}
#[derive(Debug)]
struct AddcmulBackward<T: Float> {
input: Tensor<T>,
tensor1: Tensor<T>,
tensor2: Tensor<T>,
value: f64,
}
impl<T: Float> GradFn<T> for AddcmulBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let d_input = if self.input.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.input.shape())?)
} else {
None
};
let value_t = T::from(self.value).ok_or_else(|| FerrotorchError::InvalidArgument {
message: format!(
"addcmul backward: value={} cannot be represented in the tensor dtype",
self.value
),
})?;
let d_tensor1 = if self.tensor1.requires_grad() {
let computed = no_grad(|| {
let g_t2 = mul(grad_output, &self.tensor2)?;
let scale = Tensor::from_storage(
TensorStorage::on_device(vec![value_t], grad_output.device())?,
vec![],
false,
)?;
mul(&g_t2, &scale)
})?;
Some(reduce_grad_to_shape(&computed, self.tensor1.shape())?)
} else {
None
};
let d_tensor2 = if self.tensor2.requires_grad() {
let computed = no_grad(|| {
let g_t1 = mul(grad_output, &self.tensor1)?;
let scale = Tensor::from_storage(
TensorStorage::on_device(vec![value_t], grad_output.device())?,
vec![],
false,
)?;
mul(&g_t1, &scale)
})?;
Some(reduce_grad_to_shape(&computed, self.tensor2.shape())?)
} else {
None
};
Ok(vec![d_input, d_tensor1, d_tensor2])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input, &self.tensor1, &self.tensor2]
}
fn name(&self) -> &'static str {
"AddcmulBackward"
}
}
pub fn addcmul<T: Float>(
input: &Tensor<T>,
tensor1: &Tensor<T>,
tensor2: &Tensor<T>,
value: f64,
) -> FerrotorchResult<Tensor<T>> {
if input.device() != tensor1.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: input.device(),
got: tensor1.device(),
});
}
if input.device() != tensor2.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: input.device(),
got: tensor2.device(),
});
}
crate::profiler_hook::profile_op_scope(
"addcmul",
"tensor_op",
&[input.shape(), tensor1.shape(), tensor2.shape()],
|| addcmul_inner(input, tensor1, tensor2, value),
)
}
fn addcmul_inner<T: Float>(
input: &Tensor<T>,
tensor1: &Tensor<T>,
tensor2: &Tensor<T>,
value: f64,
) -> FerrotorchResult<Tensor<T>> {
let t12_shape = broadcast_shapes(tensor1.shape(), tensor2.shape())?;
let out_shape = broadcast_shapes(input.shape(), &t12_shape)?;
let device = input.device();
let input_data = input.data_vec()?;
let t1_data = tensor1.data_vec()?;
let t2_data = tensor2.data_vec()?;
let input_shape = input.shape().to_vec();
let t1_shape = tensor1.shape().to_vec();
let t2_shape = tensor2.shape().to_vec();
let out_numel: usize = out_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let out_ndim = out_shape.len();
let pad_input = out_ndim - input_shape.len();
let pad_t1 = out_ndim - t1_shape.len();
let pad_t2 = out_ndim - t2_shape.len();
let strides_of = |shape: &[usize]| -> Vec<usize> {
let mut s = vec![1usize; shape.len()];
for d in (0..shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * shape[d + 1];
}
s
};
let input_strides = strides_of(&input_shape);
let t1_strides = strides_of(&t1_shape);
let t2_strides = strides_of(&t2_shape);
let value_t = T::from(value).ok_or_else(|| FerrotorchError::InvalidArgument {
message: format!("addcmul: value={value} cannot be represented in the tensor dtype"),
})?;
for i in 0..out_numel {
let mut rem_i = i;
let mut coords = [0usize; 16];
for d in (0..out_ndim).rev() {
coords[d] = rem_i % out_shape[d];
rem_i /= out_shape[d];
}
let flatten = |shape: &[usize], strides: &[usize], pad: usize| -> usize {
let mut flat = 0usize;
for (d, &s) in strides.iter().enumerate() {
let oc = coords[d + pad];
let coord = if shape[d] == 1 { 0 } else { oc };
flat += coord * s;
}
flat
};
let i_flat = flatten(&input_shape, &input_strides, pad_input);
let t1_flat = flatten(&t1_shape, &t1_strides, pad_t1);
let t2_flat = flatten(&t2_shape, &t2_strides, pad_t2);
result[i] = input_data[i_flat] + value_t * t1_data[t1_flat] * t2_data[t2_flat];
}
let storage = TensorStorage::on_device(result, device)?;
let out = Tensor::from_storage(storage, out_shape, false)?;
let needs_g = is_grad_enabled()
&& (input.requires_grad() || tensor1.requires_grad() || tensor2.requires_grad());
if needs_g {
let (storage, shape) = out.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(AddcmulBackward {
input: input.clone(),
tensor1: tensor1.clone(),
tensor2: tensor2.clone(),
value,
}),
)
} else {
Ok(out)
}
}
#[derive(Debug)]
struct AddcdivBackward<T: Float> {
input: Tensor<T>,
tensor1: Tensor<T>,
tensor2: Tensor<T>,
value: f64,
}
impl<T: Float> GradFn<T> for AddcdivBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let d_input = if self.input.requires_grad() {
Some(reduce_grad_to_shape(grad_output, self.input.shape())?)
} else {
None
};
let value_t = T::from(self.value).ok_or_else(|| FerrotorchError::InvalidArgument {
message: format!(
"addcdiv backward: value={} cannot be represented in the tensor dtype",
self.value
),
})?;
let d_tensor1 = if self.tensor1.requires_grad() {
let computed = no_grad(|| {
let g_over_t2 = div(grad_output, &self.tensor2)?;
let scale = Tensor::from_storage(
TensorStorage::on_device(vec![value_t], grad_output.device())?,
vec![],
false,
)?;
mul(&g_over_t2, &scale)
})?;
Some(reduce_grad_to_shape(&computed, self.tensor1.shape())?)
} else {
None
};
let d_tensor2 = if self.tensor2.requires_grad() {
let computed = no_grad(|| {
let neg_g = neg(grad_output)?;
let neg_g_t1 = mul(&neg_g, &self.tensor1)?;
let step1 = div(&neg_g_t1, &self.tensor2)?;
let step2 = div(&step1, &self.tensor2)?;
let scale = Tensor::from_storage(
TensorStorage::on_device(vec![value_t], grad_output.device())?,
vec![],
false,
)?;
mul(&step2, &scale)
})?;
Some(reduce_grad_to_shape(&computed, self.tensor2.shape())?)
} else {
None
};
Ok(vec![d_input, d_tensor1, d_tensor2])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input, &self.tensor1, &self.tensor2]
}
fn name(&self) -> &'static str {
"AddcdivBackward"
}
}
pub fn addcdiv<T: Float>(
input: &Tensor<T>,
tensor1: &Tensor<T>,
tensor2: &Tensor<T>,
value: f64,
) -> FerrotorchResult<Tensor<T>> {
if input.device() != tensor1.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: input.device(),
got: tensor1.device(),
});
}
if input.device() != tensor2.device() {
return Err(FerrotorchError::DeviceMismatch {
expected: input.device(),
got: tensor2.device(),
});
}
crate::profiler_hook::profile_op_scope(
"addcdiv",
"tensor_op",
&[input.shape(), tensor1.shape(), tensor2.shape()],
|| addcdiv_inner(input, tensor1, tensor2, value),
)
}
fn addcdiv_inner<T: Float>(
input: &Tensor<T>,
tensor1: &Tensor<T>,
tensor2: &Tensor<T>,
value: f64,
) -> FerrotorchResult<Tensor<T>> {
let t12_shape = broadcast_shapes(tensor1.shape(), tensor2.shape())?;
let out_shape = broadcast_shapes(input.shape(), &t12_shape)?;
let device = input.device();
let input_data = input.data_vec()?;
let t1_data = tensor1.data_vec()?;
let t2_data = tensor2.data_vec()?;
let input_shape = input.shape().to_vec();
let t1_shape = tensor1.shape().to_vec();
let t2_shape = tensor2.shape().to_vec();
let out_numel: usize = out_shape.iter().product();
let mut result = vec![<T as num_traits::Zero>::zero(); out_numel.max(1)];
let out_ndim = out_shape.len();
let pad_input = out_ndim - input_shape.len();
let pad_t1 = out_ndim - t1_shape.len();
let pad_t2 = out_ndim - t2_shape.len();
let strides_of = |shape: &[usize]| -> Vec<usize> {
let mut s = vec![1usize; shape.len()];
for d in (0..shape.len().saturating_sub(1)).rev() {
s[d] = s[d + 1] * shape[d + 1];
}
s
};
let input_strides = strides_of(&input_shape);
let t1_strides = strides_of(&t1_shape);
let t2_strides = strides_of(&t2_shape);
let value_t = T::from(value).ok_or_else(|| FerrotorchError::InvalidArgument {
message: format!("addcdiv: value={value} cannot be represented in the tensor dtype"),
})?;
for i in 0..out_numel {
let mut rem_i = i;
let mut coords = [0usize; 16];
for d in (0..out_ndim).rev() {
coords[d] = rem_i % out_shape[d];
rem_i /= out_shape[d];
}
let flatten = |shape: &[usize], strides: &[usize], pad: usize| -> usize {
let mut flat = 0usize;
for (d, &s) in strides.iter().enumerate() {
let oc = coords[d + pad];
let coord = if shape[d] == 1 { 0 } else { oc };
flat += coord * s;
}
flat
};
let i_flat = flatten(&input_shape, &input_strides, pad_input);
let t1_flat = flatten(&t1_shape, &t1_strides, pad_t1);
let t2_flat = flatten(&t2_shape, &t2_strides, pad_t2);
result[i] = input_data[i_flat] + value_t * t1_data[t1_flat] / t2_data[t2_flat];
}
let storage = TensorStorage::on_device(result, device)?;
let out = Tensor::from_storage(storage, out_shape, false)?;
let needs_g = is_grad_enabled()
&& (input.requires_grad() || tensor1.requires_grad() || tensor2.requires_grad());
if needs_g {
let (storage, shape) = out.into_storage_and_shape()?;
Tensor::from_operation(
storage,
shape,
Arc::new(AddcdivBackward {
input: input.clone(),
tensor1: tensor1.clone(),
tensor2: tensor2.clone(),
value,
}),
)
} else {
Ok(out)
}
}
#[derive(Debug)]
struct AbsBackward<T: Float> {
a: Tensor<T>,
}
impl<T: Float> GradFn<T> for AbsBackward<T> {
fn backward(&self, grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
use crate::gpu_dispatch::gpu_backend;
let da = if self.a.requires_grad() {
if grad_output.is_cuda() && self.a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
let backend = gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let go_c = ensure_contig_for_gpu(grad_output)?;
let a_c = ensure_contig_for_gpu(&self.a)?;
let handle = if is_f32::<T>() {
backend.abs_backward_f32(go_c.gpu_handle()?, a_c.gpu_handle()?)?
} else {
backend.abs_backward_f64(go_c.gpu_handle()?, a_c.gpu_handle()?)?
};
let grad_a = Tensor::from_storage(
TensorStorage::gpu(handle),
self.a.shape().to_vec(),
false,
)?;
return Ok(vec![Some(grad_a)]);
}
if grad_output.is_cuda() || self.a.is_cuda() {
return Err(FerrotorchError::NotImplementedOnCuda { op: "AbsBackward" });
}
let go_data = grad_output.data()?;
let a_data = self.a.data()?;
let zero = <T as num_traits::Zero>::zero();
let one = <T as num_traits::One>::one();
let grad_a: Vec<T> = go_data
.iter()
.zip(a_data.iter())
.map(|(&g, &a)| {
let sign = if a > zero {
one
} else if a < zero {
-one
} else {
zero
};
g * sign
})
.collect();
Some(Tensor::from_storage(
TensorStorage::cpu(grad_a),
self.a.shape().to_vec(),
false,
)?)
} else {
None
};
Ok(vec![da])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.a]
}
fn name(&self) -> &'static str {
"AbsBackward"
}
}
pub fn abs<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if let Some(out) = crate::meta_propagate::unary_same_shape(a)? {
return Ok(out);
}
crate::profiler_hook::profile_op_scope("abs", "tensor_op", &[a.shape()], || abs_inner(a))
}
fn abs_inner<T: Float>(a: &Tensor<T>) -> FerrotorchResult<Tensor<T>> {
if a.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
let backend =
crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
let a_c = ensure_contig_for_gpu(a)?;
let handle = if is_f32::<T>() {
backend.abs_f32(a_c.gpu_handle()?)?
} else {
backend.abs_f64(a_c.gpu_handle()?)?
};
let storage = TensorStorage::gpu(handle);
let shape = a_c.shape().to_vec();
if needs_grad_unary(a) {
Tensor::from_operation(storage, shape, Arc::new(AbsBackward { a: a.clone() }))
} else {
Tensor::from_storage(storage, shape, false)
}
} else {
let result = unary_map(a, |x| x.abs())?;
if needs_grad_unary(a) {
let (storage, shape) = result.into_storage_and_shape()?;
Tensor::from_operation(storage, shape, Arc::new(AbsBackward { a: a.clone() }))
} else {
Ok(result)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn leaf_scalar(val: f32, requires_grad: bool) -> Tensor<f32> {
Tensor::from_storage(TensorStorage::cpu(vec![val]), vec![], requires_grad).unwrap()
}
fn leaf_vec(data: &[f32], requires_grad: bool) -> Tensor<f32> {
Tensor::from_storage(
TensorStorage::cpu(data.to_vec()),
vec![data.len()],
requires_grad,
)
.unwrap()
}
fn assert_scalar_approx(t: &Tensor<f32>, expected: f32, tol: f32) {
let val = t.item().unwrap();
assert!(
(val - expected).abs() < tol,
"expected {expected}, got {val}"
);
}
#[test]
fn test_add_forward() {
let a = leaf_vec(&[1.0, 2.0, 3.0], false);
let b = leaf_vec(&[4.0, 5.0, 6.0], false);
let c = add(&a, &b).unwrap();
assert_eq!(c.data().unwrap(), &[5.0, 7.0, 9.0]);
}
#[test]
fn test_sub_forward() {
let a = leaf_vec(&[10.0, 20.0, 30.0], false);
let b = leaf_vec(&[1.0, 2.0, 3.0], false);
let c = sub(&a, &b).unwrap();
assert_eq!(c.data().unwrap(), &[9.0, 18.0, 27.0]);
}
#[test]
fn test_mul_forward() {
let a = leaf_vec(&[2.0, 3.0, 4.0], false);
let b = leaf_vec(&[5.0, 6.0, 7.0], false);
let c = mul(&a, &b).unwrap();
assert_eq!(c.data().unwrap(), &[10.0, 18.0, 28.0]);
}
#[test]
fn test_div_forward() {
let a = leaf_vec(&[10.0, 20.0, 30.0], false);
let b = leaf_vec(&[2.0, 5.0, 10.0], false);
let c = div(&a, &b).unwrap();
assert_eq!(c.data().unwrap(), &[5.0, 4.0, 3.0]);
}
#[test]
fn test_neg_forward() {
let a = leaf_vec(&[1.0, -2.0, 3.0], false);
let c = neg(&a).unwrap();
assert_eq!(c.data().unwrap(), &[-1.0, 2.0, -3.0]);
}
#[test]
fn test_pow_forward() {
let a = leaf_vec(&[2.0, 3.0, 4.0], false);
let c = pow(&a, 2.0).unwrap();
let d = c.data().unwrap();
assert!((d[0] - 4.0).abs() < 1e-6);
assert!((d[1] - 9.0).abs() < 1e-6);
assert!((d[2] - 16.0).abs() < 1e-6);
}
#[test]
fn test_sqrt_forward() {
let a = leaf_vec(&[4.0, 9.0, 16.0], false);
let c = sqrt(&a).unwrap();
let d = c.data().unwrap();
assert!((d[0] - 2.0).abs() < 1e-6);
assert!((d[1] - 3.0).abs() < 1e-6);
assert!((d[2] - 4.0).abs() < 1e-6);
}
#[test]
fn test_abs_forward() {
let a = leaf_vec(&[-3.0, 0.0, 5.0], false);
let c = abs(&a).unwrap();
assert_eq!(c.data().unwrap(), &[3.0, 0.0, 5.0]);
}
#[test]
fn test_rsub_forward_alpha_one() -> FerrotorchResult<()> {
let a = leaf_vec(&[1.0, 2.0, 3.0], false);
let b = leaf_vec(&[10.0, 20.0, 30.0], false);
let c = rsub(&a, &b, 1.0)?;
assert_eq!(c.data()?, &[9.0, 18.0, 27.0]);
Ok(())
}
#[test]
fn test_rsub_forward_alpha_general() -> FerrotorchResult<()> {
let a = leaf_vec(&[1.0, 2.0, 3.0], false);
let b = leaf_vec(&[10.0, 20.0, 30.0], false);
let c = rsub(&a, &b, 2.0)?;
assert_eq!(c.data()?, &[8.0, 16.0, 24.0]);
Ok(())
}
#[test]
fn test_rsub_forward_alpha_negative() -> FerrotorchResult<()> {
let a = leaf_vec(&[1.0, 2.0, 3.0], false);
let b = leaf_vec(&[10.0, 20.0, 30.0], false);
let c = rsub(&a, &b, -1.0)?;
assert_eq!(c.data()?, &[11.0, 22.0, 33.0]);
Ok(())
}
#[test]
fn test_rsub_backward_alpha_one() -> FerrotorchResult<()> {
let a = leaf_scalar(2.0, true);
let b = leaf_scalar(5.0, true);
let c = rsub(&a, &b, 1.0)?;
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
assert_scalar_approx(&ga, -1.0, 1e-6);
assert_scalar_approx(&gb, 1.0, 1e-6);
Ok(())
}
#[test]
fn test_rsub_backward_alpha_general() -> FerrotorchResult<()> {
let a = leaf_scalar(3.0, true);
let b = leaf_scalar(7.0, true);
let c = rsub(&a, &b, 2.5)?;
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
assert_scalar_approx(&ga, -2.5, 1e-6);
assert_scalar_approx(&gb, 1.0, 1e-6);
Ok(())
}
#[test]
fn test_rsub_matches_sub_with_swapped_operands() -> FerrotorchResult<()> {
let a = leaf_vec(&[1.5, -2.0, 0.25, 4.0], false);
let b = leaf_vec(&[3.0, 1.0, -0.5, 2.0], false);
for alpha in [1.0_f64, 2.0, -1.0, 0.0, 0.5] {
let r = rsub(&a, &b, alpha)?;
let s = sub_scaled(&b, &a, alpha)?;
assert_eq!(r.data()?, s.data()?, "alpha={alpha}");
}
Ok(())
}
#[test]
fn test_add_backward() {
let a = leaf_scalar(2.0, true);
let b = leaf_scalar(3.0, true);
let c = add(&a, &b).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
assert_scalar_approx(&b.grad().unwrap().unwrap(), 1.0, 1e-6);
}
#[test]
fn test_sub_backward() {
let a = leaf_scalar(5.0, true);
let b = leaf_scalar(3.0, true);
let c = sub(&a, &b).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
assert_scalar_approx(&b.grad().unwrap().unwrap(), -1.0, 1e-6);
}
#[test]
fn test_mul_backward() {
let a = leaf_scalar(2.0, true);
let b = leaf_scalar(3.0, true);
let c = mul(&a, &b).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 3.0, 1e-6);
assert_scalar_approx(&b.grad().unwrap().unwrap(), 2.0, 1e-6);
}
#[test]
fn test_div_backward() {
let a = leaf_scalar(6.0, true);
let b = leaf_scalar(4.0, true);
let c = div(&a, &b).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 0.25, 1e-6);
assert_scalar_approx(&b.grad().unwrap().unwrap(), -0.375, 1e-6);
}
#[test]
fn test_div_backward_tensor_by_scalar() {
let x = Tensor::from_storage(
TensorStorage::cpu(vec![1.0f64, 2.0, 3.0, 4.0]),
vec![2, 2],
true,
)
.unwrap();
let s = Tensor::from_storage(TensorStorage::cpu(vec![2.0f64]), vec![], false).unwrap();
let y = div(&x, &s).unwrap();
let loss = crate::grad_fns::reduction::sum(&y).unwrap();
loss.backward().unwrap();
let grad = x.grad().unwrap().expect("x should have grad");
assert_eq!(grad.shape(), &[2, 2]);
let g = grad.data().unwrap();
for (i, &v) in g.iter().enumerate() {
assert!((v - 0.5).abs() < 1e-10, "grad[{i}] = {v}, expected 0.5");
}
}
#[test]
fn test_neg_backward() {
let a = leaf_scalar(7.0, true);
let c = neg(&a).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), -1.0, 1e-6);
}
#[test]
fn test_pow_backward() {
let a = leaf_scalar(2.0, true);
let c = pow(&a, 3.0).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 12.0, 1e-5);
}
#[test]
fn test_sqrt_backward() {
let a = leaf_scalar(4.0, true);
let c = sqrt(&a).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 0.25, 1e-6);
}
#[test]
fn test_rsqrt_forward() -> FerrotorchResult<()> {
let a = leaf_vec(&[4.0, 16.0, 100.0], false);
let c = rsqrt(&a)?;
let d = c.data()?;
assert!((d[0] - 0.5).abs() < 1e-6);
assert!((d[1] - 0.25).abs() < 1e-6);
assert!((d[2] - 0.1).abs() < 1e-6);
Ok(())
}
#[test]
fn test_rsqrt_forward_edges() -> FerrotorchResult<()> {
let a = leaf_vec(&[0.0, -1.0, f32::INFINITY], false);
let c = rsqrt(&a)?;
let d = c.data()?;
assert!(
d[0].is_infinite() && d[0] > 0.0,
"rsqrt(0) -> +Inf, got {}",
d[0]
);
assert!(d[1].is_nan(), "rsqrt(-1) -> NaN, got {}", d[1]);
assert!(d[2] == 0.0, "rsqrt(+Inf) -> 0, got {}", d[2]);
Ok(())
}
#[test]
fn test_rsqrt_backward() -> FerrotorchResult<()> {
let a = leaf_scalar(4.0, true);
let c = rsqrt(&a)?;
c.backward()?;
let a_val: f32 = 4.0;
let expected = -0.5_f32 / a_val.powf(1.5);
assert!(
(expected - (-0.0625)).abs() < 1e-7,
"expected formula sanity"
);
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
assert_scalar_approx(&ga, expected, 1e-6);
Ok(())
}
#[test]
fn test_rsqrt_backward_vector() -> FerrotorchResult<()> {
let a = leaf_vec(&[1.0, 4.0, 9.0], true);
let c = rsqrt(&a)?;
let c_data = c.data()?.to_vec();
let total: f32 = c_data.iter().sum();
let sum_backward = SumBackward { input: c.clone() };
let loss = Tensor::from_operation(
TensorStorage::cpu(vec![total]),
vec![],
Arc::new(sum_backward),
)?;
loss.backward()?;
let grad = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let g = grad.data()?;
let expected = [
-0.5_f32 / 1.0_f32.powf(1.5),
-0.5_f32 / 4.0_f32.powf(1.5),
-0.5_f32 / 9.0_f32.powf(1.5),
];
assert!(
(g[0] - expected[0]).abs() < 1e-6,
"g[0]={}, expected {}",
g[0],
expected[0]
);
assert!(
(g[1] - expected[1]).abs() < 1e-6,
"g[1]={}, expected {}",
g[1],
expected[1]
);
assert!(
(g[2] - expected[2]).abs() < 1e-6,
"g[2]={}, expected {}",
g[2],
expected[2]
);
Ok(())
}
#[test]
fn test_reciprocal_forward() -> FerrotorchResult<()> {
let a = leaf_vec(&[2.0, 4.0, 5.0], false);
let c = reciprocal(&a)?;
let d = c.data()?;
assert!((d[0] - 0.5).abs() < 1e-6);
assert!((d[1] - 0.25).abs() < 1e-6);
assert!((d[2] - 0.2).abs() < 1e-6);
Ok(())
}
#[test]
fn test_reciprocal_forward_edges() -> FerrotorchResult<()> {
let a = leaf_vec(
&[0.0, -0.0, f32::INFINITY, f32::NEG_INFINITY, f32::NAN],
false,
);
let c = reciprocal(&a)?;
let d = c.data()?;
assert!(
d[0].is_infinite() && d[0] > 0.0,
"reciprocal(+0) -> +Inf, got {}",
d[0]
);
assert!(
d[1].is_infinite() && d[1] < 0.0,
"reciprocal(-0) -> -Inf, got {}",
d[1]
);
assert!(
d[2] == 0.0 && !d[2].is_sign_negative(),
"reciprocal(+Inf) -> +0, got {} (sign_neg={})",
d[2],
d[2].is_sign_negative()
);
assert!(
d[3] == 0.0 && d[3].is_sign_negative(),
"reciprocal(-Inf) -> -0, got {} (sign_neg={})",
d[3],
d[3].is_sign_negative()
);
assert!(d[4].is_nan(), "reciprocal(NaN) -> NaN, got {}", d[4]);
Ok(())
}
#[test]
fn test_reciprocal_backward_scalar() -> FerrotorchResult<()> {
let a = leaf_scalar(4.0, true);
let c = reciprocal(&a)?;
c.backward()?;
let a_val: f32 = 4.0;
let expected = -1.0_f32 / (a_val * a_val);
assert!(
(expected - (-0.0625)).abs() < 1e-7,
"expected formula sanity"
);
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
assert_scalar_approx(&ga, expected, 1e-6);
Ok(())
}
#[test]
fn test_reciprocal_backward_vector() -> FerrotorchResult<()> {
let a = leaf_vec(&[2.0, 4.0], true);
let c = reciprocal(&a)?;
let c_data = c.data()?.to_vec();
let total: f32 = c_data.iter().sum();
let sum_backward = SumBackward { input: c.clone() };
let loss = Tensor::from_operation(
TensorStorage::cpu(vec![total]),
vec![],
Arc::new(sum_backward),
)?;
loss.backward()?;
let grad = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let g = grad.data()?;
let expected = [-1.0_f32 / (2.0_f32 * 2.0), -1.0_f32 / (4.0_f32 * 4.0)];
assert!(
(expected[0] - (-0.25)).abs() < 1e-7,
"expected[0] formula sanity"
);
assert!(
(expected[1] - (-0.0625)).abs() < 1e-7,
"expected[1] formula sanity"
);
assert!(
(g[0] - expected[0]).abs() < 1e-6,
"g[0]={}, expected {}",
g[0],
expected[0]
);
assert!(
(g[1] - expected[1]).abs() < 1e-6,
"g[1]={}, expected {}",
g[1],
expected[1]
);
Ok(())
}
const REMAINDER_SIGN_CASES: [(f32, f32, f32); 4] = [
(5.0, 3.0, 2.0), (-5.0, 3.0, 1.0), (5.0, -3.0, -1.0), (-5.0, -3.0, -2.0), ];
#[test]
fn test_remainder_forward_sign_cases() -> FerrotorchResult<()> {
for (a_val, b_val, expected) in REMAINDER_SIGN_CASES {
let a = leaf_vec(&[a_val], false);
let b = leaf_vec(&[b_val], false);
let c = remainder(&a, &b)?;
let d = c.data()?;
assert!(
(d[0] - expected).abs() < 1e-6,
"remainder({a_val}, {b_val}) = {} (expected {expected})",
d[0],
);
}
Ok(())
}
#[test]
fn test_remainder_forward_div_by_zero() -> FerrotorchResult<()> {
let a = leaf_vec(&[5.0], false);
let b = leaf_vec(&[0.0], false);
let c = remainder(&a, &b)?;
let d = c.data()?;
assert!(d[0].is_nan(), "remainder(5, 0) -> NaN, got {}", d[0]);
Ok(())
}
#[test]
fn test_remainder_forward_nan_propagation() -> FerrotorchResult<()> {
let a_nan = leaf_vec(&[f32::NAN], false);
let b = leaf_vec(&[3.0], false);
let c = remainder(&a_nan, &b)?;
let d = c.data()?;
assert!(d[0].is_nan(), "remainder(NaN, 3) -> NaN, got {}", d[0]);
let a = leaf_vec(&[5.0], false);
let b_nan = leaf_vec(&[f32::NAN], false);
let c = remainder(&a, &b_nan)?;
let d = c.data()?;
assert!(d[0].is_nan(), "remainder(5, NaN) -> NaN, got {}", d[0]);
Ok(())
}
#[test]
fn test_remainder_forward_vector() -> FerrotorchResult<()> {
let a = leaf_vec(&[5.0, -5.0, 5.0, -5.0], false);
let b = leaf_vec(&[3.0, 3.0, -3.0, -3.0], false);
let c = remainder(&a, &b)?;
let d = c.data()?;
let expected = [2.0_f32, 1.0, -1.0, -2.0];
for i in 0..4 {
assert!(
(d[i] - expected[i]).abs() < 1e-6,
"vec remainder[{i}] = {} (expected {})",
d[i],
expected[i],
);
}
Ok(())
}
#[test]
fn test_remainder_backward_scalar() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, true);
let b = leaf_scalar(3.0, true);
let c = remainder(&a, &b)?;
assert!((c.item()? - 1.0).abs() < 1e-6, "forward remainder(7,3) = 1");
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
let expected_da: f32 = 1.0;
let expected_db: f32 = -(7.0_f32 / 3.0_f32).floor(); assert!(
(expected_db - (-2.0)).abs() < 1e-7,
"expected formula sanity: -floor(7/3) = -2"
);
assert_scalar_approx(&ga, expected_da, 1e-6);
assert_scalar_approx(&gb, expected_db, 1e-6);
Ok(())
}
#[test]
fn test_remainder_backward_negative_dividend() -> FerrotorchResult<()> {
let a = leaf_scalar(-7.0, true);
let b = leaf_scalar(3.0, true);
let c = remainder(&a, &b)?;
assert!(
(c.item()? - 2.0).abs() < 1e-6,
"forward remainder(-7,3) = 2, got {}",
c.item()?,
);
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
let expected_da: f32 = 1.0;
let expected_db: f32 = -(-7.0_f32 / 3.0_f32).floor(); assert!(
(expected_db - 3.0).abs() < 1e-7,
"expected formula sanity: -floor(-7/3) = 3"
);
assert_scalar_approx(&ga, expected_da, 1e-6);
assert_scalar_approx(&gb, expected_db, 1e-6);
Ok(())
}
const FMOD_SIGN_CASES: [(f32, f32, f32); 4] = [
(5.0, 3.0, 2.0), (-5.0, 3.0, -2.0), (5.0, -3.0, 2.0), (-5.0, -3.0, -2.0), ];
#[test]
fn test_fmod_forward_sign_cases() -> FerrotorchResult<()> {
for (a_val, b_val, expected) in FMOD_SIGN_CASES {
let a = leaf_vec(&[a_val], false);
let b = leaf_vec(&[b_val], false);
let c = fmod(&a, &b)?;
let d = c.data()?;
assert!(
(d[0] - expected).abs() < 1e-6,
"fmod({a_val}, {b_val}) = {} (expected {expected})",
d[0],
);
}
Ok(())
}
#[test]
fn test_fmod_forward_div_by_zero() -> FerrotorchResult<()> {
let a = leaf_vec(&[5.0], false);
let b = leaf_vec(&[0.0], false);
let c = fmod(&a, &b)?;
let d = c.data()?;
assert!(d[0].is_nan(), "fmod(5, 0) -> NaN, got {}", d[0]);
Ok(())
}
#[test]
fn test_fmod_forward_nan_propagation() -> FerrotorchResult<()> {
let a_nan = leaf_vec(&[f32::NAN], false);
let b = leaf_vec(&[3.0], false);
let c = fmod(&a_nan, &b)?;
let d = c.data()?;
assert!(d[0].is_nan(), "fmod(NaN, 3) -> NaN, got {}", d[0]);
let a = leaf_vec(&[5.0], false);
let b_nan = leaf_vec(&[f32::NAN], false);
let c = fmod(&a, &b_nan)?;
let d = c.data()?;
assert!(d[0].is_nan(), "fmod(5, NaN) -> NaN, got {}", d[0]);
Ok(())
}
#[test]
fn test_fmod_forward_vector() -> FerrotorchResult<()> {
let a = leaf_vec(&[5.0, -5.0, 5.0, -5.0], false);
let b = leaf_vec(&[3.0, 3.0, -3.0, -3.0], false);
let c = fmod(&a, &b)?;
let d = c.data()?;
let expected = [2.0_f32, -2.0, 2.0, -2.0];
for i in 0..4 {
assert!(
(d[i] - expected[i]).abs() < 1e-6,
"vec fmod[{i}] = {} (expected {})",
d[i],
expected[i],
);
}
Ok(())
}
#[test]
fn test_fmod_vs_remainder_sign_contrast() -> FerrotorchResult<()> {
let a = leaf_vec(&[-5.0], false);
let b = leaf_vec(&[3.0], false);
let fm = fmod(&a, &b)?;
let fmd = fm.data()?;
assert!(
(fmd[0] - (-2.0_f32)).abs() < 1e-6,
"fmod(-5,3) = {} (expected -2.0 — sign of dividend)",
fmd[0],
);
let rem = remainder(&a, &b)?;
let remd = rem.data()?;
assert!(
(remd[0] - 1.0_f32).abs() < 1e-6,
"remainder(-5,3) = {} (expected 1.0 — sign of divisor)",
remd[0],
);
assert!(
(fmd[0] - remd[0]).abs() > 1e-6,
"fmod and remainder must differ on (-5,3): fmod={}, remainder={}",
fmd[0],
remd[0],
);
Ok(())
}
#[test]
fn test_fmod_backward_scalar() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, true);
let b = leaf_scalar(3.0, true);
let c = fmod(&a, &b)?;
assert!((c.item()? - 1.0).abs() < 1e-6, "forward fmod(7,3) = 1");
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
let expected_da: f32 = 1.0;
let expected_db: f32 = -(7.0_f32 / 3.0_f32).trunc(); assert!(
(expected_db - (-2.0)).abs() < 1e-7,
"expected formula sanity: -trunc(7/3) = -2"
);
assert_scalar_approx(&ga, expected_da, 1e-6);
assert_scalar_approx(&gb, expected_db, 1e-6);
Ok(())
}
#[test]
fn test_fmod_backward_negative_dividend() -> FerrotorchResult<()> {
let a = leaf_scalar(-7.0, true);
let b = leaf_scalar(3.0, true);
let c = fmod(&a, &b)?;
assert!(
(c.item()? - (-1.0)).abs() < 1e-6,
"forward fmod(-7,3) = -1, got {}",
c.item()?,
);
c.backward()?;
let ga = a.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "a.grad missing".into(),
})?;
let gb = b.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "b.grad missing".into(),
})?;
let expected_da: f32 = 1.0;
let expected_db: f32 = -(-7.0_f32 / 3.0_f32).trunc(); assert!(
(expected_db - 2.0).abs() < 1e-7,
"expected formula sanity: -trunc(-7/3) = 2"
);
assert_scalar_approx(&ga, expected_da, 1e-6);
assert_scalar_approx(&gb, expected_db, 1e-6);
Ok(())
}
#[test]
fn test_floor_divide_sign_pos_pos() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, false);
let b = leaf_scalar(3.0, false);
let c = floor_divide(&a, &b)?;
let expected: f32 = 2.0;
assert!(
(c.item()? - expected).abs() < 1e-6,
"floor_divide(7, 3) expected {expected}, got {}",
c.item()?,
);
Ok(())
}
#[test]
fn test_floor_divide_sign_neg_pos() -> FerrotorchResult<()> {
let a = leaf_scalar(-7.0, false);
let b = leaf_scalar(3.0, false);
let c = floor_divide(&a, &b)?;
let expected: f32 = -3.0;
let trunc_would_be: f32 = -2.0;
assert!(
(c.item()? - expected).abs() < 1e-6,
"floor_divide(-7, 3) expected {expected} (true floor), got {} \
(trunc would give {trunc_would_be})",
c.item()?,
);
Ok(())
}
#[test]
fn test_floor_divide_sign_pos_neg() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, false);
let b = leaf_scalar(-3.0, false);
let c = floor_divide(&a, &b)?;
let expected: f32 = -3.0;
assert!(
(c.item()? - expected).abs() < 1e-6,
"floor_divide(7, -3) expected {expected}, got {}",
c.item()?,
);
Ok(())
}
#[test]
fn test_floor_divide_sign_neg_neg() -> FerrotorchResult<()> {
let a = leaf_scalar(-7.0, false);
let b = leaf_scalar(-3.0, false);
let c = floor_divide(&a, &b)?;
let expected: f32 = 2.0;
assert!(
(c.item()? - expected).abs() < 1e-6,
"floor_divide(-7, -3) expected {expected}, got {}",
c.item()?,
);
Ok(())
}
#[test]
fn test_floor_divide_div_by_zero_pos() -> FerrotorchResult<()> {
let a = leaf_scalar(5.0, false);
let b = leaf_scalar(0.0, false);
let c = floor_divide(&a, &b)?;
let v = c.item()?;
assert!(
v.is_infinite() && v > 0.0,
"floor_divide(5, 0) expected +Inf, got {v}"
);
Ok(())
}
#[test]
fn test_floor_divide_div_by_zero_neg() -> FerrotorchResult<()> {
let a = leaf_scalar(-5.0, false);
let b = leaf_scalar(0.0, false);
let c = floor_divide(&a, &b)?;
let v = c.item()?;
assert!(
v.is_infinite() && v < 0.0,
"floor_divide(-5, 0) expected -Inf, got {v}"
);
Ok(())
}
#[test]
fn test_floor_divide_zero_by_zero() -> FerrotorchResult<()> {
let a = leaf_scalar(0.0, false);
let b = leaf_scalar(0.0, false);
let c = floor_divide(&a, &b)?;
let v = c.item()?;
assert!(v.is_nan(), "floor_divide(0, 0) expected NaN, got {v}");
Ok(())
}
#[test]
fn test_floor_divide_nan_propagation() -> FerrotorchResult<()> {
let nan = f32::NAN;
let a_nan = leaf_scalar(nan, false);
let b = leaf_scalar(3.0, false);
let c = floor_divide(&a_nan, &b)?;
assert!(c.item()?.is_nan(), "floor_divide(NaN, 3) -> NaN");
let a = leaf_scalar(5.0, false);
let b_nan = leaf_scalar(nan, false);
let c = floor_divide(&a, &b_nan)?;
assert!(c.item()?.is_nan(), "floor_divide(5, NaN) -> NaN");
Ok(())
}
#[test]
fn test_floor_divide_three_way_sign_contrast() -> FerrotorchResult<()> {
let a = leaf_scalar(-7.0, false);
let b = leaf_scalar(3.0, false);
let fd = floor_divide(&a, &b)?.item()?;
let rem = remainder(&a, &b)?.item()?;
let fm = fmod(&a, &b)?.item()?;
assert!(
(fd - (-3.0)).abs() < 1e-6,
"floor_divide(-7, 3) = -3, got {fd}"
);
assert!((rem - 2.0).abs() < 1e-6, "remainder(-7, 3) = 2, got {rem}");
assert!((fm - (-1.0)).abs() < 1e-6, "fmod(-7, 3) = -1, got {fm}");
assert!(
(fd - rem).abs() > 1e-3 && (fd - fm).abs() > 1e-3 && (rem - fm).abs() > 1e-3,
"3-way contrast (-7, 3) collapsed: fd={fd}, rem={rem}, fm={fm}",
);
let recovered = fd * 3.0_f32 + rem;
assert!(
(recovered - (-7.0)).abs() < 1e-6,
"identity broken: floor_divide(a,b)*b + remainder(a,b) = {recovered}, expected -7",
);
Ok(())
}
#[test]
fn test_floor_divide_no_grad_fn_when_inputs_detached() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, false);
let b = leaf_scalar(3.0, false);
let c = floor_divide(&a, &b)?;
assert!(
c.grad_fn().is_none(),
"floor_divide on requires_grad=false inputs should not attach grad_fn"
);
Ok(())
}
#[test]
fn test_floor_divide_backward_errors() -> FerrotorchResult<()> {
let a = leaf_scalar(7.0, true);
let b = leaf_scalar(3.0, true);
let c = floor_divide(&a, &b)?;
assert!(
c.grad_fn().is_some(),
"floor_divide on requires_grad=true inputs MUST attach grad_fn \
(upstream attaches <NotImplemented object>)"
);
let res = c.backward();
let err = res.expect_err(
"floor_divide backward must fail (upstream raises 'derivative for \
aten::floor_divide is not implemented')",
);
let is_invalid_arg_with_op_name = matches!(
&err,
FerrotorchError::InvalidArgument { message } if message.contains("floor_divide"),
);
assert!(
is_invalid_arg_with_op_name,
"expected InvalidArgument mentioning 'floor_divide', got {err:?}",
);
Ok(())
}
#[test]
fn test_floor_divide_broadcast() -> FerrotorchResult<()> {
let a = leaf_vec(&[7.0, -7.0], false);
let b = leaf_vec(&[3.0], false);
let c = floor_divide(&a, &b)?;
let d = c.data()?.to_vec();
let expected: [f32; 2] = [2.0, -3.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"broadcast floor_divide[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcmul_forward_default_value() -> FerrotorchResult<()> {
let input = leaf_vec(&[1.0, 2.0, 3.0], false);
let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
let t2 = leaf_vec(&[7.0, 8.0, 9.0], false);
let c = addcmul(&input, &t1, &t2, 1.0)?;
let d = c.data()?.to_vec();
let expected: [f32; 3] = [29.0, 42.0, 57.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcmul[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcmul_forward_value_half() -> FerrotorchResult<()> {
let input = leaf_vec(&[1.0, 2.0, 3.0], false);
let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
let t2 = leaf_vec(&[7.0, 8.0, 9.0], false);
let c = addcmul(&input, &t1, &t2, 0.5)?;
let d = c.data()?.to_vec();
let expected: [f32; 3] = [15.0, 22.0, 30.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcmul[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcmul_forward_value_negative_one() -> FerrotorchResult<()> {
let input = leaf_vec(&[10.0, 20.0, 30.0], false);
let t1 = leaf_vec(&[2.0, 3.0, 4.0], false);
let t2 = leaf_vec(&[3.0, 4.0, 5.0], false);
let c = addcmul(&input, &t1, &t2, -1.0)?;
let d = c.data()?.to_vec();
let expected: [f32; 3] = [4.0, 8.0, 10.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcmul[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcmul_broadcast_3way() -> FerrotorchResult<()> {
let input =
Tensor::from_storage(TensorStorage::cpu(vec![1.0_f32, 2.0, 3.0]), vec![3], false)?;
let t1 = Tensor::from_storage(
TensorStorage::cpu(vec![10.0_f32, 20.0, 30.0, 40.0, 50.0, 60.0]),
vec![2, 3],
false,
)?;
let t2 = Tensor::from_storage(
TensorStorage::cpu(vec![1.0_f32, 1.0, 1.0, 2.0, 2.0, 2.0]),
vec![2, 3],
false,
)?;
let c = addcmul(&input, &t1, &t2, 1.0)?;
assert_eq!(c.shape(), &[2, 3], "addcmul broadcast output shape");
let d = c.data()?.to_vec();
let expected: [f32; 6] = [11.0, 22.0, 33.0, 81.0, 102.0, 123.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcmul broadcast[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcmul_forward_nan_propagation() -> FerrotorchResult<()> {
let input = leaf_vec(&[f32::NAN, 2.0, 3.0], false);
let t1 = leaf_vec(&[1.0, f32::NAN, 1.0], false);
let t2 = leaf_vec(&[1.0, 1.0, f32::NAN], false);
let c = addcmul(&input, &t1, &t2, 1.0)?;
let d = c.data()?.to_vec();
assert!(d[0].is_nan(), "NaN in input must propagate (got {})", d[0]);
assert!(
d[1].is_nan(),
"NaN in tensor1 must propagate (got {})",
d[1]
);
assert!(
d[2].is_nan(),
"NaN in tensor2 must propagate (got {})",
d[2]
);
Ok(())
}
#[test]
fn test_addcmul_backward_value_two() -> FerrotorchResult<()> {
let input = leaf_scalar(3.0, true);
let t1 = leaf_scalar(5.0, true);
let t2 = leaf_scalar(7.0, true);
let c = addcmul(&input, &t1, &t2, 2.0)?;
assert!(c.grad_fn().is_some(), "addcmul must attach grad_fn");
let fwd = c.data()?.to_vec();
assert!(
(fwd[0] - 73.0).abs() < 1e-6,
"addcmul forward = {}, expected 73",
fwd[0]
);
c.backward()?;
let g_input = input
.grad()?
.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcmul backward: input gradient missing".into(),
})?;
assert_scalar_approx(&g_input, 1.0, 1e-6);
let g_t1 = t1.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcmul backward: tensor1 gradient missing".into(),
})?;
assert_scalar_approx(&g_t1, 14.0, 1e-6);
let g_t2 = t2.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcmul backward: tensor2 gradient missing".into(),
})?;
assert_scalar_approx(&g_t2, 10.0, 1e-6);
Ok(())
}
#[test]
fn test_addcdiv_forward_default_value() -> FerrotorchResult<()> {
let input = leaf_vec(&[1.0, 2.0, 3.0], false);
let t1 = leaf_vec(&[4.0, 5.0, 6.0], false);
let t2 = leaf_vec(&[2.0, 2.0, 2.0], false);
let c = addcdiv(&input, &t1, &t2, 1.0)?;
let d = c.data()?.to_vec();
let expected: [f32; 3] = [3.0, 4.5, 6.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcdiv[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcdiv_forward_value_two() -> FerrotorchResult<()> {
let input = leaf_vec(&[10.0, 20.0, 30.0], false);
let t1 = leaf_vec(&[2.0, 4.0, 6.0], false);
let t2 = leaf_vec(&[4.0, 4.0, 4.0], false);
let c = addcdiv(&input, &t1, &t2, 2.0)?;
let d = c.data()?.to_vec();
let expected: [f32; 3] = [11.0, 22.0, 33.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcdiv[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcdiv_forward_div_by_zero() -> FerrotorchResult<()> {
let input = leaf_vec(&[1.0, 1.0, 1.0], false);
let t1 = leaf_vec(&[1.0, -1.0, 0.0], false);
let t2 = leaf_vec(&[0.0, 0.0, 0.0], false);
let c = addcdiv(&input, &t1, &t2, 1.0)?;
let d = c.data()?.to_vec();
assert!(
d[0].is_infinite() && d[0] > 0.0,
"addcdiv(1, 1, 0) expected +Inf, got {}",
d[0]
);
assert!(
d[1].is_infinite() && d[1] < 0.0,
"addcdiv(1, -1, 0) expected -Inf, got {}",
d[1]
);
assert!(
d[2].is_nan(),
"addcdiv(1, 0, 0) expected NaN (1 + 0/0), got {}",
d[2]
);
Ok(())
}
#[test]
fn test_addcdiv_forward_nan_propagation() -> FerrotorchResult<()> {
let input = leaf_vec(&[f32::NAN, 2.0, 3.0], false);
let t1 = leaf_vec(&[1.0, f32::NAN, 1.0], false);
let t2 = leaf_vec(&[1.0, 1.0, f32::NAN], false);
let c = addcdiv(&input, &t1, &t2, 1.0)?;
let d = c.data()?.to_vec();
assert!(d[0].is_nan(), "NaN in input must propagate (got {})", d[0]);
assert!(
d[1].is_nan(),
"NaN in tensor1 must propagate (got {})",
d[1]
);
assert!(
d[2].is_nan(),
"NaN in tensor2 must propagate (got {})",
d[2]
);
Ok(())
}
#[test]
fn test_addcdiv_broadcast_3way() -> FerrotorchResult<()> {
let input =
Tensor::from_storage(TensorStorage::cpu(vec![1.0_f32, 2.0, 3.0]), vec![3], false)?;
let t1 = Tensor::from_storage(
TensorStorage::cpu(vec![10.0_f32, 20.0, 30.0, 40.0, 50.0, 60.0]),
vec![2, 3],
false,
)?;
let t2 = Tensor::from_storage(
TensorStorage::cpu(vec![2.0_f32, 4.0, 5.0, 8.0, 10.0, 12.0]),
vec![2, 3],
false,
)?;
let c = addcdiv(&input, &t1, &t2, 1.0)?;
assert_eq!(c.shape(), &[2, 3], "addcdiv broadcast output shape");
let d = c.data()?.to_vec();
let expected: [f32; 6] = [6.0, 7.0, 9.0, 6.0, 7.0, 8.0];
for (i, (got, want)) in d.iter().zip(expected.iter()).enumerate() {
assert!(
(got - want).abs() < 1e-6,
"addcdiv broadcast[{i}] = {got}, expected {want}"
);
}
Ok(())
}
#[test]
fn test_addcdiv_backward_value_two() -> FerrotorchResult<()> {
let input = leaf_scalar(3.0, true);
let t1 = leaf_scalar(8.0, true);
let t2 = leaf_scalar(4.0, true);
let c = addcdiv(&input, &t1, &t2, 2.0)?;
assert!(c.grad_fn().is_some(), "addcdiv must attach grad_fn");
let fwd = c.data()?.to_vec();
assert!(
(fwd[0] - 7.0).abs() < 1e-6,
"addcdiv forward = {}, expected 7",
fwd[0]
);
c.backward()?;
let g_input = input
.grad()?
.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcdiv backward: input gradient missing".into(),
})?;
assert_scalar_approx(&g_input, 1.0, 1e-6);
let g_t1 = t1.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcdiv backward: tensor1 gradient missing".into(),
})?;
assert_scalar_approx(&g_t1, 0.5, 1e-6);
let g_t2 = t2.grad()?.ok_or_else(|| FerrotorchError::InvalidArgument {
message: "addcdiv backward: tensor2 gradient missing".into(),
})?;
assert_scalar_approx(&g_t2, -1.0, 1e-6);
Ok(())
}
#[test]
fn test_abs_backward_positive() {
let a = leaf_scalar(3.0, true);
let c = abs(&a).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
}
#[test]
fn test_abs_backward_negative() {
let a = leaf_scalar(-3.0, true);
let c = abs(&a).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), -1.0, 1e-6);
}
#[test]
fn test_add_no_grad_fn_when_inputs_detached() {
let a = leaf_scalar(2.0, false);
let b = leaf_scalar(3.0, false);
let c = add(&a, &b).unwrap();
assert!(c.grad_fn().is_none());
}
#[test]
fn test_mul_partial_requires_grad() {
let a = leaf_scalar(3.0, true);
let b = leaf_scalar(5.0, false);
let c = mul(&a, &b).unwrap();
assert!(c.grad_fn().is_some());
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 5.0, 1e-6);
assert!(b.grad().unwrap().is_none());
}
#[test]
fn test_no_grad_context_skips_backward() {
use crate::autograd::no_grad::no_grad;
let a = leaf_scalar(2.0, true);
let b = leaf_scalar(3.0, true);
let c = no_grad(|| add(&a, &b)).unwrap();
assert!(c.grad_fn().is_none());
}
#[test]
fn test_chain_mul_add() {
let a = leaf_scalar(2.0, true);
let b = leaf_scalar(3.0, true);
let c = mul(&a, &b).unwrap();
let d = add(&c, &b).unwrap();
d.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 3.0, 1e-6);
assert_scalar_approx(&b.grad().unwrap().unwrap(), 3.0, 1e-6);
}
#[test]
fn test_chain_div_sub() {
let a = leaf_scalar(3.0, true);
let b = leaf_scalar(2.0, true);
let d = div(&a, &b).unwrap();
let e = sub(&d, &a).unwrap();
e.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), -0.5, 1e-5);
assert_scalar_approx(&b.grad().unwrap().unwrap(), -0.75, 1e-5);
}
#[test]
fn test_chain_sqrt_pow() {
let a = leaf_scalar(9.0, true);
let s = sqrt(&a).unwrap();
let c = pow(&s, 2.0).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-5);
}
#[test]
fn test_neg_double() {
let a = leaf_scalar(5.0, true);
let b = neg(&a).unwrap();
let c = neg(&b).unwrap();
c.backward().unwrap();
assert_scalar_approx(&a.grad().unwrap().unwrap(), 1.0, 1e-6);
}
#[test]
fn test_mul_vector_backward() {
let a = leaf_vec(&[1.0, 2.0, 3.0], true);
let b = leaf_vec(&[4.0, 5.0, 6.0], true);
let c = mul(&a, &b).unwrap();
let c_data = c.data().unwrap().to_vec();
let total: f32 = c_data.iter().sum();
let sum_backward = SumBackward { input: c.clone() };
let loss = Tensor::from_operation(
TensorStorage::cpu(vec![total]),
vec![],
Arc::new(sum_backward),
)
.unwrap();
loss.backward().unwrap();
let a_grad = a.grad().unwrap().unwrap();
let a_g = a_grad.data().unwrap();
assert!((a_g[0] - 4.0).abs() < 1e-6);
assert!((a_g[1] - 5.0).abs() < 1e-6);
assert!((a_g[2] - 6.0).abs() < 1e-6);
let b_grad = b.grad().unwrap().unwrap();
let b_g = b_grad.data().unwrap();
assert!((b_g[0] - 1.0).abs() < 1e-6);
assert!((b_g[1] - 2.0).abs() < 1e-6);
assert!((b_g[2] - 3.0).abs() < 1e-6);
}
#[derive(Debug)]
struct SumBackward<T: Float> {
input: Tensor<T>,
}
impl<T: Float> GradFn<T> for SumBackward<T> {
fn backward(&self, _grad_output: &Tensor<T>) -> FerrotorchResult<Vec<Option<Tensor<T>>>> {
let ones_data = vec![<T as num_traits::One>::one(); self.input.numel()];
let ones = Tensor::from_storage(
TensorStorage::cpu(ones_data),
self.input.shape().to_vec(),
false,
)?;
Ok(vec![Some(ones)])
}
fn inputs(&self) -> Vec<&Tensor<T>> {
vec![&self.input]
}
fn name(&self) -> &'static str {
"SumBackward"
}
}
}