use core::fmt;
use core::ops::{Add, Div, Mul, Neg, Sub};
use axonml_core::Device;
use axonml_core::backends::CpuBackend;
#[cfg(feature = "cuda")]
use axonml_core::backends::CudaBackend;
use axonml_core::dtype::{Float, Numeric, Scalar};
use axonml_core::error::{Error, Result};
use axonml_core::storage::Storage;
use num_traits::NumCast;
#[cfg(feature = "cuda")]
mod cuda_accel {
use super::*;
use axonml_core::backends::cuda::get_cuda_backend;
pub fn get_cuda() -> Option<&'static CudaBackend> {
get_cuda_backend()
}
pub fn cuda_matmul(a: &[f32], b: &[f32], m: usize, n: usize, k: usize) -> Option<Vec<f32>> {
let cuda = get_cuda()?;
let a_gpu = cuda.htod_copy(a).ok()?;
let b_gpu = cuda.htod_copy(b).ok()?;
let mut c_gpu = cuda.alloc::<f32>(m * n).ok()?;
cuda.gemm_f32(
false, false, n, m, k, 1.0, &b_gpu, n, &a_gpu, k, 0.0, &mut c_gpu, n,
)
.ok()?;
cuda.dtoh_copy(&c_gpu).ok()
}
}
use crate::shape::{
Shape, Strides, broadcast_shape, broadcast_strides, contiguous_strides, is_contiguous,
linear_index, normalize_dim, numel, reshape, squeeze, transpose_shape, transpose_strides,
unsqueeze,
};
#[cfg(feature = "cuda")]
unsafe fn gpu_ref<T: Scalar>(t: &Tensor<T>) -> &Tensor<f32> {
assert!(
is_f32::<T>(),
"gpu_ref: only Tensor<f32> can be used for GPU operations, got {:?}",
T::DTYPE
);
unsafe { &*(t as *const Tensor<T> as *const Tensor<f32>) }
}
#[cfg(feature = "cuda")]
unsafe fn gpu_into<T: Scalar>(t: Tensor<f32>) -> Tensor<T> {
assert!(
is_f32::<T>(),
"gpu_into: only Tensor<f32> can be produced from GPU operations, got {:?}",
T::DTYPE
);
unsafe {
let out = std::ptr::read(&t as *const Tensor<f32> as *const Tensor<T>);
std::mem::forget(t);
out
}
}
#[cfg(feature = "cuda")]
fn is_f32<T: 'static>() -> bool {
std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>()
}
#[derive(Clone)]
pub struct Tensor<T: Scalar> {
pub(crate) storage: Storage<T>,
pub(crate) shape: Shape,
pub(crate) strides: Strides,
pub(crate) offset: usize,
}
impl<T: Scalar> Tensor<T> {
pub fn from_storage(storage: Storage<T>, shape: &[usize]) -> Result<Self> {
let total = numel(shape);
if total != storage.len() {
return Err(Error::shape_mismatch(&[storage.len()], shape));
}
let shape = Shape::from_slice(shape);
let strides = contiguous_strides(&shape);
Ok(Self {
storage,
shape,
strides,
offset: 0,
})
}
pub fn from_vec(data: Vec<T>, shape: &[usize]) -> Result<Self> {
let storage = Storage::from_vec(data, Device::Cpu);
Self::from_storage(storage, shape)
}
pub fn from_slice(data: &[T], shape: &[usize]) -> Result<Self> {
let storage = Storage::from_slice(data, Device::Cpu);
Self::from_storage(storage, shape)
}
pub fn scalar(value: T) -> Self {
Self {
storage: Storage::from_vec(vec![value], Device::Cpu),
shape: Shape::new(),
strides: Strides::new(),
offset: 0,
}
}
#[must_use]
pub fn zeros(shape: &[usize]) -> Self {
crate::creation::zeros(shape)
}
#[must_use]
pub fn ones(shape: &[usize]) -> Self
where
T: Numeric,
{
crate::creation::ones(shape)
}
#[must_use]
pub fn full(shape: &[usize], value: T) -> Self {
crate::creation::full(shape, value)
}
#[must_use]
pub fn randn(shape: &[usize]) -> Self
where
T: Float,
rand_distr::StandardNormal: rand::distributions::Distribution<T>,
{
crate::creation::randn(shape)
}
#[must_use]
pub fn rand(shape: &[usize]) -> Self
where
T: Float,
rand::distributions::Standard: rand::distributions::Distribution<T>,
{
crate::creation::rand(shape)
}
#[must_use]
pub fn shape(&self) -> &[usize] {
&self.shape
}
#[must_use]
pub fn strides(&self) -> &[isize] {
&self.strides
}
#[must_use]
pub fn ndim(&self) -> usize {
self.shape.len()
}
#[must_use]
pub fn numel(&self) -> usize {
numel(&self.shape)
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.numel() == 0
}
pub fn size(&self, dim: i64) -> Result<usize> {
let idx = normalize_dim(dim, self.ndim())?;
Ok(self.shape[idx])
}
#[must_use]
pub fn device(&self) -> Device {
self.storage.device()
}
#[must_use]
pub fn is_contiguous(&self) -> bool {
is_contiguous(&self.shape, &self.strides)
}
#[must_use]
pub fn is_scalar(&self) -> bool {
self.shape.is_empty()
}
pub fn get(&self, indices: &[usize]) -> Result<T> {
if indices.len() != self.ndim() {
return Err(Error::invalid_operation(format!(
"Expected {} indices, got {}",
self.ndim(),
indices.len()
)));
}
for (&idx, &dim) in indices.iter().zip(self.shape.iter()) {
if idx >= dim {
return Err(Error::IndexOutOfBounds {
index: idx,
size: dim,
});
}
}
let offset = self.offset + linear_index(indices, &self.strides);
Ok(self.storage.as_slice()[offset])
}
pub fn set(&self, indices: &[usize], value: T) -> Result<()> {
if indices.len() != self.ndim() {
return Err(Error::invalid_operation(format!(
"Expected {} indices, got {}",
self.ndim(),
indices.len()
)));
}
for (&idx, &dim) in indices.iter().zip(self.shape.iter()) {
if idx >= dim {
return Err(Error::IndexOutOfBounds {
index: idx,
size: dim,
});
}
}
let offset = self.offset + linear_index(indices, &self.strides);
self.storage.as_slice_mut()[offset] = value;
Ok(())
}
pub fn item(&self) -> Result<T> {
if self.numel() != 1 {
return Err(Error::invalid_operation(
"item() only works on single-element tensors",
));
}
let data = self.to_vec();
if data.is_empty() {
Err(Error::invalid_operation("item() on empty tensor"))
} else {
Ok(data[0])
}
}
#[must_use]
pub fn to_vec(&self) -> Vec<T> {
#[cfg(feature = "cuda")]
if self.storage.is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let f32_vec = self_f32.to_vec_gpu();
unsafe {
let mut v = std::mem::ManuallyDrop::new(f32_vec);
return Vec::from_raw_parts(v.as_mut_ptr() as *mut T, v.len(), v.capacity());
}
}
if self.is_contiguous() {
let storage = self.storage.as_slice();
storage[self.offset..self.offset + self.numel()].to_vec()
} else {
let mut result = Vec::with_capacity(self.numel());
self.copy_data_to(&mut result);
result
}
}
fn copy_data_to(&self, dst: &mut Vec<T>) {
dst.clear();
let storage = self.storage.as_slice();
let total = self.numel();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &self.shape);
let offset = self.offset + linear_index(&indices, &self.strides);
dst.push(storage[offset]);
}
}
pub fn reshape(&self, new_shape: &[isize]) -> Result<Self> {
let shape = reshape(&self.shape, new_shape)?;
if self.is_contiguous() {
Ok(Self {
storage: self.storage.clone(),
strides: contiguous_strides(&shape),
shape,
offset: self.offset,
})
} else {
let contig = self.contiguous();
Ok(Self {
storage: contig.storage,
strides: contiguous_strides(&shape),
shape,
offset: 0,
})
}
}
#[must_use]
pub fn flatten(&self) -> Self {
self.reshape(&[-1]).expect("Flatten should never fail")
}
pub fn squeeze(&self, dim: Option<i64>) -> Result<Self> {
let dim = match dim {
Some(d) => Some(normalize_dim(d, self.ndim())?),
None => None,
};
let new_shape = squeeze(&self.shape, dim);
let new_strides: Strides = match dim {
Some(d) => {
let mut s = self.strides.clone();
if d < self.shape.len() && self.shape[d] == 1 {
s.remove(d);
}
s
}
None => self
.shape
.iter()
.zip(self.strides.iter())
.filter(|(dim, _)| **dim != 1)
.map(|(_, stride)| *stride)
.collect(),
};
Ok(Self {
storage: self.storage.clone(),
shape: new_shape,
strides: new_strides,
offset: self.offset,
})
}
pub fn unsqueeze(&self, dim: i64) -> Result<Self> {
let normalized = if dim < 0 {
(dim + self.ndim() as i64 + 1) as usize
} else {
dim as usize
};
let new_shape = unsqueeze(&self.shape, normalized)?;
let mut new_strides = Strides::with_capacity(new_shape.len());
for (i, _) in new_shape.iter().enumerate() {
if i < normalized {
new_strides.push(self.strides.get(i).copied().unwrap_or(1));
} else if i == normalized {
new_strides.push(1);
} else {
new_strides.push(self.strides[i - 1]);
}
}
Ok(Self {
storage: self.storage.clone(),
shape: new_shape,
strides: new_strides,
offset: self.offset,
})
}
pub fn transpose(&self, dim0: i64, dim1: i64) -> Result<Self> {
let d0 = normalize_dim(dim0, self.ndim())?;
let d1 = normalize_dim(dim1, self.ndim())?;
let new_shape = transpose_shape(&self.shape, d0, d1)?;
let new_strides = transpose_strides(&self.strides, d0, d1);
Ok(Self {
storage: self.storage.clone(),
shape: new_shape,
strides: new_strides,
offset: self.offset,
})
}
pub fn t(&self) -> Result<Self> {
if self.ndim() != 2 {
return Err(Error::invalid_operation("t() only works on 2D tensors"));
}
self.transpose(0, 1)
}
pub fn permute(&self, dims: &[usize]) -> Result<Self> {
if dims.len() != self.ndim() {
return Err(Error::invalid_operation(format!(
"Expected {} dimensions, got {}",
self.ndim(),
dims.len()
)));
}
let mut seen = vec![false; self.ndim()];
for &d in dims {
if d >= self.ndim() {
return Err(Error::InvalidDimension {
index: d as i64,
ndim: self.ndim(),
});
}
if seen[d] {
return Err(Error::invalid_operation("Duplicate dimension in permute"));
}
seen[d] = true;
}
let new_shape: Shape = dims.iter().map(|&d| self.shape[d]).collect();
let new_strides: Strides = dims.iter().map(|&d| self.strides[d]).collect();
Ok(Self {
storage: self.storage.clone(),
shape: new_shape,
strides: new_strides,
offset: self.offset,
})
}
#[must_use]
pub fn contiguous(&self) -> Self {
if self.is_contiguous() && self.offset == 0 {
return self.clone();
}
#[cfg(feature = "cuda")]
if self.storage.is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let result = self_f32.contiguous_gpu();
return unsafe { gpu_into(result) };
}
let data = self.to_vec();
Self::from_vec(data, &self.shape).expect("Contiguous should never fail")
}
#[must_use]
pub fn map<F: Fn(T) -> T>(&self, f: F) -> Self {
let data = self.to_vec(); let result: Vec<T> = data.into_iter().map(f).collect();
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn zip_map<F: Fn(T, T) -> T>(&self, other: &Self, f: F) -> Self {
let a = self.to_vec();
let b = other.to_vec();
debug_assert_eq!(
a.len(),
b.len(),
"zip_map requires same number of elements: {} vs {}",
a.len(),
b.len()
);
let result: Vec<T> = a.into_iter().zip(b).map(|(x, y)| f(x, y)).collect();
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn zip_map3<F: Fn(T, T, T) -> T>(&self, b: &Self, c: &Self, f: F) -> Self {
let a_data = self.to_vec();
let b_data = b.to_vec();
let c_data = c.to_vec();
debug_assert_eq!(a_data.len(), b_data.len());
debug_assert_eq!(a_data.len(), c_data.len());
let result: Vec<T> = a_data
.into_iter()
.zip(b_data)
.zip(c_data)
.map(|((a, b), c)| f(a, b, c))
.collect();
Self::from_vec(result, &self.shape).unwrap()
}
pub fn to_device(&self, device: Device) -> Result<Self> {
if self.device() == device {
return Ok(self.clone());
}
#[cfg(feature = "cuda")]
if self.storage.is_gpu() || device.is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let result = self_f32.to_device_f32(device)?;
return Ok(unsafe { gpu_into(result) });
}
let contig = self.contiguous();
let new_storage = contig.storage.to_device(device)?;
Ok(Self {
storage: new_storage,
shape: self.shape.clone(),
strides: self.strides.clone(),
offset: 0,
})
}
pub fn cpu(&self) -> Result<Self> {
self.to_device(Device::Cpu)
}
#[must_use]
pub fn clone_deep(&self) -> Self {
let data = self.to_vec();
let cpu = Self::from_vec(data, &self.shape).expect("Deep clone should never fail");
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
return cpu.to_device(self.device()).unwrap();
}
cpu
}
}
impl<T: Numeric> Tensor<T> {
pub fn fill_(&self, value: T) {
assert!(
self.storage.is_cpu(),
"fill_() not supported on GPU tensors — create a new tensor and transfer instead"
);
let mut data = self.storage.as_slice_mut();
CpuBackend::fill(&mut data, value);
}
pub fn zero_(&self) {
self.fill_(T::zero());
}
#[must_use]
pub fn sum(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let mut t = self_f32.clone();
while t.ndim() > 1 {
t = t.sum_dim_cuda(0);
}
if t.numel() > 1 {
t = t.sum_dim_cuda(0);
}
return unsafe { gpu_into(t) };
}
let data = self.to_vec();
let result = CpuBackend::sum(&data);
Self::scalar(result)
}
#[must_use]
pub fn prod(&self) -> Self {
let data = self.to_vec();
let result = CpuBackend::prod(&data);
let s = Self::scalar(result);
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
return s
.to_device(self.device())
.expect("prod: device transfer failed");
}
s
}
pub fn max(&self) -> Result<Self> {
if self.is_empty() {
return Err(Error::EmptyTensor);
}
let data = self.to_vec();
let result = CpuBackend::max(&data).expect("max on non-empty tensor");
let s = Self::scalar(result);
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
return Ok(s
.to_device(self.device())
.expect("max: device transfer failed"));
}
Ok(s)
}
pub fn min(&self) -> Result<Self> {
if self.is_empty() {
return Err(Error::EmptyTensor);
}
let data = self.to_vec();
let result = CpuBackend::min(&data).expect("min on non-empty tensor");
let s = Self::scalar(result);
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
return Ok(s
.to_device(self.device())
.expect("min: device transfer failed"));
}
Ok(s)
}
pub fn argmax(&self) -> Result<usize> {
if self.is_empty() {
return Err(Error::EmptyTensor);
}
let data = self.to_vec();
Ok(CpuBackend::argmax(&data).unwrap())
}
pub fn argmin(&self) -> Result<usize> {
if self.is_empty() {
return Err(Error::EmptyTensor);
}
let data = self.to_vec();
Ok(CpuBackend::argmin(&data).unwrap())
}
pub fn cat(tensors: &[&Self], dim: usize) -> Result<Self> {
if tensors.is_empty() {
return Err(Error::invalid_operation("cat requires at least one tensor"));
}
let ndim = tensors[0].ndim();
if dim >= ndim {
return Err(Error::invalid_operation("cat dimension out of range"));
}
for t in &tensors[1..] {
if t.ndim() != ndim {
return Err(Error::invalid_operation(
"cat: all tensors must have same ndim",
));
}
for d in 0..ndim {
if d != dim && t.shape[d] != tensors[0].shape[d] {
return Err(Error::invalid_operation(
"cat: shapes must match on non-cat dims",
));
}
}
}
let total_dim_size: usize = tensors.iter().map(|t| t.shape[dim]).sum();
let mut out_shape: Vec<usize> = tensors[0].shape.to_vec();
out_shape[dim] = total_dim_size;
let outer_size: usize = out_shape[..dim].iter().product();
let inner_size: usize = out_shape[dim + 1..].iter().product();
let total_numel: usize = out_shape.iter().product();
let mut result = vec![T::zero(); total_numel];
let mut dim_offset = 0;
for t in tensors {
let t_data = t.contiguous().to_vec();
let t_dim_size = t.shape[dim];
for outer in 0..outer_size {
for d in 0..t_dim_size {
let src_base = outer * t_dim_size * inner_size + d * inner_size;
let dst_base =
outer * total_dim_size * inner_size + (dim_offset + d) * inner_size;
result[dst_base..dst_base + inner_size]
.copy_from_slice(&t_data[src_base..src_base + inner_size]);
}
}
dim_offset += t_dim_size;
}
let out = Self::from_vec(result, &out_shape)?;
#[cfg(feature = "cuda")]
if tensors[0].device().is_gpu() {
return Ok(out.to_device(tensors[0].device()).unwrap());
}
Ok(out)
}
}
impl<T: Float> Tensor<T> {
pub fn mean(&self) -> Result<Self> {
if self.is_empty() {
return Err(Error::EmptyTensor);
}
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
let s = self.sum(); let n = self.numel() as f32;
return Ok(s.mul_scalar(T::from(1.0 / n as f64).unwrap_or(T::zero())));
}
let data = self.to_vec();
let result = CpuBackend::mean(&data).expect("mean on non-empty tensor");
Ok(Self::scalar(result))
}
#[must_use]
pub fn relu(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).relu_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::relu(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn sigmoid(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).sigmoid_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::sigmoid(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn tanh(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).tanh_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::tanh(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn exp(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).exp_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::exp(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn ln(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).ln_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::ln(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn sqrt(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).sqrt_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::sqrt(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn pow(&self, exp: T) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let exp_f32: f32 = unsafe { *(&exp as *const T as *const f32) };
return unsafe { gpu_into(gpu_ref(self).pow_cuda(exp_f32)) };
}
let data = self.to_vec();
let result: Vec<T> = data.iter().map(|&x| x.pow_value(exp)).collect();
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn gelu(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).gelu_cuda()) };
}
crate::ops::gelu(self)
}
#[must_use]
pub fn silu(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
return unsafe { gpu_into(gpu_ref(self).silu_cuda()) };
}
crate::ops::silu(self)
}
#[must_use]
pub fn softmax(&self, dim: i32) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
return unsafe { gpu_into(self_f32.softmax_cuda(dim).expect("CUDA softmax failed")) };
}
crate::ops::softmax(self, dim as i64).unwrap_or_else(|_| self.clone())
}
#[must_use]
pub fn log_softmax(&self, dim: i32) -> Self {
let softmax_result = self.softmax(dim);
softmax_result.ln()
}
#[must_use]
pub fn mean_dim(&self, dim: i32, keepdim: bool) -> Self {
let ndim = self.ndim();
let dim = if dim < 0 {
(ndim as i32 + dim) as usize
} else {
dim as usize
};
if dim >= ndim {
return self.clone();
}
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let summed = if keepdim {
self_f32.sum_dim_keepdim_cuda(dim)
} else {
self_f32.sum_dim_cuda(dim)
};
let dim_size = self.shape[dim];
let result = summed.mul_scalar_cuda(1.0 / dim_size as f32);
return unsafe { gpu_into(result) };
}
let dim_size = self.shape[dim];
let data = self.to_vec();
let mut new_shape = self.shape.clone();
if keepdim {
new_shape[dim] = 1;
} else {
new_shape.remove(dim);
}
if new_shape.is_empty() {
new_shape = smallvec::smallvec![1];
}
let new_numel: usize = new_shape.iter().product();
let mut result = vec![T::zero(); new_numel];
let outer_size: usize = self.shape[..dim].iter().product();
let inner_size: usize = self.shape[dim + 1..].iter().product();
for outer in 0..outer_size {
for inner in 0..inner_size {
let mut sum = T::zero();
for d in 0..dim_size {
let idx = outer * dim_size * inner_size + d * inner_size + inner;
sum = sum + data[idx];
}
let mean = sum / NumCast::from(dim_size).unwrap();
let result_idx = outer * inner_size + inner;
result[result_idx] = mean;
}
}
Self::from_vec(result, &new_shape).unwrap()
}
#[must_use]
pub fn sum_dim(&self, dim: i32, keepdim: bool) -> Self {
let ndim = self.ndim();
let dim = if dim < 0 {
(ndim as i32 + dim) as usize
} else {
dim as usize
};
if dim >= ndim {
return self.clone();
}
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let result = if keepdim {
self_f32.sum_dim_keepdim_cuda(dim)
} else {
self_f32.sum_dim_cuda(dim)
};
return unsafe { gpu_into(result) };
}
let dim_size = self.shape[dim];
let data = self.to_vec();
let mut new_shape = self.shape.clone();
if keepdim {
new_shape[dim] = 1;
} else {
new_shape.remove(dim);
}
if new_shape.is_empty() {
new_shape = smallvec::smallvec![1];
}
let new_numel: usize = new_shape.iter().product();
let mut result = vec![T::zero(); new_numel];
let outer_size: usize = self.shape[..dim].iter().product();
let inner_size: usize = self.shape[dim + 1..].iter().product();
for outer in 0..outer_size {
for inner in 0..inner_size {
let mut sum = T::zero();
for d in 0..dim_size {
let idx = outer * dim_size * inner_size + d * inner_size + inner;
sum = sum + data[idx];
}
let result_idx = outer * inner_size + inner;
result[result_idx] = sum;
}
}
Self::from_vec(result, &new_shape).unwrap()
}
#[must_use]
pub fn var_dim(&self, dim: i32, keepdim: bool) -> Self {
let mean = self.mean_dim(dim, true);
let sq = self.mul(self).unwrap_or_else(|_| self.clone());
let mean_sq = sq.mean_dim(dim, keepdim);
let mean_keepdim = if keepdim {
mean.clone()
} else {
self.mean_dim(dim, keepdim)
};
let mean_squared = mean_keepdim
.mul(&mean_keepdim)
.unwrap_or_else(|_| mean_keepdim.clone());
mean_sq
.sub(&mean_squared)
.unwrap_or_else(|_| mean_sq.clone())
}
#[must_use]
pub fn broadcast_to(&self, shape: &[usize]) -> Self {
if self.shape.as_slice() == shape {
return self.clone();
}
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
return unsafe {
gpu_into(
self_f32
.broadcast_to_cuda(shape)
.expect("CUDA broadcast_to failed"),
)
};
}
let result_shape = broadcast_shape(&self.shape, shape).unwrap_or_else(|_| shape.into());
let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
let total = numel(&result_shape);
let mut result_data = vec![T::zero(); total];
let self_data = self.storage.as_slice();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &result_shape);
let self_idx = self.offset + linear_index(&indices, &self_strides);
result_data[i] = self_data[self_idx];
}
Self::from_vec(result_data, &result_shape).unwrap()
}
#[must_use]
pub fn slice(&self, ranges: &[std::ops::Range<usize>]) -> Self {
let mut new_shape = Vec::with_capacity(self.ndim());
for (i, range) in ranges.iter().enumerate() {
if i < self.ndim() {
new_shape.push(range.end - range.start);
}
}
for i in ranges.len()..self.ndim() {
new_shape.push(self.shape[i]);
}
let new_numel: usize = new_shape.iter().product();
let mut result_data = vec![T::zero(); new_numel];
let self_data = self.to_vec();
let mut result_idx = 0;
Self::slice_recursive(
&self_data,
&self.shape,
ranges,
0,
0,
&mut result_data,
&mut result_idx,
);
let out = Self::from_vec(result_data, &new_shape).unwrap();
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
return out.to_device(self.device()).unwrap();
}
out
}
fn slice_recursive(
data: &[T],
shape: &[usize],
ranges: &[std::ops::Range<usize>],
dim: usize,
offset: usize,
result: &mut [T],
result_idx: &mut usize,
) {
if dim == shape.len() {
result[*result_idx] = data[offset];
*result_idx += 1;
return;
}
let stride: usize = shape[dim + 1..].iter().product();
let (start, end) = if dim < ranges.len() {
(ranges[dim].start, ranges[dim].end)
} else {
(0, shape[dim])
};
for i in start..end {
Self::slice_recursive(
data,
shape,
ranges,
dim + 1,
offset + i * stride,
result,
result_idx,
);
}
}
}
impl<T: Numeric> Tensor<T> {
pub fn add(&self, other: &Self) -> Result<Self> {
#[cfg(feature = "cuda")]
{
let self_gpu = self.device().is_gpu();
let other_gpu = other.device().is_gpu();
if self_gpu || other_gpu {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
if self_gpu && other_gpu {
let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
if self.shape == other.shape {
return Ok(unsafe { gpu_into(s.add_cuda(o)?) });
} else {
return Ok(unsafe { gpu_into(s.broadcast_add_cuda(o)?) });
}
}
let target_device = if self_gpu {
self.device()
} else {
other.device()
};
let a_gpu = if self_gpu {
self.clone()
} else {
self.to_device(target_device)?
};
let b_gpu = if other_gpu {
other.clone()
} else {
other.to_device(target_device)?
};
return a_gpu.add(&b_gpu);
}
}
if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
let a = self.storage.as_slice();
let b = other.storage.as_slice();
let ao = self.offset;
let bo = other.offset;
let n = numel(&self.shape);
let mut result_data = vec![T::zero(); n];
for i in 0..n {
result_data[i] = a[ao + i] + b[bo + i];
}
return Self::from_vec(result_data, &self.shape);
}
let result_shape = broadcast_shape(&self.shape, &other.shape)?;
let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);
let total = numel(&result_shape);
let mut result_data = vec![T::zero(); total];
let self_data = self.storage.as_slice();
let other_data = other.storage.as_slice();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &result_shape);
let self_idx = self.offset + linear_index(&indices, &self_strides);
let other_idx = other.offset + linear_index(&indices, &other_strides);
result_data[i] = self_data[self_idx] + other_data[other_idx];
}
Self::from_vec(result_data, &result_shape)
}
pub fn sub(&self, other: &Self) -> Result<Self> {
#[cfg(feature = "cuda")]
{
let self_gpu = self.device().is_gpu();
let other_gpu = other.device().is_gpu();
if self_gpu || other_gpu {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
if self_gpu && other_gpu {
let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
if self.shape == other.shape {
return Ok(unsafe { gpu_into(s.sub_cuda(o)?) });
} else {
return Ok(unsafe { gpu_into(s.broadcast_sub_cuda(o)?) });
}
}
let target = if self_gpu {
self.device()
} else {
other.device()
};
let a_gpu = if self_gpu {
self.clone()
} else {
self.to_device(target)?
};
let b_gpu = if other_gpu {
other.clone()
} else {
other.to_device(target)?
};
return a_gpu.sub(&b_gpu);
}
}
if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
let a = self.storage.as_slice();
let b = other.storage.as_slice();
let (ao, bo) = (self.offset, other.offset);
let n = numel(&self.shape);
let mut r = vec![T::zero(); n];
for i in 0..n {
r[i] = a[ao + i] - b[bo + i];
}
return Self::from_vec(r, &self.shape);
}
let result_shape = broadcast_shape(&self.shape, &other.shape)?;
let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);
let total = numel(&result_shape);
let mut result_data = vec![T::zero(); total];
let self_data = self.storage.as_slice();
let other_data = other.storage.as_slice();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &result_shape);
let self_idx = self.offset + linear_index(&indices, &self_strides);
let other_idx = other.offset + linear_index(&indices, &other_strides);
result_data[i] = self_data[self_idx] - other_data[other_idx];
}
Self::from_vec(result_data, &result_shape)
}
pub fn mul(&self, other: &Self) -> Result<Self> {
#[cfg(feature = "cuda")]
{
let self_gpu = self.device().is_gpu();
let other_gpu = other.device().is_gpu();
if self_gpu || other_gpu {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
if self_gpu && other_gpu {
let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
if self.shape == other.shape {
return Ok(unsafe { gpu_into(s.mul_cuda(o)?) });
} else {
return Ok(unsafe { gpu_into(s.broadcast_mul_cuda(o)?) });
}
}
let target = if self_gpu {
self.device()
} else {
other.device()
};
let a_gpu = if self_gpu {
self.clone()
} else {
self.to_device(target)?
};
let b_gpu = if other_gpu {
other.clone()
} else {
other.to_device(target)?
};
return a_gpu.mul(&b_gpu);
}
}
if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
let a = self.storage.as_slice();
let b = other.storage.as_slice();
let (ao, bo) = (self.offset, other.offset);
let n = numel(&self.shape);
let mut r = vec![T::zero(); n];
for i in 0..n {
r[i] = a[ao + i] * b[bo + i];
}
return Self::from_vec(r, &self.shape);
}
let result_shape = broadcast_shape(&self.shape, &other.shape)?;
let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);
let total = numel(&result_shape);
let mut result_data = vec![T::zero(); total];
let self_data = self.storage.as_slice();
let other_data = other.storage.as_slice();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &result_shape);
let self_idx = self.offset + linear_index(&indices, &self_strides);
let other_idx = other.offset + linear_index(&indices, &other_strides);
result_data[i] = self_data[self_idx] * other_data[other_idx];
}
Self::from_vec(result_data, &result_shape)
}
pub fn div(&self, other: &Self) -> Result<Self> {
#[cfg(feature = "cuda")]
{
let self_gpu = self.device().is_gpu();
let other_gpu = other.device().is_gpu();
if self_gpu || other_gpu {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
if self_gpu && other_gpu {
let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
if self.shape == other.shape {
return Ok(unsafe { gpu_into(s.div_cuda(o)?) });
} else {
return Ok(unsafe { gpu_into(s.broadcast_div_cuda(o)?) });
}
}
let target = if self_gpu {
self.device()
} else {
other.device()
};
let a_gpu = if self_gpu {
self.clone()
} else {
self.to_device(target)?
};
let b_gpu = if other_gpu {
other.clone()
} else {
other.to_device(target)?
};
return a_gpu.div(&b_gpu);
}
}
if self.shape == other.shape && self.is_contiguous() && other.is_contiguous() {
let a = self.storage.as_slice();
let b = other.storage.as_slice();
let (ao, bo) = (self.offset, other.offset);
let n = numel(&self.shape);
let mut r = vec![T::zero(); n];
for i in 0..n {
r[i] = a[ao + i] / b[bo + i];
}
return Self::from_vec(r, &self.shape);
}
let result_shape = broadcast_shape(&self.shape, &other.shape)?;
let self_strides = broadcast_strides(&self.shape, &self.strides, &result_shape);
let other_strides = broadcast_strides(&other.shape, &other.strides, &result_shape);
let total = numel(&result_shape);
let mut result_data = vec![T::zero(); total];
let self_data = self.storage.as_slice();
let other_data = other.storage.as_slice();
for i in 0..total {
let indices = crate::shape::unravel_index(i, &result_shape);
let self_idx = self.offset + linear_index(&indices, &self_strides);
let other_idx = other.offset + linear_index(&indices, &other_strides);
result_data[i] = self_data[self_idx] / other_data[other_idx];
}
Self::from_vec(result_data, &result_shape)
}
#[must_use]
pub fn add_scalar(&self, scalar: T) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let scalar_f32: f32 = unsafe { *(&scalar as *const T as *const f32) };
return unsafe { gpu_into(self_f32.add_scalar_cuda(scalar_f32)) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::add_scalar(&mut result, &data, scalar);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn mul_scalar(&self, scalar: T) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
let scalar_f32: f32 = unsafe { *(&scalar as *const T as *const f32) };
return unsafe { gpu_into(self_f32.mul_scalar_cuda(scalar_f32)) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::mul_scalar(&mut result, &data, scalar);
Self::from_vec(result, &self.shape).unwrap()
}
#[must_use]
pub fn neg(&self) -> Self {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let self_f32 = unsafe { gpu_ref(self) };
return unsafe { gpu_into(self_f32.neg_cuda()) };
}
let data = self.to_vec();
let mut result = vec![T::zero(); data.len()];
CpuBackend::neg(&mut result, &data);
Self::from_vec(result, &self.shape).unwrap()
}
pub fn matmul(&self, other: &Self) -> Result<Self> {
#[cfg(feature = "cuda")]
if self.device().is_gpu() {
assert!(is_f32::<T>(), "GPU tensors are only supported for f32");
let (s, o) = unsafe { (gpu_ref(self), gpu_ref(other)) };
return Ok(unsafe { gpu_into(s.matmul_cuda(o)?) });
}
if self.ndim() < 2 || other.ndim() < 2 {
return Err(Error::invalid_operation(
"matmul requires at least 2D tensors",
));
}
let m = self.shape[self.ndim() - 2];
let k1 = self.shape[self.ndim() - 1];
let k2 = other.shape[other.ndim() - 2];
let n = other.shape[other.ndim() - 1];
if k1 != k2 {
return Err(Error::invalid_operation(format!(
"matmul inner dimensions must match: {k1} vs {k2}"
)));
}
if self.ndim() == 2 && other.ndim() == 2 {
let a_data = self.contiguous().to_vec();
let b_data = other.contiguous().to_vec();
#[cfg(feature = "cuda")]
{
let flops = m * n * k1;
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>()
&& flops >= 4_000_000
{
debug_assert!(std::mem::size_of::<T>() == std::mem::size_of::<f32>());
let a_f32: &[f32] = unsafe { std::mem::transmute(a_data.as_slice()) };
let b_f32: &[f32] = unsafe { std::mem::transmute(b_data.as_slice()) };
if let Some(c_f32) = cuda_accel::cuda_matmul(a_f32, b_f32, m, n, k1) {
let c_t: Vec<T> = unsafe {
let mut v = std::mem::ManuallyDrop::new(c_f32);
Vec::from_raw_parts(v.as_mut_ptr() as *mut T, v.len(), v.capacity())
};
return Self::from_vec(c_t, &[m, n]);
}
}
}
let mut c_data = vec![T::zero(); m * n];
CpuBackend::matmul(&mut c_data, &a_data, &b_data, m, n, k1);
return Self::from_vec(c_data, &[m, n]);
}
let batch_dims_self: Vec<usize> = self.shape[..self.ndim() - 2].to_vec();
let batch_dims_other: Vec<usize> = other.shape[..other.ndim() - 2].to_vec();
let broadcast_batch = if batch_dims_self == batch_dims_other {
None
} else {
let max_len = batch_dims_self.len().max(batch_dims_other.len());
let pad_a = vec![1usize; max_len - batch_dims_self.len()];
let pad_b = vec![1usize; max_len - batch_dims_other.len()];
let a_dims: Vec<usize> = pad_a
.iter()
.chain(batch_dims_self.iter())
.copied()
.collect();
let b_dims: Vec<usize> = pad_b
.iter()
.chain(batch_dims_other.iter())
.copied()
.collect();
let mut out_dims = Vec::with_capacity(max_len);
for i in 0..max_len {
if a_dims[i] == b_dims[i] {
out_dims.push(a_dims[i]);
} else if a_dims[i] == 1 {
out_dims.push(b_dims[i]);
} else if b_dims[i] == 1 {
out_dims.push(a_dims[i]);
} else {
return Err(Error::invalid_operation(format!(
"matmul batch dimensions not broadcastable: {:?} vs {:?}",
batch_dims_self, batch_dims_other
)));
}
}
Some((a_dims, b_dims, out_dims))
};
let (batch_size, a_batch_idx, b_batch_idx) =
if let Some((a_dims, b_dims, out_dims)) = &broadcast_batch {
let bs: usize = out_dims.iter().product();
let mut a_idx = Vec::with_capacity(bs);
let mut b_idx = Vec::with_capacity(bs);
for flat in 0..bs {
let mut remaining = flat;
let mut ai = 0usize;
let mut bi = 0usize;
let mut a_stride_acc = 1usize;
let mut b_stride_acc = 1usize;
for d in (0..out_dims.len()).rev() {
let out_d = out_dims[d];
let idx = remaining % out_d;
remaining /= out_d;
let a_d = a_dims[d];
let b_d = b_dims[d];
ai += (idx % a_d) * a_stride_acc;
bi += (idx % b_d) * b_stride_acc;
a_stride_acc *= a_d;
b_stride_acc *= b_d;
}
a_idx.push(ai);
b_idx.push(bi);
}
(bs, a_idx, b_idx)
} else {
let bs: usize = batch_dims_self.iter().product();
let idx: Vec<usize> = (0..bs).collect();
(bs, idx.clone(), idx)
};
let a_stride = m * k1;
let b_stride = k1 * n;
let c_stride = m * n;
let a_data = self.contiguous().to_vec();
let b_data = other.contiguous().to_vec();
let mut c_data = vec![T::zero(); batch_size * m * n];
#[cfg(feature = "cuda")]
{
let flops = m * n * k1;
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() && flops >= 4_000_000 {
let a_f32: &[f32] = unsafe { std::mem::transmute(a_data.as_slice()) };
let b_f32: &[f32] = unsafe { std::mem::transmute(b_data.as_slice()) };
let mut gpu_ok = true;
for batch in 0..batch_size {
let ai = a_batch_idx[batch];
let bi = b_batch_idx[batch];
let a_slice = &a_f32[ai * a_stride..(ai + 1) * a_stride];
let b_slice = &b_f32[bi * b_stride..(bi + 1) * b_stride];
if let Some(c_batch) = cuda_accel::cuda_matmul(a_slice, b_slice, m, n, k1) {
c_data[batch * c_stride..(batch + 1) * c_stride]
.copy_from_slice(unsafe { std::mem::transmute(c_batch.as_slice()) });
} else {
gpu_ok = false;
break;
}
}
if gpu_ok {
let mut output_shape = batch_dims_self;
output_shape.push(m);
output_shape.push(n);
return Self::from_vec(c_data, &output_shape);
}
c_data = vec![T::zero(); batch_size * m * n];
}
}
for batch in 0..batch_size {
let ai = a_batch_idx[batch];
let bi = b_batch_idx[batch];
let a_slice = &a_data[ai * a_stride..(ai + 1) * a_stride];
let b_slice = &b_data[bi * b_stride..(bi + 1) * b_stride];
let c_slice = &mut c_data[batch * c_stride..(batch + 1) * c_stride];
CpuBackend::matmul(c_slice, a_slice, b_slice, m, n, k1);
}
let mut output_shape = if let Some((_, _, ref out_dims)) = broadcast_batch {
out_dims.clone()
} else {
batch_dims_self
};
output_shape.push(m);
output_shape.push(n);
Self::from_vec(c_data, &output_shape)
}
pub fn dot(&self, other: &Self) -> Result<Self> {
if self.ndim() != 1 || other.ndim() != 1 {
return Err(Error::invalid_operation("dot requires 1D tensors"));
}
if self.shape[0] != other.shape[0] {
return Err(Error::shape_mismatch(&self.shape, &other.shape));
}
let a_data = self.to_vec();
let b_data = other.to_vec();
let result = CpuBackend::dot(&a_data, &b_data);
Ok(Self::scalar(result))
}
}
impl<T: Numeric> Add for &Tensor<T> {
type Output = Tensor<T>;
fn add(self, other: Self) -> Self::Output {
self.add(other).expect("Addition failed")
}
}
impl<T: Numeric> Sub for &Tensor<T> {
type Output = Tensor<T>;
fn sub(self, other: Self) -> Self::Output {
self.sub(other).expect("Subtraction failed")
}
}
impl<T: Numeric> Mul for &Tensor<T> {
type Output = Tensor<T>;
fn mul(self, other: Self) -> Self::Output {
self.mul(other).expect("Multiplication failed")
}
}
impl<T: Numeric> Div for &Tensor<T> {
type Output = Tensor<T>;
fn div(self, other: Self) -> Self::Output {
self.div(other).expect("Division failed")
}
}
impl<T: Numeric> Neg for &Tensor<T> {
type Output = Tensor<T>;
fn neg(self) -> Self::Output {
self.neg()
}
}
impl<T: Numeric> Add<T> for &Tensor<T> {
type Output = Tensor<T>;
fn add(self, scalar: T) -> Self::Output {
self.add_scalar(scalar)
}
}
impl<T: Numeric> Mul<T> for &Tensor<T> {
type Output = Tensor<T>;
fn mul(self, scalar: T) -> Self::Output {
self.mul_scalar(scalar)
}
}
impl<T: Scalar + fmt::Display> fmt::Debug for Tensor<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Tensor(shape={:?}, device={}",
self.shape(),
self.device()
)?;
if self.numel() <= 10 {
write!(f, ", data={:?}", self.to_vec())?;
}
write!(f, ")")
}
}
impl<T: Scalar + fmt::Display> fmt::Display for Tensor<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.is_scalar() {
write!(f, "{}", self.item().unwrap())
} else if self.ndim() == 1 {
write!(f, "[")?;
let data = self.to_vec();
for (i, val) in data.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
write!(f, "{val}")?;
}
write!(f, "]")
} else {
write!(f, "Tensor(shape={:?})", self.shape())
}
}
}
impl Tensor<f32> {
#[must_use]
pub fn to_f16_precision(&self) -> Self {
let data = self.to_vec();
let f16_data: Vec<f32> = data
.iter()
.map(|&v| {
let h = half::f16::from_f32(v);
h.to_f32()
})
.collect();
Self::from_vec(f16_data, self.shape()).unwrap()
}
#[must_use]
pub fn to_f32_precision(&self) -> Self {
self.clone()
}
#[must_use]
pub fn has_f16_rounding_error(&self) -> bool {
let data = self.to_vec();
data.iter().any(|&v| {
let h = half::f16::from_f32(v);
(h.to_f32() - v).abs() > f32::EPSILON
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_from_vec() {
let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
assert_eq!(t.shape(), &[2, 3]);
assert_eq!(t.numel(), 6);
}
#[test]
fn test_get_set() {
let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
assert_eq!(t.get(&[0, 0]).unwrap(), 1.0);
assert_eq!(t.get(&[0, 1]).unwrap(), 2.0);
assert_eq!(t.get(&[1, 0]).unwrap(), 3.0);
assert_eq!(t.get(&[1, 1]).unwrap(), 4.0);
t.set(&[0, 0], 99.0).unwrap();
assert_eq!(t.get(&[0, 0]).unwrap(), 99.0);
}
#[test]
fn test_reshape() {
let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
let r = t.reshape(&[3, 2]).expect("reshape failed");
assert_eq!(r.shape(), &[3, 2]);
let r = t.reshape(&[-1]).expect("reshape failed");
assert_eq!(r.shape(), &[6]);
}
#[test]
fn test_transpose() {
let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
let r = t.t().unwrap();
assert_eq!(r.shape(), &[3, 2]);
assert_eq!(r.get(&[0, 0]).unwrap(), 1.0);
assert_eq!(r.get(&[0, 1]).unwrap(), 4.0);
assert_eq!(r.get(&[1, 0]).unwrap(), 2.0);
}
#[test]
fn test_arithmetic() {
let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0], &[3]).unwrap();
let b = Tensor::<f32>::from_vec(vec![4.0, 5.0, 6.0], &[3]).unwrap();
let c = &a + &b;
assert_eq!(c.to_vec(), vec![5.0, 7.0, 9.0]);
let d = &a * &b;
assert_eq!(d.to_vec(), vec![4.0, 10.0, 18.0]);
}
#[test]
fn test_broadcasting() {
let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0], &[3]).unwrap();
let b = Tensor::<f32>::from_vec(vec![10.0], &[1]).unwrap();
let c = &a + &b;
assert_eq!(c.to_vec(), vec![11.0, 12.0, 13.0]);
}
#[test]
fn test_sum() {
let t = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[4]).unwrap();
let s = t.sum();
assert_eq!(s.item().unwrap(), 10.0);
}
#[test]
fn test_matmul() {
let a = Tensor::<f32>::from_vec(vec![1.0, 2.0, 3.0, 4.0], &[2, 2]).unwrap();
let b = Tensor::<f32>::from_vec(vec![5.0, 6.0, 7.0, 8.0], &[2, 2]).unwrap();
let c = a.matmul(&b).unwrap();
assert_eq!(c.shape(), &[2, 2]);
assert_eq!(c.to_vec(), vec![19.0, 22.0, 43.0, 50.0]);
}
#[test]
fn test_relu() {
let t = Tensor::<f32>::from_vec(vec![-1.0, 0.0, 1.0, 2.0], &[4]).unwrap();
let r = t.relu();
assert_eq!(r.to_vec(), vec![0.0, 0.0, 1.0, 2.0]);
}
#[test]
fn test_scalar() {
let s = Tensor::<f32>::scalar(42.0);
assert!(s.is_scalar());
assert_eq!(s.numel(), 1);
assert_eq!(s.item().unwrap(), 42.0);
}
}