use std::sync::Arc;
use torsh_core::{
dtype::TensorElement,
error::{Result, TorshError},
};
#[cfg(feature = "simd")]
mod simd_imports {
pub use scirs2_core::ndarray::Array1;
}
#[cfg(feature = "simd")]
use simd_imports::*;
#[cfg(feature = "parallel")]
#[cfg(feature = "parallel")]
use scirs2_core::chunking::{
CacheAwareness, ChunkConfig, ChunkStrategy, ComputeIntensity, GpuChunkSettings, MemoryPattern,
NumaStrategy,
};
use crate::core_ops::{Operation, Tensor};
#[cfg(feature = "simd")]
pub(crate) mod adaptive_simd {
use super::*;
use scirs2_core::ndarray::ArrayView1;
pub fn adaptive_simd_relu_f32(input: &ArrayView1<f32>) -> Array1<f32> {
scirs2_core::simd::activation::simd_relu_f32(input)
}
pub fn adaptive_simd_sigmoid_f32(input: &ArrayView1<f32>) -> Array1<f32> {
scirs2_core::simd::transcendental::simd_sigmoid_f32(input)
}
pub fn adaptive_simd_gelu_f32(input: &ArrayView1<f32>) -> Array1<f32> {
scirs2_core::simd::transcendental::simd_gelu_f32(input)
}
}
#[cfg(feature = "simd")]
#[cfg(feature = "parallel")]
mod intelligent_chunking {
use super::*;
#[derive(Debug, Clone, Copy)]
pub enum TensorOpType {
ElementWise,
Activation,
}
pub fn create_optimal_chunk_config(
tensor_size: usize,
op_type: TensorOpType,
_device: torsh_core::device::DeviceType,
is_gpu_available: bool,
) -> ChunkConfig {
match op_type {
TensorOpType::ElementWise => ChunkConfig {
strategy: if tensor_size > 100_000 {
ChunkStrategy::MemoryOptimized
} else {
ChunkStrategy::CacheOptimized
},
min_chunk_size: 64,
max_chunk_size: 8192,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::MemoryBound,
enable_monitoring: false,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::L2,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: if is_gpu_available {
Some(GpuChunkSettings::default())
} else {
None
},
},
TensorOpType::Activation => ChunkConfig {
strategy: ChunkStrategy::CacheOptimized,
min_chunk_size: 64,
max_chunk_size: 4096,
prefer_work_stealing: true,
memory_pattern: MemoryPattern::Sequential,
compute_intensity: ComputeIntensity::ComputeIntensive,
enable_monitoring: false,
load_balance_factor: 0.1,
cache_awareness: CacheAwareness::L1,
numa_strategy: NumaStrategy::LocalPreferred,
gpu_settings: if is_gpu_available {
Some(GpuChunkSettings {
gpu_memory_ratio: 0.7,
gpu_min_chunk: 2048,
overlap_compute: true,
gpu_bandwidth: None, transfer_bandwidth: None, })
} else {
None
},
},
}
}
pub fn intelligent_parallel_process<T, F, R>(
data: Vec<T>,
op_type: TensorOpType,
device: torsh_core::device::DeviceType,
operation: F,
) -> Vec<R>
where
T: Send + Sync,
R: Send + Sync,
F: Fn(T) -> R + Send + Sync,
{
let is_gpu_available = matches!(
device,
torsh_core::device::DeviceType::Cuda(_)
| torsh_core::device::DeviceType::Metal(_)
| torsh_core::device::DeviceType::Wgpu(_)
);
let _chunk_config =
create_optimal_chunk_config(data.len(), op_type, device, is_gpu_available);
#[cfg(feature = "parallel")]
{
use scirs2_core::parallel_ops::*;
data.into_par_iter().map(operation).collect()
}
#[cfg(not(feature = "parallel"))]
{
data.into_iter().map(operation).collect()
}
}
}
#[cfg(feature = "parallel")]
use intelligent_chunking::*;
fn can_broadcast(shape1: &[usize], shape2: &[usize]) -> bool {
let max_dims = shape1.len().max(shape2.len());
for i in 0..max_dims {
let dim1 = if i < shape1.len() {
shape1[shape1.len() - 1 - i]
} else {
1
};
let dim2 = if i < shape2.len() {
shape2[shape2.len() - 1 - i]
} else {
1
};
if dim1 != dim2 && dim1 != 1 && dim2 != 1 {
return false;
}
}
true
}
fn compute_broadcast_shape(shape1: &[usize], shape2: &[usize]) -> Result<Vec<usize>> {
let max_dims = shape1.len().max(shape2.len());
let mut result = Vec::with_capacity(max_dims);
for i in 0..max_dims {
let dim1 = if i < shape1.len() {
shape1[shape1.len() - 1 - i]
} else {
1
};
let dim2 = if i < shape2.len() {
shape2[shape2.len() - 1 - i]
} else {
1
};
if dim1 == dim2 {
result.push(dim1);
} else if dim1 == 1 {
result.push(dim2);
} else if dim2 == 1 {
result.push(dim1);
} else {
return Err(TorshError::ShapeMismatch {
expected: shape1.to_vec(),
got: shape2.to_vec(),
});
}
}
result.reverse();
Ok(result)
}
fn compute_broadcast_index(
flat_idx: usize,
broadcast_shape: &[usize],
original_shape: &[usize],
) -> usize {
let mut result = 0;
let mut remaining = flat_idx;
let dims_diff = broadcast_shape.len() - original_shape.len();
for (i, &broadcast_dim) in broadcast_shape.iter().enumerate() {
let stride = broadcast_shape[i + 1..].iter().product::<usize>().max(1);
let coord = remaining / stride;
remaining %= stride;
debug_assert!(
coord < broadcast_dim,
"Coordinate {} out of bounds for dimension {} of size {}",
coord,
i,
broadcast_dim
);
if i >= dims_diff {
let original_dim = original_shape[i - dims_diff];
let adjusted_coord = if original_dim == 1 { 0 } else { coord };
result = result * original_dim + adjusted_coord;
}
}
result
}
impl<T: TensorElement + Copy> Tensor<T> {
pub fn add_scalar_(&mut self, scalar: T) -> Result<()>
where
T: Copy + std::ops::Add<Output = T>,
{
self.make_unique()?;
self.apply_(|x| x + scalar)
}
pub fn add_scalar(&self, scalar: T) -> Result<Self>
where
T: Copy + std::ops::Add<Output = T>,
{
self.map(|x| x + scalar)
}
pub fn sub_scalar_(&mut self, scalar: T) -> Result<()>
where
T: Copy + std::ops::Sub<Output = T>,
{
self.make_unique()?;
self.apply_(|x| x - scalar)
}
pub fn sub_scalar(&self, scalar: T) -> Result<Self>
where
T: Copy + std::ops::Sub<Output = T>,
{
self.map(|x| x - scalar)
}
pub fn mul_scalar_(&mut self, scalar: T) -> Result<()>
where
T: Copy + std::ops::Mul<Output = T>,
{
self.make_unique()?;
self.apply_(|x| x * scalar)
}
pub fn mul_scalar(&self, scalar: T) -> Result<Self>
where
T: Copy + std::ops::Mul<Output = T>,
{
self.map(|x| x * scalar)
}
pub fn div_scalar_(&mut self, scalar: T) -> Result<()>
where
T: Copy + std::ops::Div<Output = T>,
{
self.make_unique()?;
self.apply_(|x| x / scalar)
}
pub fn div_scalar(&self, scalar: T) -> Result<Self>
where
T: Copy + std::ops::Div<Output = T>,
{
self.map(|x| x / scalar)
}
pub fn add(&self, other: &Self) -> Result<Self>
where
T: std::ops::Add<Output = T>,
{
if self.shape() != other.shape() {
return self.broadcast_add(other);
}
let mut result = self.elementwise_operation(other, |a, b| a + b)?;
if self.requires_grad || other.requires_grad {
result.requires_grad = true;
result.operation = Operation::Add {
lhs: Arc::new(self.clone()),
rhs: Arc::new(other.clone()),
};
}
Ok(result)
}
fn broadcast_add(&self, other: &Self) -> Result<Self>
where
T: std::ops::Add<Output = T>,
{
let self_shape_binding = self.shape();
let other_shape_binding = other.shape();
let self_shape = self_shape_binding.dims();
let other_shape = other_shape_binding.dims();
if !can_broadcast(self_shape, other_shape) {
return Err(TorshError::ShapeMismatch {
expected: self_shape.to_vec(),
got: other_shape.to_vec(),
});
}
let broadcast_shape = compute_broadcast_shape(self_shape, other_shape)?;
let self_data = self.data()?;
let other_data = other.data()?;
let mut result_data = Vec::with_capacity(broadcast_shape.iter().product());
for i in 0..broadcast_shape.iter().product::<usize>() {
let self_idx = compute_broadcast_index(i, &broadcast_shape, self_shape);
let other_idx = compute_broadcast_index(i, &broadcast_shape, other_shape);
let self_val = *self_data
.get(self_idx)
.ok_or_else(|| TorshError::IndexError {
index: self_idx,
size: self_data.len(),
})?;
let other_val = *other_data
.get(other_idx)
.ok_or_else(|| TorshError::IndexError {
index: other_idx,
size: other_data.len(),
})?;
result_data.push(self_val + other_val);
}
let mut result = Self::from_data(result_data, broadcast_shape, self.device)?;
if self.requires_grad || other.requires_grad {
result.requires_grad = true;
result.operation = Operation::Add {
lhs: Arc::new(self.clone()),
rhs: Arc::new(other.clone()),
};
}
Ok(result)
}
pub fn sub(&self, other: &Self) -> Result<Self>
where
T: std::ops::Sub<Output = T>,
{
self.elementwise_operation(other, |a, b| a - b)
}
pub fn mul(&self, other: &Self) -> Result<Self>
where
T: std::ops::Mul<Output = T>,
{
self.elementwise_operation(other, |a, b| a * b)
}
pub fn div(&self, other: &Self) -> Result<Self>
where
T: std::ops::Div<Output = T>,
{
self.elementwise_operation(other, |a, b| a / b)
}
fn broadcast_binary_op<F>(&self, other: &Self, op: F) -> Result<Self>
where
F: Fn(T, T) -> T + Send + Sync,
{
use crate::broadcast::BroadcastOps;
let self_shape_binding = self.shape();
let self_shape = self_shape_binding.dims();
let other_shape_binding = other.shape();
let other_shape = other_shape_binding.dims();
let broadcast_shape = BroadcastOps::compute_broadcast_shape(self_shape, other_shape)?;
let self_data = self.data()?;
let other_data = other.data()?;
let total_elements = broadcast_shape.iter().product::<usize>();
let mut result_data = Vec::with_capacity(total_elements);
let mut indices = vec![0; broadcast_shape.len()];
for _ in 0..total_elements {
let self_idx = self.compute_broadcast_index(&indices, self_shape, &broadcast_shape)?;
let other_idx =
other.compute_broadcast_index(&indices, other_shape, &broadcast_shape)?;
let result = op(self_data[self_idx], other_data[other_idx]);
result_data.push(result);
Self::increment_indices(&mut indices, &broadcast_shape);
}
Self::from_data(result_data, broadcast_shape, self.device)
}
fn increment_indices(indices: &mut [usize], shape: &[usize]) {
for i in (0..indices.len()).rev() {
indices[i] += 1;
if indices[i] < shape[i] {
break;
}
indices[i] = 0;
}
}
fn compute_broadcast_index(
&self,
broadcast_indices: &[usize],
original_shape: &[usize],
broadcast_shape: &[usize],
) -> Result<usize> {
let ndim_diff = broadcast_shape.len() - original_shape.len();
let mut flat_index = 0;
let mut stride = 1;
for i in (0..original_shape.len()).rev() {
let broadcast_idx = broadcast_indices[ndim_diff + i];
let original_size = original_shape[i];
let actual_idx = if original_size == 1 { 0 } else { broadcast_idx };
flat_index += actual_idx * stride;
stride *= original_size;
}
Ok(flat_index)
}
fn elementwise_operation<F>(&self, other: &Self, op: F) -> Result<Self>
where
F: Fn(T, T) -> T + Send + Sync,
{
if self.shape() != other.shape() {
return self.broadcast_binary_op(other, op);
}
let self_data = self.data()?;
let other_data = other.data()?;
#[cfg(feature = "simd")]
{
if self_data.len() > 1000 {
let result_data = self.simd_elementwise_operation(&self_data, &other_data, op)?;
return Self::from_data(result_data, self.shape().dims().to_vec(), self.device);
}
}
#[cfg(feature = "parallel")]
{
if self_data.len() > 100 {
let paired_data: Vec<(T, T)> = self_data
.iter()
.zip(other_data.iter())
.map(|(&a, &b)| (a, b))
.collect();
let result_data = intelligent_parallel_process(
paired_data,
TensorOpType::ElementWise, self.device.clone(),
|(a, b)| op(a, b),
);
return Self::from_data(result_data, self.shape().dims().to_vec(), self.device);
}
}
let result_data: Vec<T> = self_data
.iter()
.zip(other_data.iter())
.map(|(&a, &b)| op(a, b))
.collect();
Self::from_data(result_data, self.shape().dims().to_vec(), self.device)
}
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_elementwise_operation<F>(&self, data_a: &[T], data_b: &[T], op: F) -> Result<Vec<T>>
where
F: Fn(T, T) -> T + Send + Sync,
T: TensorElement,
{
#[cfg(feature = "simd")]
{
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
let _a_f32 = unsafe { std::mem::transmute::<&[T], &[f32]>(data_a) };
let _b_f32 = unsafe { std::mem::transmute::<&[T], &[f32]>(data_b) };
#[cfg(feature = "parallel")]
{
use scirs2_core::parallel_ops::*;
return Ok(data_a
.par_iter()
.zip(data_b.par_iter())
.map(|(&a, &b)| op(a, b))
.collect());
}
#[cfg(not(feature = "parallel"))]
{
return Ok(data_a
.iter()
.zip(data_b.iter())
.map(|(&a, &b)| op(a, b))
.collect());
}
}
}
Ok(data_a
.iter()
.zip(data_b.iter())
.map(|(&a, &b)| op(a, b))
.collect())
}
}
impl<T: TensorElement + Copy> Tensor<T>
where
T: scirs2_core::numeric::Float + torsh_core::dtype::FloatElement,
{
pub fn sqrt(&self) -> Result<Self> {
self.map(|x| x.sqrt())
}
pub fn square(&self) -> Result<Self> {
self.map(|x| x * x)
}
pub fn rsqrt(&self) -> Result<Self> {
self.map(|x| T::from(1.0).expect("numeric conversion should succeed") / x.sqrt())
}
pub fn reciprocal(&self) -> Result<Self> {
self.map(|x| T::from(1.0).expect("numeric conversion should succeed") / x)
}
pub fn exp(&self) -> Result<Self> {
self.map(|x| x.exp())
}
pub fn ln(&self) -> Result<Self> {
self.map(|x| x.ln())
}
pub fn log10(&self) -> Result<Self> {
self.map(|x| x.log10())
}
pub fn log2(&self) -> Result<Self> {
self.map(|x| x.log2())
}
pub fn log(&self) -> Result<Self> {
self.map(|x| x.ln())
}
pub fn sin(&self) -> Result<Self> {
self.map(|x| x.sin())
}
pub fn cos(&self) -> Result<Self> {
self.map(|x| x.cos())
}
pub fn tan(&self) -> Result<Self> {
self.map(|x| x.tan())
}
pub fn gelu(&self) -> Result<Self> {
#[cfg(feature = "gpu")]
{
if self.numel() > 50000 {
if let Ok(result) = self.gpu_gelu() {
return Ok(result);
}
}
}
#[cfg(feature = "simd")]
{
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() && self.numel() > 1000 {
return self.simd_gelu_f32();
}
}
#[cfg(feature = "parallel")]
{
if self.numel() > 100 {
return self.parallel_map(|x| self.compute_gelu_scalar(x));
}
}
self.map(|x| self.compute_gelu_scalar(x))
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
fn gpu_gelu(&self) -> Result<Self>
where
T: torsh_core::dtype::FloatElement,
{
#[cfg(feature = "profiling")]
Err(TorshError::InvalidArgument(
"GPU GELU temporarily unavailable".to_string(),
))
}
fn compute_gelu_scalar(&self, x: T) -> T {
let pi = T::from(std::f64::consts::PI).expect("numeric conversion should succeed");
let two = T::from(2.0).expect("numeric conversion should succeed");
let sqrt_2_over_pi = (two / pi).sqrt();
let point_044715 = T::from(0.044715).expect("numeric conversion should succeed");
let one = <T as scirs2_core::numeric::One>::one();
let half = T::from(0.5).expect("numeric conversion should succeed");
let x_cubed = x * x * x;
let tanh_input = sqrt_2_over_pi * (x + point_044715 * x_cubed);
half * x * (one + tanh_input.tanh())
}
#[cfg(feature = "simd")]
fn simd_gelu_f32(&self) -> Result<Self> {
use scirs2_core::ndarray::ArrayView1;
let data = self.data()?;
let data_f32: &[f32] =
unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, data.len()) };
let data_view = ArrayView1::from(data_f32);
let result_array = adaptive_simd::adaptive_simd_gelu_f32(&data_view);
let result_vec: Vec<T> = result_array
.to_vec()
.into_iter()
.map(|f| unsafe { std::mem::transmute_copy::<f32, T>(&f) })
.collect();
Self::from_data(
result_vec,
self.shape().dims().to_vec(),
self.device.clone(),
)
}
pub fn leaky_relu(&self, negative_slope: T) -> Result<Self> {
self.map(|x| {
if x > scirs2_core::numeric::Zero::zero() {
x
} else {
negative_slope * x
}
})
}
pub fn asin(&self) -> Result<Self> {
self.map(|x| x.asin())
}
pub fn acos(&self) -> Result<Self> {
self.map(|x| x.acos())
}
pub fn atan(&self) -> Result<Self> {
self.map(|x| x.atan())
}
pub fn sinh(&self) -> Result<Self> {
self.map(|x| x.sinh())
}
pub fn cosh(&self) -> Result<Self> {
self.map(|x| x.cosh())
}
pub fn tanh(&self) -> Result<Self> {
self.map(|x| x.tanh())
}
pub fn pow(&self, exponent: T) -> Result<Self>
where
T: TensorElement + Into<f32>,
{
let exponent_f32: f32 = exponent.into();
let mut result = self.map(|x| x.powf(exponent))?;
if self.requires_grad {
result.requires_grad = true;
result.operation = Operation::Power {
input: Arc::new(self.clone()),
exponent: exponent_f32,
};
}
Ok(result)
}
pub fn pow_scalar(&self, exponent: T) -> Result<Self>
where
T: TensorElement + Into<f32>,
{
self.pow(exponent)
}
pub fn pow_tensor(&self, exponent: &Self) -> Result<Self> {
self.elementwise_operation(exponent, |base, exp| base.powf(exp))
}
pub fn floor(&self) -> Result<Self> {
self.map(|x| x.floor())
}
pub fn ceil(&self) -> Result<Self> {
self.map(|x| x.ceil())
}
pub fn round(&self) -> Result<Self> {
self.map(|x| x.round())
}
pub fn trunc(&self) -> Result<Self> {
self.map(|x| x.trunc())
}
pub fn fract(&self) -> Result<Self> {
self.map(|x| x.fract())
}
pub fn neg(&self) -> Result<Self>
where
T: std::ops::Neg<Output = T>,
{
self.map(|x| -x)
}
pub fn sign(&self) -> Result<Self> {
self.map(|x| {
if x > <T as scirs2_core::numeric::Zero>::zero() {
<T as scirs2_core::numeric::One>::one()
} else if x < <T as scirs2_core::numeric::Zero>::zero() {
-<T as scirs2_core::numeric::One>::one()
} else {
<T as scirs2_core::numeric::Zero>::zero()
}
})
}
}
impl<T: TensorElement + Copy> Tensor<T> {
pub fn add_op(&self, other: &Self) -> Result<Self>
where
T: std::ops::Add<Output = T>,
{
self.add(other)
}
pub fn mul_op(&self, other: &Self) -> Result<Self>
where
T: std::ops::Mul<Output = T>,
{
self.mul(other)
}
pub fn sigmoid(&self) -> Result<Self>
where
T: torsh_core::dtype::FloatElement,
{
#[cfg(feature = "simd")]
{
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() && self.numel() > 1000 {
return self.simd_sigmoid_f32();
}
}
#[cfg(feature = "parallel")]
{
if self.numel() > 100 {
let one = <T as scirs2_core::numeric::One>::one();
return self.parallel_map(|x| {
one / (one + (-x).exp())
});
}
}
let one = <T as scirs2_core::numeric::One>::one();
let neg_self = self.neg()?;
let exp_neg = neg_self.exp()?;
let one_plus_exp = exp_neg.add_scalar(one)?;
let ones = Self::ones(self.shape().dims(), self.device)?;
ones.div(&one_plus_exp)
}
#[cfg(feature = "simd")]
fn simd_sigmoid_f32(&self) -> Result<Self> {
use scirs2_core::ndarray::ArrayView1;
let data = self.data()?;
let data_f32: &[f32] =
unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, data.len()) };
let data_view = ArrayView1::from(data_f32);
let result_array = adaptive_simd::adaptive_simd_sigmoid_f32(&data_view);
let result_vec: Vec<T> = result_array
.to_vec()
.into_iter()
.map(|f| unsafe { std::mem::transmute_copy::<f32, T>(&f) })
.collect();
Self::from_data(
result_vec,
self.shape().dims().to_vec(),
self.device.clone(),
)
}
pub fn relu(&self) -> Result<Self>
where
T: std::cmp::PartialOrd + scirs2_core::numeric::Zero,
{
let zero = <T as scirs2_core::numeric::Zero>::zero();
#[cfg(feature = "simd")]
{
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() && self.numel() > 1000 {
return self.simd_relu_f32();
}
}
#[cfg(feature = "parallel")]
{
if self.numel() > 100 {
return self.parallel_map(|x| if x > zero { x } else { zero });
}
}
self.map(|x| if x > zero { x } else { zero })
}
#[cfg(feature = "simd")]
fn simd_relu_f32(&self) -> Result<Self> {
use scirs2_core::ndarray::ArrayView1;
let data = self.data()?;
let data_f32: &[f32] =
unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, data.len()) };
let data_view = ArrayView1::from(data_f32);
let result_array = adaptive_simd::adaptive_simd_relu_f32(&data_view);
let result_vec: Vec<T> = result_array
.to_vec()
.into_iter()
.map(|f| unsafe { std::mem::transmute_copy::<f32, T>(&f) })
.collect();
Self::from_data(
result_vec,
self.shape().dims().to_vec(),
self.device.clone(),
)
}
#[cfg(feature = "parallel")]
fn parallel_map<F>(&self, op: F) -> Result<Self>
where
F: Fn(T) -> T + Send + Sync,
{
let data = self.data()?;
let result_data = intelligent_parallel_process(
data.iter().copied().collect::<Vec<_>>(),
TensorOpType::Activation, self.device.clone(),
op,
);
Self::from_data(result_data, self.shape().dims().to_vec(), self.device)
}
pub fn minimum(&self, other: &Self) -> Result<Self>
where
T: std::cmp::PartialOrd,
{
self.elementwise_operation(other, |a, b| if a < b { a } else { b })
}
pub fn maximum(&self, other: &Self) -> Result<Self>
where
T: std::cmp::PartialOrd,
{
self.elementwise_operation(other, |a, b| if a > b { a } else { b })
}
pub fn clamp(&self, min: T, max: T) -> Result<Self>
where
T: std::cmp::PartialOrd + Copy,
{
let data = self.to_vec()?;
let clamped_data: Vec<T> = data
.iter()
.map(|&x| {
if x < min {
min
} else if x > max {
max
} else {
x
}
})
.collect();
Self::from_data(
clamped_data,
self.shape().dims().to_vec(),
self.device.clone(),
)
}
pub fn dot(&self, other: &Self) -> Result<Self>
where
T: std::ops::Mul<Output = T> + std::ops::Add<Output = T> + scirs2_core::numeric::Zero,
{
let elementwise = self.mul(other)?;
elementwise.sum()
}
}
impl<T: TensorElement + Copy + scirs2_core::numeric::FromPrimitive> Tensor<T> {
pub fn add_scirs2(&self, other: &Self) -> Result<Self>
where
T: std::ops::Add<Output = T> + scirs2_core::numeric::Float,
{
self.add(other)
}
pub fn mul_scirs2(&self, other: &Self) -> Result<Self>
where
T: std::ops::Mul<Output = T> + scirs2_core::numeric::Float,
{
self.mul(other)
}
pub fn sub_scirs2(&self, other: &Self) -> Result<Self>
where
T: std::ops::Sub<Output = T> + scirs2_core::numeric::Float,
{
self.sub(other)
}
pub fn div_scirs2(&self, other: &Self) -> Result<Self>
where
T: std::ops::Div<Output = T> + scirs2_core::numeric::Float,
{
self.div(other)
}
}
impl<T: TensorElement + Copy> std::ops::Add for &Tensor<T>
where
T: std::ops::Add<Output = T>,
{
type Output = Tensor<T>;
fn add(self, rhs: Self) -> Self::Output {
self.add(rhs).expect("tensor addition should succeed")
}
}
impl<T: TensorElement + Copy> std::ops::Sub for &Tensor<T>
where
T: std::ops::Sub<Output = T>,
{
type Output = Tensor<T>;
fn sub(self, rhs: Self) -> Self::Output {
self.sub(rhs).expect("tensor subtraction should succeed")
}
}
impl<T: TensorElement + Copy> std::ops::Mul for &Tensor<T>
where
T: std::ops::Mul<Output = T>,
{
type Output = Tensor<T>;
fn mul(self, rhs: Self) -> Self::Output {
self.mul(rhs).expect("tensor multiplication should succeed")
}
}
impl<T: TensorElement + Copy> std::ops::Div for &Tensor<T>
where
T: std::ops::Div<Output = T>,
{
type Output = Tensor<T>;
fn div(self, rhs: Self) -> Self::Output {
self.div(rhs).expect("tensor division should succeed")
}
}
impl<T: TensorElement + Copy> std::ops::Neg for &Tensor<T>
where
T: std::ops::Neg<Output = T>,
{
type Output = Tensor<T>;
fn neg(self) -> Self::Output {
self.map(|x| -x).expect("negation map should succeed")
}
}
impl<T: TensorElement + Copy + scirs2_core::numeric::Float> Tensor<T> {
#[cfg(feature = "simd")]
pub fn add_simd(&self, other: &Self) -> Result<Self>
where
T: scirs2_core::simd_ops::SimdUnifiedOps,
{
use scirs2_core::ndarray::Array1;
if self.shape().dims() != other.shape().dims() {
return Err(TorshError::ShapeMismatch {
expected: self.shape().dims().to_vec(),
got: other.shape().dims().to_vec(),
});
}
let data_a = self.to_vec()?;
let data_b = other.to_vec()?;
let arr_a = Array1::from_vec(data_a);
let arr_b = Array1::from_vec(data_b);
let result_arr = T::simd_add(&arr_a.view(), &arr_b.view());
Tensor::from_vec(result_arr.to_vec(), self.shape().dims())
}
#[cfg(feature = "simd")]
pub fn mul_simd(&self, other: &Self) -> Result<Self>
where
T: scirs2_core::simd_ops::SimdUnifiedOps,
{
use scirs2_core::ndarray::Array1;
if self.shape().dims() != other.shape().dims() {
return Err(TorshError::ShapeMismatch {
expected: self.shape().dims().to_vec(),
got: other.shape().dims().to_vec(),
});
}
let data_a = self.to_vec()?;
let data_b = other.to_vec()?;
let arr_a = Array1::from_vec(data_a);
let arr_b = Array1::from_vec(data_b);
let result_arr = T::simd_mul(&arr_a.view(), &arr_b.view());
Tensor::from_vec(result_arr.to_vec(), self.shape().dims())
}
#[cfg(feature = "simd")]
pub fn dot_simd(&self, other: &Self) -> Result<T>
where
T: scirs2_core::simd_ops::SimdUnifiedOps,
{
use scirs2_core::ndarray::Array1;
if self.shape().dims() != other.shape().dims() {
return Err(TorshError::ShapeMismatch {
expected: self.shape().dims().to_vec(),
got: other.shape().dims().to_vec(),
});
}
let data_a = self.to_vec()?;
let data_b = other.to_vec()?;
let arr_a = Array1::from_vec(data_a);
let arr_b = Array1::from_vec(data_b);
Ok(T::simd_dot(&arr_a.view(), &arr_b.view()))
}
pub fn reduce_memory_efficient<F>(&self, func: F) -> Result<T>
where
F: Fn(T, T) -> T + Send + Sync,
{
#[cfg(feature = "profiling")]
{
}
let data = self.to_vec()?;
Ok(data
.into_iter()
.reduce(func)
.unwrap_or_else(|| <T as scirs2_core::numeric::Zero>::zero()))
}
}
impl<T: TensorElement + Copy + std::ops::Mul<Output = T>> Tensor<T> {
pub fn relu_(&mut self) -> Result<&mut Self>
where
T: std::cmp::PartialOrd + scirs2_core::numeric::Zero,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let zero = <T as scirs2_core::numeric::Zero>::zero();
let len = self.storage.len();
for i in 0..len {
let current = self.storage.get(i)?;
if current < zero {
self.storage.set(i, zero)?;
}
}
Ok(self)
}
pub fn sigmoid_(&mut self) -> Result<&mut Self>
where
T: torsh_core::dtype::FloatElement,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let one = <T as scirs2_core::numeric::One>::one();
let len = self.storage.len();
for i in 0..len {
let x = self.storage.get(i)?;
let sigmoid_val = one / (one + (-x).exp());
self.storage.set(i, sigmoid_val)?;
}
Ok(self)
}
pub fn tanh_(&mut self) -> Result<&mut Self>
where
T: torsh_core::dtype::FloatElement,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let len = self.storage.len();
for i in 0..len {
let x = self.storage.get(i)?;
self.storage.set(i, x.tanh())?;
}
Ok(self)
}
pub fn gelu_(&mut self) -> Result<&mut Self>
where
T: torsh_core::dtype::FloatElement,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let len = self.storage.len();
let pi = T::from(std::f64::consts::PI).expect("numeric conversion should succeed");
let two = T::from(2.0).expect("numeric conversion should succeed");
let sqrt_2_over_pi = (two / pi).sqrt();
let point_044715 = T::from(0.044715).expect("numeric conversion should succeed");
let one = <T as scirs2_core::numeric::One>::one();
let half = T::from(0.5).expect("numeric conversion should succeed");
for i in 0..len {
let x = self.storage.get(i)?;
let x_cubed = x * x * x;
let tanh_input = sqrt_2_over_pi * (x + point_044715 * x_cubed);
let gelu_val = half * x * (one + tanh_input.tanh());
self.storage.set(i, gelu_val)?;
}
Ok(self)
}
pub fn leaky_relu_(&mut self, negative_slope: T) -> Result<&mut Self>
where
T: std::cmp::PartialOrd + scirs2_core::numeric::Zero,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let zero = <T as scirs2_core::numeric::Zero>::zero();
let len = self.storage.len();
for i in 0..len {
let x = self.storage.get(i)?;
if x < zero {
self.storage.set(i, negative_slope * x)?;
}
}
Ok(self)
}
pub fn clamp_(&mut self, min: T, max: T) -> Result<&mut Self>
where
T: std::cmp::PartialOrd,
{
if self.requires_grad {
return Err(TorshError::InvalidArgument(
"In-place operation on tensor that requires grad is not allowed".to_string(),
));
}
let len = self.storage.len();
for i in 0..len {
let x = self.storage.get(i)?;
let clamped = if x < min {
min
} else if x > max {
max
} else {
x
};
self.storage.set(i, clamped)?;
}
Ok(self)
}
}
#[cfg(test)]
mod tests {
use super::*;
use torsh_core::device::DeviceType;
#[test]
fn test_scalar_operations() {
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let tensor = Tensor::from_data(data, vec![4], DeviceType::Cpu)
.expect("failed to create tensor for scalar ops");
let result = tensor.add_scalar(5.0).expect("add_scalar should succeed");
assert_eq!(
result.data().expect("failed to get add_scalar result data"),
vec![6.0, 7.0, 8.0, 9.0]
);
let result = tensor.mul_scalar(2.0).expect("mul_scalar should succeed");
assert_eq!(
result.data().expect("failed to get mul_scalar result data"),
vec![2.0, 4.0, 6.0, 8.0]
);
let result = tensor.sub_scalar(1.0).expect("sub_scalar should succeed");
assert_eq!(
result.data().expect("failed to get sub_scalar result data"),
vec![0.0, 1.0, 2.0, 3.0]
);
let result = tensor.div_scalar(2.0).expect("div_scalar should succeed");
assert_eq!(
result.data().expect("failed to get div_scalar result data"),
vec![0.5, 1.0, 1.5, 2.0]
);
}
#[test]
fn test_elementwise_operations() {
let a = Tensor::from_data(vec![1.0f32, 2.0, 3.0], vec![3], DeviceType::Cpu)
.expect("failed to create tensor a");
let b = Tensor::from_data(vec![4.0f32, 5.0, 6.0], vec![3], DeviceType::Cpu)
.expect("failed to create tensor b");
let result = a.add(&b).expect("elementwise add should succeed");
assert_eq!(
result.data().expect("failed to get add result data"),
vec![5.0, 7.0, 9.0]
);
let result = a.sub(&b).expect("elementwise sub should succeed");
assert_eq!(
result.data().expect("failed to get sub result data"),
vec![-3.0, -3.0, -3.0]
);
let result = a.mul(&b).expect("elementwise mul should succeed");
assert_eq!(
result.data().expect("failed to get mul result data"),
vec![4.0, 10.0, 18.0]
);
let result = b.div(&a).expect("elementwise div should succeed");
assert_eq!(
result.data().expect("failed to get div result data"),
vec![4.0, 2.5, 2.0]
);
}
#[test]
fn test_mathematical_functions() {
let data = vec![1.0f32, 4.0, 9.0, 16.0];
let tensor = Tensor::from_data(data, vec![4], DeviceType::Cpu)
.expect("failed to create tensor for math functions");
let sqrt_result = tensor.sqrt().expect("sqrt should succeed");
assert_eq!(
sqrt_result.data().expect("failed to get sqrt result data"),
vec![1.0, 2.0, 3.0, 4.0]
);
let data2 = vec![0.0f32, 1.0, 2.0];
let tensor2 = Tensor::from_data(data2, vec![3], DeviceType::Cpu)
.expect("failed to create tensor2 for exp");
let exp_result = tensor2.exp().expect("exp should succeed");
let expected_exp = vec![1.0, std::f32::consts::E, std::f32::consts::E.powi(2)];
for (got, &expected) in exp_result
.data()
.expect("failed to get exp result data")
.iter()
.zip(&expected_exp)
{
assert!((got - expected).abs() < 1e-6);
}
}
#[test]
fn test_trigonometric_functions() {
let data = vec![0.0f32, std::f32::consts::PI / 2.0, std::f32::consts::PI];
let tensor = Tensor::from_data(data, vec![3], DeviceType::Cpu)
.expect("failed to create tensor for trig functions");
let sin_result = tensor.sin().expect("sin should succeed");
let sin_data = sin_result.data().expect("failed to get sin result data");
assert!((sin_data[0] - 0.0).abs() < 1e-6);
assert!((sin_data[1] - 1.0).abs() < 1e-6);
assert!((sin_data[2] - 0.0).abs() < 1e-6);
let cos_result = tensor.cos().expect("cos should succeed");
let cos_data = cos_result.data().expect("failed to get cos result data");
assert!((cos_data[0] - 1.0).abs() < 1e-6);
assert!((cos_data[1] - 0.0).abs() < 1e-6);
assert!((cos_data[2] - (-1.0)).abs() < 1e-6);
}
#[test]
fn test_operator_overloads() {
let a = Tensor::from_data(vec![1.0f32, 2.0, 3.0], vec![3], DeviceType::Cpu)
.expect("failed to create tensor a for operator overloads");
let b = Tensor::from_data(vec![4.0f32, 5.0, 6.0], vec![3], DeviceType::Cpu)
.expect("failed to create tensor b for operator overloads");
let result = &a + &b;
assert_eq!(
result.data().expect("failed to get add operator result"),
vec![5.0, 7.0, 9.0]
);
let result = &b - &a;
assert_eq!(
result.data().expect("failed to get sub operator result"),
vec![3.0, 3.0, 3.0]
);
let result = &a * &b;
assert_eq!(
result.data().expect("failed to get mul operator result"),
vec![4.0, 10.0, 18.0]
);
let result = &b / &a;
assert_eq!(
result.data().expect("failed to get div operator result"),
vec![4.0, 2.5, 2.0]
);
let neg_result = -&a;
assert_eq!(
neg_result
.data()
.expect("failed to get neg operator result"),
vec![-1.0, -2.0, -3.0]
);
}
#[test]
fn test_power_operations() {
let data = vec![2.0f32, 3.0, 4.0];
let tensor = Tensor::from_data(data, vec![3], DeviceType::Cpu)
.expect("failed to create tensor for power ops");
let pow_result = tensor.pow(2.0).expect("pow should succeed");
assert_eq!(
pow_result.data().expect("failed to get pow result data"),
vec![4.0, 9.0, 16.0]
);
let exponents = Tensor::from_data(vec![1.0f32, 2.0, 3.0], vec![3], DeviceType::Cpu)
.expect("failed to create exponents tensor");
let pow_tensor_result = tensor
.pow_tensor(&exponents)
.expect("pow_tensor should succeed");
assert_eq!(
pow_tensor_result
.data()
.expect("failed to get pow_tensor result data"),
vec![2.0, 9.0, 64.0]
);
}
#[test]
fn test_rounding_functions() {
let data = vec![1.2f32, 2.7, -1.5, -2.3];
let tensor = Tensor::from_data(data, vec![4], DeviceType::Cpu)
.expect("failed to create tensor for rounding");
let floor_result = tensor.floor().expect("floor should succeed");
assert_eq!(
floor_result
.data()
.expect("failed to get floor result data"),
vec![1.0, 2.0, -2.0, -3.0]
);
let ceil_result = tensor.ceil().expect("ceil should succeed");
assert_eq!(
ceil_result.data().expect("failed to get ceil result data"),
vec![2.0, 3.0, -1.0, -2.0]
);
let round_result = tensor.round().expect("round should succeed");
assert_eq!(
round_result
.data()
.expect("failed to get round result data"),
vec![1.0, 3.0, -2.0, -2.0]
);
}
#[test]
fn test_sign_function() {
let data = vec![-3.0f32, 0.0, 5.0, -1.0];
let tensor = Tensor::from_data(data, vec![4], DeviceType::Cpu)
.expect("failed to create tensor for sign");
let sign_result = tensor.sign().expect("sign should succeed");
assert_eq!(
sign_result.data().expect("failed to get sign result data"),
vec![-1.0, 0.0, 1.0, -1.0]
);
}
#[test]
fn test_shape_mismatch_error() {
let a = Tensor::from_data(vec![1.0f32, 2.0], vec![2], DeviceType::Cpu)
.expect("failed to create tensor a for shape mismatch test");
let b = Tensor::from_data(vec![1.0f32, 2.0, 3.0], vec![3], DeviceType::Cpu)
.expect("failed to create tensor b for shape mismatch test");
assert!(a.add(&b).is_err());
assert!(a.mul(&b).is_err());
}
#[test]
fn test_relu_inplace() {
let mut tensor =
Tensor::from_data(vec![-2.0f32, -1.0, 0.0, 1.0, 2.0], vec![5], DeviceType::Cpu)
.expect("failed to create tensor for relu inplace");
tensor.relu_().expect("relu_ should succeed");
let result = tensor.data().expect("failed to get relu_ result data");
assert_eq!(result, vec![0.0, 0.0, 0.0, 1.0, 2.0]);
}
#[test]
fn test_sigmoid_inplace() {
let mut tensor = Tensor::from_data(vec![0.0f32], vec![1], DeviceType::Cpu)
.expect("failed to create tensor for sigmoid inplace");
tensor.sigmoid_().expect("sigmoid_ should succeed");
let result = tensor.data().expect("failed to get sigmoid_ result data");
assert!((result[0] - 0.5).abs() < 1e-6);
}
#[test]
fn test_tanh_inplace() {
let mut tensor = Tensor::from_data(vec![0.0f32], vec![1], DeviceType::Cpu)
.expect("failed to create tensor for tanh inplace");
tensor.tanh_().expect("tanh_ should succeed");
let result = tensor.data().expect("failed to get tanh_ result data");
assert!(result[0].abs() < 1e-6);
}
#[test]
fn test_clamp_inplace() {
let mut tensor =
Tensor::from_data(vec![-2.0f32, -1.0, 0.0, 1.0, 2.0], vec![5], DeviceType::Cpu)
.expect("failed to create tensor for clamp inplace");
tensor.clamp_(-1.0, 1.0).expect("clamp_ should succeed");
let result = tensor.data().expect("failed to get clamp_ result data");
assert_eq!(result, vec![-1.0, -1.0, 0.0, 1.0, 1.0]);
}
#[test]
fn test_inplace_with_requires_grad_error() {
let mut tensor = Tensor::from_data(vec![1.0f32, 2.0], vec![2], DeviceType::Cpu)
.expect("failed to create tensor for requires_grad test");
tensor.requires_grad = true;
assert!(tensor.relu_().is_err());
assert!(tensor.sigmoid_().is_err());
assert!(tensor.tanh_().is_err());
}
}