use super::common::normalize_axis;
use crate::tensor::TensorStorage;
use crate::{Result, Tensor, TensorError};
use scirs2_core::ndarray::{ArrayD, Axis};
use scirs2_core::numeric::{Float, FromPrimitive, Zero};
pub fn sum<T>(x: &Tensor<T>, axes: Option<&[i32]>, keepdims: bool) -> Result<Tensor<T>>
where
T: Clone
+ Default
+ Zero
+ std::ops::Add<Output = T>
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
let _result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
if let Some(axes) = axes {
let mut result = arr.view().to_owned();
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
if result.len() > 10000 {
for &axis in &sorted_axes {
result = par_sum_axis(&result, axis)?;
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
} else {
for &axis in &sorted_axes {
result = result.sum_axis(Axis(axis));
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
}
Ok(Tensor::from_array(result))
} else {
let sum = if arr.len() > 10000 {
parallel_sum_all(arr)?
} else {
arr.sum()
};
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], sum)
} else {
ArrayD::from_elem(vec![], sum)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Sum,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
pub fn mean<T>(x: &Tensor<T>, axes: Option<&[i32]>, keepdims: bool) -> Result<Tensor<T>>
where
T: Clone
+ Default
+ Float
+ FromPrimitive
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
let _result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
if let Some(axes) = axes {
let mut result = arr.map(|x| *x);
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
for &axis in &sorted_axes {
result = result
.mean_axis(Axis(axis))
.expect("axis should be valid for mean reduction");
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
Ok(Tensor::from_array(result))
} else {
let mean_val = arr.mean().unwrap_or_default();
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], mean_val)
} else {
ArrayD::from_elem(vec![], mean_val)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Mean,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
pub fn max<T>(x: &Tensor<T>, axes: Option<&[i32]>, keepdims: bool) -> Result<Tensor<T>>
where
T: Clone + Default + PartialOrd + Send + Sync + 'static + bytemuck::Pod + bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
if arr.is_empty() {
return Err(TensorError::invalid_argument(
"Cannot compute max of empty tensor ".to_string(),
));
}
if let Some(axes) = axes {
let mut result = arr.clone();
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
for &axis in &sorted_axes {
result =
result.fold_axis(
Axis(axis),
T::default(),
|acc, x| {
if x > acc {
*x
} else {
*acc
}
},
);
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
Ok(Tensor::from_array(result))
} else {
let max_val = arr
.iter()
.fold(None, |acc: Option<&T>, x| match acc {
None => Some(x),
Some(max) => {
if x > max {
Some(x)
} else {
Some(max)
}
}
})
.cloned()
.unwrap_or_default();
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], max_val)
} else {
ArrayD::from_elem(vec![], max_val)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Max,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
pub fn min<T>(x: &Tensor<T>, axes: Option<&[i32]>, keepdims: bool) -> Result<Tensor<T>>
where
T: Clone + Default + PartialOrd + Send + Sync + 'static + bytemuck::Pod + bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
if arr.is_empty() {
return Err(TensorError::invalid_argument(
"Cannot compute min of empty tensor ".to_string(),
));
}
if let Some(axes) = axes {
let mut result = arr.clone();
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
for &axis in &sorted_axes {
result =
result.fold_axis(
Axis(axis),
T::default(),
|acc, x| {
if x < acc {
*x
} else {
*acc
}
},
);
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
Ok(Tensor::from_array(result))
} else {
let min_val = arr
.iter()
.fold(None, |acc: Option<&T>, x| match acc {
None => Some(x),
Some(min) => {
if x < min {
Some(x)
} else {
Some(min)
}
}
})
.cloned()
.unwrap_or_default();
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], min_val)
} else {
ArrayD::from_elem(vec![], min_val)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Min,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
pub fn prod<T>(x: &Tensor<T>, axes: Option<&[i32]>, keepdims: bool) -> Result<Tensor<T>>
where
T: Clone
+ Default
+ std::ops::Mul<Output = T>
+ scirs2_core::num_traits::One
+ Send
+ Sync
+ 'static
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
let _result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
if let Some(axes) = axes {
let mut result = arr.clone();
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
for &axis in &sorted_axes {
result = result.fold_axis(Axis(axis), T::one(), |acc, x| *acc * *x);
if keepdims {
result = result.insert_axis(Axis(axis));
}
}
Ok(Tensor::from_array(result))
} else {
let prod = arr.iter().fold(T::one(), |acc, x| acc * *x);
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], prod)
} else {
ArrayD::from_elem(vec![], prod)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Prod,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
pub fn variance<T>(
x: &Tensor<T>,
axes: Option<&[i32]>,
keepdims: bool,
ddof: usize,
) -> Result<Tensor<T>>
where
T: Clone
+ Default
+ Float
+ FromPrimitive
+ Send
+ Sync
+ 'static
+ scirs2_core::ndarray::ScalarOperand
+ bytemuck::Pod
+ bytemuck::Zeroable,
{
match &x.storage {
TensorStorage::Cpu(arr) => {
let _result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
if let Some(axes) = axes {
let mut result = arr.map(|x| *x);
let mut sorted_axes: Vec<_> = axes
.iter()
.map(|&a| normalize_axis(a, x.shape().rank() as i32))
.collect::<Result<Vec<_>>>()?;
sorted_axes.sort_by(|a, b| b.cmp(a));
for &axis in &sorted_axes {
let mean = result
.mean_axis(Axis(axis))
.expect("axis should be valid for variance mean calculation");
let mut variance_result = ArrayD::zeros(mean.raw_dim());
let axis_size = result.shape()[axis];
let n = if axis_size > ddof {
axis_size - ddof
} else {
1
};
let n_f = T::from_usize(n).unwrap_or_else(T::one);
for i in 0..axis_size {
let slice = result.index_axis(Axis(axis), i);
let diff = &slice - &mean;
let squared_diff = diff.mapv(|x| x * x);
variance_result = variance_result + squared_diff;
}
variance_result = variance_result / n_f;
result = if keepdims {
variance_result.insert_axis(Axis(axis))
} else {
variance_result
};
}
Ok(Tensor::from_array(result))
} else {
let mean_val = arr.mean().unwrap_or_default();
let n = arr.len();
let effective_n = if n > ddof { n - ddof } else { 1 };
let n_f = T::from_usize(effective_n).unwrap_or_else(T::one);
let variance_val = arr
.iter()
.map(|x| {
let diff = *x - mean_val;
diff * diff
})
.fold(T::zero(), |acc, x| acc + x)
/ n_f;
let result = if keepdims {
ArrayD::from_elem(vec![1; x.shape().rank()], variance_val)
} else {
ArrayD::from_elem(vec![], variance_val)
};
Ok(Tensor::from_array(result))
}
}
#[cfg(feature = "gpu")]
TensorStorage::Gpu(gpu_buffer) => {
use crate::gpu::ops::{execute_axis_reduction_op, ReductionOp};
let result_shape =
crate::ops::shape_inference::infer_reduction(x.shape(), axes, keepdims)?;
let output_len = result_shape.dims().iter().product();
let result_buffer = execute_axis_reduction_op(
gpu_buffer,
ReductionOp::Variance,
x.shape().dims(),
axes,
keepdims,
output_len,
)?;
Ok(Tensor::from_gpu_buffer(result_buffer, result_shape))
}
}
}
fn par_sum_axis<T>(arr: &ArrayD<T>, axis: usize) -> Result<ArrayD<T>>
where
T: Clone + Default + Zero + std::ops::Add<Output = T> + Send + Sync + 'static,
{
let axis_obj = Axis(axis);
Ok(arr.sum_axis(axis_obj))
}
fn parallel_sum_all<T>(arr: &ArrayD<T>) -> Result<T>
where
T: Clone + Default + Zero + std::ops::Add<Output = T> + Send + Sync + 'static,
{
const CHUNK_SIZE: usize = 100000;
if arr.len() <= CHUNK_SIZE {
return Ok(arr.sum());
}
Ok(arr.sum())
}
#[cfg(feature = "gpu")]
pub fn reduce_axis_cpu<T>(
tensor: &Tensor<T>,
axis: usize,
op: super::gpu_kernels::ReductionOp,
keep_dims: bool,
) -> Result<Tensor<T>>
where
T: scirs2_core::num_traits::Float
+ Default
+ bytemuck::Pod
+ Send
+ Sync
+ 'static
+ FromPrimitive
+ scirs2_core::num_traits::ops::mul_add::MulAdd
+ scirs2_core::ndarray::ScalarOperand,
{
match op {
super::gpu_kernels::ReductionOp::Sum => sum(tensor, Some(&[axis as i32]), keep_dims),
super::gpu_kernels::ReductionOp::Mean => mean(tensor, Some(&[axis as i32]), keep_dims),
super::gpu_kernels::ReductionOp::Max => max(tensor, Some(&[axis as i32]), keep_dims),
super::gpu_kernels::ReductionOp::Min => min(tensor, Some(&[axis as i32]), keep_dims),
super::gpu_kernels::ReductionOp::Prod => prod(tensor, Some(&[axis as i32]), keep_dims),
super::gpu_kernels::ReductionOp::Variance => {
variance(tensor, Some(&[axis as i32]), keep_dims, 0)
}
_ => Err(TensorError::not_implemented_simple(format!(
"Reduction operation {:?} not implemented for CPU",
op
))),
}
}
#[cfg(feature = "gpu")]
pub fn reduce_all_cpu<T>(tensor: &Tensor<T>, op: super::gpu_kernels::ReductionOp) -> Result<T>
where
T: scirs2_core::num_traits::Float
+ Default
+ bytemuck::Pod
+ Send
+ Sync
+ 'static
+ FromPrimitive
+ scirs2_core::num_traits::ops::mul_add::MulAdd
+ scirs2_core::ndarray::ScalarOperand,
{
let result = match op {
super::gpu_kernels::ReductionOp::Sum => sum(tensor, None, false)?,
super::gpu_kernels::ReductionOp::Mean => mean(tensor, None, false)?,
super::gpu_kernels::ReductionOp::Max => max(tensor, None, false)?,
super::gpu_kernels::ReductionOp::Min => min(tensor, None, false)?,
super::gpu_kernels::ReductionOp::Prod => prod(tensor, None, false)?,
_ => {
return Err(TensorError::not_implemented_simple(format!(
"Reduction operation {:?} not implemented for CPU",
op
)))
}
};
let data = result.data();
if data.is_empty() {
Ok(T::default())
} else {
Ok(data[0])
}
}