use super::Tensor;
use crate::error::{RusTorchError, RusTorchResult};
type ParallelResult<T> = RusTorchResult<T>;
use super::parallel_traits::{
BatchParallelOp, MatrixParallelOp, ParallelConfig, ParallelOp, ReductionParallelOp,
};
use crate::gpu::{get_device_manager, DeviceType};
use num_traits::Float;
#[derive(Debug, Clone, Copy)]
pub enum GpuExecutionStrategy {
CpuParallel,
GpuPreferred {
fallback_threshold: usize,
},
Hybrid {
gpu_threshold: usize,
cpu_threads: usize,
},
Auto,
}
pub fn select_optimal_device(shape: &[usize]) -> DeviceType {
let total_elements: usize = shape.iter().product();
if total_elements > 100000 {
if DeviceType::Cuda(0).is_available() {
return DeviceType::Cuda(0);
}
if DeviceType::Metal(0).is_available() {
return DeviceType::Metal(0);
}
if DeviceType::OpenCL(0).is_available() {
return DeviceType::OpenCL(0);
}
}
DeviceType::Cpu
}
pub fn current_device() -> DeviceType {
if DeviceType::Cuda(0).is_available() {
DeviceType::Cuda(0)
} else if DeviceType::Metal(0).is_available() {
DeviceType::Metal(0)
} else if DeviceType::OpenCL(0).is_available() {
DeviceType::OpenCL(0)
} else {
DeviceType::Cpu
}
}
pub trait GpuParallelOp<T: Float + Send + Sync + Clone + 'static>: ParallelOp<T> {
fn gpu_elementwise_op<F>(&self, other: &Tensor<T>, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(T, T) -> T + Send + Sync + Clone + 'static;
fn gpu_matmul(&self, other: &Tensor<T>) -> ParallelResult<Tensor<T>>;
fn gpu_reduce<F, R>(&self, dim: usize, init: R, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(R, T) -> R + Send + Sync + Clone,
R: Send + Sync + Clone + Into<T>;
fn to_device(&self, device: DeviceType) -> ParallelResult<Tensor<T>>;
fn to_cpu(&self) -> ParallelResult<Tensor<T>>;
}
#[derive(Debug, Clone, Copy)]
pub enum GpuParallelStrategy {
CpuParallel,
GpuPreferred,
Hybrid,
Auto,
}
#[derive(Debug, Clone)]
pub struct GpuParallelConfig {
pub base_config: ParallelConfig,
pub gpu_strategy: GpuParallelStrategy,
pub transfer_threshold: usize,
pub preferred_device: Option<DeviceType>,
}
impl Default for GpuParallelConfig {
fn default() -> Self {
Self {
base_config: ParallelConfig::default(),
gpu_strategy: GpuParallelStrategy::Auto,
transfer_threshold: 10000,
preferred_device: None,
}
}
}
pub struct GpuParallelContext {
pub config: GpuParallelConfig,
current_device: DeviceType,
}
impl GpuParallelContext {
pub fn new(config: GpuParallelConfig) -> Self {
let current_device = current_device();
Self {
config,
current_device,
}
}
pub fn default() -> Self {
Self::new(GpuParallelConfig::default())
}
pub fn determine_strategy(&self, size: usize) -> GpuParallelStrategy {
match self.config.gpu_strategy {
GpuParallelStrategy::Auto => {
if size < self.config.transfer_threshold {
GpuParallelStrategy::CpuParallel
} else if self.is_gpu_available() {
GpuParallelStrategy::GpuPreferred
} else {
GpuParallelStrategy::CpuParallel
}
}
strategy => strategy,
}
}
pub fn is_gpu_available(&self) -> bool {
!matches!(self.current_device, DeviceType::Cpu)
}
pub fn set_device(&mut self, device: DeviceType) -> RusTorchResult<()> {
crate::gpu::set_device(device)?;
self.current_device = device;
Ok(())
}
pub fn current_device(&self) -> DeviceType {
self.current_device
}
}
#[cfg(feature = "cuda")]
impl<T> GpuParallelOp<T> for Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand
+ cudarc::driver::DeviceRepr
+ cudarc::driver::ValidAsZeroBits,
{
fn gpu_elementwise_op<F>(&self, other: &Tensor<T>, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(T, T) -> T + Send + Sync + Clone + 'static,
{
if self.shape() != other.shape() {
return Err(RusTorchError::shape_mismatch(self.shape(), other.shape()));
}
let ctx = GpuParallelContext::default();
let strategy = ctx.determine_strategy(self.data.len());
match strategy {
GpuParallelStrategy::CpuParallel => {
self.batch_elementwise_op(other, op)
}
GpuParallelStrategy::GpuPreferred => {
match self.try_gpu_elementwise_op(other, op.clone()) {
Ok(result) => Ok(result),
Err(_) => {
self.batch_elementwise_op(other, op)
}
}
}
GpuParallelStrategy::Hybrid => {
self.batch_elementwise_op(other, op)
}
GpuParallelStrategy::Auto => self.batch_elementwise_op(other, op),
}
}
fn gpu_matmul(&self, other: &Tensor<T>) -> ParallelResult<Tensor<T>> {
use crate::gpu::matrix_ops::GpuLinearAlgebra;
let ctx = GpuParallelContext::default();
let strategy = ctx.determine_strategy(self.data.len() * other.data.len());
match strategy {
GpuParallelStrategy::CpuParallel => self.batch_matmul(other),
GpuParallelStrategy::GpuPreferred => {
GpuLinearAlgebra::gpu_matmul(self, other).or_else(|_| {
self.batch_matmul(other)
})
}
GpuParallelStrategy::Hybrid => {
GpuLinearAlgebra::gpu_matmul(self, other).or_else(|_| self.batch_matmul(other))
}
GpuParallelStrategy::Auto => {
GpuLinearAlgebra::gpu_matmul(self, other).or_else(|_| self.batch_matmul(other))
}
}
}
fn gpu_reduce<F, R>(&self, dim: usize, init: R, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(R, T) -> R + Send + Sync + Clone,
R: Send + Sync + Clone + Into<T>,
{
let ctx = GpuParallelContext::default();
let strategy = ctx.determine_strategy(self.data.len());
match strategy {
GpuParallelStrategy::CpuParallel => self.parallel_reduce(dim, init, op),
GpuParallelStrategy::GpuPreferred => {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_sum(self, Some(dim)).or_else(|_| {
self.parallel_reduce(dim, init, op)
})
}
GpuParallelStrategy::Hybrid => self.parallel_reduce(dim, init, op),
GpuParallelStrategy::Auto => {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_sum(self, Some(dim))
.or_else(|_| self.parallel_reduce(dim, init, op))
}
}
}
fn to_device(&self, device: DeviceType) -> ParallelResult<Tensor<T>> {
use crate::gpu::memory_ops::manager::GpuMemoryManager;
match device {
DeviceType::Cpu => {
Ok(self.clone())
}
_ => {
let gpu_buffer = GpuMemoryManager::to_device(self, &device)?;
GpuMemoryManager::to_cpu(&gpu_buffer, self.shape())
}
}
}
fn to_cpu(&self) -> ParallelResult<Tensor<T>> {
Ok(self.clone())
}
}
pub trait GpuBatchParallelOp<T: Float + Send + Sync + Clone + 'static>: GpuParallelOp<T> {
fn gpu_batch_normalize(&self, epsilon: T) -> ParallelResult<Tensor<T>>;
fn gpu_batch_conv2d(
&self,
kernel: &Tensor<T>,
stride: usize,
padding: usize,
) -> ParallelResult<Tensor<T>>;
fn gpu_batch_attention(&self, key: &Tensor<T>, value: &Tensor<T>) -> ParallelResult<Tensor<T>>;
}
#[cfg(feature = "cuda")]
impl<T> GpuBatchParallelOp<T> for Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand
+ cudarc::driver::DeviceRepr
+ cudarc::driver::ValidAsZeroBits,
{
fn gpu_batch_normalize(&self, epsilon: T) -> ParallelResult<Tensor<T>> {
let ctx = GpuParallelContext::default();
let strategy = ctx.determine_strategy(self.data.len());
match strategy {
GpuParallelStrategy::GpuPreferred => {
Ok(self.try_gpu_batch_normalize(epsilon).unwrap_or_else(|_| {
self.batch_normalize(epsilon)
}))
}
_ => Ok(self.batch_normalize(epsilon)),
}
}
fn gpu_batch_conv2d(
&self,
kernel: &Tensor<T>,
stride: usize,
padding: usize,
) -> ParallelResult<Tensor<T>> {
use crate::backends::ConvolutionParams;
use crate::gpu::conv_ops::GpuConvolution;
let ctx = GpuParallelContext::default();
let strategy = ctx.determine_strategy(self.data.len() * kernel.data.len());
let params = ConvolutionParams {
kernel_size: vec![3, 3], stride: vec![stride, stride],
padding: vec![padding, padding],
dilation: vec![1, 1],
groups: 1,
};
match strategy {
GpuParallelStrategy::GpuPreferred => {
GpuConvolution::gpu_batch_conv2d(self, kernel, ¶ms).or_else(|_| {
self.batch_conv2d(kernel, stride, padding)
})
}
GpuParallelStrategy::Auto => {
GpuConvolution::gpu_batch_conv2d(self, kernel, ¶ms)
.or_else(|_| self.batch_conv2d(kernel, stride, padding))
}
_ => self.batch_conv2d(kernel, stride, padding),
}
}
fn gpu_batch_attention(&self, key: &Tensor<T>, value: &Tensor<T>) -> ParallelResult<Tensor<T>> {
match self.try_gpu_batch_attention(key, value) {
Ok(result) => Ok(result),
Err(_) => {
let scores = self.gpu_matmul(key)?;
let attention_weights = self.apply_softmax(&scores)?; attention_weights.gpu_matmul(value)
}
}
}
}
#[cfg(not(feature = "cuda"))]
impl<T> GpuParallelOp<T> for Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand,
{
fn gpu_elementwise_op<F>(&self, other: &Tensor<T>, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(T, T) -> T + Send + Sync + Clone + 'static,
{
self.batch_elementwise_op(other, op)
}
fn gpu_matmul(&self, other: &Tensor<T>) -> ParallelResult<Tensor<T>> {
#[cfg(feature = "mac-hybrid")]
{
return self.matmul_hybrid(other).map_err(|e| e.into());
}
#[cfg(all(feature = "metal", not(feature = "mac-hybrid")))]
{
return self.matmul_metal(other, 0).map_err(|e| e.into());
}
#[cfg(all(
feature = "coreml",
not(any(feature = "metal", feature = "mac-hybrid"))
))]
{
return self.matmul_coreml(other, 0).map_err(|e| e.into());
}
#[cfg(not(any(feature = "metal", feature = "coreml", feature = "mac-hybrid")))]
{
self.matmul(other).map_err(|e| e.into())
}
}
fn gpu_reduce<F, R>(&self, dim: usize, init: R, op: F) -> ParallelResult<Tensor<T>>
where
F: Fn(R, T) -> R + Send + Sync + Clone,
R: Send + Sync + Clone + Into<T>,
{
Err(crate::error::RusTorchError::tensor_op(
"GPU reduce not available without CUDA".to_string(),
)
.into())
}
fn to_device(&self, _device: DeviceType) -> ParallelResult<Tensor<T>> {
Ok(self.clone())
}
fn to_cpu(&self) -> ParallelResult<Tensor<T>> {
Ok(self.clone())
}
}
#[cfg(not(feature = "cuda"))]
impl<T> GpuBatchParallelOp<T> for Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand,
{
fn gpu_batch_normalize(&self, epsilon: T) -> ParallelResult<Tensor<T>> {
Ok(self.batch_normalize(epsilon))
}
fn gpu_batch_conv2d(
&self,
kernel: &Tensor<T>,
stride: usize,
padding: usize,
) -> ParallelResult<Tensor<T>> {
use crate::backends::ConvolutionParams;
let params = ConvolutionParams {
kernel_size: vec![kernel.shape()[2], kernel.shape()[3]],
stride: vec![stride, stride],
padding: vec![padding, padding],
dilation: vec![1, 1],
groups: 1,
};
#[cfg(feature = "mac-hybrid")]
{
use crate::gpu::{DeviceType, GpuConvolution, OpType};
return self.gpu_conv2d(kernel, ¶ms).map_err(|e| e.into());
}
#[cfg(all(feature = "metal", not(feature = "mac-hybrid")))]
{
use crate::gpu::GpuConvolution;
return self.gpu_conv2d(kernel, ¶ms).map_err(|e| e.into());
}
#[cfg(all(
feature = "coreml",
not(any(feature = "metal", feature = "mac-hybrid"))
))]
{
use crate::gpu::GpuConvolution;
return self.gpu_conv2d(kernel, ¶ms).map_err(|e| e.into());
}
#[cfg(not(any(feature = "metal", feature = "coreml", feature = "mac-hybrid")))]
{
Err(crate::error::RusTorchError::tensor_op(
"No GPU acceleration available (enable metal, coreml, or mac-hybrid features)"
.to_string(),
)
.into())
}
}
fn gpu_batch_attention(&self, key: &Tensor<T>, value: &Tensor<T>) -> ParallelResult<Tensor<T>> {
#[cfg(feature = "mac-hybrid")]
{
let scores = self.matmul_hybrid(key)?;
let attention_weights = self.apply_softmax(&scores)?;
return attention_weights.matmul_hybrid(value);
}
#[cfg(all(feature = "metal", not(feature = "mac-hybrid")))]
{
let scores = self.matmul_metal(key, 0)?;
let attention_weights = self.apply_softmax(&scores)?;
return attention_weights.matmul_metal(value, 0);
}
#[cfg(all(
feature = "coreml",
not(any(feature = "metal", feature = "mac-hybrid"))
))]
{
let scores = self.matmul_coreml(key, 0)?;
let attention_weights = self.apply_softmax(&scores)?;
return attention_weights.matmul_coreml(value, 0);
}
#[cfg(not(any(feature = "metal", feature = "coreml", feature = "mac-hybrid")))]
{
let scores = self.matmul(key)?;
let attention_weights = self.apply_softmax(&scores)?;
attention_weights.matmul(value)
}
}
}
#[cfg(feature = "cuda")]
impl<T> Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand
+ cudarc::driver::DeviceRepr
+ cudarc::driver::ValidAsZeroBits,
{
fn try_gpu_elementwise_op<F>(
&self,
other: &Tensor<T>,
op: F,
) -> Result<Tensor<T>, crate::error::RusTorchError>
where
F: Fn(T, T) -> T + Send + Sync + Clone + 'static,
{
use crate::gpu::memory_ops::manager::GpuMemoryManager;
use ndarray::ArrayD;
let manager = get_device_manager();
let devices = manager.available_devices();
if devices.is_empty() || devices == vec![DeviceType::Cpu] {
return Err(crate::error::RusTorchError::gpu(
"No GPU devices available, falling back to CPU",
));
}
let device = devices.first().unwrap();
if *device == DeviceType::Cpu {
return self.batch_elementwise_op(other, op).map_err(|e| {
crate::error::RusTorchError::gpu(format!("CPU fallback failed: {}", e))
});
}
if self.shape() != other.shape() {
return Err(crate::error::RusTorchError::shape_mismatch(
self.shape(),
other.shape(),
));
}
let original_shape = self.shape().to_vec();
let data_len = self.data.len();
let self_data = self.data.view().to_owned().into_raw_vec_and_offset().0;
let other_data = other.data.view().to_owned().into_raw_vec_and_offset().0;
let flat_self = Tensor::from_vec(self_data, vec![data_len]);
let flat_other = Tensor::from_vec(other_data, vec![data_len]);
let gpu_manager = GpuMemoryManager::new();
let gpu_self = GpuMemoryManager::to_device(&flat_self, device)?;
let gpu_other = GpuMemoryManager::to_device(&flat_other, device)?;
let gpu_result = gpu_manager.execute_elementwise(&gpu_self, &gpu_other, op)?;
let flat_result = GpuMemoryManager::to_cpu(&gpu_result, &[data_len]).map_err(|e| {
crate::error::RusTorchError::gpu(format!(
"Failed to transfer result from device: {}",
e
))
})?;
let result_data = flat_result.data.into_raw_vec_and_offset().0;
let total_elements: usize = original_shape.iter().product();
if result_data.len() != total_elements {
return Err(crate::error::RusTorchError::gpu(format!(
"Mismatched element count: expected {} but got {}",
total_elements,
result_data.len()
)));
}
let array = ArrayD::from_shape_vec(original_shape.clone(), result_data).map_err(|e| {
crate::error::RusTorchError::gpu(format!("Failed to reshape result: {}", e))
})?;
let result = Tensor::from_ndarray(array);
Ok(result)
}
fn try_gpu_batch_normalize(
&self,
epsilon: T,
) -> Result<Tensor<T>, crate::error::RusTorchError> {
use crate::gpu::{memory_ops::manager::GpuMemoryManager, DeviceManager};
let manager = DeviceManager::new();
let devices = manager.available_devices();
if devices.is_empty() {
return Err(crate::error::RusTorchError::gpu("GPU unavailable"));
}
let gpu_manager = GpuMemoryManager::new();
let device = devices.first().unwrap();
let gpu_tensor = GpuMemoryManager::to_device(self, device)?;
let gpu_result = gpu_manager.execute_batch_normalize(&gpu_tensor, epsilon)?;
let result = GpuMemoryManager::to_cpu(&gpu_result, self.data.shape())?;
Ok(result)
}
fn try_gpu_batch_attention(
&self,
key: &Tensor<T>,
value: &Tensor<T>,
) -> Result<Tensor<T>, crate::error::RusTorchError> {
use crate::gpu::{memory_ops::manager::GpuMemoryManager, DeviceManager};
let manager = DeviceManager::new();
let devices = manager.available_devices();
if devices.is_empty() {
return Err(crate::error::RusTorchError::gpu("GPU unavailable"));
}
let gpu_manager = GpuMemoryManager::new();
let device = devices.first().unwrap();
let gpu_query = GpuMemoryManager::to_device(self, device)?;
let gpu_key = GpuMemoryManager::to_device(key, device)?;
let gpu_value = GpuMemoryManager::to_device(value, device)?;
let gpu_result = gpu_manager.execute_attention(&gpu_query, &gpu_key, &gpu_value)?;
let result = GpuMemoryManager::to_cpu(&gpu_result, self.data.shape())?;
Ok(result)
}
fn apply_softmax(&self, tensor: &Tensor<T>) -> Result<Tensor<T>, crate::error::RusTorchError> {
let data = tensor.data.as_slice().ok_or_else(|| {
crate::error::RusTorchError::tensor_op(
"Non-contiguous tensor not supported for softmax",
)
})?;
let max_val = data
.iter()
.fold(T::neg_infinity(), |max, &x| if x > max { x } else { max });
let exp_data: Vec<T> = data.iter().map(|&x| (x - max_val).exp()).collect();
let sum = exp_data.iter().fold(T::zero(), |acc, &x| acc + x);
let softmax_data: Vec<T> = exp_data.iter().map(|&x| x / sum).collect();
let array = ndarray::ArrayD::from_shape_vec(ndarray::IxDyn(tensor.shape()), softmax_data)
.map_err(|e| {
crate::error::RusTorchError::tensor_op(format!("Shape error: {}", e))
})?;
Ok(Tensor::from_ndarray(array))
}
}
#[cfg(not(feature = "cuda"))]
impl<T> Tensor<T>
where
T: Float
+ Send
+ Sync
+ Clone
+ 'static
+ std::fmt::Debug
+ num_traits::FromPrimitive
+ ndarray::ScalarOperand,
{
fn try_gpu_elementwise_op<F>(
&self,
other: &Tensor<T>,
op: F,
) -> Result<Tensor<T>, crate::error::RusTorchError>
where
F: Fn(T, T) -> T + Send + Sync + Clone + 'static,
{
self.batch_elementwise_op(other, op)
}
fn try_gpu_batch_normalize(
&self,
epsilon: T,
) -> Result<Tensor<T>, crate::error::RusTorchError> {
Ok(self.batch_normalize(epsilon))
}
fn try_gpu_batch_attention(
&self,
key: &Tensor<T>,
value: &Tensor<T>,
) -> Result<Tensor<T>, crate::error::RusTorchError> {
let scores = self.matmul(key)?;
let attention_weights = self.apply_softmax(&scores)?;
attention_weights.matmul(value)
}
fn apply_softmax(&self, tensor: &Tensor<T>) -> Result<Tensor<T>, crate::error::RusTorchError> {
let data = tensor.data.as_slice().ok_or_else(|| {
crate::error::RusTorchError::tensor_op(
"Non-contiguous tensor not supported for softmax",
)
})?;
let max_val = data
.iter()
.fold(T::neg_infinity(), |max, &x| if x > max { x } else { max });
let exp_values: Vec<T> = data.iter().map(|&x| (x - max_val).exp()).collect();
let sum_exp = exp_values.iter().fold(T::zero(), |acc, &x| acc + x);
let softmax_values: Vec<T> = exp_values.iter().map(|&x| x / sum_exp).collect();
let array = ndarray::ArrayD::from_shape_vec(tensor.data.raw_dim(), softmax_values)
.map_err(|e| {
crate::error::RusTorchError::tensor_op(format!("Softmax shape error: {}", e))
})?;
Ok(Tensor::from_ndarray(array))
}
}
pub mod gpu_parallel_utils {
use super::*;
pub fn select_optimal_device(size: usize) -> DeviceType {
let manager = get_device_manager();
let available = manager.available_devices();
if size > 100000 {
for device in available {
if !matches!(device, DeviceType::Cpu) {
return device;
}
}
}
DeviceType::Cpu
}
pub fn evaluate_gpu_efficiency(tensor_size: usize, operation_complexity: f32) -> f32 {
let transfer_cost = tensor_size as f32 * 0.001; let compute_benefit = operation_complexity * tensor_size as f32 * 0.01;
(compute_benefit - transfer_cost).max(0.0)
}
pub fn optimize_batch_size(total_size: usize, device: DeviceType) -> usize {
match device {
DeviceType::Cpu => {
std::cmp::min(1024, total_size)
}
DeviceType::Cuda(_) => {
std::cmp::min(4096, total_size)
}
DeviceType::Metal(_) => {
std::cmp::min(2048, total_size)
}
DeviceType::OpenCL(_) => {
std::cmp::min(1024, total_size)
}
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
DeviceType::CoreML(_) => {
std::cmp::min(3072, total_size)
}
#[cfg(feature = "mac-hybrid")]
DeviceType::MacHybrid => {
std::cmp::min(4096, total_size)
}
DeviceType::Auto => {
std::cmp::min(3072, total_size)
}
}
}
}
impl<
T: Float + Send + Sync + Clone + 'static + ndarray::ScalarOperand + num_traits::FromPrimitive,
> Tensor<T>
{
pub fn gpu_sum(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>> {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_sum(self, dim)
}
pub fn gpu_mean(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>>
where
T: num_traits::FromPrimitive,
{
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_mean(self, dim)
}
pub fn gpu_max(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>> {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_max(self, dim)
}
pub fn gpu_min(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>> {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_min(self, dim)
}
pub fn gpu_std(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>> {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_std(self, dim)
}
pub fn gpu_var(&self, dim: Option<usize>) -> ParallelResult<Tensor<T>> {
use crate::gpu::reduction_ops::GpuReduction;
GpuReduction::gpu_var(self, dim)
}
pub fn gpu_batch_matmul(&self, other: &Tensor<T>) -> ParallelResult<Tensor<T>> {
self.batch_matmul(other)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_parallel_config() {
let config = GpuParallelConfig::default();
assert_eq!(config.transfer_threshold, 10000);
assert!(matches!(config.gpu_strategy, GpuParallelStrategy::Auto));
}
#[test]
fn test_gpu_parallel_context() {
let ctx = GpuParallelContext::default();
let actual_device = ctx.current_device();
println!("実際のデバイス: {:?}", actual_device);
assert!(matches!(
actual_device,
DeviceType::Cpu | DeviceType::Metal(_) | DeviceType::Cuda(_) | DeviceType::OpenCL(_)
));
let strategy = ctx.determine_strategy(5000);
assert!(matches!(strategy, GpuParallelStrategy::CpuParallel));
let strategy = ctx.determine_strategy(50000);
if actual_device != DeviceType::Cpu {
assert!(matches!(strategy, GpuParallelStrategy::GpuPreferred));
} else {
assert!(matches!(strategy, GpuParallelStrategy::CpuParallel));
}
}
#[test]
fn test_gpu_elementwise_op() {
let tensor1 = Tensor::<f32>::ones(&[2, 2]);
let tensor2 = Tensor::<f32>::ones(&[2, 2]);
let result = tensor1.gpu_elementwise_op(&tensor2, |a, b| a + b);
assert!(result.is_ok());
let result = result.unwrap();
assert_eq!(result.as_array()[[0, 0]], 2.0);
}
#[test]
fn test_gpu_matmul() {
let tensor1 = Tensor::<f32>::ones(&[2, 3]);
let tensor2 = Tensor::<f32>::ones(&[3, 2]);
let result = tensor1.matmul(&tensor2);
assert_eq!(result.unwrap().shape(), &[2, 2]);
let batch_tensor1 = Tensor::<f32>::ones(&[1, 2, 3]);
let batch_tensor2 = Tensor::<f32>::ones(&[1, 3, 2]);
let result = batch_tensor1.gpu_batch_matmul(&batch_tensor2);
assert!(result.is_ok());
let result = result.unwrap();
assert_eq!(result.shape(), &[1, 2, 2]);
}
}