use crate::backends::ConvolutionParams;
use crate::error::{RusTorchError, RusTorchResult};
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
use crate::gpu::hybrid_executor::HybridExecution;
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
use crate::gpu::{DeviceType, OpType};
use crate::tensor::Tensor;
use ndarray::ScalarOperand;
use num_traits::{Float, FromPrimitive};
pub trait GpuConvolution<T: Float + FromPrimitive + ScalarOperand + Send + Sync + 'static> {
fn gpu_conv2d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>>;
fn gpu_conv_transpose2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>>;
fn gpu_depthwise_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>>;
fn gpu_grouped_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
groups: usize,
) -> RusTorchResult<Tensor<T>>;
fn gpu_conv3d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>>;
}
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
impl<T: Float + FromPrimitive + ScalarOperand + Send + Sync + 'static> GpuConvolution<T>
for Tensor<T>
{
fn gpu_conv2d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>> {
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
{
use crate::gpu::hybrid_executor::HybridExecution;
use crate::gpu::{coreml::CoreMLConvolution, OpType};
return self.hybrid_operation(OpType::Convolution, |device| {
match device {
super::DeviceType::CoreML(_) => {
let stride = &[params.stride[0], params.stride[1]];
let padding = &[params.padding[0], params.padding[1]];
self.coreml_conv2d(kernel, stride, padding)
}
super::DeviceType::Cuda(_) => {
self.conv2d_cuda(kernel, params)
}
super::DeviceType::Metal(_) => {
self.conv2d_metal(kernel, params)
}
super::DeviceType::OpenCL(_) => {
Err(RusTorchError::UnsupportedOperation(
"OpenCL convolution not yet implemented".to_string(),
))
}
super::DeviceType::Cpu => {
self.conv2d_fallback(kernel, params)
}
_ => Err(RusTorchError::UnsupportedDevice(
"Unsupported device for convolution".to_string(),
)),
}
});
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
{
self.conv2d_fallback(kernel, params)
}
}
fn gpu_conv_transpose2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
{
use crate::gpu::hybrid_executor::HybridExecution;
use crate::gpu::{coreml::CoreMLConvolution, OpType};
return self.hybrid_operation(OpType::Convolution, |device| {
match device {
super::DeviceType::CoreML(_) => {
self.conv_transpose2d_fallback(kernel, params)
}
super::DeviceType::Cuda(_) => {
self.conv_transpose2d_fallback(kernel, params)
}
super::DeviceType::Metal(_) => {
Err(RusTorchError::UnsupportedOperation(
"Metal transpose convolution not yet implemented".to_string(),
))
}
super::DeviceType::OpenCL(_) => {
self.conv_transpose2d_fallback(kernel, params)
}
super::DeviceType::Cpu => {
self.conv_transpose2d_fallback(kernel, params)
}
_ => Err(RusTorchError::UnsupportedDevice(
"Unsupported device for transpose convolution".to_string(),
)),
}
});
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
{
self.conv_transpose2d_fallback(kernel, params)
}
}
fn gpu_depthwise_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
{
use crate::gpu::hybrid_executor::HybridExecution;
use crate::gpu::{coreml::CoreMLConvolution, OpType};
return self.hybrid_operation(OpType::Convolution, |device| {
match device {
super::DeviceType::CoreML(_) => {
self.depthwise_conv2d_fallback(kernel, params)
}
super::DeviceType::Cuda(_) => {
self.depthwise_conv2d_fallback(kernel, params)
}
super::DeviceType::Metal(_) => {
Err(RusTorchError::UnsupportedOperation(
"Metal depthwise convolution not yet implemented".to_string(),
))
}
super::DeviceType::OpenCL(_) => {
self.depthwise_conv2d_fallback(kernel, params)
}
super::DeviceType::Cpu => {
self.depthwise_conv2d_fallback(kernel, params)
}
_ => Err(RusTorchError::UnsupportedDevice(
"Unsupported device for depthwise convolution".to_string(),
)),
}
});
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
{
self.depthwise_conv2d_fallback(kernel, params)
}
}
fn gpu_grouped_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
groups: usize,
) -> RusTorchResult<Tensor<T>> {
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
{
use crate::gpu::hybrid_executor::HybridExecution;
use crate::gpu::{coreml::CoreMLConvolution, OpType};
return self.hybrid_operation(OpType::Convolution, |device| {
match device {
super::DeviceType::CoreML(_) => {
self.grouped_conv2d_fallback(kernel, params, groups)
}
super::DeviceType::Cuda(_) => {
self.grouped_conv2d_fallback(kernel, params, groups)
}
super::DeviceType::Metal(_) => {
Err(RusTorchError::UnsupportedOperation(
"Metal grouped convolution not yet implemented".to_string(),
))
}
super::DeviceType::OpenCL(_) => {
self.grouped_conv2d_fallback(kernel, params, groups)
}
super::DeviceType::Cpu => {
self.grouped_conv2d_fallback(kernel, params, groups)
}
_ => Err(RusTorchError::UnsupportedDevice(
"Unsupported device for grouped convolution".to_string(),
)),
}
});
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
{
self.grouped_conv2d_fallback(kernel, params, groups)
}
}
fn gpu_conv3d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>> {
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
{
use crate::gpu::hybrid_executor::HybridExecution;
use crate::gpu::{coreml::CoreMLConvolution, OpType};
return self.hybrid_operation(OpType::Convolution, |device| {
match device {
super::DeviceType::CoreML(_) => {
self.conv3d_fallback(kernel, params)
}
super::DeviceType::Cuda(_) => {
self.conv3d_fallback(kernel, params)
}
super::DeviceType::Metal(_) => {
Err(RusTorchError::UnsupportedOperation(
"Metal 3D convolution not yet implemented".to_string(),
))
}
super::DeviceType::OpenCL(_) => {
self.conv3d_fallback(kernel, params)
}
super::DeviceType::Cpu => {
self.conv3d_fallback(kernel, params)
}
_ => Err(RusTorchError::UnsupportedDevice(
"Unsupported device for 3D convolution".to_string(),
)),
}
});
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
{
self.conv3d_fallback(kernel, params)
}
}
}
#[cfg(not(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
)))]
impl<T: Float + FromPrimitive + ScalarOperand + Send + Sync + 'static> GpuConvolution<T>
for Tensor<T>
{
fn gpu_conv2d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>> {
self.conv2d_fallback(kernel, params)
}
fn gpu_conv_transpose2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
self.conv_transpose2d_fallback(kernel, params)
}
fn gpu_depthwise_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
self.depthwise_conv2d_fallback(kernel, params)
}
fn gpu_grouped_conv2d(
&self,
kernel: &Self,
params: &ConvolutionParams,
groups: usize,
) -> RusTorchResult<Tensor<T>> {
self.grouped_conv2d_fallback(kernel, params, groups)
}
fn gpu_conv3d(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Tensor<T>> {
self.conv3d_fallback(kernel, params)
}
}
impl<T: Float + FromPrimitive + ScalarOperand + Send + Sync + 'static> Tensor<T> {
fn conv2d_fallback(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Self> {
#[cfg(feature = "metal")]
{
return self.conv2d_metal(kernel, params);
}
#[cfg(not(feature = "metal"))]
{
let output_height = (self.shape()[2] + 2 * params.padding[0] - kernel.shape()[2])
/ params.stride[0]
+ 1;
let output_width = (self.shape()[3] + 2 * params.padding[1] - kernel.shape()[3])
/ params.stride[1]
+ 1;
let output_size = self.shape()[0] * kernel.shape()[0] * output_height * output_width;
let output_data = vec![T::zero(); output_size];
let output_shape = vec![
self.shape()[0],
kernel.shape()[0],
output_height,
output_width,
];
Ok(Tensor::from_vec(output_data, output_shape))
}
}
#[cfg(feature = "metal")]
fn conv2d_metal(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Self> {
use crate::gpu::metal_kernels::metal_conv2d_f32;
let input_data = self
.data
.iter()
.map(|&x| x.to_f32().unwrap())
.collect::<Vec<f32>>();
let kernel_data = kernel
.data
.iter()
.map(|&x| x.to_f32().unwrap())
.collect::<Vec<f32>>();
let input_shape = self.data.shape();
let kernel_shape = kernel.data.shape();
if input_shape.len() != 4 || kernel_shape.len() != 4 {
return Err(RusTorchError::InvalidOperation {
operation: "conv2d_metal".to_string(),
message: "Input and kernel must be 4D tensors [N, C, H, W]".to_string(),
});
}
let batch_size = input_shape[0];
let input_channels = input_shape[1];
let input_height = input_shape[2];
let input_width = input_shape[3];
let output_channels = kernel_shape[0];
let kernel_height = kernel_shape[2];
let kernel_width = kernel_shape[3];
let output_height =
(input_height + 2 * params.padding[0] - kernel_height) / params.stride[0] + 1;
let output_width =
(input_width + 2 * params.padding[1] - kernel_width) / params.stride[1] + 1;
let output_size = batch_size * output_channels * output_height * output_width;
let mut output_data = vec![0.0f32; output_size];
for batch in 0..batch_size {
let input_batch_start = batch * input_channels * input_height * input_width;
let input_batch_end = input_batch_start + input_channels * input_height * input_width;
let input_batch = &input_data[input_batch_start..input_batch_end];
let output_batch_start = batch * output_channels * output_height * output_width;
let output_batch_end =
output_batch_start + output_channels * output_height * output_width;
let output_batch = &mut output_data[output_batch_start..output_batch_end];
metal_conv2d_f32(
input_batch,
&kernel_data,
output_batch,
input_height,
input_width,
input_channels,
output_channels,
kernel_height,
kernel_width,
params.stride[0],
params.stride[1],
params.padding[0],
params.padding[1],
)
.map_err(|e| RusTorchError::InvalidOperation {
operation: "conv2d_metal".to_string(),
message: format!("Metal convolution failed: {}", e),
})?;
}
let result_data: Vec<T> = output_data
.into_iter()
.map(|x| T::from_f32(x).unwrap())
.collect();
let output_shape = vec![batch_size, output_channels, output_height, output_width];
Ok(Tensor::from_vec(result_data, output_shape))
}
#[cfg(not(feature = "metal"))]
fn conv2d_metal(&self, _kernel: &Self, _params: &ConvolutionParams) -> RusTorchResult<Self> {
Err(RusTorchError::UnsupportedDevice(
"Metal not available".to_string(),
))
}
#[cfg(feature = "cuda")]
fn conv2d_cuda(&self, kernel: &Self, params: &ConvolutionParams) -> RusTorchResult<Self> {
use crate::gpu::cuda_kernels::cuda_conv2d_f32;
let input_data = self
.data
.iter()
.map(|&x| x.to_f32().unwrap())
.collect::<Vec<f32>>();
let kernel_data = kernel
.data
.iter()
.map(|&x| x.to_f32().unwrap())
.collect::<Vec<f32>>();
let input_shape = self.data.shape();
let kernel_shape = kernel.data.shape();
if input_shape.len() != 4 || kernel_shape.len() != 4 {
return Err(RusTorchError::InvalidOperation {
operation: "conv2d_cuda".to_string(),
message: "Input and kernel must be 4D tensors [N, C, H, W]".to_string(),
});
}
let batch_size = input_shape[0];
let input_channels = input_shape[1];
let input_height = input_shape[2];
let input_width = input_shape[3];
let output_channels = kernel_shape[0];
let kernel_height = kernel_shape[2];
let kernel_width = kernel_shape[3];
let output_height =
(input_height + 2 * params.padding[0] - kernel_height) / params.stride[0] + 1;
let output_width =
(input_width + 2 * params.padding[1] - kernel_width) / params.stride[1] + 1;
let output_size = batch_size * output_channels * output_height * output_width;
let mut output_data = vec![0.0f32; output_size];
for b in 0..batch_size {
let batch_input_start = b * input_channels * input_height * input_width;
let batch_input_end = batch_input_start + input_channels * input_height * input_width;
let batch_input = &input_data[batch_input_start..batch_input_end];
let batch_output_start = b * output_channels * output_height * output_width;
let batch_output_end =
batch_output_start + output_channels * output_height * output_width;
let batch_output = &mut output_data[batch_output_start..batch_output_end];
cuda_conv2d_f32(
batch_input,
&kernel_data,
batch_output,
input_height,
input_width,
input_channels,
output_channels,
kernel_height,
kernel_width,
params.stride[0],
params.stride[1],
params.padding[0],
params.padding[1],
)
.map_err(|e| RusTorchError::InvalidOperation {
operation: "conv2d_cuda".to_string(),
message: format!("CUDA convolution failed: {}", e),
})?;
}
let result_data: Vec<T> = output_data
.into_iter()
.map(|x| T::from_f32(x).unwrap())
.collect();
let output_shape = vec![batch_size, output_channels, output_height, output_width];
Ok(Tensor::from_vec(result_data, output_shape))
}
#[cfg(not(feature = "cuda"))]
fn conv2d_cuda(&self, _kernel: &Self, _params: &ConvolutionParams) -> RusTorchResult<Self> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
pub fn conv_transpose2d_fallback(
&self,
_kernel: &Self,
_params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
Err(RusTorchError::TensorOp {
message: "Transpose convolution fallback not yet implemented".to_string(),
source: None,
})
}
pub fn depthwise_conv2d_fallback(
&self,
_kernel: &Self,
_params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
Err(RusTorchError::TensorOp {
message: "Depthwise convolution fallback not yet implemented".to_string(),
source: None,
})
}
pub fn grouped_conv2d_fallback(
&self,
_kernel: &Self,
_params: &ConvolutionParams,
_groups: usize,
) -> RusTorchResult<Tensor<T>> {
Err(RusTorchError::TensorOp {
message: "Grouped convolution fallback not yet implemented".to_string(),
source: None,
})
}
pub fn conv3d_fallback(
&self,
_kernel: &Self,
_params: &ConvolutionParams,
) -> RusTorchResult<Tensor<T>> {
Err(RusTorchError::TensorOp {
message: "3D convolution fallback not yet implemented".to_string(),
source: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::backends::ConvolutionParams;
#[test]
fn test_conv2d_fallback_basic() {
let input_data = vec![1.0, 2.0, 3.0, 4.0];
let input = Tensor::<f32>::from_vec(input_data, vec![1, 1, 2, 2]);
let kernel_data = vec![1.0, 0.0, 0.0, 1.0];
let kernel = Tensor::<f32>::from_vec(kernel_data, vec![1, 1, 2, 2]);
let params = ConvolutionParams {
kernel_size: vec![2, 2],
stride: vec![1, 1],
padding: vec![0, 0],
dilation: vec![1, 1],
groups: 1,
};
let result = input.conv2d_fallback(&kernel, ¶ms).unwrap();
let result_shape = result.shape();
assert_eq!(result_shape, &[1, 1, 1, 1]);
assert_eq!(result.as_slice().unwrap()[0], 5.0);
}
#[test]
fn test_gpu_conv2d_fallback() {
let input_data = vec![1.0, 2.0, 3.0, 4.0];
let input = Tensor::<f32>::from_vec(input_data, vec![1, 1, 2, 2]);
let kernel_data = vec![1.0, 0.0, 0.0, 1.0];
let kernel = Tensor::<f32>::from_vec(kernel_data, vec![1, 1, 2, 2]);
let params = ConvolutionParams {
kernel_size: vec![2, 2],
stride: vec![1, 1],
padding: vec![0, 0],
dilation: vec![1, 1],
groups: 1,
};
let result = input.gpu_conv2d(&kernel, ¶ms).unwrap();
let result_shape = result.shape();
assert_eq!(result_shape, &[1, 1, 1, 1]);
assert_eq!(result.as_slice().unwrap()[0], 5.0);
}
}