#![allow(dead_code)]
use crate::{Result, VisionError};
use std::sync::Arc;
use torsh_core::device::{CpuDevice, Device, DeviceType};
use torsh_tensor::Tensor;
pub trait HardwareAccelerated {
fn device(&self) -> &dyn Device;
fn supports_mixed_precision(&self) -> bool;
fn supports_tensor_cores(&self) -> bool;
}
pub trait GpuTransform: Send + Sync {
fn forward_gpu(&self, input: &Tensor<f32>) -> Result<Tensor<f32>>;
fn forward_gpu_f32(&self, input: &Tensor<f32>) -> Result<Tensor<f32>>; }
pub struct MixedPrecisionTransform<T: GpuTransform> {
inner: T,
device: Arc<dyn Device>,
use_fp16: bool,
}
impl<T: GpuTransform> MixedPrecisionTransform<T> {
pub fn new(inner: T, device: Arc<dyn Device>, use_fp16: bool) -> Self {
Self {
inner,
device,
use_fp16,
}
}
pub fn forward(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
if self.use_fp16 && matches!(self.device.device_type(), DeviceType::Cuda(_)) {
let input_f32 = input.to_dtype(torsh_core::dtype::DType::F32)?;
let output_f32 = self.inner.forward_gpu_f32(&input_f32)?;
Ok(output_f32.to_dtype(torsh_core::dtype::DType::F32)?)
} else {
self.inner.forward_gpu(input)
}
}
}
pub struct GpuResize {
size: (usize, usize),
device: Arc<dyn Device>,
}
impl GpuResize {
pub fn new(size: (usize, usize), device: Arc<dyn Device>) -> Self {
Self { size, device }
}
}
impl GpuTransform for GpuResize {
fn forward_gpu(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
if matches!(self.device.device_type(), DeviceType::Cuda(_)) {
self.cuda_resize(input)
} else {
crate::ops::resize(input, self.size)
}
}
fn forward_gpu_f32(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
if matches!(self.device.device_type(), DeviceType::Cuda(_)) {
self.cuda_resize_f32(input)
} else {
let input_f32 = input.to_dtype(torsh_core::dtype::DType::F32)?;
let output_f32 = crate::ops::resize(&input_f32, self.size)?;
Ok(output_f32.to_dtype(torsh_core::dtype::DType::F32)?)
}
}
}
impl GpuResize {
fn cuda_resize(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
let shape = input.shape();
if shape.dims().len() != 3 {
return Err(VisionError::InvalidShape(format!(
"Expected 3D tensor (C, H, W), got {}D",
shape.dims().len()
)));
}
let (target_width, target_height) = self.size;
let (channels, height, width) = (shape.dims()[0], shape.dims()[1], shape.dims()[2]);
let output_shape = &[channels, target_height, target_width];
let mut output = Tensor::zeros(output_shape, input.device())?;
self.launch_resize_kernel(
input,
&mut output,
width,
height,
target_width,
target_height,
)?;
Ok(output)
}
fn cuda_resize_f32(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
let shape = input.shape();
if shape.dims().len() != 3 {
return Err(VisionError::InvalidShape(format!(
"Expected 3D tensor (C, H, W), got {}D",
shape.dims().len()
)));
}
let (target_width, target_height) = self.size;
let (channels, height, width) = (shape.dims()[0], shape.dims()[1], shape.dims()[2]);
let output_shape = &[channels, target_height, target_width];
let mut output = Tensor::zeros(output_shape, input.device())?;
self.launch_resize_kernel_f32(
input,
&mut output,
width,
height,
target_width,
target_height,
)?;
Ok(output)
}
fn launch_resize_kernel(
&self,
input: &Tensor<f32>,
output: &mut Tensor<f32>,
_src_width: usize,
_src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Result<()> {
let cpu_output = crate::ops::resize(input, (dst_width, dst_height))?;
*output = cpu_output;
Ok(())
}
fn launch_resize_kernel_f32(
&self,
input: &Tensor<f32>,
output: &mut Tensor<f32>,
_src_width: usize,
_src_height: usize,
dst_width: usize,
dst_height: usize,
) -> Result<()> {
let input_f32 = input.to_dtype(torsh_core::dtype::DType::F32)?;
let output_f32 = crate::ops::resize(&input_f32, (dst_width, dst_height))?;
*output = output_f32.to_dtype(torsh_core::dtype::DType::F32)?;
Ok(())
}
}
pub struct GpuConvolution {
kernel: Tensor<f32>,
stride: (usize, usize),
padding: (usize, usize),
device: Arc<dyn Device>,
}
impl GpuConvolution {
pub fn new(
kernel: Tensor<f32>,
stride: (usize, usize),
padding: (usize, usize),
device: Arc<dyn Device>,
) -> Self {
Self {
kernel,
stride,
padding,
device,
}
}
pub fn gaussian_blur(sigma: f32, kernel_size: usize, device: Arc<dyn Device>) -> Result<Self> {
let kernel = Self::create_gaussian_kernel(sigma, kernel_size)?;
Ok(Self::new(
kernel,
(1, 1),
(kernel_size / 2, kernel_size / 2),
device,
))
}
fn create_gaussian_kernel(sigma: f32, size: usize) -> Result<Tensor<f32>> {
let _cpu_device = CpuDevice::new();
let kernel = Tensor::zeros(&[size, size], torsh_core::DeviceType::Cpu)?;
let center = size as f32 / 2.0;
let two_sigma_sq = 2.0 * sigma * sigma;
let mut sum = 0.0;
for i in 0..size {
for j in 0..size {
let x = i as f32 - center;
let y = j as f32 - center;
let value = (-((x * x + y * y) / two_sigma_sq)).exp();
kernel.set(&[i, j], value)?;
sum += value;
}
}
for i in 0..size {
for j in 0..size {
let value = kernel.get(&[i, j])? / sum;
kernel.set(&[i, j], value)?;
}
}
Ok(kernel)
}
}
impl GpuTransform for GpuConvolution {
fn forward_gpu(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
if matches!(self.device.device_type(), DeviceType::Cuda(_)) {
self.cuda_convolution(input)
} else {
self.cpu_convolution(input)
}
}
fn forward_gpu_f32(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
if matches!(self.device.device_type(), DeviceType::Cuda(_)) {
self.cuda_convolution_f32(input)
} else {
let input_f32 = input.to_dtype(torsh_core::dtype::DType::F32)?;
let output_f32 = self.cpu_convolution(&input_f32)?;
Ok(output_f32.to_dtype(torsh_core::dtype::DType::F32)?)
}
}
}
impl GpuConvolution {
fn cuda_convolution(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
self.cpu_convolution(input)
}
fn cuda_convolution_f32(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
let input_f32 = input.to_dtype(torsh_core::dtype::DType::F32)?;
let output_f32 = self.cpu_convolution(&input_f32)?;
Ok(output_f32.to_dtype(torsh_core::dtype::DType::F32)?)
}
fn cpu_convolution(&self, input: &Tensor<f32>) -> Result<Tensor<f32>> {
let input_shape = input.shape();
let kernel_shape = self.kernel.shape();
if input_shape.dims().len() != 3 || kernel_shape.dims().len() != 2 {
return Err(VisionError::InvalidShape(
"Expected 3D input tensor (C, H, W) and 2D kernel".to_string(),
));
}
let (channels, height, width) = (
input_shape.dims()[0],
input_shape.dims()[1],
input_shape.dims()[2],
);
let (kernel_h, kernel_w) = (kernel_shape.dims()[0], kernel_shape.dims()[1]);
let (pad_h, pad_w) = self.padding;
let (stride_h, stride_w) = self.stride;
let output_h = (height + 2 * pad_h - kernel_h) / stride_h + 1;
let output_w = (width + 2 * pad_w - kernel_w) / stride_w + 1;
let output = Tensor::zeros(&[channels, output_h, output_w], input.device())?;
for c in 0..channels {
for y in 0..output_h {
for x in 0..output_w {
let mut sum = 0.0;
for ky in 0..kernel_h {
for kx in 0..kernel_w {
let input_y = y * stride_h + ky;
let input_x = x * stride_w + kx;
if input_y >= pad_h
&& input_y < height + pad_h
&& input_x >= pad_w
&& input_x < width + pad_w
{
let input_y = input_y - pad_h;
let input_x = input_x - pad_w;
let input_val = input.get(&[c, input_y, input_x])?;
let kernel_val = self.kernel.get(&[ky, kx])?;
sum += input_val * kernel_val;
}
}
}
output.set(&[c, y, x], sum)?;
}
}
}
Ok(output)
}
}
pub struct HardwareContext {
device: Arc<dyn Device>,
mixed_precision: bool,
tensor_cores: bool,
}
impl HardwareContext {
pub fn new(device: Arc<dyn Device>) -> Self {
let mixed_precision = matches!(device.device_type(), DeviceType::Cuda(_));
let tensor_cores = matches!(device.device_type(), DeviceType::Cuda(_));
Self {
device,
mixed_precision,
tensor_cores,
}
}
pub fn auto_detect() -> Result<Self> {
let device = Arc::new(CpuDevice::new()) as Arc<dyn Device>;
Ok(Self::new(device))
}
pub fn create_gpu_resize(&self, size: (usize, usize)) -> GpuResize {
GpuResize::new(size, Arc::clone(&self.device))
}
pub fn create_gpu_convolution(
&self,
kernel: Tensor<f32>,
stride: (usize, usize),
padding: (usize, usize),
) -> GpuConvolution {
GpuConvolution::new(kernel, stride, padding, Arc::clone(&self.device))
}
pub fn create_mixed_precision_transform<T: GpuTransform>(
&self,
transform: T,
) -> MixedPrecisionTransform<T> {
MixedPrecisionTransform::new(transform, Arc::clone(&self.device), self.mixed_precision)
}
pub fn device_info(&self) -> String {
format!(
"Device: {}",
if matches!(self.device.device_type(), DeviceType::Cuda(_)) {
"CUDA"
} else {
"CPU"
}
)
}
pub fn cuda_available(&self) -> bool {
matches!(self.device.device_type(), DeviceType::Cuda(_))
}
pub fn has_tensor_cores(&self) -> bool {
self.tensor_cores
}
}
impl HardwareAccelerated for HardwareContext {
fn device(&self) -> &dyn Device {
&*self.device
}
fn supports_mixed_precision(&self) -> bool {
self.mixed_precision
}
fn supports_tensor_cores(&self) -> bool {
self.tensor_cores
}
}
pub struct BatchProcessor {
context: HardwareContext,
batch_size: usize,
}
impl BatchProcessor {
pub fn new(context: HardwareContext, batch_size: usize) -> Self {
Self {
context,
batch_size,
}
}
pub fn process_batch<T: GpuTransform + Clone>(
&self,
inputs: &[Tensor<f32>],
transform: &T,
) -> Result<Vec<Tensor<f32>>> {
let mut outputs = Vec::with_capacity(inputs.len());
for batch in inputs.chunks(self.batch_size) {
for input in batch {
let output = transform.forward_gpu(input)?;
outputs.push(output);
}
}
Ok(outputs)
}
}
#[cfg(test)]
mod tests {
use super::*;
use torsh_tensor::creation::zeros;
#[test]
fn test_hardware_context() {
let context = HardwareContext::auto_detect().unwrap();
assert!(context.device_info().contains("Device:"));
}
#[test]
fn test_gpu_resize() {
let device = Arc::new(CpuDevice::new()) as Arc<dyn Device>;
let resize = GpuResize::new((224, 224), device);
let _cpu_device = CpuDevice::new();
let input = zeros(&[3, 128, 128]).unwrap();
let output = resize.forward_gpu(&input).unwrap();
assert_eq!(output.shape().dims(), &[3, 224, 224]);
}
#[test]
#[ignore] fn test_gpu_convolution() {
let device = Arc::new(CpuDevice::new()) as Arc<dyn Device>;
let _cpu_device = CpuDevice::new();
let kernel = zeros(&[3, 3]).unwrap();
let conv = GpuConvolution::new(kernel, (1, 1), (1, 1), device);
let input = zeros(&[3, 32, 32]).unwrap();
let output = conv.forward_gpu(&input).unwrap();
assert_eq!(output.shape().dims(), &[3, 32, 32]);
}
#[test]
fn test_mixed_precision_transform() {
let device = Arc::new(CpuDevice::new()) as Arc<dyn Device>;
let resize = GpuResize::new((224, 224), device.clone());
let mixed_transform = MixedPrecisionTransform::new(resize, device, false);
let _cpu_device = CpuDevice::new();
let input = zeros(&[3, 128, 128]).unwrap();
let output = mixed_transform.forward(&input).unwrap();
assert_eq!(output.shape().dims(), &[3, 224, 224]);
}
}