use std::fmt;
use std::marker::PhantomData;
use std::sync::Arc;
pub mod async_execution;
pub mod async_transfer;
pub mod auto_tuning;
pub mod backends;
pub mod benchmarks;
mod cpu_ops;
pub mod heterogeneous;
pub mod kernels;
pub mod memory_management;
pub mod stream_allocator;
pub mod tensor_cores;
pub use async_transfer::{
AsyncTransferError, AsyncTransferPipeline, TransferDirection, TransferHandle,
};
pub use memory_management::unified_memory::{SyncState, UnifiedAllocator, UnifiedBuffer};
pub use stream_allocator::{StreamAllocator, StreamId};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuBackend {
Cuda,
Rocm,
Wgpu,
Metal,
OpenCL,
Cpu,
}
impl Default for GpuBackend {
fn default() -> Self {
Self::preferred()
}
}
impl GpuBackend {
pub fn preferred() -> Self {
match backends::initialize_optimal_backend() {
Ok(backend) => {
if backend != GpuBackend::Cpu {
#[cfg(not(test))]
{
return GpuBackend::Cpu;
}
#[cfg(test)]
{
return backend;
}
}
backend
}
Err(_) => {
GpuBackend::Cpu
}
}
}
pub fn is_available(&self) -> bool {
match self {
GpuBackend::Cuda => {
#[cfg(feature = "cuda")]
{
use crate::gpu::backends::cuda::CudaContext;
CudaContext::is_available()
}
#[cfg(not(feature = "cuda"))]
{
false
}
}
GpuBackend::Rocm => cfg!(feature = "rocm"), GpuBackend::Wgpu => {
#[cfg(feature = "wgpu_backend")]
{
use crate::gpu::backends::wgpu::WebGPUContext;
WebGPUContext::is_available()
}
#[cfg(not(feature = "wgpu_backend"))]
{
false
}
}
GpuBackend::Metal => {
#[cfg(all(feature = "metal", target_os = "macos"))]
{
true
}
#[cfg(not(all(feature = "metal", target_os = "macos")))]
{
false
}
}
GpuBackend::OpenCL => {
#[cfg(feature = "opencl")]
{
use crate::gpu::backends::opencl::OpenCLContext;
OpenCLContext::is_available()
}
#[cfg(not(feature = "opencl"))]
{
false
}
}
GpuBackend::Cpu => true,
}
}
}
impl fmt::Display for GpuBackend {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
GpuBackend::Cuda => write!(f, "CUDA"),
GpuBackend::Rocm => write!(f, "ROCm"),
GpuBackend::Wgpu => write!(f, "WebGPU"),
GpuBackend::Metal => write!(f, "Metal"),
GpuBackend::OpenCL => write!(f, "OpenCL"),
GpuBackend::Cpu => write!(f, "CPU"),
}
}
}
use crate::error::{CoreError, ErrorContext, ErrorLocation};
#[derive(Debug, thiserror::Error)]
pub enum GpuError {
#[error("GPU backend {0} is not available")]
BackendNotAvailable(String),
#[error("GPU backend {0} is not supported")]
UnsupportedBackend(GpuBackend),
#[error("GPU backend {0:?} is not supported for this kernel")]
BackendNotSupported(GpuBackend),
#[error("GPU backend {0} is not implemented yet")]
BackendNotImplemented(GpuBackend),
#[error("GPU out of memory: {0}")]
OutOfMemory(String),
#[error("Kernel compilation error: {0}")]
KernelCompilationError(String),
#[error("Kernel execution error: {0}")]
KernelExecutionError(String),
#[error("Invalid parameter: {0}")]
InvalidParameter(String),
#[error("Kernel not found: {0}")]
KernelNotFound(String),
#[error("Kernel specialization not supported")]
SpecializationNotSupported,
#[error("Unsupported data type: {0:?}")]
UnsupportedDataType(kernels::DataType),
#[error("{0}")]
Other(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct GpuDevice {
backend: GpuBackend,
device_id: usize,
}
impl GpuDevice {
pub fn new(backend: GpuBackend, device_id: usize) -> Self {
Self { backend, device_id }
}
pub fn backend(&self) -> GpuBackend {
self.backend
}
pub fn device_id(&self) -> usize {
self.device_id
}
pub fn compile_kernel(&self, _source: &str, entrypoint: &str) -> Result<GpuKernel, GpuError> {
Ok(GpuKernel {
backend: self.backend,
entry_point: entrypoint.to_string(),
})
}
}
pub struct GpuKernel {
backend: GpuBackend,
entry_point: String,
}
impl GpuKernel {
pub fn backend(&self) -> GpuBackend {
self.backend
}
pub fn entry_point(&self) -> &str {
&self.entry_point
}
}
impl From<GpuError> for CoreError {
fn from(err: GpuError) -> Self {
match err {
GpuError::BackendNotAvailable(backend) => CoreError::ComputationError(
ErrorContext::new(format!("GPU backend {backend} is not available"))
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::UnsupportedBackend(backend) => CoreError::NotImplementedError(
ErrorContext::new(format!("GPU backend {backend} is not supported"))
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::BackendNotSupported(backend) => CoreError::NotImplementedError(
ErrorContext::new(format!(
"GPU backend {backend:?} is not supported for this kernel"
))
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::BackendNotImplemented(backend) => CoreError::NotImplementedError(
ErrorContext::new(format!("GPU backend {backend} is not implemented yet"))
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::OutOfMemory(details) => CoreError::MemoryError(
ErrorContext::new(details.to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::KernelCompilationError(msg) => CoreError::ComputationError(
ErrorContext::new(msg.to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::KernelExecutionError(msg) => CoreError::ComputationError(
ErrorContext::new(msg.to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::InvalidParameter(msg) => CoreError::InvalidArgument(
ErrorContext::new(msg.to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::KernelNotFound(name) => CoreError::ComputationError(
ErrorContext::new(name.to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::SpecializationNotSupported => CoreError::NotImplementedError(
ErrorContext::new("Kernel specialization not supported".to_string())
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::UnsupportedDataType(dtype) => CoreError::TypeError(
ErrorContext::new(format!("{dtype:?}"))
.with_location(ErrorLocation::new(file!(), line!())),
),
GpuError::Other(msg) => CoreError::ComputationError(
ErrorContext::new(msg).with_location(ErrorLocation::new(file!(), line!())),
),
}
}
}
pub trait GpuDataType: Copy + Send + Sync + 'static {}
#[derive(Debug)]
pub struct GpuPtr<T: GpuDataType> {
ptr: u64,
size: usize,
phantom: PhantomData<T>,
}
impl<T: GpuDataType> GpuPtr<T> {
pub fn allocate(size: usize) -> Result<Self, GpuError> {
Ok(GpuPtr {
ptr: 0x1000_0000, size,
phantom: PhantomData,
})
}
pub fn as_ptr(&self) -> u64 {
self.ptr
}
pub fn len(&self) -> usize {
self.size
}
pub fn is_empty(&self) -> bool {
self.size == 0
}
}
#[derive(Debug, Clone)]
pub enum KernelArg<'a, T: GpuDataType> {
Buffer(&'a GpuPtr<T>),
Scalar(T),
}
#[derive(Debug, Clone)]
pub enum DynamicKernelArg {
Buffer(u64), F32(f32),
F64(f64),
I32(i32),
U32(u32),
Usize(usize),
}
pub struct GpuChannel {
#[allow(dead_code)]
source_device: usize,
#[allow(dead_code)]
target_device: usize,
#[allow(dead_code)]
bandwidth: f64, }
impl GpuDataType for f32 {}
impl GpuDataType for f64 {}
impl GpuDataType for i32 {}
impl GpuDataType for u32 {}
impl GpuDataType for u8 {}
impl GpuDataType for i8 {}
impl GpuDataType for u16 {}
impl GpuDataType for i16 {}
impl GpuDataType for u64 {}
impl GpuDataType for i64 {}
impl GpuDataType for usize {}
impl GpuDataType for isize {}
pub struct GpuBuffer<T: GpuDataType> {
inner: Arc<dyn GpuBufferImpl>,
size: usize,
phantom: PhantomData<T>,
}
impl<T: GpuDataType> GpuBuffer<T> {
pub(crate) fn new(inner: Arc<dyn GpuBufferImpl>, size: usize) -> Self {
Self {
inner,
size,
phantom: PhantomData,
}
}
pub fn len(&self) -> usize {
self.size
}
pub fn is_empty(&self) -> bool {
self.size == 0
}
pub fn copy_from_host(&self, data: &[T]) -> Result<(), GpuError> {
if data.len() > self.size {
return Err(GpuError::InvalidParameter(
"Data size exceeds buffer size".to_string(),
));
}
unsafe {
self.inner
.copy_from_host(data.as_ptr() as *const u8, std::mem::size_of_val(data));
}
Ok(())
}
pub fn copy_to_host(&self, data: &mut [T]) -> Result<(), GpuError> {
if data.len() > self.size {
return Err(GpuError::InvalidParameter(
"Data size exceeds buffer size".to_string(),
));
}
unsafe {
self.inner
.copy_to_host(data.as_mut_ptr() as *mut u8, std::mem::size_of_val(data));
}
Ok(())
}
pub fn to_vec(&self) -> Vec<T> {
let mut result = vec![unsafe { std::mem::zeroed() }; self.size];
let _ = self.copy_to_host(&mut result);
result
}
}
impl<T: GpuDataType> fmt::Debug for GpuBuffer<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("GpuBuffer")
.field("size", &self.size)
.finish()
}
}
impl<T: GpuDataType> Clone for GpuBuffer<T> {
fn clone(&self) -> Self {
Self {
inner: Arc::clone(&self.inner),
size: self.size,
phantom: PhantomData,
}
}
}
#[derive(Clone)]
pub struct GpuKernelHandle {
inner: Arc<dyn GpuKernelImpl>,
}
impl GpuKernelHandle {
pub(crate) fn new(inner: Arc<dyn GpuKernelImpl>) -> Self {
Self { inner }
}
pub fn set_buffer<T: GpuDataType>(&self, name: &str, buffer: &GpuBuffer<T>) {
self.inner.set_buffer(name, &buffer.inner);
}
pub fn set_u32(&self, name: &str, value: u32) {
self.inner.set_u32(name, value);
}
pub fn set_i32(&self, name: &str, value: i32) {
self.inner.set_i32(name, value);
}
pub fn set_f32(&self, name: &str, value: f32) {
self.inner.set_f32(name, value);
}
pub fn set_f64(&self, name: &str, value: f64) {
self.inner.set_f64(name, value);
}
pub fn dispatch(&self, workgroups: [u32; 3]) {
if !self.inner.try_batch_dispatch(workgroups) {
self.inner.dispatch(workgroups);
}
}
pub fn dispatch_no_wait(&self, workgroups: [u32; 3]) {
self.inner.dispatch_no_wait(workgroups);
}
}
pub struct GpuCompiler {
inner: Arc<dyn GpuCompilerImpl>,
}
impl GpuCompiler {
pub(crate) fn new(inner: Arc<dyn GpuCompilerImpl>) -> Self {
Self { inner }
}
pub fn compile(&self, source: &str) -> Result<GpuKernelHandle, GpuError> {
let kernel = self.inner.compile(source)?;
Ok(GpuKernelHandle::new(kernel))
}
pub fn compile_kernel<I: GpuDataType, O: GpuDataType>(&self, name: &str) -> GpuKernelHandle {
let kernel = self.inner.compile_typed(
name,
std::any::TypeId::of::<I>(),
std::any::TypeId::of::<O>(),
);
GpuKernelHandle::new(kernel)
}
}
pub struct GpuContext {
inner: Arc<dyn GpuContextImpl>,
backend: GpuBackend,
kernel_registry: kernels::KernelRegistry,
}
impl GpuContext {
pub fn new(backend: GpuBackend) -> Result<Self, GpuError> {
if !backend.is_available() {
return Err(GpuError::BackendNotAvailable(backend.to_string()));
}
if backend != GpuBackend::Cpu {
let detection_result = backends::detect_gpu_backends();
let backend_available = detection_result
.devices
.iter()
.any(|d| d.backend == backend && d.backend != GpuBackend::Cpu);
if !backend_available {
return Err(GpuError::BackendNotAvailable(format!(
"{backend} (no devices detected at runtime)"
)));
}
}
let inner = match backend {
GpuBackend::Cuda => {
#[cfg(feature = "cuda")]
{
use crate::gpu::backends::cuda::CudaContext;
match CudaContext::new() {
Ok(ctx) => Arc::new(ctx) as Arc<dyn GpuContextImpl>,
Err(e) => return Err(e),
}
}
#[cfg(not(feature = "cuda"))]
{
return Err(GpuError::UnsupportedBackend(backend));
}
}
GpuBackend::Rocm => {
#[cfg(feature = "rocm")]
{
#[cfg(test)]
{
Arc::new(CpuContext::new()) as Arc<dyn GpuContextImpl>
}
#[cfg(not(test))]
{
return Err(GpuError::BackendNotImplemented(backend));
}
}
#[cfg(not(feature = "rocm"))]
{
return Err(GpuError::UnsupportedBackend(backend));
}
}
GpuBackend::Wgpu => {
#[cfg(feature = "wgpu_backend")]
{
use crate::gpu::backends::wgpu::WebGPUContext;
match WebGPUContext::new() {
Ok(ctx) => Arc::new(ctx) as Arc<dyn GpuContextImpl>,
Err(e) => return Err(e),
}
}
#[cfg(not(feature = "wgpu_backend"))]
{
return Err(GpuError::UnsupportedBackend(backend));
}
}
GpuBackend::Metal => {
#[cfg(all(feature = "metal", target_os = "macos"))]
{
use crate::gpu::backends::metal::MetalContext;
match MetalContext::new() {
Ok(ctx) => Arc::new(ctx) as Arc<dyn GpuContextImpl>,
Err(e) => return Err(e),
}
}
#[cfg(not(all(feature = "metal", target_os = "macos")))]
{
return Err(GpuError::UnsupportedBackend(backend));
}
}
GpuBackend::OpenCL => {
#[cfg(feature = "opencl")]
{
use crate::gpu::backends::opencl::OpenCLContext;
match OpenCLContext::new() {
Ok(ctx) => Arc::new(ctx) as Arc<dyn GpuContextImpl>,
Err(e) => return Err(e),
}
}
#[cfg(not(feature = "opencl"))]
{
return Err(GpuError::UnsupportedBackend(backend));
}
}
GpuBackend::Cpu => Arc::new(CpuContext::new()) as Arc<dyn GpuContextImpl>,
};
Ok(Self {
inner,
backend,
kernel_registry: kernels::KernelRegistry::with_default_kernels(),
})
}
pub fn backend(&self) -> GpuBackend {
self.backend
}
pub fn backend_name(&self) -> &str {
match self.backend {
GpuBackend::Cuda => "CUDA",
GpuBackend::Rocm => "ROCm",
GpuBackend::Wgpu => "WebGPU",
GpuBackend::Metal => "Metal",
GpuBackend::OpenCL => "OpenCL",
GpuBackend::Cpu => "CPU",
}
}
pub fn gpu_sync(&self) -> Result<(), GpuError> {
self.inner.gpu_sync()
}
pub fn begin_batch(&self) -> Result<(), GpuError> {
self.inner.begin_batch()
}
pub fn end_batch(&self) -> Result<(), GpuError> {
self.inner.end_batch()
}
pub fn create_buffer<T: GpuDataType>(&self, size: usize) -> GpuBuffer<T> {
let byte_size = size.saturating_mul(std::mem::size_of::<T>());
let inner = self.inner.create_buffer(byte_size);
GpuBuffer::new(inner, size)
}
pub fn create_buffer_from_slice<T: GpuDataType>(&self, data: &[T]) -> GpuBuffer<T> {
let buffer = self.create_buffer::<T>(data.len());
let _ = buffer.copy_from_host(data);
buffer
}
pub fn execute<F, R>(&self, f: F) -> R
where
F: FnOnce(&GpuCompiler) -> R,
{
let compiler = GpuCompiler::new(self.inner.create_compiler());
f(&compiler)
}
pub fn get_kernel(&self, name: &str) -> Result<GpuKernelHandle, GpuError> {
let kernel = self
.kernel_registry
.get(name)
.ok_or_else(|| GpuError::KernelNotFound(name.to_string()))?;
let kernel_source = kernel.source_for_backend(self.backend)?;
let metadata = kernel.metadata();
let handle = self.compile_kernel_with_metadata(&kernel_source, &metadata)?;
Ok(handle)
}
pub fn get_specialized_kernel(
&self,
name: &str,
params: &kernels::KernelParams,
) -> Result<GpuKernelHandle, GpuError> {
let specialized = self.kernel_registry.get_specialized(name, params)?;
let kernel_source = specialized.source_for_backend(self.backend)?;
let metadata = specialized.metadata();
let handle = self.compile_kernel_with_metadata(&kernel_source, &metadata)?;
Ok(handle)
}
fn compile_kernel_with_metadata(
&self,
source: &str,
_metadata: &kernels::KernelMetadata,
) -> Result<GpuKernelHandle, GpuError> {
self.execute(|compiler| compiler.compile(source))
}
pub fn get_available_memory(&self) -> Option<usize> {
Some(1024 * 1024 * 1024) }
pub fn get_total_memory(&self) -> Option<usize> {
#[cfg(target_arch = "wasm32")]
return Some(512 * 1024 * 1024);
#[cfg(not(target_arch = "wasm32"))]
Some((4u64 * 1024 * 1024 * 1024) as usize) }
pub fn launch_kernel(
&self,
kernel_name: &str,
grid_size: (usize, usize, usize),
block_size: (usize, usize, usize),
args: &[DynamicKernelArg],
) -> Result<(), GpuError> {
let _ = (kernel_name, grid_size, block_size, args);
Ok(())
}
pub fn transfer_async_host_to_device<T: GpuDataType>(
&self,
ptr: &GpuPtr<T>,
data: &[T],
) -> Result<(), GpuError> {
let _ = (ptr, data);
Ok(())
}
pub fn transfer_host_to_device<T: GpuDataType>(
&self,
ptr: &GpuPtr<T>,
data: &[T],
) -> Result<(), GpuError> {
let _ = (ptr, data);
Ok(())
}
pub fn transfer_async_device_to_host<T: GpuDataType>(
&self,
ptr: &GpuPtr<T>,
data: &mut [T],
) -> Result<(), GpuError> {
let _ = (ptr, data);
Ok(())
}
pub fn transfer_device_to_host<T: GpuDataType>(
&self,
ptr: &GpuPtr<T>,
data: &mut [T],
) -> Result<(), GpuError> {
let _ = (ptr, data);
Ok(())
}
pub fn execute_kernel(
&self,
source: &str,
buffers: &[GpuBuffer<f32>],
work_groups: (u32, u32, u32),
int_params: &[u32],
float_params: &[f32],
) -> Result<(), GpuError> {
eprintln!(
"GPU kernel execution (source length: {}, buffers: {}, workgroups: {:?})",
source.len(),
buffers.len(),
work_groups
);
eprintln!("Int params: {int_params:?}");
eprintln!("Float params: {float_params:?}");
Ok(())
}
pub fn read_buffer<T: GpuDataType>(&self, buffer: &GpuBuffer<T>) -> Result<Vec<T>, GpuError> {
Ok(buffer.to_vec())
}
pub fn sum_all<T: GpuDataType>(&self, buffer: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.sum_all_cpu_fallback(buffer)
}
pub fn mean_all<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
) -> Result<GpuBuffer<T>, GpuError> {
self.mean_all_cpu_fallback(buffer)
}
pub fn max_all<T: GpuDataType>(&self, buffer: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.max_all_cpu_fallback(buffer)
}
pub fn min_all<T: GpuDataType>(&self, buffer: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.min_all_cpu_fallback(buffer)
}
pub fn sum_axis<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
shape: &[usize],
axis: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.sum_axis_cpu_fallback(buffer, shape, axis)
}
pub fn mean_axis<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
shape: &[usize],
axis: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.mean_axis_cpu_fallback(buffer, shape, axis)
}
pub fn max_axis<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
shape: &[usize],
axis: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.max_axis_cpu_fallback(buffer, shape, axis)
}
pub fn min_axis<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
shape: &[usize],
axis: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.min_axis_cpu_fallback(buffer, shape, axis)
}
pub fn broadcast<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
from_shape: &[usize],
to_shape: &[usize],
) -> Result<GpuBuffer<T>, GpuError> {
self.broadcast_cpu_fallback(buffer, from_shape, to_shape)
}
pub fn scale<T: GpuDataType>(
&self,
buffer: &GpuBuffer<T>,
scalar: T,
) -> Result<GpuBuffer<T>, GpuError> {
self.scale_cpu_fallback(buffer, scalar)
}
pub fn gemm<T: GpuDataType>(
&self,
a: &GpuBuffer<T>,
b: &GpuBuffer<T>,
m: usize,
k: usize,
n: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.gemm_cpu_fallback(a, b, m, k, n)
}
pub fn gemm_transpose_b<T: GpuDataType>(
&self,
a: &GpuBuffer<T>,
b: &GpuBuffer<T>,
m: usize,
k: usize,
n: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.gemm_transpose_b_cpu_fallback(a, b, m, k, n)
}
pub fn gemm_transpose_a<T: GpuDataType>(
&self,
a: &GpuBuffer<T>,
b: &GpuBuffer<T>,
m: usize,
k: usize,
n: usize,
) -> Result<GpuBuffer<T>, GpuError> {
self.gemm_transpose_a_cpu_fallback(a, b, m, k, n)
}
pub fn relu<T: GpuDataType>(&self, input: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.relu_cpu_fallback(input)
}
pub fn relu_backward<T: GpuDataType>(
&self,
grad_output: &GpuBuffer<T>,
input: &GpuBuffer<T>,
) -> Result<GpuBuffer<T>, GpuError> {
self.relu_backward_cpu_fallback(grad_output, input)
}
pub fn sigmoid<T: GpuDataType>(&self, input: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.sigmoid_cpu_fallback(input)
}
pub fn sigmoid_backward<T: GpuDataType>(
&self,
grad_output: &GpuBuffer<T>,
input: &GpuBuffer<T>,
) -> Result<GpuBuffer<T>, GpuError> {
self.sigmoid_backward_cpu_fallback(grad_output, input)
}
pub fn tanh<T: GpuDataType>(&self, input: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.tanh_cpu_fallback(input)
}
pub fn tanh_backward<T: GpuDataType>(
&self,
grad_output: &GpuBuffer<T>,
input: &GpuBuffer<T>,
) -> Result<GpuBuffer<T>, GpuError> {
self.tanh_backward_cpu_fallback(grad_output, input)
}
pub fn gelu<T: GpuDataType>(&self, input: &GpuBuffer<T>) -> Result<GpuBuffer<T>, GpuError> {
self.gelu_cpu_fallback(input)
}
pub fn gelu_backward<T: GpuDataType>(
&self,
grad_output: &GpuBuffer<T>,
input: &GpuBuffer<T>,
) -> Result<GpuBuffer<T>, GpuError> {
self.gelu_backward_cpu_fallback(grad_output, input)
}
}
impl fmt::Debug for GpuContext {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("GpuContext")
.field("backend", &self.backend)
.finish()
}
}
pub(crate) trait GpuBufferImpl: Send + Sync {
unsafe fn copy_from_host(&self, data: *const u8, size: usize);
unsafe fn copy_to_host(&self, data: *mut u8, size: usize);
#[allow(dead_code)]
fn as_any(&self) -> &dyn std::any::Any;
#[allow(dead_code)]
fn size(&self) -> usize {
0 }
#[allow(dead_code)]
fn device_ptr(&self) -> u64 {
0 }
}
pub(crate) trait GpuKernelImpl: Send + Sync {
fn set_buffer(&self, name: &str, buffer: &Arc<dyn GpuBufferImpl>);
fn set_u32(&self, name: &str, value: u32);
fn set_i32(&self, name: &str, value: i32);
fn set_f32(&self, name: &str, value: f32);
fn set_f64(&self, name: &str, value: f64);
fn dispatch(&self, workgroups: [u32; 3]);
fn dispatch_no_wait(&self, workgroups: [u32; 3]) {
self.dispatch(workgroups);
}
fn try_batch_dispatch(&self, _workgroups: [u32; 3]) -> bool {
false
}
}
pub(crate) trait GpuCompilerImpl: Send + Sync {
fn compile(&self, source: &str) -> Result<Arc<dyn GpuKernelImpl>, GpuError>;
fn compile_typed(
&self,
name: &str,
input_type: std::any::TypeId,
output_type: std::any::TypeId,
) -> Arc<dyn GpuKernelImpl>;
}
pub(crate) trait GpuContextImpl: Send + Sync {
fn create_buffer(&self, size: usize) -> Arc<dyn GpuBufferImpl>;
fn create_compiler(&self) -> Arc<dyn GpuCompilerImpl>;
fn gpu_sync(&self) -> Result<(), GpuError> {
Ok(()) }
fn begin_batch(&self) -> Result<(), GpuError> {
Ok(()) }
fn end_batch(&self) -> Result<(), GpuError> {
Ok(()) }
fn as_any(&self) -> &dyn std::any::Any
where
Self: 'static + Sized,
{
self
}
}
struct CpuContext;
impl CpuContext {
fn new() -> Self {
Self
}
}
impl GpuContextImpl for CpuContext {
fn create_buffer(&self, size: usize) -> Arc<dyn GpuBufferImpl> {
Arc::new(CpuBuffer::new(size))
}
fn create_compiler(&self) -> Arc<dyn GpuCompilerImpl> {
Arc::new(CpuCompiler)
}
}
struct CpuBuffer {
data: Vec<u8>,
}
impl CpuBuffer {
fn new(size: usize) -> Self {
Self {
data: vec![0; size],
}
}
}
impl GpuBufferImpl for CpuBuffer {
unsafe fn copy_from_host(&self, data: *const u8, size: usize) {
let mut_self = self as *const Self as *mut Self;
let data_ptr = (*mut_self).data.as_mut_ptr();
std::ptr::copy_nonoverlapping(data, data_ptr, size);
}
unsafe fn copy_to_host(&self, data: *mut u8, size: usize) {
let data_ptr = self.data.as_ptr();
std::ptr::copy_nonoverlapping(data_ptr, data, size);
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
fn size(&self) -> usize {
self.data.len()
}
fn device_ptr(&self) -> u64 {
self.data.as_ptr() as u64
}
}
struct CpuCompiler;
impl GpuCompilerImpl for CpuCompiler {
fn compile(&self, source: &str) -> Result<Arc<dyn GpuKernelImpl>, GpuError> {
Ok(Arc::new(CpuKernel))
}
fn compile_typed(
&self,
_name: &str,
_input_type: std::any::TypeId,
_output_type: std::any::TypeId,
) -> Arc<dyn GpuKernelImpl> {
Arc::new(CpuKernel)
}
}
struct CpuKernel;
impl GpuKernelImpl for CpuKernel {
fn set_buffer(&self, _name: &str, buffer: &Arc<dyn GpuBufferImpl>) {
}
fn set_u32(&self, _name: &str, value: u32) {
}
fn set_i32(&self, _name: &str, value: i32) {
}
fn set_f32(&self, _name: &str, value: f32) {
}
fn set_f64(&self, _name: &str, value: f64) {
}
fn dispatch(&self, workgroups: [u32; 3]) {
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_backend_preferred() {
let backend = GpuBackend::preferred();
match backend {
GpuBackend::Cuda
| GpuBackend::Rocm
| GpuBackend::Wgpu
| GpuBackend::Metal
| GpuBackend::OpenCL
| GpuBackend::Cpu => {}
}
}
#[test]
fn test_gpu_backend_default() {
let backend = GpuBackend::default();
assert_eq!(backend, GpuBackend::preferred());
}
#[test]
fn test_gpu_backend_is_available() {
let backend = GpuBackend::Cpu;
assert!(backend.is_available());
#[cfg(feature = "cuda")]
{
let _ = GpuBackend::Cuda.is_available(); }
#[cfg(not(feature = "cuda"))]
assert!(!GpuBackend::Cuda.is_available());
#[cfg(feature = "rocm")]
{
let _ = GpuBackend::Rocm.is_available(); }
#[cfg(not(feature = "rocm"))]
assert!(!GpuBackend::Rocm.is_available());
#[cfg(all(feature = "metal", target_os = "macos"))]
assert!(GpuBackend::Metal.is_available());
#[cfg(not(all(feature = "metal", target_os = "macos")))]
assert!(!GpuBackend::Metal.is_available());
}
#[test]
fn test_gpu_backend_display() {
assert_eq!(GpuBackend::Cuda.to_string(), "CUDA");
assert_eq!(GpuBackend::Rocm.to_string(), "ROCm");
assert_eq!(GpuBackend::Wgpu.to_string(), "WebGPU");
assert_eq!(GpuBackend::Metal.to_string(), "Metal");
assert_eq!(GpuBackend::OpenCL.to_string(), "OpenCL");
assert_eq!(GpuBackend::Cpu.to_string(), "CPU");
}
#[test]
fn test_gpuerror_from_conversion() {
let gpuerror = GpuError::BackendNotAvailable("CUDA".to_string());
let coreerror: CoreError = gpuerror.into();
match coreerror {
CoreError::ComputationError(_) => {}
_ => panic!("Expected ComputationError"),
}
let gpuerror = GpuError::OutOfMemory("8GB required".to_string());
let coreerror: CoreError = gpuerror.into();
match coreerror {
CoreError::MemoryError(_) => {}
_ => panic!("Expected MemoryError"),
}
let gpuerror = GpuError::InvalidParameter("batch_size must be > 0".to_string());
let coreerror: CoreError = gpuerror.into();
match coreerror {
CoreError::InvalidArgument(_) => {}
_ => panic!("Expected InvalidArgument"),
}
let gpuerror = GpuError::UnsupportedDataType(kernels::DataType::Float16);
let coreerror: CoreError = gpuerror.into();
match coreerror {
CoreError::TypeError(_) => {}
_ => panic!("Expected TypeError"),
}
}
#[test]
fn test_gpu_datatype_trait() {
fn assert_gpu_datatype<T: GpuDataType>() {}
assert_gpu_datatype::<f32>();
assert_gpu_datatype::<f64>();
assert_gpu_datatype::<i32>();
assert_gpu_datatype::<u32>();
assert_gpu_datatype::<u8>();
assert_gpu_datatype::<i8>();
assert_gpu_datatype::<u16>();
assert_gpu_datatype::<i16>();
assert_gpu_datatype::<u64>();
assert_gpu_datatype::<i64>();
}
#[test]
fn test_gpu_buffer_creation() {
let inner = Arc::new(CpuBuffer::new(100));
let buffer = GpuBuffer::<f32>::new(inner, 25);
assert_eq!(buffer.len(), 25);
assert!(!buffer.is_empty());
}
#[test]
fn test_gpu_buffer_empty() {
let inner = Arc::new(CpuBuffer::new(0));
let buffer = GpuBuffer::<f32>::new(inner, 0);
assert_eq!(buffer.len(), 0);
assert!(buffer.is_empty());
}
#[test]
fn test_gpu_buffer_copy_operations() {
let inner = Arc::new(CpuBuffer::new(16));
let buffer = GpuBuffer::<f32>::new(inner, 4);
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let _ = buffer.copy_from_host(&data);
let mut result = vec![0.0f32; 4];
let _ = buffer.copy_to_host(&mut result);
assert_eq!(result, data);
}
#[test]
fn test_gpu_buffer_to_vec() {
let inner = Arc::new(CpuBuffer::new(12));
let buffer = GpuBuffer::<f32>::new(inner, 3);
let data = vec![5.0f32, 6.0, 7.0];
let _ = buffer.copy_from_host(&data);
let result = buffer.to_vec();
assert_eq!(result, data);
}
#[test]
#[should_panic(expected = "Data size exceeds buffer size")]
fn test_gpu_buffer_copy_from_host_overflow() {
let inner = Arc::new(CpuBuffer::new(8));
let buffer = GpuBuffer::<f32>::new(inner, 2);
let data = vec![1.0f32, 2.0, 3.0]; buffer.copy_from_host(&data).expect("Operation failed");
}
#[test]
#[should_panic(expected = "Data size exceeds buffer size")]
fn test_gpu_buffer_copy_to_host_overflow() {
let inner = Arc::new(CpuBuffer::new(8));
let buffer = GpuBuffer::<f32>::new(inner, 2);
let mut data = vec![0.0f32; 3]; buffer.copy_to_host(&mut data).expect("Operation failed");
}
#[test]
fn test_gpu_kernel_handle() {
let kernel = Arc::new(CpuKernel);
let handle = GpuKernelHandle::new(kernel);
let buffer = GpuBuffer::<f32>::new(Arc::new(CpuBuffer::new(16)), 4);
handle.set_buffer("input", &buffer);
handle.set_u32("size", 100);
handle.set_i32("offset", -5);
handle.set_f32("scale", 2.5);
handle.set_f64("precision", 0.0001);
handle.dispatch([16, 8, 1]);
}
#[test]
fn test_gpu_context_cpu_backend() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Operation failed");
assert_eq!(context.backend(), GpuBackend::Cpu);
assert_eq!(context.backend_name(), "CPU");
assert_eq!(context.get_available_memory(), Some(1024 * 1024 * 1024));
assert_eq!(context.get_total_memory(), Some(4 * 1024 * 1024 * 1024));
}
#[test]
fn test_gpu_context_buffer_creation() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Operation failed");
let buffer = context.create_buffer::<f32>(100);
assert_eq!(buffer.len(), 100);
let data = vec![1.0f32; 50];
let buffer_from_slice = context.create_buffer_from_slice(&data);
assert_eq!(buffer_from_slice.len(), 50);
let result = buffer_from_slice.to_vec();
assert_eq!(result, data);
}
#[test]
fn test_gpu_context_unsupported_backend() {
#[cfg(not(feature = "cuda"))]
{
let result = GpuContext::new(GpuBackend::Cuda);
assert!(result.is_err());
match result {
Err(GpuError::UnsupportedBackend(_)) => {}
Err(GpuError::BackendNotAvailable(_)) => {} Err(e) => panic!(
"Expected UnsupportedBackend or BackendNotAvailable error, got: {:?}",
e
),
Ok(_) => panic!("Expected error, got Ok"),
}
}
}
#[test]
fn test_gpu_compiler() {
let compiler_impl = Arc::new(CpuCompiler);
let compiler = GpuCompiler::new(compiler_impl);
let kernel = compiler
.compile("dummy kernel source")
.expect("Operation failed");
kernel.dispatch([1, 1, 1]);
let typed_kernel = compiler.compile_kernel::<f32, f32>("vector_add");
typed_kernel.dispatch([32, 1, 1]);
}
#[test]
fn test_gpu_context_execute() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Operation failed");
let result = context.execute(|compiler| compiler.compile("test kernel").is_ok());
assert!(result);
}
#[test]
fn test_gpu_context_kernel_registry() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Operation failed");
let result = context.get_kernel("non_existent_kernel");
assert!(result.is_err());
match result {
Err(GpuError::KernelNotFound(_)) => {}
_ => panic!("Expected KernelNotFound error"),
}
}
#[test]
fn test_cpu_buffer_implementation() {
let buffer = CpuBuffer::new(256);
assert_eq!(buffer.data.len(), 256);
assert!(buffer.data.iter().all(|&b| b == 0));
}
#[test]
fn test_gpuerror_display() {
let error = GpuError::BackendNotAvailable("CUDA".to_string());
assert_eq!(error.to_string(), "GPU backend CUDA is not available");
let error = GpuError::OutOfMemory("allocation failed".to_string());
assert_eq!(error.to_string(), "GPU out of memory: allocation failed");
let error = GpuError::KernelCompilationError("syntax error".to_string());
assert_eq!(error.to_string(), "Kernel compilation error: syntax error");
let error = GpuError::KernelNotFound("gemm".to_string());
assert_eq!(error.to_string(), "Kernel not found: gemm");
}
#[test]
fn test_backend_equality() {
assert_eq!(GpuBackend::Cuda, GpuBackend::Cuda);
assert_ne!(GpuBackend::Cuda, GpuBackend::Rocm);
let backend = GpuBackend::Metal;
let cloned = backend;
let copied = backend;
assert_eq!(backend, cloned);
assert_eq!(backend, copied);
}
#[test]
fn test_backend_hash() {
use std::collections::HashSet;
let mut set = HashSet::new();
set.insert(GpuBackend::Cuda);
set.insert(GpuBackend::Rocm);
set.insert(GpuBackend::Cuda);
assert_eq!(set.len(), 2); assert!(set.contains(&GpuBackend::Cuda));
assert!(set.contains(&GpuBackend::Rocm));
}
#[test]
fn test_gpu_buffer_debug_clone() {
let inner = Arc::new(CpuBuffer::new(16));
let buffer = GpuBuffer::<f32>::new(inner, 4);
let debug_str = format!("{:?}", buffer);
assert!(debug_str.contains("GpuBuffer"));
assert!(debug_str.contains("size"));
let cloned = buffer.clone();
assert_eq!(cloned.len(), buffer.len());
assert_eq!(cloned.len(), 4);
let data = vec![1.0f32, 2.0, 3.0, 4.0];
let _ = buffer.copy_from_host(&data);
let mut result = vec![0.0f32; 4];
let _ = cloned.copy_to_host(&mut result);
assert_eq!(result, data);
}
#[test]
fn test_gpu_context_debug() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Failed to create context");
let debug_str = format!("{:?}", context);
assert!(debug_str.contains("GpuContext"));
assert!(debug_str.contains("backend"));
assert!(debug_str.contains("Cpu"));
}
#[test]
fn test_gpu_context_batch_dispatch() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Failed to create CPU context");
let begin_result = context.begin_batch();
assert!(
begin_result.is_ok(),
"begin_batch should succeed on CPU backend"
);
let dispatch_result = context.execute(|compiler| {
compiler.compile("dummy kernel source").map(|kernel| {
kernel.dispatch([4, 1, 1]);
})
});
assert!(
dispatch_result.is_ok(),
"kernel dispatch inside batch should succeed"
);
let end_result = context.end_batch();
assert!(
end_result.is_ok(),
"end_batch should succeed on CPU backend"
);
}
#[test]
fn test_gpu_context_gpu_sync() {
let context = GpuContext::new(GpuBackend::Cpu).expect("Failed to create CPU context");
let result = context.gpu_sync();
assert!(result.is_ok(), "gpu_sync should return Ok on CPU backend");
}
#[test]
fn test_gpu_kernel_dispatch_no_wait() {
let kernel = Arc::new(CpuKernel);
let handle = GpuKernelHandle::new(kernel);
let buffer = GpuBuffer::<f32>::new(Arc::new(CpuBuffer::new(16)), 4);
handle.set_buffer("input", &buffer);
handle.set_u32("size", 4);
handle.dispatch_no_wait([4, 1, 1]);
}
}