use super::super::{AutoGpuSelector, GpuBuffer, GpuContext, GpuDeviceInfo, GpuLinalgOps};
use crate::error::{LinalgError, LinalgResult};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::numeric::{Float, NumAssign, Zero};
use std::fmt::Debug;
pub const DEFAULT_GPU_THRESHOLD: usize = 50_000;
pub struct GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
gpu_threshold: usize,
_phantom: std::marker::PhantomData<T>,
}
impl<T> GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
pub fn new() -> Self {
Self {
gpu_threshold: DEFAULT_GPU_THRESHOLD,
_phantom: std::marker::PhantomData,
}
}
pub fn with_threshold(threshold: usize) -> Self {
Self {
gpu_threshold: threshold,
_phantom: std::marker::PhantomData,
}
}
pub fn set_threshold(&mut self, threshold: usize) {
self.gpu_threshold = threshold;
}
pub fn threshold(&self) -> usize {
self.gpu_threshold
}
}
impl<T> Default for GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn default() -> Self {
Self::new()
}
}
impl<T> GpuLinalgOps<T> for GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn gpu_matvec(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
x: &ArrayView1<T>,
) -> LinalgResult<Array1<T>> {
let (m, n) = a.dim();
if n != x.len() {
return Err(LinalgError::ShapeError(format!(
"Matrix columns ({}) must match vector length ({})",
n,
x.len()
)));
}
let required_memory = (m * n + n + m) * std::mem::size_of::<T>();
let available_memory = ctx.available_memory()?;
if required_memory > available_memory {
return self.cpu_matvec(a, x);
}
let mut a_buffer = self.allocate_buffer_from_context::<T>(ctx, m * n)?;
let mut x_buffer = self.allocate_buffer_from_context::<T>(ctx, n)?;
let mut y_buffer = self.allocate_buffer_from_context::<T>(ctx, m)?;
let a_flat: Vec<T> = a.iter().cloned().collect();
let x_flat: Vec<T> = x.iter().cloned().collect();
a_buffer.copy_from_host(&a_flat)?;
x_buffer.copy_from_host(&x_flat)?;
self.execute_matvec_kernel(
ctx,
a_buffer.as_ref(),
x_buffer.as_ref(),
y_buffer.as_mut(),
m,
n,
)?;
let mut result_data = vec![T::zero(); m];
y_buffer.copy_to_host(&mut result_data)?;
Ok(Array1::from_vec(result_data))
}
fn gpu_matmul(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>> {
let (m, k1) = a.dim();
let (k2, n) = b.dim();
if k1 != k2 {
return Err(LinalgError::ShapeError(format!(
"Matrix dimensions mismatch: {}x{} * {}x{}",
m, k1, k2, n
)));
}
let k = k1;
let required_memory = (m * k + k * n + m * n) * std::mem::size_of::<T>();
let available_memory = ctx.available_memory()?;
if required_memory > available_memory {
return self.cpu_matmul(a, b);
}
let mut a_buffer = self.allocate_buffer_from_context::<T>(ctx, m * k)?;
let mut b_buffer = self.allocate_buffer_from_context::<T>(ctx, k * n)?;
let mut c_buffer = self.allocate_buffer_from_context::<T>(ctx, m * n)?;
let a_flat: Vec<T> = a.iter().cloned().collect();
let b_flat: Vec<T> = b.iter().cloned().collect();
a_buffer.copy_from_host(&a_flat)?;
b_buffer.copy_from_host(&b_flat)?;
self.execute_matmul_kernel(
ctx,
a_buffer.as_ref(),
b_buffer.as_ref(),
c_buffer.as_mut(),
m,
n,
k,
)?;
let mut result_data = vec![T::zero(); m * n];
c_buffer.copy_to_host(&mut result_data)?;
let result_array = Array2::from_shape_vec((m, n), result_data)
.map_err(|e| LinalgError::ComputationError(format!("Shape error: {}", e)))?;
Ok(result_array)
}
fn gpu_dot(
&self,
ctx: &dyn GpuContext,
x: &ArrayView1<T>,
y: &ArrayView1<T>,
) -> LinalgResult<T> {
if x.len() != y.len() {
return Err(LinalgError::ShapeError(format!(
"Vector lengths must match: {} != {}",
x.len(),
y.len()
)));
}
Ok(Self::cpu_dot_static(x, y))
}
fn gpu_norm(&self, ctx: &dyn GpuContext, x: &ArrayView1<T>) -> LinalgResult<T> {
Ok(Self::cpu_norm_static(x))
}
fn gpu_elementwise_add(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>> {
if a.shape() != b.shape() {
return Err(LinalgError::ShapeError(format!(
"Matrix shapes must match: {:?} != {:?}",
a.shape(),
b.shape()
)));
}
Self::cpu_elementwise_add_static(a, b)
}
fn gpu_elementwise_mul(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>> {
if a.shape() != b.shape() {
return Err(LinalgError::ShapeError(format!(
"Matrix shapes must match: {:?} != {:?}",
a.shape(),
b.shape()
)));
}
Self::cpu_elementwise_mul_static(a, b)
}
}
impl<T> GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn execute_matvec_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
match ctx.device_info().device_type {
crate::gpu::GpuDeviceType::Cuda => {
self.execute_cuda_matvec_kernel(ctx, a_buffer, x_buffer, y_buffer, m, n)
}
crate::gpu::GpuDeviceType::OpenCl => {
self.execute_opencl_matvec_kernel(ctx, a_buffer, x_buffer, y_buffer, m, n)
}
crate::gpu::GpuDeviceType::Rocm => {
self.execute_rocm_matvec_kernel(ctx, a_buffer, x_buffer, y_buffer, m, n)
}
crate::gpu::GpuDeviceType::Metal => {
self.execute_metal_matvec_kernel(ctx, a_buffer, x_buffer, y_buffer, m, n)
}
_ => {
self.simulate_gpu_matvec(a_buffer, x_buffer, y_buffer, m, n)
}
}
}
fn execute_matmul_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
match ctx.device_info().device_type {
crate::gpu::GpuDeviceType::Cuda => {
self.execute_cuda_matmul_kernel(ctx, a_buffer, b_buffer, c_buffer, m, n, k)
}
crate::gpu::GpuDeviceType::OpenCl => {
self.execute_opencl_matmul_kernel(ctx, a_buffer, b_buffer, c_buffer, m, n, k)
}
crate::gpu::GpuDeviceType::Rocm => {
self.execute_rocm_matmul_kernel(ctx, a_buffer, b_buffer, c_buffer, m, n, k)
}
crate::gpu::GpuDeviceType::Metal => {
self.execute_metal_matmul_kernel(ctx, a_buffer, b_buffer, c_buffer, m, n, k)
}
_ => {
self.simulate_gpu_matmul(a_buffer, b_buffer, c_buffer, m, n, k)
}
}
}
fn simulate_gpu_matvec(
&self,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
let mut a_data = vec![T::zero(); m * n];
let mut x_data = vec![T::zero(); n];
let mut y_data = vec![T::zero(); m];
a_buffer.copy_to_host(&mut a_data)?;
x_buffer.copy_to_host(&mut x_data)?;
for i in 0..m {
let mut sum = T::zero();
for j in 0..n {
sum += a_data[i * n + j] * x_data[j];
}
y_data[i] = sum;
}
y_buffer.copy_from_host(&y_data)?;
Ok(())
}
fn simulate_gpu_matmul(
&self,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
let mut a_data = vec![T::zero(); m * k];
let mut b_data = vec![T::zero(); k * n];
let mut c_data = vec![T::zero(); m * n];
a_buffer.copy_to_host(&mut a_data)?;
b_buffer.copy_to_host(&mut b_data)?;
for i in 0..m {
for j in 0..n {
let mut sum = T::zero();
for l in 0..k {
sum += a_data[i * k + l] * b_data[l * n + j];
}
c_data[i * n + j] = sum;
}
}
c_buffer.copy_from_host(&c_data)?;
Ok(())
}
fn execute_cuda_matvec_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_cuda_matvec_f32(
a_buffer.device_ptr() as *const f32,
x_buffer.device_ptr() as *const f32,
y_buffer.device_ptr() as *mut f32,
m,
n,
)
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f64>() {
self.launch_cuda_matvec_f64(
a_buffer.device_ptr() as *const f64,
x_buffer.device_ptr() as *const f64,
y_buffer.device_ptr() as *mut f64,
m,
n,
)
} else {
return Err(LinalgError::ComputationError(
"Unsupported data type for CUDA kernel".to_string(),
));
}
}
fn execute_opencl_matvec_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_opencl_matvec_f32(
ctx,
a_buffer.device_ptr(),
x_buffer.device_ptr(),
y_buffer.device_ptr(),
m,
n,
)
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f64>() {
self.launch_opencl_matvec_f64(
ctx,
a_buffer.device_ptr(),
x_buffer.device_ptr(),
y_buffer.device_ptr(),
m,
n,
)
} else {
return Err(LinalgError::ComputationError(
"Unsupported data type for OpenCL kernel".to_string(),
));
}
}
fn execute_rocm_matvec_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_rocm_matvec_f32(
ctx,
a_buffer.device_ptr(),
x_buffer.device_ptr(),
y_buffer.device_ptr(),
m,
n,
)
} else {
self.simulate_gpu_matvec(a_buffer, x_buffer, y_buffer, m, n)
}
}
fn execute_metal_matvec_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
x_buffer: &dyn GpuBuffer<T>,
y_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_metal_matvec_f32(
ctx,
a_buffer.device_ptr(),
x_buffer.device_ptr(),
y_buffer.device_ptr(),
m,
n,
)
} else {
self.simulate_gpu_matvec(a_buffer, x_buffer, y_buffer, m, n)
}
}
fn execute_cuda_matmul_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
let device_info = ctx.device_info();
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
let kernel_variant = self.select_cuda_matmul_variant(m, n, k, device_info);
match kernel_variant {
CudaKernelVariant::Basic => self.launch_cuda_matmul_f32_basic(
a_buffer.device_ptr() as *const f32,
b_buffer.device_ptr() as *const f32,
c_buffer.device_ptr() as *mut f32,
m,
n,
k,
),
CudaKernelVariant::Tiled => self.launch_cuda_matmul_f32_tiled(
a_buffer.device_ptr() as *const f32,
b_buffer.device_ptr() as *const f32,
c_buffer.device_ptr() as *mut f32,
m,
n,
k,
),
CudaKernelVariant::TensorCore => {
if device_info.supports_tensor_cores {
self.launch_cuda_matmul_f32_tensor_core(
a_buffer.device_ptr() as *const f32,
b_buffer.device_ptr() as *const f32,
c_buffer.device_ptr() as *mut f32,
m,
n,
k,
)
} else {
self.launch_cuda_matmul_f32_tiled(
a_buffer.device_ptr() as *const f32,
b_buffer.device_ptr() as *const f32,
c_buffer.device_ptr() as *mut f32,
m,
n,
k,
)
}
}
CudaKernelVariant::WarpShuffle => self.launch_cuda_matmul_f32_warp_shuffle(
a_buffer.device_ptr() as *const f32,
b_buffer.device_ptr() as *const f32,
c_buffer.device_ptr() as *mut f32,
m,
n,
k,
),
}
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f64>() {
self.launch_cuda_matmul_f64(
a_buffer.device_ptr() as *const f64,
b_buffer.device_ptr() as *const f64,
c_buffer.device_ptr() as *mut f64,
m,
n,
k,
)
} else {
return Err(LinalgError::ComputationError(
"Unsupported data type for CUDA kernel".to_string(),
));
}
}
fn execute_opencl_matmul_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
let device_info = ctx.device_info();
let kernel_variant = self.select_opencl_matmul_variant(m, n, k, device_info);
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
match kernel_variant {
OpenClKernelVariant::Basic => self.launch_opencl_matmul_f32_basic(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
),
OpenClKernelVariant::Optimized => self.launch_opencl_matmul_f32_optimized(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
),
OpenClKernelVariant::Vectorized => self.launch_opencl_matmul_f32_vectorized(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
),
}
} else if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f64>() {
self.launch_opencl_matmul_f64(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
)
} else {
return Err(LinalgError::ComputationError(
"Unsupported data type for OpenCL kernel".to_string(),
));
}
}
fn execute_rocm_matmul_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_rocm_matmul_f32(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
)
} else {
self.simulate_gpu_matmul(a_buffer, b_buffer, c_buffer, m, n, k)
}
}
fn execute_metal_matmul_kernel(
&self,
ctx: &dyn GpuContext,
a_buffer: &dyn GpuBuffer<T>,
b_buffer: &dyn GpuBuffer<T>,
c_buffer: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
if std::any::TypeId::of::<T>() == std::any::TypeId::of::<f32>() {
self.launch_metal_matmul_f32(
ctx,
a_buffer.device_ptr(),
b_buffer.device_ptr(),
c_buffer.device_ptr(),
m,
n,
k,
)
} else {
self.simulate_gpu_matmul(a_buffer, b_buffer, c_buffer, m, n, k)
}
}
pub fn cpu_matvec(&self, a: &ArrayView2<T>, x: &ArrayView1<T>) -> LinalgResult<Array1<T>> {
let (m, n) = a.dim();
let mut result = Array1::zeros(m);
for i in 0..m {
let mut sum = T::zero();
for j in 0..n {
sum += a[[i, j]] * x[j];
}
result[i] = sum;
}
Ok(result)
}
pub fn cpu_matmul(&self, a: &ArrayView2<T>, b: &ArrayView2<T>) -> LinalgResult<Array2<T>> {
let (m, k) = a.dim();
let (_, n) = b.dim();
let mut result = Array2::zeros((m, n));
for i in 0..m {
for j in 0..n {
let mut sum = T::zero();
for l in 0..k {
sum += a[[i, l]] * b[[l, j]];
}
result[[i, j]] = sum;
}
}
Ok(result)
}
fn cpu_dot(&self, x: &ArrayView1<T>, y: &ArrayView1<T>) -> T {
let mut result = T::zero();
for (a, b) in x.iter().zip(y.iter()) {
result += *a * *b;
}
result
}
fn cpu_dot_static(x: &ArrayView1<T>, y: &ArrayView1<T>) -> T {
let mut result = T::zero();
for (a, b) in x.iter().zip(y.iter()) {
result += *a * *b;
}
result
}
fn cpu_norm(&self, x: &ArrayView1<T>) -> T {
let mut sum_sq = T::zero();
for &val in x.iter() {
sum_sq += val * val;
}
sum_sq.sqrt()
}
fn cpu_norm_static(x: &ArrayView1<T>) -> T {
let mut sum_sq = T::zero();
for &val in x.iter() {
sum_sq += val * val;
}
sum_sq.sqrt()
}
fn cpu_elementwise_add(&self, a: &ArrayView2<T>, b: &ArrayView2<T>) -> LinalgResult<Array2<T>> {
let mut result = Array2::zeros(a.dim());
for ((i, j), &val_a) in a.indexed_iter() {
result[[i, j]] = val_a + b[[i, j]];
}
Ok(result)
}
fn cpu_elementwise_add_static(a: &ArrayView2<T>, b: &ArrayView2<T>) -> LinalgResult<Array2<T>> {
let mut result = Array2::zeros(a.dim());
for ((i, j), &val_a) in a.indexed_iter() {
result[[i, j]] = val_a + b[[i, j]];
}
Ok(result)
}
fn cpu_elementwise_mul(&self, a: &ArrayView2<T>, b: &ArrayView2<T>) -> LinalgResult<Array2<T>> {
let mut result = Array2::zeros(a.dim());
for ((i, j), &val_a) in a.indexed_iter() {
result[[i, j]] = val_a * b[[i, j]];
}
Ok(result)
}
fn cpu_elementwise_mul_static(a: &ArrayView2<T>, b: &ArrayView2<T>) -> LinalgResult<Array2<T>> {
let mut result = Array2::zeros(a.dim());
for ((i, j), &val_a) in a.indexed_iter() {
result[[i, j]] = val_a * b[[i, j]];
}
Ok(result)
}
fn allocate_buffer_from_context<U: Clone + Send + Sync + Copy + std::fmt::Debug + 'static>(
&self,
ctx: &dyn GpuContext,
size: usize,
) -> LinalgResult<Box<dyn GpuBuffer<U>>> {
use crate::gpu::acceleration::MockGpuBuffer;
Ok(Box::new(MockGpuBuffer::new(size)))
}
}
impl<T> AutoGpuSelector<T> for GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn auto_matvec(
&self,
a: &ArrayView2<T>,
x: &ArrayView1<T>,
gpu_context: Option<&dyn GpuContext>,
) -> LinalgResult<Array1<T>> {
let elements = a.len();
if let Some(ctx) = gpu_context {
if elements > self.gpu_threshold {
return self.gpu_matvec(ctx, a, x);
}
}
self.cpu_matvec(a, x)
}
fn auto_matmul(
&self,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
gpu_context: Option<&dyn GpuContext>,
) -> LinalgResult<Array2<T>> {
let elements = a.len() + b.len();
if let Some(ctx) = gpu_context {
if elements > self.gpu_threshold {
return self.gpu_matmul(ctx, a, b);
}
}
self.cpu_matmul(a, b)
}
}
#[derive(Debug, Clone, Copy)]
enum CudaKernelVariant {
Basic,
Tiled,
TensorCore,
WarpShuffle,
}
#[derive(Debug, Clone, Copy)]
enum OpenClKernelVariant {
Basic,
Optimized,
Vectorized,
}
impl<T> GpuOperationDispatcher<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn select_cuda_matmul_variant(
&self,
m: usize,
n: usize,
k: usize,
device_info: &crate::gpu::GpuDeviceInfo,
) -> CudaKernelVariant {
let total_elements = m * n * k;
if device_info.supports_tensor_cores && total_elements > 1_000_000 {
CudaKernelVariant::TensorCore
}
else if total_elements > 100_000 {
CudaKernelVariant::Tiled
}
else if m <= 32 || n <= 32 {
CudaKernelVariant::WarpShuffle
}
else {
CudaKernelVariant::Basic
}
}
fn select_opencl_matmul_variant(
&self,
m: usize,
n: usize,
k: usize,
device_info: &crate::gpu::GpuDeviceInfo,
) -> OpenClKernelVariant {
let total_elements = m * n * k;
if total_elements > 500_000 && device_info.compute_units > 16 {
OpenClKernelVariant::Vectorized
}
else if total_elements > 50_000 {
OpenClKernelVariant::Optimized
}
else {
OpenClKernelVariant::Basic
}
}
fn launch_cuda_matvec_f32(
&self,
_a_ptr: *const f32,
_x_ptr: *const f32,
_y_ptr: *mut f32,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("CUDA f32 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_cuda_matvec_f64(
&self,
_a_ptr: *const f64,
_x_ptr: *const f64,
_y_ptr: *mut f64,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("CUDA f64 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_cuda_matmul_f32_basic(
&self,
_a_ptr: *const f32,
_b_ptr: *const f32,
_c_ptr: *mut f32,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("CUDA f32 basic matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_cuda_matmul_f32_tiled(
&self,
_a_ptr: *const f32,
_b_ptr: *const f32,
_c_ptr: *mut f32,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("CUDA f32 tiled matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_cuda_matmul_f32_tensor_core(
&self,
_a_ptr: *const f32,
_b_ptr: *const f32,
_c_ptr: *mut f32,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("CUDA f32 tensor core matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_cuda_matmul_f32_warp_shuffle(
&self,
_a_ptr: *const f32,
_b_ptr: *const f32,
_c_ptr: *mut f32,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("CUDA f32 warp shuffle matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_cuda_matmul_f64(
&self,
_a_ptr: *const f64,
_b_ptr: *const f64,
_c_ptr: *mut f64,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("CUDA f64 matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_opencl_matvec_f32(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_x_ptr: *mut std::ffi::c_void,
_y_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("OpenCL f32 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_opencl_matvec_f64(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_x_ptr: *mut std::ffi::c_void,
_y_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("OpenCL f64 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_opencl_matmul_f32_basic(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("OpenCL f32 basic matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_opencl_matmul_f32_optimized(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("OpenCL f32 optimized matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_opencl_matmul_f32_vectorized(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("OpenCL f32 vectorized matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_opencl_matmul_f64(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("OpenCL f64 matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_rocm_matvec_f32(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_x_ptr: *mut std::ffi::c_void,
_y_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("ROCm f32 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_rocm_matmul_f32(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("ROCm f32 matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
fn launch_metal_matvec_f32(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_x_ptr: *mut std::ffi::c_void,
_y_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
) -> LinalgResult<()> {
println!("Metal f32 matvec kernel: {}x{} matrix", m, n);
Ok(())
}
fn launch_metal_matmul_f32(
&self,
ctx: &dyn GpuContext,
_a_ptr: *mut std::ffi::c_void,
_b_ptr: *mut std::ffi::c_void,
_c_ptr: *mut std::ffi::c_void,
m: usize,
n: usize,
k: usize,
) -> LinalgResult<()> {
println!("Metal f32 matmul kernel: {}x{}x{}", m, n, k);
Ok(())
}
}