use std::sync::Arc;
use crate::error::Result;
use crate::prelude::SklearsError;
use scirs2_core::ndarray::Array2;
#[cfg(feature = "gpu_support")]
use oxicuda_backend::CpuBackend;
#[cfg(feature = "gpu_support")]
use oxicuda_backend::{BackendRegistry, BackendTranspose, BinaryOp, ComputeBackend, DeviceInfo};
#[cfg(feature = "gpu_support")]
#[derive(Debug, Clone)]
pub struct GpuContext {
backend: Arc<dyn ComputeBackend>,
device_id: usize,
}
#[cfg(not(feature = "gpu_support"))]
#[derive(Debug, Clone)]
pub struct GpuContext;
#[cfg(feature = "gpu_support")]
impl GpuContext {
pub fn new() -> Result<Self> {
Self::with_device_id(0)
}
pub fn with_device_id(device_id: usize) -> Result<Self> {
let mut backend: Box<dyn ComputeBackend> = Box::new(CpuBackend::new());
backend
.init()
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(Self {
backend: Arc::from(backend),
device_id,
})
}
pub fn device_id(&self) -> usize {
self.device_id
}
pub fn get_stream(&self) -> Result<usize> {
Ok(0)
}
pub fn return_stream(&self, _stream_id: usize) {}
pub fn memory_info(&self) -> Result<GpuMemoryInfo> {
let devices = self
.backend
.available_devices()
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
if let Some(dev) = devices.first() {
let total = dev.total_memory_bytes as usize;
if total > 0 {
return Ok(GpuMemoryInfo {
total,
free: total,
used: 0,
});
}
}
let mem = crate::system_info::system_memory()?;
Ok(GpuMemoryInfo {
total: mem.total as usize,
free: mem.available as usize,
used: mem.used as usize,
})
}
pub fn synchronize(&self) -> Result<()> {
self.backend
.synchronize()
.map_err(|e| SklearsError::NumericalError(e.to_string()))
}
pub fn is_gpu(&self) -> bool {
self.backend.name() != "cpu"
}
pub(crate) fn backend(&self) -> Arc<dyn ComputeBackend> {
Arc::clone(&self.backend)
}
}
#[cfg(not(feature = "gpu_support"))]
impl GpuContext {
pub fn new() -> Result<Self> {
Ok(Self)
}
pub fn with_device_id(_device_id: usize) -> Result<Self> {
Ok(Self)
}
pub fn device_id(&self) -> usize {
0
}
pub fn get_stream(&self) -> Result<usize> {
Err(SklearsError::NotImplemented(
"GPU support not enabled".into(),
))
}
pub fn return_stream(&self, _stream_id: usize) {}
pub fn memory_info(&self) -> Result<GpuMemoryInfo> {
Err(SklearsError::NotImplemented(
"GPU support not enabled".into(),
))
}
pub fn synchronize(&self) -> Result<()> {
Err(SklearsError::NotImplemented(
"GPU support not enabled".into(),
))
}
pub fn is_gpu(&self) -> bool {
false
}
}
#[derive(Debug, Clone, Copy)]
pub struct GpuMemoryInfo {
pub free: usize,
pub total: usize,
pub used: usize,
}
impl GpuMemoryInfo {
pub fn utilization(&self) -> f32 {
if self.total == 0 {
0.0
} else {
(self.used as f32 / self.total as f32) * 100.0
}
}
}
#[cfg(feature = "gpu_support")]
pub struct GpuArray<T: bytemuck::Pod> {
ptr: u64,
shape: Vec<usize>,
size: usize,
backend: Arc<dyn ComputeBackend>,
_marker: std::marker::PhantomData<T>,
}
#[cfg(not(feature = "gpu_support"))]
pub struct GpuArray<T> {
_phantom: std::marker::PhantomData<T>,
}
#[cfg(feature = "gpu_support")]
impl<T: bytemuck::Pod> std::fmt::Debug for GpuArray<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("GpuArray")
.field("ptr", &format_args!("{:#x}", self.ptr))
.field("shape", &self.shape)
.field("size", &self.size)
.finish()
}
}
#[cfg(feature = "gpu_support")]
impl<T: bytemuck::Pod> Drop for GpuArray<T> {
fn drop(&mut self) {
let _ = self.backend.free(self.ptr);
}
}
#[cfg(feature = "gpu_support")]
impl<T: bytemuck::Pod> GpuArray<T> {
pub(crate) fn alloc_upload(
backend: Arc<dyn ComputeBackend>,
data: &[T],
shape: Vec<usize>,
) -> Result<Self> {
let bytes: &[u8] = bytemuck::cast_slice(data);
let ptr = backend
.alloc(bytes.len())
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
backend.copy_htod(ptr, bytes).map_err(|e| {
let _ = backend.free(ptr);
SklearsError::NumericalError(e.to_string())
})?;
Ok(Self {
ptr,
size: data.len(),
shape,
backend,
_marker: std::marker::PhantomData,
})
}
pub(crate) fn alloc_zeros(backend: Arc<dyn ComputeBackend>, shape: Vec<usize>) -> Result<Self> {
let size: usize = shape.iter().product();
let zeros: Vec<T> = vec![T::zeroed(); size];
Self::alloc_upload(backend, &zeros, shape)
}
pub fn from_slice(context: &GpuContext, data: &[T]) -> Result<Self> {
Self::alloc_upload(context.backend(), data, vec![data.len()])
}
pub fn from_array2(context: &GpuContext, array: &Array2<T>) -> Result<Self>
where
T: Clone,
{
let data: Vec<T> = if array.is_standard_layout() {
array.as_slice().unwrap_or(&[]).to_vec()
} else {
array.iter().cloned().collect()
};
let shape = vec![array.nrows(), array.ncols()];
Self::alloc_upload(context.backend(), &data, shape)
}
pub fn zeros(context: &GpuContext, shape: &[usize]) -> Result<Self> {
Self::alloc_zeros(context.backend(), shape.to_vec())
}
pub fn to_cpu(&self) -> Result<Vec<T>> {
let n_bytes = self.size * std::mem::size_of::<T>();
let mut bytes = vec![0u8; n_bytes];
self.backend
.copy_dtoh(&mut bytes, self.ptr)
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(bytemuck::cast_slice(&bytes).to_vec())
}
pub fn to_array2(&self) -> Result<Array2<T>>
where
T: Clone,
{
if self.shape.len() != 2 {
return Err(SklearsError::InvalidOperation(
"to_array2 requires a 2-D GpuArray".into(),
));
}
let data = self.to_cpu()?;
Array2::from_shape_vec((self.shape[0], self.shape[1]), data)
.map_err(|e| SklearsError::InvalidOperation(format!("from_shape_vec: {e}")))
}
pub fn shape(&self) -> &[usize] {
&self.shape
}
pub fn len(&self) -> usize {
self.size
}
pub fn is_empty(&self) -> bool {
self.size == 0
}
pub fn is_gpu_resident(&self) -> bool {
self.backend.name() != "cpu"
}
pub fn to_cpu_only(&mut self) -> Result<()> {
Ok(())
}
}
#[cfg(not(feature = "gpu_support"))]
impl<T> GpuArray<T> {
pub fn from_slice(_context: &GpuContext, _data: &[T]) -> Result<Self> {
Err(SklearsError::NotImplemented(
"GPU support not enabled".into(),
))
}
pub fn zeros(_context: &GpuContext, _shape: &[usize]) -> Result<Self> {
Err(SklearsError::NotImplemented(
"GPU support not enabled".into(),
))
}
}
pub trait GpuMatrixOps {
fn matmul(&self, other: &Self) -> Result<Self>
where
Self: Sized;
fn add(&self, other: &Self) -> Result<Self>
where
Self: Sized;
fn mul(&self, other: &Self) -> Result<Self>
where
Self: Sized;
fn scale(&self, scalar: f32) -> Result<Self>
where
Self: Sized;
fn transpose(&self) -> Result<Self>
where
Self: Sized;
}
#[cfg(feature = "gpu_support")]
fn gemm_row_major_f64(
backend: &Arc<dyn ComputeBackend>,
m: usize,
k: usize,
n: usize,
a_ptr: u64,
b_ptr: u64,
c_ptr: u64,
) -> Result<()> {
backend
.gemm(
BackendTranspose::NoTrans,
BackendTranspose::NoTrans,
n,
m,
k,
1.0,
b_ptr,
n,
a_ptr,
k,
0.0,
c_ptr,
n,
)
.map_err(|e| SklearsError::NumericalError(e.to_string()))
}
#[cfg(feature = "gpu_support")]
impl GpuMatrixOps for GpuArray<f64> {
fn matmul(&self, other: &Self) -> Result<Self> {
if self.shape.len() != 2 || other.shape.len() != 2 {
return Err(SklearsError::InvalidOperation(
"matmul requires 2-D arrays".into(),
));
}
let (m, k) = (self.shape[0], self.shape[1]);
let (k2, n) = (other.shape[0], other.shape[1]);
if k != k2 {
return Err(SklearsError::ShapeMismatch {
expected: format!("k={k}"),
actual: format!("k2={k2}"),
});
}
let c = GpuArray::<f64>::alloc_zeros(Arc::clone(&self.backend), vec![m, n])?;
gemm_row_major_f64(&self.backend, m, k, n, self.ptr, other.ptr, c.ptr)?;
Ok(c)
}
fn add(&self, other: &Self) -> Result<Self> {
if self.shape != other.shape {
return Err(SklearsError::ShapeMismatch {
expected: format!("{:?}", self.shape),
actual: format!("{:?}", other.shape),
});
}
let out = GpuArray::<f64>::alloc_zeros(Arc::clone(&self.backend), self.shape.clone())?;
self.backend
.binary(BinaryOp::Add, self.ptr, other.ptr, out.ptr, self.size)
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(out)
}
fn mul(&self, other: &Self) -> Result<Self> {
if self.shape != other.shape {
return Err(SklearsError::ShapeMismatch {
expected: format!("{:?}", self.shape),
actual: format!("{:?}", other.shape),
});
}
let out = GpuArray::<f64>::alloc_zeros(Arc::clone(&self.backend), self.shape.clone())?;
self.backend
.binary(BinaryOp::Mul, self.ptr, other.ptr, out.ptr, self.size)
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(out)
}
fn scale(&self, scalar: f32) -> Result<Self> {
let mut data = self.to_cpu()?;
for v in &mut data {
*v *= scalar as f64;
}
Self::alloc_upload(Arc::clone(&self.backend), &data, self.shape.clone())
}
fn transpose(&self) -> Result<Self> {
if self.shape.len() != 2 {
return Err(SklearsError::InvalidOperation(
"transpose requires 2-D array".into(),
));
}
let (m, n) = (self.shape[0], self.shape[1]);
let src = self.to_cpu()?;
let mut dst = vec![0.0f64; m * n];
for i in 0..m {
for j in 0..n {
dst[j * m + i] = src[i * n + j];
}
}
Self::alloc_upload(Arc::clone(&self.backend), &dst, vec![n, m])
}
}
#[cfg(feature = "gpu_support")]
impl GpuMatrixOps for GpuArray<f32> {
fn matmul(&self, other: &Self) -> Result<Self> {
if self.shape.len() != 2 || other.shape.len() != 2 {
return Err(SklearsError::InvalidOperation(
"matmul requires 2-D arrays".into(),
));
}
let (m, k) = (self.shape[0], self.shape[1]);
let (k2, n) = (other.shape[0], other.shape[1]);
if k != k2 {
return Err(SklearsError::ShapeMismatch {
expected: format!("k={k}"),
actual: format!("k2={k2}"),
});
}
let a64: Vec<f64> = self.to_cpu()?.into_iter().map(|x| x as f64).collect();
let b64: Vec<f64> = other.to_cpu()?.into_iter().map(|x| x as f64).collect();
let a = GpuArray::<f64>::alloc_upload(Arc::clone(&self.backend), &a64, self.shape.clone())?;
let b =
GpuArray::<f64>::alloc_upload(Arc::clone(&self.backend), &b64, other.shape.clone())?;
let c = GpuArray::<f64>::alloc_zeros(Arc::clone(&self.backend), vec![m, n])?;
gemm_row_major_f64(&self.backend, m, k, n, a.ptr, b.ptr, c.ptr)?;
let c32: Vec<f32> = c.to_cpu()?.into_iter().map(|x| x as f32).collect();
Self::alloc_upload(Arc::clone(&self.backend), &c32, vec![m, n])
}
fn add(&self, other: &Self) -> Result<Self> {
if self.shape != other.shape {
return Err(SklearsError::ShapeMismatch {
expected: format!("{:?}", self.shape),
actual: format!("{:?}", other.shape),
});
}
let out = GpuArray::<f32>::alloc_zeros(Arc::clone(&self.backend), self.shape.clone())?;
self.backend
.binary(BinaryOp::Add, self.ptr, other.ptr, out.ptr, self.size)
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(out)
}
fn mul(&self, other: &Self) -> Result<Self> {
if self.shape != other.shape {
return Err(SklearsError::ShapeMismatch {
expected: format!("{:?}", self.shape),
actual: format!("{:?}", other.shape),
});
}
let out = GpuArray::<f32>::alloc_zeros(Arc::clone(&self.backend), self.shape.clone())?;
self.backend
.binary(BinaryOp::Mul, self.ptr, other.ptr, out.ptr, self.size)
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
Ok(out)
}
fn scale(&self, scalar: f32) -> Result<Self> {
let mut data = self.to_cpu()?;
for v in &mut data {
*v *= scalar;
}
Self::alloc_upload(Arc::clone(&self.backend), &data, self.shape.clone())
}
fn transpose(&self) -> Result<Self> {
if self.shape.len() != 2 {
return Err(SklearsError::InvalidOperation(
"transpose requires 2-D array".into(),
));
}
let (m, n) = (self.shape[0], self.shape[1]);
let src = self.to_cpu()?;
let mut dst = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
dst[j * m + i] = src[i * n + j];
}
}
Self::alloc_upload(Arc::clone(&self.backend), &dst, vec![n, m])
}
}
#[cfg(not(feature = "gpu_support"))]
impl<T> GpuMatrixOps for GpuArray<T> {
fn matmul(&self, _other: &Self) -> Result<Self> {
Err(SklearsError::from("GPU support not enabled"))
}
fn add(&self, _other: &Self) -> Result<Self> {
Err(SklearsError::from("GPU support not enabled"))
}
fn mul(&self, _other: &Self) -> Result<Self> {
Err(SklearsError::from("GPU support not enabled"))
}
fn scale(&self, _scalar: f32) -> Result<Self> {
Err(SklearsError::from("GPU support not enabled"))
}
fn transpose(&self) -> Result<Self> {
Err(SklearsError::from("GPU support not enabled"))
}
}
pub struct GpuUtils;
impl GpuUtils {
pub fn is_gpu_available() -> bool {
#[cfg(feature = "gpu_support")]
{
let registry = BackendRegistry::with_defaults();
registry.available_kinds().iter().any(|k| k.is_gpu())
}
#[cfg(not(feature = "gpu_support"))]
{
false
}
}
pub fn device_count() -> usize {
#[cfg(feature = "gpu_support")]
{
GpuContext::new()
.ok()
.and_then(|ctx| ctx.backend().available_devices().ok())
.map(|v| v.len())
.unwrap_or(0)
}
#[cfg(not(feature = "gpu_support"))]
{
0
}
}
#[cfg(feature = "gpu_support")]
pub fn device_properties(device_id: usize) -> Result<GpuDeviceProperties> {
let ctx = GpuContext::new()?;
let devices = ctx
.backend()
.available_devices()
.map_err(|e| SklearsError::NumericalError(e.to_string()))?;
let info: DeviceInfo = devices
.into_iter()
.nth(device_id)
.ok_or_else(|| SklearsError::InvalidInput(format!("no device at index {device_id}")))?;
Ok(GpuDeviceProperties {
device_id,
name: info.name,
total_memory: info.total_memory_bytes as usize,
free_memory: info.total_memory_bytes as usize,
compute_capability: (0, 0),
})
}
#[cfg(not(feature = "gpu_support"))]
pub fn device_properties(_device_id: usize) -> Result<GpuDeviceProperties> {
Err(SklearsError::from("GPU support not enabled"))
}
}
#[derive(Debug, Clone)]
pub struct GpuDeviceProperties {
pub device_id: usize,
pub name: String,
pub total_memory: usize,
pub free_memory: usize,
pub compute_capability: (i32, i32),
}
pub struct MemoryTransferOpts;
impl MemoryTransferOpts {
pub fn optimal_transfer_strategy(size_bytes: usize) -> TransferStrategy {
if size_bytes < 1024 * 1024 {
TransferStrategy::Synchronous
} else if size_bytes < 100 * 1024 * 1024 {
TransferStrategy::Asynchronous
} else {
TransferStrategy::Chunked {
chunk_size: 10 * 1024 * 1024,
}
}
}
pub fn estimate_transfer_time(size_bytes: usize, pcie_bandwidth_gbps: f32) -> f32 {
let bandwidth_bytes_per_sec = pcie_bandwidth_gbps * 1e9 / 8.0;
size_bytes as f32 / bandwidth_bytes_per_sec
}
}
#[derive(Debug, Clone, Copy)]
pub enum TransferStrategy {
Synchronous,
Asynchronous,
Chunked { chunk_size: usize },
}
#[allow(non_snake_case)]
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_availability() {
let _gpu_available = GpuUtils::is_gpu_available();
let _device_count = GpuUtils::device_count();
}
#[test]
fn test_memory_transfer_strategy() {
assert!(matches!(
MemoryTransferOpts::optimal_transfer_strategy(1000),
TransferStrategy::Synchronous
));
assert!(matches!(
MemoryTransferOpts::optimal_transfer_strategy(10_000_000),
TransferStrategy::Asynchronous
));
assert!(matches!(
MemoryTransferOpts::optimal_transfer_strategy(200_000_000),
TransferStrategy::Chunked { chunk_size: _ }
));
}
#[test]
fn test_transfer_time_estimation() {
let time = MemoryTransferOpts::estimate_transfer_time(1_000_000, 16.0);
assert!(time > 0.0);
assert!(time < 1.0);
}
#[cfg(feature = "gpu_support")]
#[test]
fn test_gpu_context_creation() {
let ctx = GpuContext::new().expect("CpuBackend always succeeds");
let info = ctx.memory_info().expect("memory_info should succeed");
assert!(info.total > 0);
}
#[cfg(feature = "gpu_support")]
#[test]
fn test_gpu_array_roundtrip_f64() {
let ctx = GpuContext::new().unwrap();
let data = vec![1.0f64, 2.0, 3.0, 4.0];
let arr = GpuArray::<f64>::from_slice(&ctx, &data).unwrap();
assert_eq!(arr.len(), 4);
let back = arr.to_cpu().unwrap();
assert_eq!(back, data);
}
#[cfg(feature = "gpu_support")]
#[test]
fn test_gpu_matmul_2x2_f64() {
use crate::gpu::GpuMatrixOps;
let ctx = GpuContext::new().unwrap();
let a = GpuArray::<f64>::alloc_upload(ctx.backend(), &[1.0, 2.0, 3.0, 4.0], vec![2, 2])
.unwrap();
let b = GpuArray::<f64>::alloc_upload(ctx.backend(), &[5.0, 6.0, 7.0, 8.0], vec![2, 2])
.unwrap();
let c = a.matmul(&b).unwrap();
let r = c.to_cpu().unwrap();
assert!((r[0] - 19.0).abs() < 1e-10, "C[0,0]={}", r[0]);
assert!((r[1] - 22.0).abs() < 1e-10, "C[0,1]={}", r[1]);
assert!((r[2] - 43.0).abs() < 1e-10, "C[1,0]={}", r[2]);
assert!((r[3] - 50.0).abs() < 1e-10, "C[1,1]={}", r[3]);
}
#[cfg(feature = "gpu_support")]
#[test]
fn test_gpu_array_scale_f64() {
use crate::gpu::GpuMatrixOps;
let ctx = GpuContext::new().unwrap();
let arr = GpuArray::<f64>::from_slice(&ctx, &[1.0, 2.0, 3.0, 4.0]).unwrap();
let scaled = arr.scale(2.0).unwrap();
let back = scaled.to_cpu().unwrap();
assert_eq!(back, vec![2.0, 4.0, 6.0, 8.0]);
}
}