use super::common::*;
use std::ptr;
use std::sync::Arc;
#[cfg(feature = "opencl")]
pub mod opencl_impl {
use super::*;
type ClInt = i32;
type ClUInt = u32;
type ClULong = u64;
type ClBool = u32;
#[derive(Debug, Clone, Copy)]
pub struct SafeClPtr(pub *mut std::ffi::c_void);
unsafe impl Send for SafeClPtr {}
unsafe impl Sync for SafeClPtr {}
impl SafeClPtr {
fn new(ptr: *mut std::ffi::c_void) -> Self {
Self(ptr)
}
fn as_ptr(self) -> *mut std::ffi::c_void {
self.0
}
}
type ClPlatformId = SafeClPtr;
type ClDeviceId = SafeClPtr;
type ClContext = SafeClPtr;
type ClCommandQueue = SafeClPtr;
type ClProgram = SafeClPtr;
type ClKernel = SafeClPtr;
type ClMem = SafeClPtr;
type ClEvent = SafeClPtr;
const CL_SUCCESS: ClInt = 0;
const CL_DEVICE_NOT_FOUND: ClInt = -1;
const CL_DEVICE_TYPE_GPU: ClULong = 1 << 2;
const CL_DEVICE_TYPE_CPU: ClULong = 1 << 1;
const CL_DEVICE_TYPE_ALL: ClULong = 0xFFFFFFFF;
const CL_MEM_READ_WRITE: ClULong = 1 << 0;
const CL_MEM_COPY_HOST_PTR: ClULong = 1 << 2;
fn cl_get_platform_ids() -> (ClInt, Vec<ClPlatformId>) {
(CL_SUCCESS, vec![]) }
fn cl_get_device_ids(
_platform: ClPlatformId,
_device_type: ClULong,
) -> (ClInt, Vec<ClDeviceId>) {
(CL_SUCCESS, vec![])
}
fn cl_get_device_info(_device: ClDeviceId, _param_name: ClUInt) -> (ClInt, Vec<u8>) {
(CL_SUCCESS, vec![0; 256])
}
fn cl_get_platform_info(_platform: ClPlatformId, _param_name: ClUInt) -> (ClInt, String) {
(CL_SUCCESS, "Mock Platform".to_string())
}
fn cl_create_context(_devices: &[ClDeviceId]) -> (ClInt, ClContext) {
(CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
}
fn cl_create_command_queue(
_context: ClContext,
_device: ClDeviceId,
) -> (ClInt, ClCommandQueue) {
(CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
}
fn cl_create_buffer(_context: ClContext, _flags: ClULong, _size: usize) -> (ClInt, ClMem) {
(CL_SUCCESS, SafeClPtr::new(ptr::null_mut()))
}
fn cl_enqueue_write_buffer(
_queue: ClCommandQueue,
_buffer: ClMem,
_blocking: ClBool,
_offset: usize,
_size: usize,
_ptr: *const std::ffi::c_void,
) -> ClInt {
CL_SUCCESS
}
fn cl_enqueue_read_buffer(
_queue: ClCommandQueue,
_buffer: ClMem,
_blocking: ClBool,
_offset: usize,
_size: usize,
_ptr: *mut std::ffi::c_void,
) -> ClInt {
CL_SUCCESS
}
fn cl_finish(_queue: ClCommandQueue) -> ClInt {
CL_SUCCESS
}
fn cl_release_mem_object(_memobj: ClMem) -> ClInt {
CL_SUCCESS
}
pub struct OpenClBackend {
platforms: Vec<OpenClPlatform>,
devices: Vec<OpenClDeviceInfo>,
context_cache: HashMap<usize, Arc<OpenClContextData>>,
opencl_version: String,
extensions: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct OpenClPlatform {
platform_id: ClPlatformId,
name: String,
vendor: String,
version: String,
profile: String,
extensions: Vec<String>,
devices: Vec<usize>, }
#[derive(Debug, Clone)]
struct OpenClDeviceInfo {
device_id: ClDeviceId,
platform_index: usize,
device_type: ClULong,
name: String,
vendor: String,
driver_version: String,
device_version: String,
opencl_c_version: String,
max_compute_units: ClUInt,
max_work_groupsize: usize,
max_work_item_dimensions: ClUInt,
max_work_itemsizes: Vec<usize>,
preferred_vector_width_char: ClUInt,
preferred_vector_width_short: ClUInt,
preferred_vector_width_int: ClUInt,
preferred_vector_width_long: ClUInt,
preferred_vector_width_float: ClUInt,
preferred_vector_width_double: ClUInt,
max_clock_frequency: ClUInt,
address_bits: ClUInt,
max_mem_allocsize: ClULong,
image_support: ClBool,
max_read_image_args: ClUInt,
max_write_image_args: ClUInt,
image2d_max_width: usize,
image2d_max_height: usize,
image3d_max_width: usize,
image3d_max_height: usize,
image3d_max_depth: usize,
max_samplers: ClUInt,
max_parametersize: usize,
mem_base_addr_align: ClUInt,
min_data_type_alignsize: ClUInt,
single_fp_config: ClULong,
global_mem_cache_type: ClUInt,
global_mem_cachelinesize: ClUInt,
global_mem_cachesize: ClULong,
global_memsize: ClULong,
max_constant_buffersize: ClULong,
max_constant_args: ClUInt,
local_mem_type: ClUInt,
local_memsize: ClULong,
error_correction_support: ClBool,
profiling_timer_resolution: usize,
endian_little: ClBool,
available: ClBool,
compiler_available: ClBool,
execution_capabilities: ClULong,
queue_properties: ClULong,
platform_id: ClPlatformId,
}
#[derive(Debug)]
struct OpenClContextData {
context: ClContext,
device_id: ClDeviceId,
command_queue: ClCommandQueue,
device_info: OpenClDeviceInfo,
kernel_cache: std::collections::HashMap<String, ClKernel>,
program_cache: std::collections::HashMap<String, ClProgram>,
}
impl OpenClBackend {
pub fn new() -> LinalgResult<Self> {
let (result, platform_ids) = cl_get_platform_ids();
if result != CL_SUCCESS {
if result == CL_DEVICE_NOT_FOUND {
return Err(LinalgError::ComputationError(
"No OpenCL platforms found".to_string(),
));
}
return Err(LinalgError::ComputationError(format!(
"Failed to get OpenCL platforms: error code {}",
result
)));
}
let mut platforms = Vec::new();
let mut all_devices = Vec::new();
for (platform_idx, &platform_id) in platform_ids.iter().enumerate() {
let (result, platform_name) = cl_get_platform_info(platform_id, 0x0902); if result != CL_SUCCESS {
continue;
}
let (result, platform_vendor) = cl_get_platform_info(platform_id, 0x0903); if result != CL_SUCCESS {
continue;
}
let (result, platform_version) = cl_get_platform_info(platform_id, 0x0901); if result != CL_SUCCESS {
continue;
}
let (result, device_ids) = cl_get_device_ids(platform_id, CL_DEVICE_TYPE_ALL);
if result != CL_SUCCESS {
continue;
}
let mut platform_device_indices = Vec::new();
for device_id in device_ids {
let device_info = Self::get_device_info(device_id, platform_idx, platform_id)?;
platform_device_indices.push(all_devices.len());
all_devices.push(device_info);
}
platforms.push(OpenClPlatform {
platform_id,
name: platform_name,
vendor: platform_vendor,
version: platform_version,
profile: "FULL_PROFILE".to_string(), extensions: vec!["cl_khr_fp64".to_string(), "cl_khr_fp16".to_string()], devices: platform_device_indices,
});
}
Ok(Self {
platforms,
devices: all_devices,
context_cache: HashMap::new(),
opencl_version: "OpenCL 2.1".to_string(), extensions: vec![
"cl_khr_fp64".to_string(),
"cl_khr_fp16".to_string(),
"cl_khr_global_int32_base_atomics".to_string(),
"cl_khr_global_int32_extended_atomics".to_string(),
],
})
}
fn get_device_info(
device_id: ClDeviceId,
platform_index: usize,
platform_id: ClPlatformId,
) -> LinalgResult<OpenClDeviceInfo> {
Ok(OpenClDeviceInfo {
device_id,
platform_index,
device_type: CL_DEVICE_TYPE_GPU,
name: "Mock OpenCL GPU Device".to_string(),
vendor: "Mock Vendor".to_string(),
driver_version: "1.0.0".to_string(),
device_version: "OpenCL 2.1".to_string(),
opencl_c_version: "OpenCL C 2.0".to_string(),
max_compute_units: 32,
max_work_groupsize: 1024,
max_work_item_dimensions: 3,
max_work_itemsizes: vec![1024, 1024, 64],
preferred_vector_width_char: 16,
preferred_vector_width_short: 8,
preferred_vector_width_int: 4,
preferred_vector_width_long: 2,
preferred_vector_width_float: 4,
preferred_vector_width_double: 2,
max_clock_frequency: 1500,
address_bits: 64,
max_mem_allocsize: {
#[cfg(target_pointer_width = "32")]
{
512u64 * 1024 * 1024
} #[cfg(target_pointer_width = "64")]
{
2u64 * 1024 * 1024 * 1024
} },
image_support: 1,
max_read_image_args: 128,
max_write_image_args: 64,
image2d_max_width: 16384,
image2d_max_height: 16384,
image3d_max_width: 2048,
image3d_max_height: 2048,
image3d_max_depth: 2048,
max_samplers: 16,
max_parametersize: 1024,
mem_base_addr_align: 1024,
min_data_type_alignsize: 128,
single_fp_config: 0x3F, global_mem_cache_type: 2, global_mem_cachelinesize: 64,
global_mem_cachesize: 2u64 * 1024 * 1024, global_memsize: {
#[cfg(target_pointer_width = "32")]
{
512u64 * 1024 * 1024
} #[cfg(target_pointer_width = "64")]
{
8u64 * 1024 * 1024 * 1024
} },
max_constant_buffersize: 64u64 * 1024, max_constant_args: 8,
local_mem_type: 1, local_memsize: 48u64 * 1024, error_correction_support: 0,
profiling_timer_resolution: 1,
endian_little: 1,
available: 1,
compiler_available: 1,
execution_capabilities: 1, queue_properties: 2, platform_id,
})
}
pub fn platforms(&self) -> &[OpenClPlatform] {
&self.platforms
}
pub fn platform(&self, index: usize) -> Option<&OpenClPlatform> {
self.platforms.get(index)
}
pub fn supports_double_precision(&self) -> bool {
self.extensions.contains(&"cl_khr_fp64".to_string())
}
pub fn supports_half_precision(&self) -> bool {
self.extensions.contains(&"cl_khr_fp16".to_string())
}
pub fn opencl_version(&self) -> &str {
&self.opencl_version
}
}
impl GpuBackend for OpenClBackend {
fn name(&self) -> &str {
"OpenCL"
}
fn is_available(&self) -> bool {
!self.platforms.is_empty() && !self.devices.is_empty()
}
fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
let devices = self
.devices
.iter()
.map(|opencl_device| {
let memory_bandwidth =
(opencl_device.max_clock_frequency as f64 * 256.0) / 1000.0;
let device_type = match opencl_device.device_type {
CL_DEVICE_TYPE_GPU => GpuDeviceType::OpenCl,
CL_DEVICE_TYPE_CPU => GpuDeviceType::OpenCl,
_ => GpuDeviceType::OpenCl,
};
GpuDeviceInfo {
device_type,
name: format!("{} ({})", opencl_device.name, opencl_device.vendor),
total_memory: opencl_device.global_memsize as usize,
compute_units: opencl_device.max_compute_units,
clock_frequency: opencl_device.max_clock_frequency,
supports_fp64: self.supports_double_precision(),
supports_fp16: self.supports_half_precision(),
max_work_groupsize: opencl_device.max_work_groupsize,
memory_bandwidth,
l2_cachesize: opencl_device.global_mem_cachesize as usize,
shared_memory_per_block: opencl_device.local_memsize as usize,
registers_per_block: 0, warpsize: opencl_device.preferred_vector_width_float, max_threads_per_mp: opencl_device.max_work_groupsize as u32,
multiprocessor_count: opencl_device.max_compute_units,
supports_tensor_cores: false, supports_mixed_precision: self.supports_half_precision(),
vendor: opencl_device.vendor.clone(),
}
})
.collect();
Ok(devices)
}
fn create_context(&self, device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
if device_id >= self.devices.len() {
return Err(LinalgError::ComputationError(format!(
"Invalid device ID: {} (available devices: {})",
device_id,
self.devices.len()
)));
}
let device_info = &self.devices[device_id];
let (result, context) = cl_create_context(&[device_info.device_id]);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to create OpenCL context: error code {}",
result
)));
}
let (result, command_queue) = cl_create_command_queue(context, device_info.device_id);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to create OpenCL command queue: error code {}",
result
)));
}
let context_data = Arc::new(OpenClContextData {
context,
device_id: device_info.device_id,
command_queue,
device_info: device_info.clone(),
kernel_cache: HashMap::new(),
program_cache: HashMap::new(),
});
Ok(Box::new(OpenClContext::new(context_data, device_id)))
}
}
#[derive(Debug)]
pub struct OpenClContext {
context_data: Arc<OpenClContextData>,
device_index: usize,
memory_pool: OpenClMemoryPool,
performance_stats: OpenClPerformanceStats,
kernel_compilation_cache: HashMap<String, String>, }
impl OpenClContext {
fn new(context_data: Arc<OpenClContextData>, device_index: usize) -> Self {
let memory_pool = OpenClMemoryPool::new(context_data.context);
let performance_stats = OpenClPerformanceStats::new();
Self {
context_data,
device_index,
memory_pool,
performance_stats,
kernel_compilation_cache: HashMap::new(),
}
}
pub fn cl_context(&self) -> ClContext {
self.context_data.context
}
pub fn command_queue(&self) -> ClCommandQueue {
self.context_data.command_queue
}
pub fn compile_kernel(
&mut self,
_kernel_name: &str,
_source: &str,
) -> LinalgResult<ClKernel> {
Ok(SafeClPtr(ptr::null_mut()))
}
pub fn performance_stats(&self) -> &OpenClPerformanceStats {
&self.performance_stats
}
}
impl GpuContext for OpenClContext {
#[allow(static_mut_refs)]
fn device_info(&self) -> &GpuDeviceInfo {
static mut CACHED_INFO: Option<GpuDeviceInfo> = None;
unsafe {
if CACHED_INFO.is_none() {
let opencl_device = &self.context_data.device_info;
let memory_bandwidth =
(opencl_device.max_clock_frequency as f64 * 256.0) / 1000.0;
CACHED_INFO = Some(GpuDeviceInfo {
device_type: GpuDeviceType::OpenCl,
name: format!("{} ({})", opencl_device.name, opencl_device.vendor),
total_memory: opencl_device.global_memsize as usize,
compute_units: opencl_device.max_compute_units,
clock_frequency: opencl_device.max_clock_frequency,
supports_fp64: true, supports_fp16: true, max_work_groupsize: opencl_device.max_work_groupsize,
memory_bandwidth,
l2_cachesize: opencl_device.global_mem_cachesize as usize,
shared_memory_per_block: opencl_device.local_memsize as usize,
registers_per_block: 0,
warpsize: opencl_device.preferred_vector_width_float,
max_threads_per_mp: opencl_device.max_work_groupsize as u32,
multiprocessor_count: opencl_device.max_compute_units,
supports_tensor_cores: false,
supports_mixed_precision: true,
vendor: opencl_device.vendor.clone(),
});
}
CACHED_INFO.as_ref().expect("Operation failed")
}
}
fn synchronize(&self) -> LinalgResult<()> {
let result = cl_finish(self.context_data.command_queue);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"OpenCL synchronization failed: error code {}",
result
)));
}
Ok(())
}
fn available_memory(&self) -> LinalgResult<usize> {
Ok(self.context_data.device_info.global_memsize as usize / 2)
}
}
impl GpuContextAlloc for OpenClContext {
fn allocate_buffer<T: Clone + Send + Sync + Copy + 'static + std::fmt::Debug>(
&self,
size: usize,
) -> LinalgResult<Box<dyn GpuBuffer<T>>> {
let buffer = OpenClBuffer::new(
size,
self.context_data.context,
self.context_data.command_queue,
)?;
Ok(Box::new(buffer))
}
}
#[derive(Debug)]
struct OpenClMemoryPool {
context: ClContext,
total_allocated: usize,
peak_usage: usize,
free_buffers: HashMap<usize, Vec<ClMem>>,
}
impl OpenClMemoryPool {
fn new(context: ClContext) -> Self {
Self {
context,
total_allocated: 0,
peak_usage: 0,
free_buffers: HashMap::new(),
}
}
#[allow(dead_code)]
fn allocate(&mut self, size: usize) -> LinalgResult<ClMem> {
if let Some(buffers) = self.free_buffers.get_mut(&size) {
if let Some(buffer) = buffers.pop() {
return Ok(buffer);
}
}
let (result, buffer) = cl_create_buffer(self.context, CL_MEM_READ_WRITE, size);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"OpenCL buffer allocation failed: error code {}",
result
)));
}
self.total_allocated += size;
self.peak_usage = self.peak_usage.max(self.total_allocated);
Ok(buffer)
}
#[allow(dead_code)]
fn deallocate(&mut self, buffer: ClMem, size: usize) {
self.free_buffers.entry(size).or_default().push(buffer);
self.total_allocated = self.total_allocated.saturating_sub(size);
}
}
#[derive(Debug)]
struct OpenClBuffer<T> {
buffer: ClMem,
size: usize,
context: ClContext,
command_queue: ClCommandQueue,
_phantom: std::marker::PhantomData<T>,
}
impl<T: Clone + Send + Sync + Copy> OpenClBuffer<T> {
fn new(
size: usize,
context: ClContext,
command_queue: ClCommandQueue,
) -> LinalgResult<Self> {
let bytesize = size * std::mem::size_of::<T>();
let (result, buffer) = cl_create_buffer(context, CL_MEM_READ_WRITE, bytesize);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to create OpenCL buffer: error code {}",
result
)));
}
Ok(Self {
buffer,
size,
context,
command_queue,
_phantom: std::marker::PhantomData,
})
}
pub fn cl_mem(&self) -> ClMem {
self.buffer
}
}
impl<T: Clone + Send + Sync + Copy + std::fmt::Debug> GpuBuffer<T> for OpenClBuffer<T> {
fn len(&self) -> usize {
self.size
}
fn copy_from_host(&mut self, data: &[T]) -> LinalgResult<()> {
if data.len() != self.size {
return Err(LinalgError::ShapeError(format!(
"Buffer size mismatch: expected {}, got {}",
self.size,
data.len()
)));
}
let bytesize = data.len() * std::mem::size_of::<T>();
let result = cl_enqueue_write_buffer(
self.command_queue,
self.buffer,
1, 0, bytesize,
data.as_ptr() as *const std::ffi::c_void,
);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"OpenCL host-to-device copy failed: error code {}",
result
)));
}
Ok(())
}
fn copy_to_host(&self, data: &mut [T]) -> LinalgResult<()> {
if data.len() != self.size {
return Err(LinalgError::ShapeError(format!(
"Buffer size mismatch: expected {}, got {}",
self.size,
data.len()
)));
}
let bytesize = data.len() * std::mem::size_of::<T>();
let result = cl_enqueue_read_buffer(
self.command_queue,
self.buffer,
1, 0, bytesize,
data.as_mut_ptr() as *mut std::ffi::c_void,
);
if result != CL_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"OpenCL device-to-host copy failed: error code {}",
result
)));
}
Ok(())
}
fn device_ptr(&self) -> *mut std::ffi::c_void {
self.buffer.as_ptr()
}
}
impl<T> Drop for OpenClBuffer<T> {
fn drop(&mut self) {
if !self.buffer.0.is_null() {
let _ = cl_release_mem_object(self.buffer);
}
}
}
#[derive(Debug, Clone)]
pub struct OpenClPerformanceStats {
pub kernel_executions: usize,
pub buffer_operations: usize,
pub total_kernel_time_ms: f64,
pub total_transfer_time_ms: f64,
pub compilation_time_ms: f64,
pub cache_hits: usize,
pub cache_misses: usize,
}
impl OpenClPerformanceStats {
fn new() -> Self {
Self {
kernel_executions: 0,
buffer_operations: 0,
total_kernel_time_ms: 0.0,
total_transfer_time_ms: 0.0,
compilation_time_ms: 0.0,
cache_hits: 0,
cache_misses: 0,
}
}
pub fn kernel_efficiency(&self) -> f64 {
if self.total_kernel_time_ms + self.total_transfer_time_ms == 0.0 {
return 0.0;
}
self.total_kernel_time_ms / (self.total_kernel_time_ms + self.total_transfer_time_ms)
}
pub fn cache_hit_rate(&self) -> f64 {
let total_accesses = self.cache_hits + self.cache_misses;
if total_accesses == 0 {
return 0.0;
}
self.cache_hits as f64 / total_accesses as f64
}
}
}
#[cfg(feature = "opencl")]
pub use opencl_impl::*;
#[cfg(not(feature = "opencl"))]
pub struct OpenClBackend;
#[cfg(not(feature = "opencl"))]
impl OpenClBackend {
pub fn new() -> LinalgResult<Self> {
Err(LinalgError::ComputationError(
"OpenCL support not compiled in".to_string(),
))
}
}
#[cfg(not(feature = "opencl"))]
impl GpuBackend for OpenClBackend {
fn name(&self) -> &str {
"OpenCL (not available)"
}
fn is_available(&self) -> bool {
false
}
fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
Ok(vec![])
}
fn create_context(&self, _device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
Err(LinalgError::ComputationError(
"OpenCL support not compiled in".to_string(),
))
}
}