pub mod activation_ops;
pub mod benchmark;
pub mod conv_ops;
pub mod cuda_enhanced;
pub mod cuda_kernels;
pub mod custom_kernels;
pub mod device;
pub mod device_cache;
#[cfg(test)]
pub mod integration_tests;
pub mod kernels;
pub mod matrix_ops;
pub mod memory;
pub mod memory_ops;
pub mod memory_transfer;
pub mod metal_kernels;
pub mod opencl_kernels;
pub mod opencl_optimized;
pub mod performance_benchmark;
pub mod performance_optimizer;
pub mod reduction_ops;
pub mod simple_metal_test;
pub mod unified_kernel_simple;
pub mod validation;
pub mod smart_device_selector;
pub mod verification_tests;
pub mod multi_gpu;
pub mod sync_primitives;
pub mod distributed_training;
pub mod multi_gpu_profiler;
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
pub mod hybrid_executor;
#[cfg(any(
feature = "coreml",
feature = "coreml-hybrid",
feature = "coreml-fallback"
))]
pub mod coreml;
pub use activation_ops::GpuActivation;
pub use conv_ops::GpuConvolution;
pub use matrix_ops::GpuLinearAlgebra;
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DeviceType {
Cpu,
Cuda(usize), Metal(usize),
OpenCL(usize),
#[cfg(feature = "coreml")]
CoreML(usize),
#[cfg(feature = "coreml-hybrid")]
CoreMLHybrid {
coreml_id: usize,
fallback_gpu: Option<GpuDevice>,
},
Auto,
#[cfg(feature = "mac-hybrid")]
MacHybrid,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuDevice {
Cuda(usize),
Metal(usize),
OpenCL(usize),
}
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
pub enum OpType {
LinearAlgebra,
Convolution,
Activation,
Reduction,
Normalization,
ComplexMath,
Distribution,
CustomKernel,
DistributedOps,
}
#[derive(Debug)]
pub struct DeviceCapability {
pub device_type: DeviceType,
pub supports_f16: bool,
pub supports_f32: bool,
pub supports_f64: bool,
pub supports_complex: bool,
pub supports_distributed: bool,
pub max_memory_gb: f32,
pub supported_operations: std::collections::HashSet<OpType>,
}
impl DeviceCapability {
pub fn supports_operation(&self, op_type: &OpType) -> bool {
self.supported_operations.contains(op_type)
}
#[cfg(feature = "coreml")]
pub fn coreml_capability() -> Self {
let mut supported_ops = std::collections::HashSet::new();
supported_ops.insert(OpType::LinearAlgebra);
supported_ops.insert(OpType::Convolution);
supported_ops.insert(OpType::Activation);
supported_ops.insert(OpType::Reduction);
supported_ops.insert(OpType::Normalization);
Self {
device_type: DeviceType::CoreML(0),
supports_f16: true,
supports_f32: true,
supports_f64: false, supports_complex: false, supports_distributed: false, max_memory_gb: 8.0, supported_operations: supported_ops,
}
}
}
#[cfg(feature = "mac-hybrid")]
impl DeviceType {
pub fn select_best_for_operation(op_type: &OpType, tensor_size: Option<usize>) -> DeviceType {
use crate::backends::DeviceManager;
use crate::gpu::metal_kernels::MetalKernelExecutor;
let coreml_available = DeviceManager::is_coreml_available();
let metal_available = MetalKernelExecutor::new().is_ok();
if coreml_available && !metal_available {
return DeviceType::CoreML(0);
}
if !coreml_available && metal_available {
return DeviceType::Metal(0);
}
if !coreml_available && !metal_available {
panic!("mac-hybrid feature enabled but neither CoreML nor Metal available. Check system configuration.");
}
match op_type {
OpType::Convolution | OpType::Activation if tensor_size.unwrap_or(0) > 1000 => {
DeviceType::CoreML(0)
}
OpType::LinearAlgebra if tensor_size.unwrap_or(0) > 10000 => DeviceType::Metal(0),
OpType::ComplexMath
| OpType::Distribution
| OpType::CustomKernel
| OpType::DistributedOps => DeviceType::Metal(0),
_ => DeviceType::CoreML(0),
}
}
}
impl Default for DeviceType {
fn default() -> Self {
#[cfg(feature = "coreml")]
if crate::backends::DeviceManager::is_coreml_available() {
return DeviceType::CoreML(0);
}
#[cfg(feature = "cuda")]
if crate::backends::DeviceManager::is_cuda_available() {
return DeviceType::Cuda(0);
}
#[cfg(feature = "metal")]
{
use crate::gpu::metal_kernels::MetalKernelExecutor;
if MetalKernelExecutor::new().is_ok() {
return DeviceType::Metal(0);
}
}
DeviceType::Cpu
}
}
impl fmt::Display for DeviceType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DeviceType::Cpu => write!(f, "cpu"),
DeviceType::Cuda(id) => write!(f, "cuda:{}", id),
DeviceType::Metal(id) => write!(f, "metal:{}", id),
DeviceType::OpenCL(id) => write!(f, "opencl:{}", id),
#[cfg(feature = "coreml")]
DeviceType::CoreML(id) => write!(f, "coreml:{}", id),
#[cfg(feature = "coreml-hybrid")]
DeviceType::CoreMLHybrid {
coreml_id,
fallback_gpu,
} => {
write!(f, "coreml_hybrid:{}:{:?}", coreml_id, fallback_gpu)
}
#[cfg(feature = "mac-hybrid")]
DeviceType::MacHybrid => write!(f, "mac_hybrid"),
DeviceType::Auto => write!(f, "auto"),
}
}
}
impl DeviceType {
pub fn is_available(&self) -> bool {
match self {
DeviceType::Cpu => true,
DeviceType::Cuda(_) => {
#[cfg(feature = "cuda")]
{
use crate::gpu::cuda_kernels::CudaKernelExecutor;
CudaKernelExecutor::new(0).is_ok()
}
#[cfg(not(feature = "cuda"))]
false
}
DeviceType::Metal(_) => {
#[cfg(feature = "metal")]
{
use crate::gpu::metal_kernels::MetalKernelExecutor;
MetalKernelExecutor::new().is_ok()
}
#[cfg(not(feature = "metal"))]
false
}
DeviceType::OpenCL(_) => {
#[cfg(feature = "opencl")]
{
use crate::gpu::opencl_kernels::OpenClKernelExecutor;
OpenClKernelExecutor::new(0).is_ok()
}
#[cfg(not(feature = "opencl"))]
false
}
#[cfg(feature = "coreml")]
DeviceType::CoreML(_) => {
cfg!(target_os = "macos")
}
#[cfg(feature = "coreml-hybrid")]
DeviceType::CoreMLHybrid { fallback_gpu, .. } => {
cfg!(target_os = "macos")
|| fallback_gpu.map_or(false, |gpu| match gpu {
GpuDevice::Cuda(id) => DeviceType::Cuda(id).is_available(),
GpuDevice::Metal(id) => DeviceType::Metal(id).is_available(),
GpuDevice::OpenCL(id) => DeviceType::OpenCL(id).is_available(),
})
}
#[cfg(feature = "mac-hybrid")]
DeviceType::MacHybrid => {
cfg!(target_os = "macos")
&& (DeviceType::Metal(0).is_available() || DeviceType::CoreML(0).is_available())
}
DeviceType::Auto => true, }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum MemoryLayout {
RowMajor,
ColMajor,
}
pub struct GpuContext {
device: DeviceType,
memory_pool_size: usize,
stream_count: usize,
}
impl GpuContext {
pub fn new(device: DeviceType) -> crate::error::RusTorchResult<Self> {
match device {
DeviceType::Cpu => Ok(GpuContext {
device,
memory_pool_size: 0,
stream_count: 1,
}),
DeviceType::Cuda(_) => {
#[cfg(feature = "cuda")]
{
Ok(GpuContext {
device,
memory_pool_size: 1024 * 1024 * 1024, stream_count: 4,
})
}
#[cfg(not(feature = "cuda"))]
{
Err(crate::error::RusTorchError::gpu(
"CUDA support not compiled",
))
}
}
DeviceType::Metal(_) => {
#[cfg(feature = "metal")]
{
Ok(GpuContext {
device,
memory_pool_size: 512 * 1024 * 1024, stream_count: 2,
})
}
#[cfg(not(feature = "metal"))]
{
Err(crate::error::RusTorchError::gpu(
"Metal support not compiled",
))
}
}
DeviceType::OpenCL(_) => {
#[cfg(feature = "opencl")]
{
Ok(GpuContext {
device,
memory_pool_size: 256 * 1024 * 1024, stream_count: 2,
})
}
#[cfg(not(feature = "opencl"))]
{
Err(crate::error::RusTorchError::gpu(
"OpenCL support not compiled",
))
}
}
#[cfg(feature = "coreml")]
DeviceType::CoreML(_) => {
Ok(GpuContext {
device,
memory_pool_size: 1024 * 1024 * 1024, stream_count: 1,
})
}
#[cfg(feature = "coreml-hybrid")]
DeviceType::CoreMLHybrid { .. } => {
Ok(GpuContext {
device,
memory_pool_size: 1024 * 1024 * 1024, stream_count: 2,
})
}
#[cfg(feature = "mac-hybrid")]
DeviceType::MacHybrid => {
Ok(GpuContext {
device,
memory_pool_size: 1024 * 1024 * 1024, stream_count: 4, })
}
DeviceType::Auto => {
let best_device = DeviceType::default();
Self::new(best_device)
}
}
}
pub fn device(&self) -> DeviceType {
self.device
}
pub fn is_gpu_available(&self) -> bool {
!matches!(self.device, DeviceType::Cpu)
}
pub fn memory_pool_size(&self) -> usize {
self.memory_pool_size
}
pub fn stream_count(&self) -> usize {
self.stream_count
}
}
pub use unified_kernel_simple::{
KernelMetrics, KernelOp, KernelParams, KernelSelector, UnifiedKernelExecutor,
};
pub struct DeviceManager {
contexts: Vec<GpuContext>,
current_device: usize,
}
impl DeviceManager {
pub fn new() -> Self {
let mut contexts = Vec::new();
if let Ok(cpu_context) = GpuContext::new(DeviceType::Cpu) {
contexts.push(cpu_context);
}
#[cfg(feature = "cuda")]
{
for device_id in 0..Self::get_cuda_device_count() {
if let Ok(cuda_context) = GpuContext::new(DeviceType::Cuda(device_id)) {
contexts.push(cuda_context);
}
}
}
#[cfg(feature = "metal")]
{
if let Ok(metal_context) = GpuContext::new(DeviceType::Metal(0)) {
contexts.push(metal_context);
}
}
DeviceManager {
contexts,
current_device: 0,
}
}
pub fn available_devices(&self) -> Vec<DeviceType> {
self.contexts.iter().map(|ctx| ctx.device()).collect()
}
pub fn set_device(&mut self, device: DeviceType) -> crate::error::RusTorchResult<()> {
if let Some(index) = self.contexts.iter().position(|ctx| ctx.device() == device) {
self.current_device = index;
Ok(())
} else {
Err(crate::error::RusTorchError::device_not_available(
device.to_string(),
))
}
}
pub fn current_device(&self) -> DeviceType {
self.contexts[self.current_device].device()
}
pub fn current_context(&self) -> &GpuContext {
&self.contexts[self.current_device]
}
pub fn is_cuda_available() -> bool {
#[cfg(feature = "cuda")]
{
Self::get_cuda_device_count() > 0
}
#[cfg(not(feature = "cuda"))]
{
false
}
}
pub fn is_metal_available() -> bool {
#[cfg(feature = "metal")]
{
use crate::gpu::metal_kernels::MetalKernelExecutor;
cfg!(target_os = "macos") && MetalKernelExecutor::new().is_ok()
}
#[cfg(not(feature = "metal"))]
{
false
}
}
#[cfg(feature = "cuda")]
fn get_cuda_device_count() -> usize {
use crate::gpu::cuda_kernels::CudaKernelExecutor;
(0..8)
.filter(|&i| CudaKernelExecutor::new(i).is_ok())
.count()
}
}
impl Default for DeviceManager {
fn default() -> Self {
Self::new()
}
}
static mut DEVICE_MANAGER: Option<DeviceManager> = None;
static DEVICE_MANAGER_INIT: std::sync::Once = std::sync::Once::new();
pub fn get_device_manager() -> &'static mut DeviceManager {
unsafe {
DEVICE_MANAGER_INIT.call_once(|| {
DEVICE_MANAGER = Some(DeviceManager::new());
});
#[allow(static_mut_refs)]
{
DEVICE_MANAGER.as_mut().unwrap()
}
}
}
pub fn set_device(device: DeviceType) -> crate::error::RusTorchResult<()> {
get_device_manager().set_device(device)
}
pub fn current_device() -> DeviceType {
get_device_manager().current_device()
}
pub fn is_gpu_available() -> bool {
DeviceManager::is_cuda_available() || DeviceManager::is_metal_available()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_device_type_display() {
assert_eq!(DeviceType::Cpu.to_string(), "cpu");
assert_eq!(DeviceType::Cuda(0).to_string(), "cuda:0");
assert_eq!(DeviceType::Metal(1).to_string(), "metal:1");
assert_eq!(DeviceType::OpenCL(2).to_string(), "opencl:2");
}
#[test]
fn test_device_manager_creation() {
let manager = DeviceManager::new();
assert!(!manager.available_devices().is_empty());
assert_eq!(manager.current_device(), DeviceType::Cpu);
}
#[test]
fn test_gpu_context_cpu() {
let context = GpuContext::new(DeviceType::Cpu).unwrap();
assert_eq!(context.device(), DeviceType::Cpu);
assert!(!context.is_gpu_available());
assert_eq!(context.stream_count(), 1);
}
#[test]
fn test_global_device_manager() {
let device = current_device();
assert_eq!(device, DeviceType::Cpu);
let available = get_device_manager().available_devices();
assert!(!available.is_empty());
}
#[test]
fn test_gpu_availability() {
let _cuda_available = DeviceManager::is_cuda_available();
let _metal_available = DeviceManager::is_metal_available();
let _any_gpu = is_gpu_available();
}
}