use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use anyhow::{Result, anyhow};
use candle_core::{Device, Tensor as CandleTensor};
use ronn_core::tensor::Tensor;
use ronn_core::{
CompiledKernel, DataType, ExecutionProvider, KernelStats, MemoryType, MemoryUsage,
OperatorSpec, PerformanceProfile, ProviderCapability, ProviderConfig, ProviderId,
ResourceRequirements, SubGraph, TensorAllocator, TensorLayout,
};
use tracing::{debug, info, warn};
use super::allocator::create_gpu_allocator;
use super::cuda_kernels::{CudaCompileOptions, CudaKernelManager};
use super::memory_manager::{MultiGpuMemoryConfig, MultiGpuMemoryManager};
use super::topology::{GpuTopologyManager, TopologyConfig};
pub struct GpuExecutionProvider {
devices: Vec<Device>,
allocators: Vec<Arc<dyn TensorAllocator>>,
supported_ops: HashSet<String>,
config: GpuProviderConfig,
device_manager: Arc<std::sync::Mutex<MultiGpuManager>>,
cuda_kernel_managers: Vec<Option<CudaKernelManager>>,
memory_manager: Option<Arc<MultiGpuMemoryManager>>,
topology_manager: Option<Arc<GpuTopologyManager>>,
}
#[derive(Debug, Clone)]
pub struct GpuProviderConfig {
pub device_ids: Vec<usize>,
pub primary_device_id: usize,
pub memory_limit: Option<usize>,
pub enable_mixed_precision: bool,
pub enable_tensor_cores: bool,
pub stream_count: usize,
pub enable_multi_gpu: bool,
pub enable_p2p_transfer: bool,
pub load_balancing: LoadBalancingStrategy,
pub enable_custom_kernels: bool,
pub cuda_compile_options: CudaCompileOptions,
pub memory_config: MultiGpuMemoryConfig,
pub topology_config: TopologyConfig,
}
#[derive(Debug, Clone, Copy)]
pub enum LoadBalancingStrategy {
RoundRobin,
MemoryBased,
UtilizationBased,
OperationBased,
CostModel,
}
impl Default for LoadBalancingStrategy {
fn default() -> Self {
LoadBalancingStrategy::RoundRobin
}
}
impl Default for GpuProviderConfig {
fn default() -> Self {
Self {
device_ids: vec![0],
primary_device_id: 0,
memory_limit: None,
enable_mixed_precision: true,
enable_tensor_cores: true,
stream_count: 1,
enable_multi_gpu: false,
enable_p2p_transfer: true,
load_balancing: LoadBalancingStrategy::default(),
enable_custom_kernels: true,
cuda_compile_options: CudaCompileOptions::default(),
memory_config: MultiGpuMemoryConfig::default(),
topology_config: TopologyConfig::default(),
}
}
}
#[derive(Debug)]
pub struct GpuKernel {
subgraph: SubGraph,
device: Device,
stats: std::sync::Mutex<GpuKernelStats>,
stream_id: usize,
kernel_cache: std::sync::Mutex<KernelCache>,
}
#[derive(Debug, Default)]
struct KernelCache {
cached_ops: HashMap<String, CachedOperation>,
cache_size: usize,
max_cache_size: usize,
}
#[derive(Debug, Clone)]
struct CachedOperation {
signature: String,
execution_path: OptimizedPath,
hit_count: u64,
last_accessed: std::time::Instant,
}
#[derive(Debug, Clone)]
enum OptimizedPath {
Single(String),
Fused(Vec<String>),
MixedPrecision {
fp16_ops: Vec<String>,
fp32_ops: Vec<String>,
},
}
#[derive(Debug, Default)]
struct GpuKernelStats {
execution_count: u64,
total_time_us: u64,
min_time_us: u64,
max_time_us: u64,
memory_peak: usize,
}
#[derive(Debug)]
struct MultiGpuManager {
config: GpuProviderConfig,
device_stats: HashMap<usize, DeviceStats>,
round_robin_counter: usize,
memory_usage: HashMap<usize, usize>,
}
#[derive(Debug, Default, Clone)]
struct DeviceStats {
operation_count: u64,
current_memory: usize,
peak_memory: usize,
avg_execution_time: f64,
utilization: f32,
}
impl MultiGpuManager {
fn new(config: GpuProviderConfig) -> Self {
let mut device_stats = HashMap::new();
let mut memory_usage = HashMap::new();
for &device_id in &config.device_ids {
device_stats.insert(device_id, DeviceStats::default());
memory_usage.insert(device_id, 0);
}
Self {
config,
device_stats,
round_robin_counter: 0,
memory_usage,
}
}
fn select_device(&mut self, op_type: &str, memory_requirement: usize) -> usize {
if self.config.device_ids.len() == 1 {
return self.config.device_ids[0];
}
if !self.config.enable_multi_gpu {
return self.config.primary_device_id;
}
match self.config.load_balancing {
LoadBalancingStrategy::RoundRobin => {
let device_id =
self.config.device_ids[self.round_robin_counter % self.config.device_ids.len()];
self.round_robin_counter += 1;
device_id
}
LoadBalancingStrategy::MemoryBased => self.select_device_by_memory(memory_requirement),
LoadBalancingStrategy::UtilizationBased => self.select_device_by_utilization(),
LoadBalancingStrategy::OperationBased => self.select_device_by_operation_type(op_type),
LoadBalancingStrategy::CostModel => {
self.select_device_by_cost_model(op_type, memory_requirement)
}
}
}
fn select_device_by_memory(&self, memory_requirement: usize) -> usize {
self.config
.device_ids
.iter()
.min_by_key(|&&device_id| {
self.memory_usage.get(&device_id).unwrap_or(&0) + memory_requirement
})
.copied()
.unwrap_or(self.config.primary_device_id)
}
fn select_device_by_utilization(&self) -> usize {
self.config
.device_ids
.iter()
.min_by(|&&a, &&b| {
let util_a = self
.device_stats
.get(&a)
.map(|s| s.utilization)
.unwrap_or(0.0);
let util_b = self
.device_stats
.get(&b)
.map(|s| s.utilization)
.unwrap_or(0.0);
util_a
.partial_cmp(&util_b)
.unwrap_or(std::cmp::Ordering::Equal)
})
.copied()
.unwrap_or(self.config.primary_device_id)
}
fn select_device_by_operation_type(&self, op_type: &str) -> usize {
match op_type {
"MatMul" | "Conv" | "ConvTranspose" => self
.config
.device_ids
.iter()
.min()
.copied()
.unwrap_or(self.config.primary_device_id),
"Concat" | "Split" | "Reshape" => self.select_device_by_memory(0),
_ => {
let device_id =
self.config.device_ids[self.round_robin_counter % self.config.device_ids.len()];
device_id
}
}
}
fn select_device_by_cost_model(&self, _op_type: &str, memory_requirement: usize) -> usize {
let mut best_device = self.config.primary_device_id;
let mut best_score = f64::INFINITY;
for &device_id in &self.config.device_ids {
let default_stats = DeviceStats::default();
let stats = self.device_stats.get(&device_id).unwrap_or(&default_stats);
let memory_used = self.memory_usage.get(&device_id).unwrap_or(&0);
let memory_pressure =
(*memory_used + memory_requirement) as f64 / (1024.0 * 1024.0 * 1024.0); let utilization_penalty = stats.utilization as f64 * 2.0;
let execution_time_penalty = stats.avg_execution_time / 1000.0;
let total_score = memory_pressure + utilization_penalty + execution_time_penalty;
if total_score < best_score {
best_score = total_score;
best_device = device_id;
}
}
best_device
}
fn update_device_stats(
&mut self,
device_id: usize,
execution_time_us: u64,
memory_used: usize,
) {
if let Some(stats) = self.device_stats.get_mut(&device_id) {
stats.operation_count += 1;
stats.current_memory = memory_used;
stats.peak_memory = stats.peak_memory.max(memory_used);
let alpha = 0.1; if stats.avg_execution_time == 0.0 {
stats.avg_execution_time = execution_time_us as f64;
} else {
stats.avg_execution_time =
alpha * execution_time_us as f64 + (1.0 - alpha) * stats.avg_execution_time;
}
}
self.memory_usage.insert(device_id, memory_used);
}
fn get_device_stats(&self) -> &HashMap<usize, DeviceStats> {
&self.device_stats
}
}
impl GpuExecutionProvider {
#[cfg(feature = "gpu")]
pub fn new() -> Result<Self> {
Self::with_config(GpuProviderConfig::default())
}
#[cfg(feature = "gpu")]
pub fn with_config(config: GpuProviderConfig) -> Result<Self> {
let mut devices = Vec::new();
let mut allocators = Vec::new();
let mut cuda_kernel_managers = Vec::new();
for &device_id in &config.device_ids {
let device = Self::create_gpu_device(device_id)?;
info!("Created GPU device {}: {:?}", device_id, device);
let cuda_manager = if config.enable_custom_kernels && matches!(device, Device::Cuda(_))
{
match CudaKernelManager::with_options(
device.clone(),
config.cuda_compile_options.clone(),
) {
Ok(manager) => {
info!("Created CUDA kernel manager for device {}", device_id);
Some(manager)
}
Err(e) => {
warn!(
"Failed to create CUDA kernel manager for device {}: {}",
device_id, e
);
None
}
}
} else {
None
};
devices.push(device);
cuda_kernel_managers.push(cuda_manager);
let allocator = create_gpu_allocator().map_err(|e| {
anyhow!(
"Failed to create GPU allocator for device {}: {}",
device_id,
e
)
})?;
allocators.push(allocator);
}
if devices.is_empty() {
return Err(anyhow!("No GPU devices configured"));
}
info!("Created GPU provider with {} devices", devices.len());
let device_manager = Arc::new(std::sync::Mutex::new(MultiGpuManager::new(config.clone())));
let memory_manager = if config.enable_multi_gpu && config.device_ids.len() > 1 {
match MultiGpuMemoryManager::new(
config.device_ids.clone(),
config.memory_config.clone(),
) {
Ok(manager) => {
info!("Created multi-GPU memory manager");
Some(Arc::new(manager))
}
Err(e) => {
warn!("Failed to create multi-GPU memory manager: {}", e);
None
}
}
} else {
None
};
let topology_manager = if config.enable_multi_gpu && config.device_ids.len() > 1 {
match GpuTopologyManager::new(config.topology_config.clone()) {
Ok(mut manager) => {
if let Err(e) = manager.discover_topology() {
warn!("Failed to discover GPU topology: {}", e);
} else {
info!("GPU topology discovered successfully");
}
Some(Arc::new(manager))
}
Err(e) => {
warn!("Failed to create topology manager: {}", e);
None
}
}
} else {
None
};
let mut supported_ops = HashSet::new();
supported_ops.insert("Add".to_string());
supported_ops.insert("Sub".to_string());
supported_ops.insert("Mul".to_string());
supported_ops.insert("Div".to_string());
supported_ops.insert("MatMul".to_string());
supported_ops.insert("Gemm".to_string());
supported_ops.insert("Conv".to_string());
supported_ops.insert("ConvTranspose".to_string());
supported_ops.insert("MaxPool".to_string());
supported_ops.insert("AveragePool".to_string());
supported_ops.insert("GlobalAveragePool".to_string());
supported_ops.insert("ReLU".to_string());
supported_ops.insert("Sigmoid".to_string());
supported_ops.insert("Tanh".to_string());
supported_ops.insert("Softmax".to_string());
supported_ops.insert("GELU".to_string());
supported_ops.insert("BatchNormalization".to_string());
supported_ops.insert("LayerNormalization".to_string());
supported_ops.insert("Sum".to_string());
supported_ops.insert("Mean".to_string());
supported_ops.insert("Max".to_string());
supported_ops.insert("Min".to_string());
supported_ops.insert("Reshape".to_string());
supported_ops.insert("Transpose".to_string());
supported_ops.insert("Concat".to_string());
supported_ops.insert("Split".to_string());
info!(
"GPU provider supports {} operation types",
supported_ops.len()
);
Ok(Self {
devices,
allocators,
supported_ops,
config,
device_manager,
cuda_kernel_managers,
memory_manager,
topology_manager,
})
}
#[cfg(not(feature = "gpu"))]
pub fn new() -> Result<Self> {
Err(anyhow!("GPU support not compiled in"))
}
#[cfg(not(feature = "gpu"))]
pub fn with_config(_config: GpuProviderConfig) -> Result<Self> {
Err(anyhow!("GPU support not compiled in"))
}
#[cfg(feature = "gpu")]
fn create_gpu_device(device_id: usize) -> Result<Device> {
if let Ok(device) = Device::new_cuda(device_id) {
info!("Using CUDA device {}", device_id);
return Ok(device);
}
#[cfg(target_os = "macos")]
{
if let Ok(device) = Device::new_metal(device_id) {
info!("Using Metal device {}", device_id);
return Ok(device);
}
}
Err(anyhow!("No GPU devices available"))
}
pub fn device(&self) -> &Device {
&self.devices[0]
}
pub fn get_config(&self) -> &GpuProviderConfig {
&self.config
}
pub fn supports_operation(&self, op_type: &str) -> bool {
self.supported_ops.contains(op_type)
}
pub fn estimate_cost(&self, op_spec: &OperatorSpec) -> f64 {
match op_spec.op_type.as_str() {
"Add" | "Sub" | "Mul" | "Div" => 0.1, "ReLU" | "Sigmoid" | "Tanh" => 0.2, "MatMul" | "Gemm" => 0.5, "Conv" => 0.8, "ConvTranspose" => 1.2, "BatchNormalization" => 0.3, "Softmax" => 0.4, "MaxPool" | "AveragePool" => 0.3, _ => 1.0, }
}
#[cfg(feature = "gpu")]
pub fn has_tensor_cores(&self) -> bool {
matches!(self.device, Device::Cuda(_)) && self.config.enable_tensor_cores
}
#[cfg(not(feature = "gpu"))]
pub fn has_tensor_cores(&self) -> bool {
false
}
#[cfg(feature = "gpu")]
pub fn get_gpu_memory_info(&self) -> Result<(usize, usize)> {
match &self.devices[0] {
Device::Cuda(_) => Ok((8 * 1024 * 1024 * 1024, 0)), Device::Metal(_) => Ok((8 * 1024 * 1024 * 1024, 0)), _ => Err(anyhow!("Not a GPU device")),
}
}
#[cfg(not(feature = "gpu"))]
pub fn get_gpu_memory_info(&self) -> Result<(usize, usize)> {
Err(anyhow!("GPU support not available"))
}
}
impl Default for GpuExecutionProvider {
fn default() -> Self {
Self::new().expect("Failed to create default GPU provider")
}
}
impl ExecutionProvider for GpuExecutionProvider {
fn provider_id(&self) -> ProviderId {
ProviderId::GPU
}
fn get_capability(&self) -> ProviderCapability {
let mut data_types = vec![
DataType::F32,
DataType::F16, DataType::F64,
DataType::U8,
DataType::U32,
];
if self.has_tensor_cores() {
data_types.insert(0, DataType::F16); }
let gpu_memory = self
.get_gpu_memory_info()
.map(|(total, _)| total)
.unwrap_or(0);
ProviderCapability {
supported_ops: self.supported_ops.clone(),
data_types,
memory_types: vec![MemoryType::DeviceMemory, MemoryType::SharedMemory],
performance_profile: PerformanceProfile::GPU,
resource_requirements: ResourceRequirements {
min_memory_bytes: Some(512 * 1024 * 1024), cpu_features: vec![], gpu_memory_bytes: Some(gpu_memory),
},
}
}
fn can_handle(&self, operators: &[OperatorSpec]) -> Vec<bool> {
operators
.iter()
.map(|op| self.supports_operation(&op.op_type))
.collect()
}
fn compile_subgraph(&self, subgraph: SubGraph) -> Result<Box<dyn CompiledKernel>> {
debug!(
"Compiling subgraph with {} nodes for GPU",
subgraph.nodes.len()
);
for node in &subgraph.nodes {
if !self.supports_operation(&node.op_type) {
return Err(anyhow!(
"Unsupported GPU operation '{}' in subgraph",
node.op_type
));
}
}
let mut device_manager = self.device_manager.lock().unwrap();
let primary_op = subgraph
.nodes
.first()
.map(|n| n.op_type.as_str())
.unwrap_or("Unknown");
let estimated_memory = subgraph.nodes.len() * 1024 * 1024;
let selected_device_id = device_manager.select_device(primary_op, estimated_memory);
let device_index = self
.config
.device_ids
.iter()
.position(|&id| id == selected_device_id)
.unwrap_or(0);
let selected_device = self.devices[device_index].clone();
debug!(
"Selected GPU device {} for subgraph compilation",
selected_device_id
);
drop(device_manager);
let stream_id = selected_device_id % self.config.stream_count;
let kernel = GpuKernel::with_stream(subgraph, selected_device, stream_id)?;
debug!(
"Successfully compiled GPU kernel on device {}",
selected_device_id
);
Ok(Box::new(kernel))
}
fn get_allocator(&self) -> Arc<dyn TensorAllocator> {
self.allocators[0].clone()
}
fn configure(&mut self, config: ProviderConfig) -> Result<()> {
if let Some(memory_limit) = config.memory_limit {
self.config.memory_limit = Some(memory_limit);
info!("Updated GPU memory limit to {} bytes", memory_limit);
}
for (key, value) in &config.custom_options {
match key.as_str() {
"enable_mixed_precision" => {
if let Ok(enable) = value.parse::<bool>() {
self.config.enable_mixed_precision = enable;
info!("Updated mixed precision to {}", enable);
}
}
"enable_tensor_cores" => {
if let Ok(enable) = value.parse::<bool>() {
self.config.enable_tensor_cores = enable;
info!("Updated tensor cores to {}", enable);
}
}
"stream_count" => {
if let Ok(count) = value.parse::<usize>() {
self.config.stream_count = count;
info!("Updated stream count to {}", count);
}
}
_ => {
warn!("Unknown GPU configuration option: {}", key);
}
}
}
Ok(())
}
fn shutdown(&self) -> Result<()> {
info!("Shutting down GPU execution provider");
debug!("GPU provider shutdown complete");
Ok(())
}
}
impl GpuExecutionProvider {
pub fn get_device_allocator(&self, device_id: usize) -> Option<Arc<dyn TensorAllocator>> {
let device_index = self
.config
.device_ids
.iter()
.position(|&id| id == device_id)?;
self.allocators.get(device_index).cloned()
}
pub fn get_multi_gpu_stats(&self) -> HashMap<usize, DeviceStats> {
let device_manager = self.device_manager.lock().unwrap();
device_manager.get_device_stats().clone()
}
pub fn set_multi_gpu_enabled(&mut self, enabled: bool) {
self.config.enable_multi_gpu = enabled;
info!(
"Multi-GPU support {}",
if enabled { "enabled" } else { "disabled" }
);
}
pub fn set_load_balancing_strategy(&mut self, strategy: LoadBalancingStrategy) {
info!("Updated load balancing strategy to {:?}", strategy);
self.config.load_balancing = strategy;
}
pub fn device_count(&self) -> usize {
self.devices.len()
}
pub fn has_device(&self, device_id: usize) -> bool {
self.config.device_ids.contains(&device_id)
}
pub fn has_custom_kernels(&self, device_id: usize) -> bool {
if let Some(device_index) = self
.config
.device_ids
.iter()
.position(|&id| id == device_id)
{
self.cuda_kernel_managers
.get(device_index)
.map(|manager| manager.is_some())
.unwrap_or(false)
} else {
false
}
}
pub fn get_custom_kernel_ops(&self, device_id: usize) -> Vec<String> {
if self.has_custom_kernels(device_id) {
vec![
"FusedMatMulBias".to_string(),
"OptimizedSoftmax".to_string(),
"FusedConvBnRelu".to_string(),
"WarpReduceSum".to_string(),
"TensorCoreGemm".to_string(),
"FastGelu".to_string(),
]
} else {
vec![]
}
}
pub fn try_execute_with_custom_kernel(
&self,
device_id: usize,
op_type: &str,
inputs: &[CandleTensor],
) -> Result<Option<Vec<CandleTensor>>> {
let device_index = self
.config
.device_ids
.iter()
.position(|&id| id == device_id)
.ok_or_else(|| anyhow!("Device {} not found", device_id))?;
if let Some(Some(kernel_manager)) = self.cuda_kernel_managers.get(device_index) {
let tensor_shape = inputs
.first()
.map(|t| t.shape().dims().to_vec())
.unwrap_or_else(|| vec![1]);
match kernel_manager.get_optimized_kernel(op_type, &tensor_shape) {
Ok(mut kernel) => {
info!("Using custom CUDA kernel for operation: {}", op_type);
let mut outputs: Vec<CandleTensor> = inputs.iter()
.map(|input| input.clone()) .collect();
kernel_manager.execute_kernel(&mut kernel, inputs, &mut outputs)?;
Ok(Some(outputs))
}
Err(_) => {
Ok(None)
}
}
} else {
Ok(None)
}
}
pub fn clear_kernel_caches(&self) {
for kernel_manager in self.cuda_kernel_managers.iter().flatten() {
kernel_manager.clear_cache();
}
info!("Cleared all CUDA kernel caches");
}
pub fn get_kernel_cache_stats(&self) -> Vec<super::cuda_kernels::CacheStats> {
self.cuda_kernel_managers
.iter()
.filter_map(|manager| manager.as_ref().map(|km| km.get_cache_stats()))
.collect()
}
pub fn transfer_tensor_between_devices(
&self,
tensor: &CandleTensor,
target_device_id: usize,
) -> Result<CandleTensor> {
if let Some(ref _memory_manager) = self.memory_manager {
info!(
"Using multi-GPU memory manager for tensor transfer to device {}",
target_device_id
);
let target_device = &self.devices[self
.config
.device_ids
.iter()
.position(|&id| id == target_device_id)
.unwrap_or(0)];
Ok(tensor.to_device(target_device)?)
} else {
let target_device = &self.devices[self
.config
.device_ids
.iter()
.position(|&id| id == target_device_id)
.unwrap_or(0)];
Ok(tensor.to_device(target_device)?)
}
}
pub fn synchronize_memory(&self) -> Result<()> {
if let Some(ref memory_manager) = self.memory_manager {
memory_manager.synchronize_all()
} else {
Ok(())
}
}
pub fn get_memory_statistics(&self) -> HashMap<usize, super::memory_manager::MemoryPoolStats> {
if let Some(ref memory_manager) = self.memory_manager {
memory_manager.get_memory_stats()
} else {
HashMap::new()
}
}
pub fn get_global_memory_stats(&self) -> Option<super::memory_manager::GlobalMemoryStats> {
self.memory_manager.as_ref().map(|mm| mm.get_global_stats())
}
pub fn get_p2p_connectivity(
&self,
) -> HashMap<(usize, usize), super::memory_manager::P2PCapability> {
if let Some(ref memory_manager) = self.memory_manager {
memory_manager.get_p2p_info()
} else {
HashMap::new()
}
}
pub fn is_p2p_available(&self, src_device: usize, dst_device: usize) -> bool {
if let Some(ref memory_manager) = self.memory_manager {
let p2p_info = memory_manager.get_p2p_info();
p2p_info
.get(&(src_device, dst_device))
.map(|cap| cap.supported)
.unwrap_or(false)
} else {
false
}
}
pub fn optimize_memory_layout(
&self,
access_pattern: &super::memory_manager::AccessPattern,
) -> Result<super::memory_manager::MemoryLayout> {
if let Some(ref memory_manager) = self.memory_manager {
memory_manager.optimize_memory_layout(access_pattern)
} else {
Err(anyhow!("Multi-GPU memory manager not available"))
}
}
pub fn get_topology(&self) -> Option<super::topology::GpuTopology> {
self.topology_manager.as_ref().map(|tm| tm.get_topology())
}
pub fn optimize_workload_placement(
&self,
workload: &super::topology::Workload,
strategy: &str,
) -> Result<super::topology::PlacementPlan> {
if let Some(ref topology_manager) = self.topology_manager {
topology_manager.optimize_placement(workload, strategy)
} else {
Err(anyhow!("GPU topology manager not available"))
}
}
pub fn compare_placement_strategies(
&self,
workload: &super::topology::Workload,
strategies: &[String],
) -> Result<Vec<(String, super::topology::PlacementPlan)>> {
if let Some(ref topology_manager) = self.topology_manager {
topology_manager.compare_strategies(workload, strategies)
} else {
Err(anyhow!("GPU topology manager not available"))
}
}
pub fn get_available_placement_strategies(&self) -> Vec<String> {
if let Some(ref topology_manager) = self.topology_manager {
topology_manager.get_available_strategies()
} else {
vec![]
}
}
pub fn has_topology_support(&self) -> bool {
self.topology_manager.is_some()
}
pub fn get_detailed_device_info(&self) -> HashMap<usize, super::topology::GpuDeviceInfo> {
if let Some(ref topology_manager) = self.topology_manager {
topology_manager.get_topology().devices
} else {
HashMap::new()
}
}
pub fn get_interconnect_info(
&self,
) -> HashMap<(usize, usize), super::topology::InterconnectLink> {
if let Some(ref topology_manager) = self.topology_manager {
topology_manager.get_topology().links
} else {
HashMap::new()
}
}
pub fn auto_select_devices(&self, workload: &super::topology::Workload) -> Result<Vec<usize>> {
let plan = self.optimize_workload_placement(workload, "locality_aware")?;
Ok(plan.device_assignments.values().copied().collect())
}
}
impl GpuKernel {
pub fn new(subgraph: SubGraph, device: Device) -> Result<Self> {
Ok(Self {
subgraph,
device,
stats: std::sync::Mutex::new(GpuKernelStats::default()),
stream_id: 0, kernel_cache: std::sync::Mutex::new(KernelCache {
cached_ops: HashMap::new(),
cache_size: 0,
max_cache_size: 64 * 1024 * 1024, }),
})
}
pub fn with_stream(subgraph: SubGraph, device: Device, stream_id: usize) -> Result<Self> {
Ok(Self {
subgraph,
device,
stats: std::sync::Mutex::new(GpuKernelStats::default()),
stream_id,
kernel_cache: std::sync::Mutex::new(KernelCache {
cached_ops: HashMap::new(),
cache_size: 0,
max_cache_size: 64 * 1024 * 1024, }),
})
}
fn execute_gpu_operation(
&self,
op_type: &str,
inputs: &[CandleTensor],
) -> Result<Vec<CandleTensor>> {
match op_type {
"Add" => {
if inputs.len() != 2 {
return Err(anyhow!("Add requires exactly 2 inputs"));
}
let result = (&inputs[0] + &inputs[1])?;
Ok(vec![result])
}
"Sub" => {
if inputs.len() != 2 {
return Err(anyhow!("Sub requires exactly 2 inputs"));
}
let result = (&inputs[0] - &inputs[1])?;
Ok(vec![result])
}
"Mul" => {
if inputs.len() != 2 {
return Err(anyhow!("Mul requires exactly 2 inputs"));
}
let result = (&inputs[0] * &inputs[1])?;
Ok(vec![result])
}
"Div" => {
if inputs.len() != 2 {
return Err(anyhow!("Div requires exactly 2 inputs"));
}
let result = (&inputs[0] / &inputs[1])?;
Ok(vec![result])
}
"MatMul" => {
if inputs.len() != 2 {
return Err(anyhow!("MatMul requires exactly 2 inputs"));
}
let result = inputs[0].matmul(&inputs[1])?;
Ok(vec![result])
}
"ReLU" => {
if inputs.len() != 1 {
return Err(anyhow!("ReLU requires exactly 1 input"));
}
let zero = inputs[0].zeros_like()?;
let result = inputs[0].maximum(&zero)?;
Ok(vec![result])
}
"Softmax" => {
if inputs.len() != 1 {
return Err(anyhow!("Softmax requires exactly 1 input"));
}
let result = candle_nn::ops::softmax_last_dim(&inputs[0])?;
Ok(vec![result])
}
"Sigmoid" => {
if inputs.len() != 1 {
return Err(anyhow!("Sigmoid requires exactly 1 input"));
}
let neg_input = inputs[0].neg()?;
let exp_neg = neg_input.exp()?;
let one = inputs[0].ones_like()?;
let denominator = (&one + &exp_neg)?;
let result = one.div(&denominator)?;
Ok(vec![result])
}
"Tanh" => {
if inputs.len() != 1 {
return Err(anyhow!("Tanh requires exactly 1 input"));
}
let result = inputs[0].tanh()?;
Ok(vec![result])
}
"GELU" => {
if inputs.len() != 1 {
return Err(anyhow!("GELU requires exactly 1 input"));
}
let x = &inputs[0];
let x_cubed = x.powf(3.0)?;
let coeff_tensor = x_cubed.affine(0.044715, 0.0)?;
let x_plus_coeff = (x + &coeff_tensor)?;
let sqrt_2_over_pi = (2.0 / std::f64::consts::PI).sqrt() as f64;
let inner = x_plus_coeff.affine(sqrt_2_over_pi, 0.0)?;
let tanh_inner = inner.tanh()?;
let one = x.ones_like()?;
let one_plus_tanh = (&one + &tanh_inner)?;
let half_x = x.affine(0.5, 0.0)?;
let result = (&half_x * &one_plus_tanh)?;
Ok(vec![result])
}
"MaxPool" => {
if inputs.len() != 1 {
return Err(anyhow!("MaxPool requires exactly 1 input"));
}
let input = &inputs[0];
let dims = input.dims();
if dims.len() < 3 {
return Err(anyhow!("MaxPool requires at least 3D input (CHW)"));
}
let result = input.clone(); Ok(vec![result])
}
"AveragePool" => {
if inputs.len() != 1 {
return Err(anyhow!("AveragePool requires exactly 1 input"));
}
let input = &inputs[0];
let dims = input.dims();
if dims.len() < 3 {
return Err(anyhow!("AveragePool requires at least 3D input (CHW)"));
}
let result = input.clone(); Ok(vec![result])
}
"Conv" => {
if inputs.len() < 2 {
return Err(anyhow!(
"Conv requires at least 2 inputs (input and weights)"
));
}
let input = &inputs[0];
let weights = &inputs[1];
let result = input.conv2d(weights, 1, 1, 1, 1)?; Ok(vec![result])
}
"ConvTranspose" => {
if inputs.len() < 2 {
return Err(anyhow!("ConvTranspose requires at least 2 inputs"));
}
let input = &inputs[0];
let weights = &inputs[1];
let result = input.conv2d(weights, 1, 1, 1, 1)?;
Ok(vec![result])
}
"BatchNormalization" => {
if inputs.len() < 5 {
return Err(anyhow!(
"BatchNormalization requires 5 inputs: input, scale, bias, mean, var"
));
}
let input = &inputs[0];
let scale = &inputs[1]; let bias = &inputs[2]; let mean = &inputs[3]; let var = &inputs[4];
let eps = 1e-5;
let input_dims = input.dims();
let _batch_size = input_dims[0];
let channels = input_dims[1];
let scale_reshaped = if scale.dims().len() == 1 {
scale.reshape(&[1, channels, 1, 1])?
} else {
scale.clone()
};
let bias_reshaped = if bias.dims().len() == 1 {
bias.reshape(&[1, channels, 1, 1])?
} else {
bias.clone()
};
let mean_reshaped = if mean.dims().len() == 1 {
mean.reshape(&[1, channels, 1, 1])?
} else {
mean.clone()
};
let var_reshaped = if var.dims().len() == 1 {
var.reshape(&[1, channels, 1, 1])?
} else {
var.clone()
};
let normalized = (input - &mean_reshaped)?;
let var_plus_eps = (&var_reshaped + eps)?;
let std_dev = var_plus_eps.sqrt()?;
let normalized_scaled = (&normalized / &std_dev)?;
let scaled = (&normalized_scaled * &scale_reshaped)?;
let result = (&scaled + &bias_reshaped)?;
Ok(vec![result])
}
"LayerNormalization" => {
if inputs.len() < 3 {
return Err(anyhow!(
"LayerNormalization requires 3 inputs: input, scale, bias"
));
}
let input = &inputs[0];
let scale = &inputs[1]; let bias = &inputs[2];
let eps = 1e-5;
let dims = input.dims();
let last_dim = dims.len() - 1;
let mean = input.mean_keepdim(last_dim)?;
let variance = {
let diff = (input - &mean)?;
let squared = (&diff * &diff)?;
squared.mean_keepdim(last_dim)?
};
let normalized = (input - &mean)?;
let var_plus_eps = (&variance + eps)?;
let std_dev = var_plus_eps.sqrt()?;
let normalized_scaled = (&normalized / &std_dev)?;
let scaled = (&normalized_scaled * scale)?;
let result = (&scaled + bias)?;
Ok(vec![result])
}
"GlobalAveragePool" => {
if inputs.len() != 1 {
return Err(anyhow!("GlobalAveragePool requires exactly 1 input"));
}
let input = &inputs[0];
let dims = input.dims();
if dims.len() != 4 {
return Err(anyhow!("GlobalAveragePool expects 4D input (NCHW)"));
}
let result = input.mean_keepdim(2)?.mean_keepdim(3)?;
Ok(vec![result])
}
"Reshape" => {
if inputs.len() != 1 {
return Err(anyhow!("Reshape requires exactly 1 input"));
}
Ok(vec![inputs[0].clone()])
}
_ => Err(anyhow!("Unsupported GPU operation: {}", op_type)),
}
}
fn ronn_to_candle(&self, tensor: &ronn_core::tensor::Tensor) -> Result<CandleTensor> {
let data = tensor.to_vec()?;
let shape = tensor.shape();
let dtype = match tensor.dtype() {
DataType::F32 => candle_core::DType::F32,
DataType::F16 => candle_core::DType::F16,
DataType::F64 => candle_core::DType::F64,
DataType::U8 => candle_core::DType::U8,
DataType::U32 => candle_core::DType::U32,
_ => candle_core::DType::F32, };
let candle_tensor =
CandleTensor::from_vec(data, shape.as_slice(), &self.device)?.to_dtype(dtype)?;
Ok(candle_tensor)
}
fn generate_operation_signature(&self, op_type: &str, inputs: &[CandleTensor]) -> String {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
op_type.hash(&mut hasher);
for input in inputs {
input.dims().hash(&mut hasher);
format!("{:?}", input.dtype()).hash(&mut hasher);
}
format!("{}_{:x}", op_type, hasher.finish())
}
fn can_use_mixed_precision(&self, op_type: &str) -> bool {
matches!(
op_type,
"Add"
| "Sub"
| "Mul"
| "MatMul"
| "Conv"
| "ReLU"
| "Sigmoid"
| "Tanh"
| "GELU"
| "BatchNormalization"
| "LayerNormalization"
)
}
fn apply_mixed_precision(
&self,
inputs: &[CandleTensor],
op_type: &str,
) -> Result<Vec<CandleTensor>> {
if !self.can_use_mixed_precision(op_type) {
return Ok(inputs.to_vec());
}
let mut converted = Vec::new();
for input in inputs {
let element_count = input.dims().iter().product::<usize>();
if element_count > 1024 && input.dtype() == candle_core::DType::F32 {
let fp16_tensor = input.to_dtype(candle_core::DType::F16)?;
converted.push(fp16_tensor);
debug!("Converted tensor to FP16 for operation: {}", op_type);
} else {
converted.push(input.clone());
}
}
Ok(converted)
}
fn execute_optimized_operation(
&self,
op_type: &str,
inputs: &[CandleTensor],
) -> Result<Vec<CandleTensor>> {
let signature = self.generate_operation_signature(op_type, inputs);
{
let mut cache = self.kernel_cache.lock().unwrap();
if let Some(cached_op) = cache.cached_ops.get_mut(&signature) {
cached_op.hit_count += 1;
cached_op.last_accessed = std::time::Instant::now();
debug!(
"Cache hit for operation: {} (signature: {})",
op_type, signature
);
}
}
let optimized_inputs = self.apply_mixed_precision(inputs, op_type)?;
let result = self.execute_gpu_operation(op_type, &optimized_inputs)?;
{
let mut cache = self.kernel_cache.lock().unwrap();
let cached_op = CachedOperation {
signature: signature.clone(),
execution_path: OptimizedPath::Single(op_type.to_string()),
hit_count: 1,
last_accessed: std::time::Instant::now(),
};
cache.cached_ops.insert(signature, cached_op);
if cache.cached_ops.len() > 1000 {
self.evict_cache_entries(&mut cache);
}
}
Ok(result)
}
fn evict_cache_entries(&self, cache: &mut KernelCache) {
let current_time = std::time::Instant::now();
let mut to_remove = Vec::new();
for (signature, cached_op) in &cache.cached_ops {
if current_time
.duration_since(cached_op.last_accessed)
.as_secs()
> 300
{
to_remove.push(signature.clone());
}
}
for signature in to_remove {
cache.cached_ops.remove(&signature);
}
debug!("Evicted {} cache entries", cache.cached_ops.len());
}
pub fn get_cache_stats(&self) -> (usize, usize, f64) {
let cache = self.kernel_cache.lock().unwrap();
let total_hits: u64 = cache.cached_ops.values().map(|op| op.hit_count).sum();
let cache_count = cache.cached_ops.len();
let hit_rate = if cache_count > 0 {
total_hits as f64 / cache_count as f64
} else {
0.0
};
(cache_count, cache.cache_size, hit_rate)
}
fn candle_to_ronn(&self, tensor: &CandleTensor) -> Result<ronn_core::tensor::Tensor> {
let shape = tensor.dims().to_vec();
let data: Vec<f32> = tensor.to_vec1()?;
let ronn_tensor = Tensor::from_data(
data,
shape,
DataType::F32, TensorLayout::RowMajor,
)?;
Ok(ronn_tensor)
}
}
impl CompiledKernel for GpuKernel {
fn execute(
&self,
inputs: &[ronn_core::tensor::Tensor],
) -> Result<Vec<ronn_core::tensor::Tensor>> {
let start_time = std::time::Instant::now();
let mut candle_inputs = Vec::new();
for input in inputs {
let candle_tensor = self.ronn_to_candle(input)?;
candle_inputs.push(candle_tensor);
}
let mut current_tensors = candle_inputs;
for node in &self.subgraph.nodes {
debug!(
"Executing GPU operation: {} on stream {}",
node.op_type, self.stream_id
);
let outputs = self.execute_optimized_operation(&node.op_type, ¤t_tensors)?;
current_tensors = outputs;
}
let mut results = Vec::new();
for candle_tensor in ¤t_tensors {
let ronn_tensor = self.candle_to_ronn(candle_tensor)?;
results.push(ronn_tensor);
}
let execution_time = start_time.elapsed();
{
let mut stats = self.stats.lock().unwrap();
stats.execution_count += 1;
stats.total_time_us += execution_time.as_micros() as u64;
if stats.execution_count == 1 {
stats.min_time_us = execution_time.as_micros() as u64;
stats.max_time_us = execution_time.as_micros() as u64;
} else {
stats.min_time_us = stats.min_time_us.min(execution_time.as_micros() as u64);
stats.max_time_us = stats.max_time_us.max(execution_time.as_micros() as u64);
}
}
debug!("GPU kernel executed in {:?}", execution_time);
Ok(results)
}
fn get_memory_usage(&self) -> MemoryUsage {
let stats = self.stats.lock().unwrap();
MemoryUsage {
peak_bytes: stats.memory_peak,
current_bytes: 0, allocation_count: stats.execution_count as usize,
}
}
fn get_performance_stats(&self) -> KernelStats {
let stats = self.stats.lock().unwrap();
let average_time_us = if stats.execution_count > 0 {
stats.total_time_us as f64 / stats.execution_count as f64
} else {
0.0
};
KernelStats {
execution_count: stats.execution_count,
average_time_us,
min_time_us: stats.min_time_us as f64,
max_time_us: stats.max_time_us as f64,
}
}
}
pub fn create_gpu_provider() -> Result<Arc<dyn ExecutionProvider>> {
Ok(Arc::new(GpuExecutionProvider::new()?))
}
pub fn create_gpu_provider_with_config(
config: GpuProviderConfig,
) -> Result<Arc<dyn ExecutionProvider>> {
Ok(Arc::new(GpuExecutionProvider::with_config(config)?))
}
#[cfg(test)]
mod tests {
use super::*;
use ronn_core::{AttributeValue, GraphNode};
fn create_test_subgraph() -> SubGraph {
let node = GraphNode {
id: 0,
op_type: "Add".to_string(),
attributes: HashMap::new(),
inputs: vec!["input1".to_string(), "input2".to_string()],
outputs: vec!["output1".to_string()],
name: Some("gpu_add".to_string()),
};
SubGraph {
nodes: vec![node],
edges: vec![],
inputs: vec!["input1".to_string(), "input2".to_string()],
outputs: vec!["output1".to_string()],
}
}
#[test]
fn test_gpu_provider_creation() {
match GpuExecutionProvider::new() {
Ok(provider) => {
assert_eq!(provider.provider_id(), ProviderId::GPU);
let capability = provider.get_capability();
assert_eq!(capability.performance_profile, PerformanceProfile::GPU);
assert!(!capability.supported_ops.is_empty());
assert!(capability.data_types.contains(&DataType::F32));
}
Err(e) => {
println!("GPU not available: {}", e);
}
}
}
#[test]
fn test_gpu_provider_config() {
let config = GpuProviderConfig {
device_ids: vec![0],
enable_mixed_precision: false,
enable_tensor_cores: false,
..Default::default()
};
match GpuExecutionProvider::with_config(config) {
Ok(provider) => {
assert!(!provider.get_config().enable_mixed_precision);
assert!(!provider.get_config().enable_tensor_cores);
}
Err(_) => {
}
}
}
#[test]
fn test_operation_support() {
match GpuExecutionProvider::new() {
Ok(provider) => {
assert!(provider.supports_operation("Add"));
assert!(provider.supports_operation("MatMul"));
assert!(provider.supports_operation("Conv"));
assert!(provider.supports_operation("ReLU"));
assert!(!provider.supports_operation("NonexistentOp"));
let add_op = OperatorSpec {
op_type: "Add".to_string(),
input_types: vec![DataType::F32],
output_types: vec![DataType::F32],
attributes: HashMap::new(),
};
let conv_op = OperatorSpec {
op_type: "Conv".to_string(),
input_types: vec![DataType::F32],
output_types: vec![DataType::F32],
attributes: HashMap::new(),
};
let add_cost = provider.estimate_cost(&add_op);
let conv_cost = provider.estimate_cost(&conv_op);
assert!(conv_cost > add_cost);
assert!(add_cost < 1.0); }
Err(_) => {
}
}
}
#[test]
fn test_subgraph_compilation() {
match GpuExecutionProvider::new() {
Ok(provider) => {
let subgraph = create_test_subgraph();
match provider.compile_subgraph(subgraph) {
Ok(kernel) => {
let stats = kernel.get_performance_stats();
assert_eq!(stats.execution_count, 0); }
Err(e) => {
println!("Compilation failed: {}", e);
}
}
}
Err(_) => {
}
}
}
#[test]
fn test_factory_functions() {
match create_gpu_provider() {
Ok(provider) => {
assert_eq!(provider.provider_id(), ProviderId::GPU);
}
Err(_) => {
}
}
let config = GpuProviderConfig::default();
match create_gpu_provider_with_config(config) {
Ok(provider) => {
assert_eq!(provider.provider_id(), ProviderId::GPU);
}
Err(_) => {
}
}
}
#[test]
fn test_complex_gpu_operations() {
match GpuExecutionProvider::new() {
Ok(provider) => {
let capability = provider.get_capability();
assert!(capability.supported_ops.contains("Conv"));
assert!(capability.supported_ops.contains("BatchNormalization"));
assert!(capability.supported_ops.contains("LayerNormalization"));
assert!(capability.supported_ops.contains("GlobalAveragePool"));
assert!(capability.supported_ops.contains("Sigmoid"));
assert!(capability.supported_ops.contains("Tanh"));
assert!(capability.supported_ops.contains("GELU"));
println!(
"✅ GPU provider supports {} complex operations",
capability.supported_ops.len()
);
}
Err(e) => {
println!("GPU not available: {}", e);
}
}
}
#[test]
fn test_gpu_benchmarks() {
match GpuExecutionProvider::new() {
Ok(provider) => {
println!("🚀 Running GPU performance benchmarks...");
benchmark_basic_operations(&provider);
benchmark_complex_operations(&provider);
benchmark_mixed_precision(&provider);
benchmark_cache_performance(&provider);
benchmark_memory_throughput(&provider);
println!("✅ GPU benchmarks completed!");
}
Err(e) => {
println!("GPU not available for benchmarks: {}", e);
}
}
}
fn benchmark_basic_operations(provider: &GpuExecutionProvider) {
use std::time::Instant;
println!(" 📊 Basic Operations Benchmark:");
let ops = ["Add", "Mul", "MatMul", "ReLU", "Sigmoid", "Tanh"];
for op in ops {
let subgraph = create_single_op_subgraph(op);
if let Ok(kernel) = provider.compile_subgraph(subgraph) {
let test_input = ronn_core::tensor::Tensor::ones(
vec![1024, 1024],
DataType::F32,
TensorLayout::RowMajor,
)
.unwrap();
let start = Instant::now();
for _ in 0..10 {
let _ = kernel.execute(&[test_input.clone()]);
}
let avg_time = start.elapsed() / 10;
println!(" {} avg: {:?}", op, avg_time);
}
}
}
fn benchmark_complex_operations(provider: &GpuExecutionProvider) {
use std::time::Instant;
println!(" 🧠Complex Operations Benchmark:");
let complex_ops = [
"Conv",
"BatchNormalization",
"LayerNormalization",
"GlobalAveragePool",
];
for op in complex_ops {
let subgraph = create_single_op_subgraph(op);
if let Ok(kernel) = provider.compile_subgraph(subgraph) {
let test_input = match op {
"Conv" => ronn_core::tensor::Tensor::ones(
vec![1, 64, 224, 224], DataType::F32,
TensorLayout::RowMajor,
)
.unwrap(),
_ => ronn_core::tensor::Tensor::ones(
vec![32, 512],
DataType::F32,
TensorLayout::RowMajor,
)
.unwrap(),
};
let start = Instant::now();
for _ in 0..5 {
let _ = kernel.execute(&[test_input.clone()]);
}
let avg_time = start.elapsed() / 5;
println!(" {} avg: {:?}", op, avg_time);
}
}
}
fn benchmark_mixed_precision(provider: &GpuExecutionProvider) {
println!(" 🎯 Mixed Precision Benchmark:");
if provider.has_tensor_cores() {
println!(" Tensor cores available - mixed precision enabled");
} else {
println!(" Tensor cores not available - mixed precision simulation");
}
let sizes = [512, 1024, 2048];
for size in sizes {
println!(" Matrix size: {}x{}", size, size);
}
}
fn benchmark_cache_performance(provider: &GpuExecutionProvider) {
use std::time::Instant;
println!(" 💾 Cache Performance Benchmark:");
let subgraph = create_single_op_subgraph("Add");
if let Ok(kernel) = provider.compile_subgraph(subgraph) {
let test_input = ronn_core::tensor::Tensor::ones(
vec![512, 512],
DataType::F32,
TensorLayout::RowMajor,
)
.unwrap();
for _ in 0..5 {
let _ = kernel.execute(&[test_input.clone()]);
}
let start = Instant::now();
for _ in 0..20 {
let _ = kernel.execute(&[test_input.clone()]);
}
let cached_time = start.elapsed() / 20;
println!(" Cached execution avg: {:?}", cached_time);
}
}
fn benchmark_memory_throughput(provider: &GpuExecutionProvider) {
println!(" 🚀 Memory Throughput Benchmark:");
if let Ok((total_memory, _used_memory)) = provider.get_gpu_memory_info() {
println!(
" GPU Memory: {:.2} GB total",
total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
);
}
let allocator = provider.get_allocator();
let start = std::time::Instant::now();
let mut buffers = Vec::new();
for _ in 0..100 {
if let Ok(buffer) = allocator.allocate(&[1024], DataType::F32) {
buffers.push(buffer);
}
}
let alloc_time = start.elapsed();
let start = std::time::Instant::now();
for buffer in buffers {
let _ = allocator.deallocate(buffer);
}
let dealloc_time = start.elapsed();
println!(" 100 allocations: {:?}", alloc_time);
println!(" 100 deallocations: {:?}", dealloc_time);
}
fn create_single_op_subgraph(op_type: &str) -> SubGraph {
let node = GraphNode {
id: 0,
op_type: op_type.to_string(),
attributes: HashMap::new(),
inputs: vec!["input1".to_string()],
outputs: vec!["output1".to_string()],
name: Some(format!("test_{}", op_type)),
};
SubGraph {
nodes: vec![node],
edges: vec![],
inputs: vec!["input1".to_string()],
outputs: vec!["output1".to_string()],
}
}
#[test]
fn test_stream_execution() {
match GpuExecutionProvider::new() {
Ok(provider) => {
if provider.get_config().stream_count > 1 {
println!(
"🌊 Testing stream-based execution with {} streams",
provider.get_config().stream_count
);
let subgraph1 = create_single_op_subgraph("Add");
let subgraph2 = create_single_op_subgraph("Mul");
if let (Ok(kernel1), Ok(kernel2)) = (
GpuKernel::with_stream(subgraph1, provider.device().clone(), 0),
GpuKernel::with_stream(subgraph2, provider.device().clone(), 1),
) {
println!(" ✅ Successfully created kernels on different streams");
let test_input = ronn_core::tensor::Tensor::ones(
vec![256, 256],
DataType::F32,
TensorLayout::RowMajor,
)
.unwrap();
let start = std::time::Instant::now();
let _result1 = kernel1.execute(&[test_input.clone()]);
let _result2 = kernel2.execute(&[test_input.clone()]);
let concurrent_time = start.elapsed();
println!(" Concurrent execution time: {:?}", concurrent_time);
}
} else {
println!("🌊 Single stream execution (stream_count = 1)");
}
}
Err(_) => {
println!("GPU not available for stream testing");
}
}
}
#[test]
fn test_kernel_cache_operations() {
match GpuExecutionProvider::new() {
Ok(provider) => {
println!("💾 Testing kernel cache operations...");
let subgraph = create_single_op_subgraph("MatMul");
if let Ok(kernel) = provider.compile_subgraph(subgraph) {
let test_input = ronn_core::tensor::Tensor::ones(
vec![128, 128],
DataType::F32,
TensorLayout::RowMajor,
)
.unwrap();
for i in 0..10 {
let _ = kernel.execute(&[test_input.clone()]);
if i == 0 {
println!(" First execution (cold cache)");
} else if i == 9 {
println!(" Tenth execution (warm cache)");
}
}
println!(" ✅ Cache operations test completed");
}
}
Err(_) => {
println!("GPU not available for cache testing");
}
}
}
}