#[cfg(cuda_available)]
pub mod backend;
#[cfg(cuda_available)]
pub mod buffer;
#[cfg(cuda_available)]
pub mod device;
#[cfg(cuda_available)]
pub mod error;
#[cfg(cuda_available)]
pub mod event_coordination;
#[cfg(cuda_available)]
pub mod types_compat;
#[cfg(cuda_available)]
pub mod graph {
pub use super::graph_stub::*;
}
#[cfg(cuda_available)]
mod graph_stub;
#[cfg(cuda_available)]
pub mod kernels;
#[cfg(cuda_available)]
pub mod memory;
#[cfg(cuda_available)]
pub mod stream;
#[cfg(cuda_available)]
pub mod stream_advanced;
#[cfg(cuda_available)]
pub mod unified_buffer;
#[cfg_attr(not(feature = "cudnn"), allow(unexpected_cfgs))]
#[cfg(all(feature = "cudnn", cuda_available))]
pub mod cudnn;
#[cfg(cuda_available)]
pub mod mixed_precision;
#[cfg(cuda_available)]
pub mod multi_gpu;
#[cfg(cuda_available)]
pub mod neural_ops_enhanced;
#[cfg(cuda_available)]
pub mod tensor_cores;
#[cfg(cuda_available)]
pub mod cooperative_groups;
#[cfg(cuda_available)]
pub mod graph_execution;
#[cfg(cuda_available)]
pub mod intelligent_scheduler;
#[cfg(cuda_available)]
pub mod multi_stream_orchestrator;
#[cfg(cuda_available)]
pub mod multi_stream_usage_examples;
#[cfg(cuda_available)]
pub mod occupancy;
#[cfg(not(cuda_available))]
pub mod fallback;
#[cfg(cuda_available)]
pub use backend::{CudaBackend, CudaBackendConfig};
#[cfg(cuda_available)]
pub use buffer::CudaBuffer;
#[cfg(cuda_available)]
pub use device::CudaDevice;
#[cfg(cuda_available)]
pub use error::{CudaError, CudaResult};
#[cfg(cuda_available)]
pub use event_coordination::{
AsyncEventWaiter, CoordinationMetrics, CrossStreamBarrier, EventMetadata, EventPool,
EventPriority, OperationCoordinator, OperationType,
};
#[cfg(cuda_available)]
pub use memory::{CudaMemoryManager, MemoryAdvice, UnifiedAllocation};
#[cfg(cuda_available)]
pub use stream::{CudaEvent, CudaStream, StreamMetrics, StreamPool, StreamPriority};
#[cfg(cuda_available)]
pub use stream_advanced::{
AdvancedStreamPool, AllocationStrategy, MultiStreamCoordinator, PoolMetrics, ProfilingReport,
StreamOrderedAllocator, StreamProfiler, StreamReport, WorkloadType,
};
#[cfg(cuda_available)]
pub use types_compat::{
cudaError_t, cudaStream_t, cudaSuccess, CUstream, DevicePointer, Event, EventFlags, Stream,
StreamFlags,
};
#[cfg(cuda_available)]
pub use types_compat::cuda_sys_compat;
#[cfg(cuda_available)]
pub use types_compat::cust_compat;
#[cfg(cuda_available)]
pub use unified_buffer::UnifiedBuffer;
#[cfg(all(feature = "cudnn", cuda_available))]
pub use cudnn::{
ActivationDescriptor, ConvolutionDescriptor, CudnnHandle, CudnnOps, FilterDescriptor,
TensorDescriptor,
};
#[cfg(cuda_available)]
pub use cooperative_groups::{
CooperationPattern, CooperativeGroupDescriptor, CooperativeGroupType,
CooperativeGroupsCapabilities, CooperativeGroupsContext, CooperativeGroupsStats,
CooperativeKernelConfig, CooperativeKernelConfigBuilder, CooperativeWorkload,
KernelPerformanceMetrics, MemoryScope, SyncFrequency, SynchronizationType,
};
#[cfg(cuda_available)]
pub use mixed_precision::{AmpContext, GradientScaler, MixedPrecisionTrainer};
#[cfg(cuda_available)]
pub use multi_gpu::{DataParallel, MultiGpuContext, ReduceOp};
#[cfg(cuda_available)]
pub use neural_ops_enhanced::EnhancedNeuralOps;
#[cfg(cuda_available)]
pub use tensor_cores::{
TensorCoreCapability, TensorCoreContext, TensorCoreDType, TensorCoreGemmConfig, TensorCoreOp,
TensorCoreStats,
};
#[cfg(cuda_available)]
pub use intelligent_scheduler::{
IntelligentStreamScheduler, MemoryAccessPattern, MultiOperationCoordinator, SchedulerMetrics,
SchedulingDecision, SchedulingStrategy, SynchronizationRequirements, WorkloadCharacteristics,
};
#[cfg(cuda_available)]
pub use multi_stream_orchestrator::{
ExecutionResult, MultiStreamOrchestrator, OptimizationResult, OrchestratorConfig,
OrchestratorMetrics, RepeatingWorkloadResult,
};
#[cfg(cuda_available)]
pub use occupancy::{
CudaDeviceOccupancy, CudaOccupancyAnalyzer, DeviceProperties, LimitingFactor, OccupancyResult,
OptimizationHeuristics, OptimizedLaunchConfig, PerformanceMetrics, ResourceUsage,
};
#[cfg(not(cuda_available))]
pub use fallback::*;
pub mod prelude {
#[cfg(cuda_available)]
pub use super::{
AmpContext,
CooperationPattern,
CooperativeGroupDescriptor,
CooperativeGroupType,
CooperativeGroupsCapabilities,
CooperativeGroupsContext,
CooperativeKernelConfig,
CooperativeKernelConfigBuilder,
CooperativeWorkload,
CrossStreamBarrier,
CudaBackend,
CudaBuffer,
CudaDevice,
CudaError,
CudaMemoryManager,
CudaOccupancyAnalyzer,
CudaStream,
EnhancedNeuralOps,
EventPool,
EventPriority,
GradientScaler,
IntelligentStreamScheduler,
KernelPerformanceMetrics,
LimitingFactor,
MemoryAdvice,
MixedPrecisionTrainer,
MultiOperationCoordinator,
MultiStreamOrchestrator,
OccupancyResult,
OperationCoordinator,
OperationType,
OptimizationHeuristics,
OptimizedLaunchConfig,
OrchestratorConfig,
OrchestratorMetrics,
PerformanceMetrics,
RepeatingWorkloadResult,
ResourceUsage,
SchedulingDecision,
SchedulingStrategy,
SynchronizationType,
TensorCoreCapability,
TensorCoreContext,
TensorCoreDType,
TensorCoreGemmConfig,
TensorCoreOp,
TensorCoreStats,
UnifiedAllocation,
UnifiedBuffer,
WorkloadCharacteristics,
};
#[cfg(all(feature = "cudnn", cuda_available))]
pub use super::{
ActivationDescriptor, ConvolutionDescriptor, CudnnHandle, CudnnOps, FilterDescriptor,
TensorDescriptor,
};
pub use crate::prelude::*;
}
#[cfg(cuda_available)]
mod cuda_impl {
use super::*;
use crate::cuda::error::CustResultExt;
pub fn init() -> Result<(), CudaError> {
cust::init(cust::CudaFlags::empty()).cuda_result()?;
Ok(())
}
pub fn is_available() -> bool {
use std::panic::{catch_unwind, AssertUnwindSafe};
let result = catch_unwind(AssertUnwindSafe(|| {
match cust::init(cust::CudaFlags::empty()) {
Ok(_) => {
let mut count: i32 = 0;
unsafe {
let result = cuda_sys::cudart::cudaGetDeviceCount(&mut count);
result == cudaSuccess && count > 0
}
}
Err(_) => false,
}
}));
result.unwrap_or(false)
}
pub fn device_count() -> Result<u32, CudaError> {
let mut count: i32 = 0;
unsafe {
let result = cuda_sys::cudart::cudaGetDeviceCount(&mut count);
if result != cudaSuccess {
return Err(CudaError::Context {
message: "Failed to get device count".to_string(),
});
}
}
Ok(count as u32)
}
pub fn current_device() -> Result<CudaDevice, CudaError> {
let mut device_id: i32 = 0;
unsafe {
let result = cuda_sys::cudart::cudaGetDevice(&mut device_id);
if result != cudaSuccess {
return Err(CudaError::Context {
message: "Failed to get current device".to_string(),
});
}
}
CudaDevice::new(device_id as usize)
}
pub fn set_device(device_id: usize) -> Result<(), CudaError> {
let _device = cust::device::Device::get_device(device_id as u32).cuda_result()?;
Ok(())
}
pub fn synchronize() -> Result<(), CudaError> {
let stream =
cust::stream::Stream::new(cust::stream::StreamFlags::DEFAULT, None).cuda_result()?;
stream.synchronize().cuda_result()?;
Ok(())
}
}
#[cfg(not(cuda_available))]
mod cuda_impl {
use super::*;
pub fn init() -> Result<(), CudaError> {
Err(CudaError::RuntimeError(
"CUDA not available on this system".to_string(),
))
}
pub fn is_available() -> bool {
false
}
pub fn device_count() -> Result<u32, CudaError> {
Ok(0)
}
pub fn current_device() -> Result<CudaDevice, CudaError> {
Err(CudaError::RuntimeError(
"CUDA not available on this system".to_string(),
))
}
pub fn set_device(_device_id: usize) -> Result<(), CudaError> {
Err(CudaError::RuntimeError(
"CUDA not available on this system".to_string(),
))
}
pub fn synchronize() -> Result<(), CudaError> {
Ok(()) }
}
pub use cuda_impl::*;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cuda_availability() {
if is_available() {
assert!(device_count().expect("device count should succeed") > 0);
}
}
#[test]
fn test_cuda_init() {
if is_available() {
assert!(init().is_ok());
}
}
}