#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum CudaComputeCapability {
Maxwell,
Pascal,
Volta,
Turing,
Ampere,
Ada,
}
impl CudaComputeCapability {
pub fn from_version(major: u32, minor: u32) -> Option<Self> {
match (major, minor) {
(5, _) => Some(Self::Maxwell),
(6, _) => Some(Self::Pascal),
(7, 0) => Some(Self::Volta),
(7, 5) => Some(Self::Turing),
(8, 0) => Some(Self::Ampere),
(9, 0) => Some(Self::Ada),
_ => None,
}
}
pub fn has_tensor_cores(&self) -> bool {
matches!(
self,
Self::Volta | Self::Turing | Self::Ampere | Self::Ada
)
}
pub fn optimal_block_size(&self) -> usize {
match self {
Self::Maxwell | Self::Pascal => 256,
Self::Volta | Self::Turing => 512,
Self::Ampere => 1024,
Self::Ada => 1024,
}
}
pub fn shared_memory(&self) -> usize {
match self {
Self::Maxwell => 49152, Self::Pascal => 49152, Self::Volta => 98304, Self::Turing => 98304, Self::Ampere => 163840, Self::Ada => 163840, }
}
pub fn max_registers(&self) -> usize {
match self {
Self::Maxwell => 255,
Self::Pascal => 255,
Self::Volta => 255,
Self::Turing => 255,
Self::Ampere => 255,
Self::Ada => 255,
}
}
}
#[derive(Debug, Clone)]
pub struct CudaKernelConfig {
pub compute_capability: CudaComputeCapability,
pub block_size: usize,
pub use_shared_memory: bool,
pub use_warp_shuffles: bool,
pub use_tensor_cores: bool,
pub optimize_registers: bool,
}
impl Default for CudaKernelConfig {
fn default() -> Self {
let cap = CudaComputeCapability::Ampere;
Self {
block_size: cap.optimal_block_size(),
compute_capability: cap,
use_shared_memory: true,
use_warp_shuffles: true,
use_tensor_cores: cap.has_tensor_cores(),
optimize_registers: true,
}
}
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct CudaAlignmentKernel {
config: CudaKernelConfig,
device_id: i32,
}
impl CudaAlignmentKernel {
pub fn new(device_id: i32, compute_capability: CudaComputeCapability) -> Self {
let config = CudaKernelConfig {
compute_capability,
block_size: compute_capability.optimal_block_size(),
use_shared_memory: true,
use_warp_shuffles: true,
use_tensor_cores: compute_capability.has_tensor_cores(),
optimize_registers: true,
};
Self { config, device_id }
}
pub fn config(&self) -> &CudaKernelConfig {
&self.config
}
pub fn calculate_grid_size(&self, _m: usize, n: usize) -> (u32, u32) {
const SIMD_WIDTH: usize = 8;
let grid_x = ((n + SIMD_WIDTH - 1) / SIMD_WIDTH) as u32;
let grid_y = 1; (grid_x, grid_y)
}
pub fn shared_memory_size(&self) -> usize {
let matrix_size = 24 * 32 * 4; let working_size = 2 * self.config.block_size * 4; matrix_size + working_size
}
pub fn estimate_time(&self, m: usize, n: usize) -> f32 {
let ops = (m * n) as f32;
let ops_per_ms = match self.config.compute_capability {
CudaComputeCapability::Maxwell => 50_000.0,
CudaComputeCapability::Pascal => 100_000.0,
CudaComputeCapability::Volta => 200_000.0,
CudaComputeCapability::Turing => 300_000.0,
CudaComputeCapability::Ampere => 500_000.0,
CudaComputeCapability::Ada => 800_000.0,
};
(ops / ops_per_ms) + 1.5 }
}
#[derive(Debug)]
#[allow(dead_code)]
pub struct CudaMultiGpuBatch {
devices: Vec<i32>,
kernels: Vec<CudaAlignmentKernel>,
current_batch: usize,
}
impl CudaMultiGpuBatch {
pub fn new(device_ids: Vec<i32>) -> Self {
let kernels = device_ids
.iter()
.map(|&id| {
CudaAlignmentKernel::new(id, CudaComputeCapability::Ampere)
})
.collect();
Self {
devices: device_ids,
kernels,
current_batch: 0,
}
}
pub fn next_device(&mut self) -> &CudaAlignmentKernel {
let kernel = &self.kernels[self.current_batch % self.kernels.len()];
self.current_batch += 1;
kernel
}
pub fn reset(&mut self) {
self.current_batch = 0;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cuda_compute_capability() {
let cap = CudaComputeCapability::from_version(8, 0);
assert_eq!(cap, Some(CudaComputeCapability::Ampere));
assert!(cap.unwrap().has_tensor_cores());
}
#[test]
fn test_kernel_config() {
let config = CudaKernelConfig::default();
assert_eq!(config.compute_capability, CudaComputeCapability::Ampere);
assert_eq!(config.block_size, 1024);
assert!(config.use_shared_memory);
}
#[test]
fn test_grid_calculation() {
let kernel = CudaAlignmentKernel::new(0, CudaComputeCapability::Ampere);
let (grid_x, grid_y) = kernel.calculate_grid_size(500, 500);
assert!(grid_x > 0);
assert_eq!(grid_y, 1);
}
#[test]
fn test_time_estimation() {
let kernel = CudaAlignmentKernel::new(0, CudaComputeCapability::Ampere);
let time = kernel.estimate_time(500, 500);
assert!(time > 1.0 && time < 10.0);
}
#[test]
fn test_multi_gpu_batch() {
let mut batch = CudaMultiGpuBatch::new(vec![0, 1, 2]);
let dev1_id = batch.next_device().device_id;
let dev2_id = batch.next_device().device_id;
let dev3_id = batch.next_device().device_id;
let dev1_again_id = batch.next_device().device_id;
assert_eq!(dev1_id, 0);
assert_eq!(dev2_id, 1);
assert_eq!(dev3_id, 2);
assert_eq!(dev1_again_id, 0);
}
#[test]
fn test_shared_memory_size() {
let kernel = CudaAlignmentKernel::new(0, CudaComputeCapability::Ampere);
let mem_size = kernel.shared_memory_size();
assert!(mem_size > 0);
assert!(mem_size <= 160 * 1024); }
}