1pub mod acceleration;
97pub mod core;
98pub mod distance;
99pub mod kernels;
100pub mod memory;
101
102pub use core::{BackendContext, DeviceSelection, GpuBackend, GpuConfig, GpuContext, GpuDevice};
104
105pub use distance::{DistanceMetric, GpuArray, GpuDistanceMatrix};
106
107pub use memory::{
108 BandwidthMonitor, GpuMemoryBlock, GpuMemoryManager, MemoryStats, MemoryStrategy, MemoryTransfer,
109};
110
111pub use acceleration::{
113 detect_tensor_core_capabilities, AdvancedDeviceSelection, AdvancedGpuMemoryManager,
114 AdvancedMemoryStrategy, AllocationRecord, DeviceBenchmark, DeviceSelector,
115 GpuAccelerationConfig, GpuKMeans, GpuKMeansResult, KMeansMetrics, KernelOptimizations,
116 MemoryUsageStats, PrecisionMode, ProfilingRecord, TensorCoreCapabilities, TensorCoreConfig,
117};
118
119pub use kernels::{
121 calculate_kernel_config, generate_cuda_batch_distance_kernel,
122 generate_cuda_distance_matrix_kernel, generate_cuda_kmeans_assign_kernel,
123 generate_metal_distance_kernel, generate_opencl_distance_matrix_kernel,
124 generate_rocm_distance_kernel, get_kernel_source, get_kmeans_kernel_source, DistanceKernelType,
125 KernelConfig, KernelDataType,
126};
127
128pub fn auto_config() -> GpuConfig {
132 let preferred_backends = [
134 GpuBackend::Cuda,
135 GpuBackend::OpenCl,
136 GpuBackend::Rocm,
137 GpuBackend::Metal,
138 GpuBackend::OneApi,
139 ];
140
141 for &backend in &preferred_backends {
142 if is_backend_available(backend) {
143 return GpuConfig::new(backend);
144 }
145 }
146
147 GpuConfig::default()
149}
150
151pub fn is_backend_available(backend: GpuBackend) -> bool {
153 match backend {
154 GpuBackend::CpuFallback => true,
155 _ => {
156 false
164 }
165 }
166}
167
168pub fn list_devices() -> Vec<GpuDevice> {
170 vec![GpuDevice::new(
173 0,
174 "Integrated GPU".to_string(),
175 4_000_000_000, 3_500_000_000, "1.0".to_string(),
178 512,
179 GpuBackend::CpuFallback,
180 false,
181 )]
182}
183
184pub fn get_best_device() -> Option<GpuDevice> {
186 let devices = list_devices();
187 devices
188 .into_iter()
189 .filter(|d| d.backend != GpuBackend::CpuFallback)
190 .max_by(|a, b| {
191 a.get_device_score()
192 .partial_cmp(&b.get_device_score())
193 .unwrap_or(std::cmp::Ordering::Equal)
194 })
195}
196
197pub fn benchmark_gpu_vs_cpu(
199 data_size: usize,
200 n_features: usize,
201 metric: DistanceMetric,
202) -> Result<BenchmarkResult, crate::error::ClusteringError> {
203 use scirs2_core::ndarray::Array2;
204 use std::time::Instant;
205
206 let data = Array2::from_shape_fn((data_size, n_features), |(i, j)| {
208 (i * n_features + j) as f64 / 1000.0
209 });
210
211 let cpu_start = Instant::now();
213 let cpu_config = GpuConfig::new(GpuBackend::CpuFallback);
214 let cpu_matrix = GpuDistanceMatrix::new(cpu_config, metric, None)?;
215 let _cpu_result = cpu_matrix.compute_distance_matrix_cpu(data.view())?;
216 let cpu_duration = cpu_start.elapsed();
217
218 let gpu_start = Instant::now();
220 let gpu_config = auto_config();
221 let mut gpu_matrix = GpuDistanceMatrix::new(gpu_config, metric, None)?;
222 let _gpu_result = gpu_matrix.compute_distance_matrix(data.view())?;
223 let gpu_duration = gpu_start.elapsed();
224
225 Ok(BenchmarkResult {
226 cpu_duration_ms: cpu_duration.as_millis() as f64,
227 gpu_duration_ms: gpu_duration.as_millis() as f64,
228 speedup: cpu_duration.as_secs_f64() / gpu_duration.as_secs_f64(),
229 data_size,
230 n_features,
231 metric,
232 })
233}
234
235#[derive(Debug, Clone)]
237pub struct BenchmarkResult {
238 pub cpu_duration_ms: f64,
240 pub gpu_duration_ms: f64,
242 pub speedup: f64,
244 pub data_size: usize,
246 pub n_features: usize,
248 pub metric: DistanceMetric,
250}
251
252impl BenchmarkResult {
253 pub fn summary(&self) -> String {
255 format!(
256 "GPU vs CPU Benchmark Results:\n\
257 Data size: {} samples x {} features\n\
258 Distance metric: {}\n\
259 CPU time: {:.2} ms\n\
260 GPU time: {:.2} ms\n\
261 Speedup: {:.2}x",
262 self.data_size,
263 self.n_features,
264 self.metric,
265 self.cpu_duration_ms,
266 self.gpu_duration_ms,
267 self.speedup
268 )
269 }
270
271 pub fn gpu_is_faster(&self) -> bool {
273 self.speedup > 1.0
274 }
275
276 pub fn efficiency_rating(&self) -> &'static str {
278 match self.speedup {
279 x if x >= 10.0 => "Excellent",
280 x if x >= 5.0 => "Very Good",
281 x if x >= 2.0 => "Good",
282 x if x >= 1.1 => "Marginal",
283 _ => "No Benefit",
284 }
285 }
286}
287
288pub struct GpuCapabilities {
290 pub available_backends: Vec<GpuBackend>,
292 pub best_devices: std::collections::HashMap<GpuBackend, GpuDevice>,
294 pub total_gpu_memory: usize,
296 pub supports_unified_memory: bool,
298 pub supports_double_precision: bool,
300}
301
302impl GpuCapabilities {
303 pub fn detect() -> Self {
305 let available_backends: Vec<GpuBackend> = [
306 GpuBackend::Cuda,
307 GpuBackend::OpenCl,
308 GpuBackend::Rocm,
309 GpuBackend::Metal,
310 GpuBackend::OneApi,
311 ]
312 .iter()
313 .cloned()
314 .filter(|&backend| is_backend_available(backend))
315 .collect();
316
317 let mut best_devices = std::collections::HashMap::new();
318 let mut total_memory = 0;
319 let mut supports_unified = false;
320 let mut supports_double = false;
321
322 for backend in available_backends.iter() {
324 if let Some(device) = Self::get_best_device_for_backend(*backend) {
325 total_memory += device.total_memory;
326 supports_unified |= *backend == GpuBackend::Cuda; supports_double |= device.supports_double_precision;
328 best_devices.insert(*backend, device);
329 }
330 }
331
332 Self {
333 available_backends,
334 best_devices,
335 total_gpu_memory: total_memory,
336 supports_unified_memory: supports_unified,
337 supports_double_precision: supports_double,
338 }
339 }
340
341 pub fn summary(&self) -> String {
343 let mut summary = String::new();
344 summary.push_str("GPU Capabilities Summary:\n");
345 summary.push_str(&format!(
346 "Available backends: {:?}\n",
347 self.available_backends
348 ));
349 summary.push_str(&format!(
350 "Total GPU memory: {:.2} GB\n",
351 self.total_gpu_memory as f64 / (1024.0 * 1024.0 * 1024.0)
352 ));
353 summary.push_str(&format!(
354 "Unified memory support: {}\n",
355 self.supports_unified_memory
356 ));
357 summary.push_str(&format!(
358 "Double precision support: {}\n",
359 self.supports_double_precision
360 ));
361
362 for (backend, device) in &self.best_devices {
363 summary.push_str(&format!(
364 "Best {} device: {} ({:.2} GB)\n",
365 backend,
366 device.name,
367 device.total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
368 ));
369 }
370
371 summary
372 }
373
374 fn get_best_device_for_backend(backend: GpuBackend) -> Option<GpuDevice> {
375 match backend {
377 GpuBackend::CpuFallback => None,
378 _ => Some(GpuDevice::new(
379 0,
380 format!("{} Device", backend),
381 8_000_000_000,
382 7_000_000_000,
383 "1.0".to_string(),
384 1024,
385 backend,
386 true,
387 )),
388 }
389 }
390}
391
392pub fn is_gpu_recommended(n_samples: usize, n_features: usize) -> bool {
394 let problem_size = n_samples * n_features;
396 problem_size > 10_000 && n_samples > 100
397}
398
399pub fn get_recommended_tile_size(device: &GpuDevice, element_size: usize) -> usize {
401 let memory_per_tile = device.available_memory / 16; let elements_per_tile = memory_per_tile / element_size;
404 let sqrt_elements = (elements_per_tile as f64).sqrt() as usize;
405
406 let base_tile_size = sqrt_elements.max(32).min(1024);
408 let compute_aligned = ((base_tile_size + device.compute_units as usize - 1)
409 / device.compute_units as usize)
410 * device.compute_units as usize;
411
412 compute_aligned.min(1024)
413}
414
415#[cfg(test)]
416mod tests {
417 use super::*;
418
419 #[test]
420 fn test_auto_config() {
421 let config = auto_config();
422 assert!(config.auto_fallback);
423 }
424
425 #[test]
426 fn test_backend_availability() {
427 assert!(is_backend_available(GpuBackend::CpuFallback));
428 assert!(!is_backend_available(GpuBackend::Cuda));
430 }
431
432 #[test]
433 fn test_list_devices() {
434 let devices = list_devices();
435 assert!(!devices.is_empty());
436 }
437
438 #[test]
439 fn test_gpu_recommendation() {
440 assert!(!is_gpu_recommended(10, 10)); assert!(is_gpu_recommended(1000, 100)); }
443
444 #[test]
445 fn test_capabilities_detection() {
446 let caps = GpuCapabilities::detect();
447 assert!(!caps.summary().is_empty());
448 }
449
450 #[test]
451 fn test_recommended_tile_size() {
452 let device = GpuDevice::new(
453 0,
454 "Test".to_string(),
455 8_000_000_000,
456 6_000_000_000,
457 "1.0".to_string(),
458 1024,
459 GpuBackend::Cuda,
460 true,
461 );
462
463 let tile_size = get_recommended_tile_size(&device, 8);
464 assert!(tile_size >= 32);
465 assert!(tile_size <= 1024);
466 }
467
468 #[test]
469 fn test_benchmark_result() {
470 let result = BenchmarkResult {
471 cpu_duration_ms: 100.0,
472 gpu_duration_ms: 20.0,
473 speedup: 5.0,
474 data_size: 1000,
475 n_features: 10,
476 metric: DistanceMetric::Euclidean,
477 };
478
479 assert!(result.gpu_is_faster());
480 assert_eq!(result.efficiency_rating(), "Very Good");
481 assert!(!result.summary().is_empty());
482 }
483}