gpu_acceleration/
gpu_acceleration.rs

1//! GPU acceleration demonstration
2//!
3//! This example demonstrates GPU-accelerated data generation and processing,
4//! comparing performance between CPU and GPU implementations.
5//!
6//! Usage:
7//!   cargo run --example gpu_acceleration --release
8
9use scirs2_datasets::{
10    get_optimal_gpu_config, is_cuda_available, is_opencl_available, list_gpu_devices,
11    make_blobs_auto_gpu, make_classification, make_classification_auto_gpu,
12    make_regression_auto_gpu, GpuBackend, GpuBenchmark, GpuConfig, GpuContext, GpuMemoryConfig,
13};
14use std::collections::HashMap;
15use std::time::Instant;
16
17#[allow(dead_code)]
18fn main() -> Result<(), Box<dyn std::error::Error>> {
19    println!("๐Ÿš€ GPU Acceleration Demonstration");
20    println!("=================================\n");
21
22    // Check GPU availability
23    demonstrate_gpu_detection();
24
25    // Show available devices
26    demonstrate_device_listing()?;
27
28    // Compare different GPU backends
29    demonstrate_backend_comparison()?;
30
31    // Performance benchmarking
32    demonstrate_performance_benchmarks()?;
33
34    // Memory management
35    demonstrate_memory_management()?;
36
37    // Real-world use cases
38    demonstrate_real_world_scenarios()?;
39
40    println!("\n๐ŸŽ‰ GPU acceleration demonstration completed!");
41    Ok(())
42}
43
44#[allow(dead_code)]
45fn demonstrate_gpu_detection() {
46    println!("๐Ÿ” GPU DETECTION AND AVAILABILITY");
47    println!("{}", "-".repeat(40));
48
49    println!("CUDA Support:");
50    if is_cuda_available() {
51        println!("  โœ… CUDA is available");
52        println!("  ๐ŸŽฏ NVIDIA GPU acceleration supported");
53    } else {
54        println!("  โŒ CUDA not available");
55        println!("  ๐Ÿ’ก Install CUDA toolkit for NVIDIA GPU support");
56    }
57
58    println!("\nOpenCL Support:");
59    if is_opencl_available() {
60        println!("  โœ… OpenCL is available");
61        println!("  ๐ŸŽฏ Multi-vendor GPU acceleration supported");
62    } else {
63        println!("  โŒ OpenCL not available");
64        println!("  ๐Ÿ’ก Install OpenCL runtime for GPU support");
65    }
66
67    // Get optimal configuration
68    let optimal_config = get_optimal_gpu_config();
69    println!("\nOptimal Configuration:");
70    match optimal_config.backend {
71        GpuBackend::Cuda { device_id } => {
72            println!("  ๐Ÿš€ CUDA backend (device {device_id})");
73        }
74        GpuBackend::OpenCl {
75            platform_id,
76            device_id,
77        } => {
78            println!("  ๐Ÿš€ OpenCL backend (platform {platform_id}, device {device_id})");
79        }
80        GpuBackend::Cpu => {
81            println!("  ๐Ÿ’ป CPU fallback (no GPU available)");
82        }
83    }
84    println!(
85        "  ๐Ÿงต Threads per block: {}",
86        optimal_config.threads_per_block
87    );
88    println!(
89        "  ๐Ÿ”ข Double precision: {}",
90        optimal_config.enable_double_precision
91    );
92
93    println!();
94}
95
96#[allow(dead_code)]
97fn demonstrate_device_listing() -> Result<(), Box<dyn std::error::Error>> {
98    println!("๐Ÿ“‹ AVAILABLE GPU DEVICES");
99    println!("{}", "-".repeat(40));
100
101    let devices = list_gpu_devices()?;
102
103    if devices.is_empty() {
104        println!("No GPU devices found. Using CPU fallback.");
105    } else {
106        println!("Found {} device(s):", devices.len());
107
108        for (i, device) in devices.iter().enumerate() {
109            println!("\nDevice {i}:");
110            println!("  Name: {}", device.name);
111            println!("  Total Memory: {} MB", device.total_memory_mb);
112            println!("  Available Memory: {} MB", device.available_memory_mb);
113            println!("  Compute Units: {}", device.compute_units);
114            println!("  Max Work Group: {}", device.max_work_group_size);
115            println!("  Compute Capability: {}", device.compute_capability);
116            println!(
117                "  Double Precision: {}",
118                if device.supports_double_precision {
119                    "โœ…"
120                } else {
121                    "โŒ"
122                }
123            );
124
125            // Calculate utilization
126            let utilization = (device.total_memory_mb - device.available_memory_mb) as f64
127                / device.total_memory_mb as f64
128                * 100.0;
129            println!("  Memory Utilization: {utilization:.1}%");
130        }
131    }
132
133    println!();
134    Ok(())
135}
136
137#[allow(dead_code)]
138fn demonstrate_backend_comparison() -> Result<(), Box<dyn std::error::Error>> {
139    println!("โšก GPU BACKEND COMPARISON");
140    println!("{}", "-".repeat(40));
141
142    let testsize = 50_000;
143    let features = 20;
144
145    println!("Comparing backends for {testsize} samples with {features} features:");
146
147    // Test different backends
148    let backends = vec![
149        ("CPU Fallback", GpuBackend::Cpu),
150        ("CUDA", GpuBackend::Cuda { device_id: 0 }),
151        (
152            "OpenCL",
153            GpuBackend::OpenCl {
154                platform_id: 0,
155                device_id: 0,
156            },
157        ),
158    ];
159
160    let mut results: HashMap<String, std::time::Duration> = HashMap::new();
161
162    for (name, backend) in backends {
163        println!("\nTesting {name}:");
164
165        let config = GpuConfig {
166            backend: backend.clone(),
167            threads_per_block: 256,
168            enable_double_precision: true,
169            ..Default::default()
170        };
171
172        match GpuContext::new(config) {
173            Ok(context) => {
174                if context.is_available() {
175                    // Test classification generation
176                    let start = Instant::now();
177                    let dataset =
178                        context.make_classification_gpu(testsize, features, 5, 2, 15, Some(42))?;
179                    let duration = start.elapsed();
180
181                    results.insert(name.to_string(), duration);
182
183                    println!(
184                        "  โœ… Classification: {} samples in {:.2}ms",
185                        dataset.n_samples(),
186                        duration.as_millis()
187                    );
188                    println!(
189                        "  ๐Ÿ“Š Throughput: {:.1} samples/s",
190                        dataset.n_samples() as f64 / duration.as_secs_f64()
191                    );
192                } else {
193                    println!("  โŒ Backend not available");
194                }
195            }
196            Err(e) => {
197                println!("  โŒ Error: {e}");
198            }
199        }
200    }
201
202    // Calculate speedups
203    if let Some(cpu_time) = results.get("CPU Fallback") {
204        println!("\nSpeedup Analysis:");
205        for (backend, gpu_time) in &results {
206            if backend != "CPU Fallback" {
207                let speedup = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
208                println!("  {backend}: {speedup:.1}x faster than CPU");
209            }
210        }
211    }
212
213    println!();
214    Ok(())
215}
216
217#[allow(dead_code)]
218fn demonstrate_performance_benchmarks() -> Result<(), Box<dyn std::error::Error>> {
219    println!("๐Ÿ“Š PERFORMANCE BENCHMARKS");
220    println!("{}", "-".repeat(40));
221
222    let config = get_optimal_gpu_config();
223    let benchmark = GpuBenchmark::new(config)?;
224
225    println!("Running data generation benchmarks...");
226    let data_results = benchmark.benchmark_data_generation()?;
227    data_results.print_results();
228
229    println!("\nRunning matrix operation benchmarks...");
230    let matrix_results = benchmark.benchmark_matrix_operations()?;
231    matrix_results.print_results();
232
233    // Compare with CPU baseline
234    println!("\nCPU vs GPU Comparison:");
235    demonstrate_cpu_gpu_comparison()?;
236
237    println!();
238    Ok(())
239}
240
241#[allow(dead_code)]
242fn demonstrate_cpu_gpu_comparison() -> Result<(), Box<dyn std::error::Error>> {
243    let dataset_sizes = vec![10_000, 50_000, 100_000];
244
245    println!(
246        "{:<12} {:<15} {:<15} {:<10}",
247        "Size", "CPU Time", "GPU Time", "Speedup"
248    );
249    println!("{}", "-".repeat(55));
250
251    for &size in &dataset_sizes {
252        // CPU benchmark
253        let cpu_start = Instant::now();
254        let _cpudataset = make_classification(size, 20, 5, 2, 15, Some(42))?;
255        let cpu_time = cpu_start.elapsed();
256
257        // GPU benchmark
258        let gpu_start = Instant::now();
259        let _gpudataset = make_classification_auto_gpu(size, 20, 5, 2, 15, Some(42))?;
260        let gpu_time = gpu_start.elapsed();
261
262        let speedup = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();
263
264        println!(
265            "{:<12} {:<15} {:<15} {:<10.1}x",
266            size,
267            format!("{:.1}ms", cpu_time.as_millis()),
268            format!("{:.1}ms", gpu_time.as_millis()),
269            speedup
270        );
271    }
272
273    Ok(())
274}
275
276#[allow(dead_code)]
277fn demonstrate_memory_management() -> Result<(), Box<dyn std::error::Error>> {
278    println!("๐Ÿ’พ GPU MEMORY MANAGEMENT");
279    println!("{}", "-".repeat(40));
280
281    // Configure memory-constrained GPU context
282    let memory_config = GpuMemoryConfig {
283        max_memory_mb: Some(512),  // Limit to 512MB
284        pool_size_mb: 256,         // 256MB pool
285        enable_coalescing: true,   // Enable memory coalescing
286        use_unified_memory: false, // Don't use unified memory
287    };
288
289    let gpu_config = GpuConfig {
290        backend: get_optimal_gpu_config().backend,
291        memory: memory_config,
292        threads_per_block: 256,
293        ..Default::default()
294    };
295
296    println!("Memory Configuration:");
297    println!(
298        "  Max Memory: {} MB",
299        gpu_config.memory.max_memory_mb.unwrap_or(0)
300    );
301    println!("  Pool Size: {} MB", gpu_config.memory.pool_size_mb);
302    println!("  Coalescing: {}", gpu_config.memory.enable_coalescing);
303    println!("  Unified Memory: {}", gpu_config.memory.use_unified_memory);
304
305    let context = GpuContext::new(gpu_config)?;
306    let device_info = context.device_info();
307
308    println!("\nDevice Memory Info:");
309    println!("  Total: {} MB", device_info.total_memory_mb);
310    println!("  Available: {} MB", device_info.available_memory_mb);
311    println!(
312        "  Utilization: {:.1}%",
313        (device_info.total_memory_mb - device_info.available_memory_mb) as f64
314            / device_info.total_memory_mb as f64
315            * 100.0
316    );
317
318    // Test memory-efficient generation
319    println!("\nTesting memory-efficient dataset generation...");
320
321    let sizes = vec![10_000, 25_000, 50_000];
322    for &size in &sizes {
323        let start = Instant::now();
324
325        match context.make_regression_gpu(size, 50, 30, 0.1, Some(42)) {
326            Ok(dataset) => {
327                let duration = start.elapsed();
328                let memory_estimate = dataset.n_samples() * dataset.n_features() * 8; // 8 bytes per f64
329
330                println!(
331                    "  {} samples: {:.1}ms (~{:.1} MB)",
332                    size,
333                    duration.as_millis(),
334                    memory_estimate as f64 / (1024.0 * 1024.0)
335                );
336            }
337            Err(e) => {
338                println!("  {size} samples: Failed - {e}");
339            }
340        }
341    }
342
343    println!();
344    Ok(())
345}
346
347#[allow(dead_code)]
348fn demonstrate_real_world_scenarios() -> Result<(), Box<dyn std::error::Error>> {
349    println!("๐ŸŒ REAL-WORLD GPU SCENARIOS");
350    println!("{}", "-".repeat(40));
351
352    // Scenario 1: Large-scale data augmentation
353    println!("Scenario 1: Large-scale synthetic data generation");
354    demonstrate_large_scale_generation()?;
355
356    // Scenario 2: Rapid prototyping with GPU
357    println!("\nScenario 2: Rapid prototyping workflow");
358    demonstrate_rapid_prototyping()?;
359
360    // Scenario 3: Batch processing
361    println!("\nScenario 3: Batch dataset processing");
362    demonstrate_batch_processing()?;
363
364    Ok(())
365}
366
367#[allow(dead_code)]
368fn demonstrate_large_scale_generation() -> Result<(), Box<dyn std::error::Error>> {
369    println!("  ๐ŸŽฏ Goal: Generate 1M samples across multiple datasets");
370    println!("  ๐Ÿ“Š Using GPU acceleration for maximum throughput");
371
372    let total_samples = 1_000_000;
373    let features = 100;
374
375    // Track generation times
376    let mut generation_times = Vec::new();
377    let start_total = Instant::now();
378
379    // Classification dataset
380    let start = Instant::now();
381    let classification =
382        make_classification_auto_gpu(total_samples, features, 10, 2, 50, Some(42))?;
383    let class_time = start.elapsed();
384    generation_times.push(("Classification", class_time, classification.n_samples()));
385
386    // Regression dataset
387    let start = Instant::now();
388    let regression = make_regression_auto_gpu(total_samples, features, 60, 0.1, Some(43))?;
389    let reg_time = start.elapsed();
390    generation_times.push(("Regression", reg_time, regression.n_samples()));
391
392    // Clustering dataset
393    let start = Instant::now();
394    let clustering = make_blobs_auto_gpu(total_samples, 50, 20, 1.5, Some(44))?;
395    let cluster_time = start.elapsed();
396    generation_times.push(("Clustering", cluster_time, clustering.n_samples()));
397
398    let total_time = start_total.elapsed();
399
400    println!("  โœ… Generation Results:");
401    for (name, time, samples) in generation_times {
402        let throughput = samples as f64 / time.as_secs_f64();
403        println!(
404            "    {}: {:.1}s ({:.1}K samples/s)",
405            name,
406            time.as_secs_f64(),
407            throughput / 1000.0
408        );
409    }
410
411    let total_samples_generated =
412        classification.n_samples() + regression.n_samples() + clustering.n_samples();
413    let overall_throughput = total_samples_generated as f64 / total_time.as_secs_f64();
414
415    println!(
416        "  ๐Ÿ“ˆ Overall: {} samples in {:.1}s ({:.1}K samples/s)",
417        total_samples_generated,
418        total_time.as_secs_f64(),
419        overall_throughput / 1000.0
420    );
421
422    Ok(())
423}
424
425#[allow(dead_code)]
426fn demonstrate_rapid_prototyping() -> Result<(), Box<dyn std::error::Error>> {
427    println!("  ๐ŸŽฏ Goal: Quickly test different dataset configurations");
428    println!("  โšก Using GPU for instant feedback");
429
430    let configurations = vec![
431        ("Small Dense", 1_000, 20, 5),
432        ("Medium Sparse", 10_000, 100, 20),
433        ("Large High-Dim", 100_000, 500, 100),
434    ];
435
436    for (name, samples, features, informative) in configurations {
437        let start = Instant::now();
438
439        let dataset = make_classification_auto_gpu(samples, features, 5, 2, informative, Some(42))?;
440        let duration = start.elapsed();
441
442        // Quick analysis
443        let memory_usage = dataset.n_samples() * dataset.n_features() * 8; // bytes
444        let density = informative as f64 / features as f64;
445
446        println!(
447            "    {}: {} in {:.1}ms",
448            name,
449            format_number(dataset.n_samples()),
450            duration.as_millis()
451        );
452        println!(
453            "      Features: {} (density: {:.1}%)",
454            features,
455            density * 100.0
456        );
457        println!(
458            "      Memory: {:.1} MB",
459            memory_usage as f64 / (1024.0 * 1024.0)
460        );
461    }
462
463    Ok(())
464}
465
466#[allow(dead_code)]
467fn demonstrate_batch_processing() -> Result<(), Box<dyn std::error::Error>> {
468    println!("  ๐ŸŽฏ Goal: Process multiple dataset requests in parallel");
469    println!("  ๐Ÿ”„ Simulating production workload");
470
471    // Simulate batch requests
472    let requests = vec![
473        ("User A - Classification", 5_000, 30, "classification"),
474        ("User B - Regression", 8_000, 25, "regression"),
475        ("User C - Clustering", 3_000, 15, "clustering"),
476        ("User D - Classification", 12_000, 40, "classification"),
477        ("User E - Regression", 6_000, 35, "regression"),
478    ];
479
480    let batch_start = Instant::now();
481    let mut total_samples = 0;
482
483    for (requestname, samples, features, dataset_type) in requests {
484        let start = Instant::now();
485
486        let dataset = match dataset_type {
487            "classification" => {
488                make_classification_auto_gpu(samples, features, 5, 2, features / 2, Some(42))?
489            }
490            "regression" => {
491                make_regression_auto_gpu(samples, features, features / 2, 0.1, Some(42))?
492            }
493            "clustering" => make_blobs_auto_gpu(samples, features, 8, 1.0, Some(42))?,
494            _ => unreachable!(),
495        };
496
497        let duration = start.elapsed();
498        total_samples += dataset.n_samples();
499
500        println!(
501            "    {}: {} samples in {:.1}ms",
502            requestname,
503            dataset.n_samples(),
504            duration.as_millis()
505        );
506    }
507
508    let batch_duration = batch_start.elapsed();
509    let batch_throughput = total_samples as f64 / batch_duration.as_secs_f64();
510
511    println!("  ๐Ÿ“Š Batch Summary:");
512    println!("    Total Requests: 5");
513    println!("    Total Samples: {}", format_number(total_samples));
514    println!("    Batch Time: {:.2}s", batch_duration.as_secs_f64());
515    println!(
516        "    Throughput: {:.1}K samples/s",
517        batch_throughput / 1000.0
518    );
519
520    Ok(())
521}
522
523/// Helper function to format large numbers
524#[allow(dead_code)]
525fn format_number(n: usize) -> String {
526    if n >= 1_000_000 {
527        format!("{:.1}M", n as f64 / 1_000_000.0)
528    } else if n >= 1_000 {
529        format!("{:.1}K", n as f64 / 1_000.0)
530    } else {
531        n.to_string()
532    }
533}