iro_cuda_ffi_profile/lib.rs
1//! GPU profiling and benchmarking utilities for iro-cuda-ffi.
2//!
3//! This crate provides tools for measuring GPU kernel performance with
4//! minimal overhead and comprehensive statistical analysis.
5//!
6//! # Quick Start
7//!
8//! ```ignore
9//! use iro_cuda_ffi::prelude::*;
10//! use iro_cuda_ffi_profile::prelude::*;
11//!
12//! // One-shot timing
13//! let ms = stream.timed_ms(|| {
14//! my_kernel(&stream, ...)?;
15//! Ok(())
16//! })?;
17//!
18//! // Reusable timer for hot loops
19//! let timer = GpuTimer::new()?;
20//! for _ in 0..100 {
21//! timer.start(&stream)?;
22//! my_kernel(&stream, ...)?;
23//! let ms = timer.stop_sync(&stream)?;
24//! }
25//!
26//! // Full benchmark with statistics
27//! let result = Benchmark::new("my_kernel", &stream)
28//! .warmup(10)
29//! .iterations(100)
30//! .memory(MemoryAccess::f32(n, 3))
31//! .run(|s| my_kernel(s, ...))?;
32//!
33//! println!("{}", result);
34//! ```
35//!
36//! # Features
37//!
38//! - **`GpuTimer`**: Reusable event pair for low-overhead timing in loops
39//! - **`StreamTimingExt`**: Convenience extension for one-shot timing
40//! - **`Benchmark`**: Full benchmark harness with warmup and iterations
41//! - **`Stats`**: Comprehensive statistics including percentiles and outlier detection
42//! - **`Report`**: Formatted output for benchmark results
43//!
44//! # When to Use What
45//!
46//! | Scenario | Tool |
47//! |----------|------|
48//! | Quick one-off timing | `stream.timed_ms()` |
49//! | Timing in a hot loop | `GpuTimer` |
50//! | Full benchmark with stats | `Benchmark::new().run()` |
51//! | Comparing two implementations | `Comparison` |
52//!
53//! # Statistical Analysis
54//!
55//! The `Stats` type provides:
56//! - Basic statistics: min, max, mean, median, standard deviation
57//! - Percentiles: P1, P5, P25, P50, P75, P95, P99
58//! - Outlier detection using the IQR method
59//! - Coefficient of variation for comparing variability
60//!
61//! # Throughput Calculation
62//!
63//! For memory-bound kernels:
64//! ```ignore
65//! let result = Benchmark::new("vector_add", &stream)
66//! .memory(MemoryAccess::f32(n, 3)) // read a, read b, write c
67//! .run(|s| vector_add(s, &a, &b, &mut c))?;
68//!
69//! println!("Throughput: {:.2} GB/s", result.throughput_gbs().unwrap());
70//! ```
71//!
72//! For compute-bound kernels:
73//! ```ignore
74//! let result = Benchmark::new("fma_chain", &stream)
75//! .compute(ComputeIntensity::fma(n, iters))
76//! .run(|s| fma_chain(s, ...))?;
77//!
78//! println!("Compute: {:.2} GFLOP/s", result.throughput_gflops().unwrap());
79//! ```
80
81#![warn(missing_docs)]
82#![warn(clippy::all)]
83
84pub mod bench;
85pub mod report;
86pub mod stats;
87pub mod timer;
88
89// Re-export primary types at crate root
90pub use bench::{
91 bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess,
92};
93pub use report::{format_bytes, format_count, format_gbs, format_gflops, format_ms, Comparison, Report, print_stats};
94pub use stats::Stats;
95pub use timer::{GpuTimer, StreamTimingExt, TimingSamples};
96
97/// Prelude module for convenient imports.
98///
99/// ```ignore
100/// use iro_cuda_ffi_profile::prelude::*;
101/// ```
102pub mod prelude {
103 pub use crate::bench::{
104 bench, bench_memory, BenchConfig, BenchResult, Benchmark, ComputeIntensity, MemoryAccess,
105 };
106 pub use crate::report::{Comparison, Report};
107 pub use crate::stats::Stats;
108 pub use crate::timer::{GpuTimer, StreamTimingExt, TimingSamples};
109}
110
111#[cfg(test)]
112mod tests {
113 use super::*;
114
115 #[test]
116 fn test_prelude_imports() {
117 // Verify all prelude items are accessible
118 fn _check_types() {
119 let _: fn() -> Stats = || Stats::from_samples(&[1.0]);
120 let _: fn() -> BenchConfig = BenchConfig::default;
121 let _: fn() -> Report = Report::new;
122 }
123 }
124}