iro_cuda_ffi_profile/
timer.rs

1//! GPU timing utilities with reusable events.
2//!
3//! This module provides low-overhead timing primitives that avoid repeated
4//! event allocation in hot loops.
5
6use iro_cuda_ffi::event::{Event, EventKind};
7use iro_cuda_ffi::prelude::{Result, Stream};
8
9/// Reusable GPU timer for measuring kernel execution time.
10///
11/// Unlike creating new events for each timing, `GpuTimer` reuses its internal
12/// events to avoid allocation overhead in hot loops.
13///
14/// # Example
15///
16/// ```ignore
17/// use iro_cuda_ffi_profile::GpuTimer;
18///
19/// let timer = GpuTimer::new()?;
20/// let mut times = Vec::new();
21///
22/// for _ in 0..100 {
23///     timer.start(&stream)?;
24///     my_kernel(&stream, ...)?;
25///     times.push(timer.stop_sync(&stream)?);
26/// }
27/// ```
28#[derive(Debug)]
29pub struct GpuTimer {
30    start: Event,
31    end: Event,
32}
33
34impl GpuTimer {
35    /// Creates a new GPU timer with reusable timed events.
36    pub fn new() -> Result<Self> {
37        Ok(Self {
38            start: Event::new(EventKind::Timed)?,
39            end: Event::new(EventKind::Timed)?,
40        })
41    }
42
43    /// Records the start timestamp in the given stream.
44    ///
45    /// Call this before the operation(s) you want to time.
46    #[inline]
47    pub fn start(&self, stream: &Stream) -> Result<()> {
48        stream.record_event(&self.start)
49    }
50
51    /// Records the end timestamp in the given stream.
52    ///
53    /// Call this after the operation(s) you want to time.
54    /// Does not synchronize - use `elapsed()` after manual sync,
55    /// or use `stop_sync()` for convenience.
56    #[inline]
57    pub fn stop(&self, stream: &Stream) -> Result<()> {
58        stream.record_event(&self.end)
59    }
60
61    /// Records end timestamp, synchronizes the end event, and returns elapsed milliseconds.
62    ///
63    /// This is the most common usage pattern for timing a single operation.
64    #[inline]
65    pub fn stop_sync(&self, stream: &Stream) -> Result<f32> {
66        stream.record_event(&self.end)?;
67        self.end.synchronize()?;
68        self.end.elapsed_since(&self.start)
69    }
70
71    /// Returns elapsed milliseconds between start and end events.
72    ///
73    /// Both events must have been recorded and completed before calling this.
74    /// If you haven't synchronized, call `stream.synchronize()` or
75    /// `self.end.synchronize()` first.
76    #[inline]
77    pub fn elapsed(&self) -> Result<f32> {
78        self.end.elapsed_since(&self.start)
79    }
80
81    /// Times a closure, returning its result and elapsed milliseconds.
82    ///
83    /// Equivalent to:
84    /// ```ignore
85    /// timer.start(&stream)?;
86    /// let result = f()?;
87    /// let ms = timer.stop_sync(&stream)?;
88    /// ```
89    #[inline]
90    pub fn time<T, F>(&self, stream: &Stream, f: F) -> Result<(T, f32)>
91    where
92        F: FnOnce() -> Result<T>,
93    {
94        self.start(stream)?;
95        let result = f()?;
96        let ms = self.stop_sync(stream)?;
97        Ok((result, ms))
98    }
99
100    /// Times a closure that doesn't return a Result.
101    ///
102    /// Useful for timing infallible operations.
103    #[inline]
104    pub fn time_infallible<T, F>(&self, stream: &Stream, f: F) -> Result<(T, f32)>
105    where
106        F: FnOnce() -> T,
107    {
108        self.start(stream)?;
109        let result = f();
110        let ms = self.stop_sync(stream)?;
111        Ok((result, ms))
112    }
113}
114
115/// Extension trait for convenient one-shot timing on streams.
116///
117/// For repeated timing in loops, prefer `GpuTimer` to avoid event allocation overhead.
118pub trait StreamTimingExt {
119    /// Times a closure, returning its result and elapsed milliseconds.
120    ///
121    /// Creates temporary events for timing. For hot loops, use `GpuTimer` instead.
122    ///
123    /// # Example
124    ///
125    /// ```ignore
126    /// use iro_cuda_ffi_profile::StreamTimingExt;
127    ///
128    /// let (_, ms) = stream.timed(|| {
129    ///     my_kernel(&stream, ...)?;
130    ///     Ok(())
131    /// })?;
132    /// println!("Kernel took {ms:.3} ms");
133    /// ```
134    fn timed<T, F>(&self, f: F) -> Result<(T, f32)>
135    where
136        F: FnOnce() -> Result<T>;
137
138    /// Times a closure, discarding the result and returning only elapsed milliseconds.
139    fn timed_ms<F>(&self, f: F) -> Result<f32>
140    where
141        F: FnOnce() -> Result<()>;
142}
143
144impl StreamTimingExt for Stream {
145    fn timed<T, F>(&self, f: F) -> Result<(T, f32)>
146    where
147        F: FnOnce() -> Result<T>,
148    {
149        let start = self.record_timed_event()?;
150        let result = f()?;
151        let end = self.record_timed_event()?;
152        // Synchronize only the end event, not the entire stream.
153        // This avoids waiting for unrelated work that may have been
154        // queued after the end event was recorded.
155        end.synchronize()?;
156        let ms = end.elapsed_since(&start)?;
157        Ok((result, ms))
158    }
159
160    fn timed_ms<F>(&self, f: F) -> Result<f32>
161    where
162        F: FnOnce() -> Result<()>,
163    {
164        let ((), ms) = self.timed(f)?;
165        Ok(ms)
166    }
167}
168
169/// Accumulator for collecting timing samples with minimal overhead.
170///
171/// Pre-allocates capacity and provides efficient sample collection.
172#[derive(Debug)]
173pub struct TimingSamples {
174    samples: Vec<f64>,
175}
176
177impl TimingSamples {
178    /// Creates a new accumulator with pre-allocated capacity.
179    pub fn with_capacity(capacity: usize) -> Self {
180        Self {
181            samples: Vec::with_capacity(capacity),
182        }
183    }
184
185    /// Adds a timing sample in milliseconds.
186    #[inline]
187    pub fn push(&mut self, ms: f32) {
188        self.samples.push(ms as f64);
189    }
190
191    /// Adds a timing sample in milliseconds (f64).
192    #[inline]
193    pub fn push_f64(&mut self, ms: f64) {
194        self.samples.push(ms);
195    }
196
197    /// Returns the collected samples.
198    #[inline]
199    pub fn as_slice(&self) -> &[f64] {
200        &self.samples
201    }
202
203    /// Returns the number of samples collected.
204    #[inline]
205    pub fn len(&self) -> usize {
206        self.samples.len()
207    }
208
209    /// Returns true if no samples have been collected.
210    #[inline]
211    pub fn is_empty(&self) -> bool {
212        self.samples.is_empty()
213    }
214
215    /// Clears all samples, keeping allocated capacity.
216    #[inline]
217    pub fn clear(&mut self) {
218        self.samples.clear();
219    }
220
221    /// Computes statistics from the collected samples.
222    ///
223    /// # Panics
224    ///
225    /// Panics if no samples have been collected.
226    pub fn stats(&self) -> crate::Stats {
227        crate::Stats::from_samples(&self.samples)
228    }
229}
230
231impl From<TimingSamples> for Vec<f64> {
232    fn from(samples: TimingSamples) -> Self {
233        samples.samples
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    #[test]
242    fn test_timing_samples() {
243        let mut samples = TimingSamples::with_capacity(10);
244        assert!(samples.is_empty());
245
246        samples.push(1.0);
247        samples.push(2.0);
248        samples.push(3.0);
249
250        assert_eq!(samples.len(), 3);
251        assert_eq!(samples.as_slice(), &[1.0, 2.0, 3.0]);
252
253        let stats = samples.stats();
254        assert_eq!(stats.count, 3);
255        assert!((stats.mean - 2.0).abs() < 1e-10);
256    }
257}
iro_cuda_ffi_profile/timer.rs

iro_cuda_ffi_profile/
timer.rs