iro_cuda_ffi_profile/timer.rs
1//! GPU timing utilities with reusable events.
2//!
3//! This module provides low-overhead timing primitives that avoid repeated
4//! event allocation in hot loops.
5
6use iro_cuda_ffi::event::{Event, EventKind};
7use iro_cuda_ffi::prelude::{Result, Stream};
8
9/// Reusable GPU timer for measuring kernel execution time.
10///
11/// Unlike creating new events for each timing, `GpuTimer` reuses its internal
12/// events to avoid allocation overhead in hot loops.
13///
14/// # Example
15///
16/// ```ignore
17/// use iro_cuda_ffi_profile::GpuTimer;
18///
19/// let timer = GpuTimer::new()?;
20/// let mut times = Vec::new();
21///
22/// for _ in 0..100 {
23/// timer.start(&stream)?;
24/// my_kernel(&stream, ...)?;
25/// times.push(timer.stop_sync(&stream)?);
26/// }
27/// ```
28#[derive(Debug)]
29pub struct GpuTimer {
30 start: Event,
31 end: Event,
32}
33
34impl GpuTimer {
35 /// Creates a new GPU timer with reusable timed events.
36 pub fn new() -> Result<Self> {
37 Ok(Self {
38 start: Event::new(EventKind::Timed)?,
39 end: Event::new(EventKind::Timed)?,
40 })
41 }
42
43 /// Records the start timestamp in the given stream.
44 ///
45 /// Call this before the operation(s) you want to time.
46 #[inline]
47 pub fn start(&self, stream: &Stream) -> Result<()> {
48 stream.record_event(&self.start)
49 }
50
51 /// Records the end timestamp in the given stream.
52 ///
53 /// Call this after the operation(s) you want to time.
54 /// Does not synchronize - use `elapsed()` after manual sync,
55 /// or use `stop_sync()` for convenience.
56 #[inline]
57 pub fn stop(&self, stream: &Stream) -> Result<()> {
58 stream.record_event(&self.end)
59 }
60
61 /// Records end timestamp, synchronizes the end event, and returns elapsed milliseconds.
62 ///
63 /// This is the most common usage pattern for timing a single operation.
64 #[inline]
65 pub fn stop_sync(&self, stream: &Stream) -> Result<f32> {
66 stream.record_event(&self.end)?;
67 self.end.synchronize()?;
68 self.end.elapsed_since(&self.start)
69 }
70
71 /// Returns elapsed milliseconds between start and end events.
72 ///
73 /// Both events must have been recorded and completed before calling this.
74 /// If you haven't synchronized, call `stream.synchronize()` or
75 /// `self.end.synchronize()` first.
76 #[inline]
77 pub fn elapsed(&self) -> Result<f32> {
78 self.end.elapsed_since(&self.start)
79 }
80
81 /// Times a closure, returning its result and elapsed milliseconds.
82 ///
83 /// Equivalent to:
84 /// ```ignore
85 /// timer.start(&stream)?;
86 /// let result = f()?;
87 /// let ms = timer.stop_sync(&stream)?;
88 /// ```
89 #[inline]
90 pub fn time<T, F>(&self, stream: &Stream, f: F) -> Result<(T, f32)>
91 where
92 F: FnOnce() -> Result<T>,
93 {
94 self.start(stream)?;
95 let result = f()?;
96 let ms = self.stop_sync(stream)?;
97 Ok((result, ms))
98 }
99
100 /// Times a closure that doesn't return a Result.
101 ///
102 /// Useful for timing infallible operations.
103 #[inline]
104 pub fn time_infallible<T, F>(&self, stream: &Stream, f: F) -> Result<(T, f32)>
105 where
106 F: FnOnce() -> T,
107 {
108 self.start(stream)?;
109 let result = f();
110 let ms = self.stop_sync(stream)?;
111 Ok((result, ms))
112 }
113}
114
115/// Extension trait for convenient one-shot timing on streams.
116///
117/// For repeated timing in loops, prefer `GpuTimer` to avoid event allocation overhead.
118pub trait StreamTimingExt {
119 /// Times a closure, returning its result and elapsed milliseconds.
120 ///
121 /// Creates temporary events for timing. For hot loops, use `GpuTimer` instead.
122 ///
123 /// # Example
124 ///
125 /// ```ignore
126 /// use iro_cuda_ffi_profile::StreamTimingExt;
127 ///
128 /// let (_, ms) = stream.timed(|| {
129 /// my_kernel(&stream, ...)?;
130 /// Ok(())
131 /// })?;
132 /// println!("Kernel took {ms:.3} ms");
133 /// ```
134 fn timed<T, F>(&self, f: F) -> Result<(T, f32)>
135 where
136 F: FnOnce() -> Result<T>;
137
138 /// Times a closure, discarding the result and returning only elapsed milliseconds.
139 fn timed_ms<F>(&self, f: F) -> Result<f32>
140 where
141 F: FnOnce() -> Result<()>;
142}
143
144impl StreamTimingExt for Stream {
145 fn timed<T, F>(&self, f: F) -> Result<(T, f32)>
146 where
147 F: FnOnce() -> Result<T>,
148 {
149 let start = self.record_timed_event()?;
150 let result = f()?;
151 let end = self.record_timed_event()?;
152 // Synchronize only the end event, not the entire stream.
153 // This avoids waiting for unrelated work that may have been
154 // queued after the end event was recorded.
155 end.synchronize()?;
156 let ms = end.elapsed_since(&start)?;
157 Ok((result, ms))
158 }
159
160 fn timed_ms<F>(&self, f: F) -> Result<f32>
161 where
162 F: FnOnce() -> Result<()>,
163 {
164 let ((), ms) = self.timed(f)?;
165 Ok(ms)
166 }
167}
168
169/// Accumulator for collecting timing samples with minimal overhead.
170///
171/// Pre-allocates capacity and provides efficient sample collection.
172#[derive(Debug)]
173pub struct TimingSamples {
174 samples: Vec<f64>,
175}
176
177impl TimingSamples {
178 /// Creates a new accumulator with pre-allocated capacity.
179 pub fn with_capacity(capacity: usize) -> Self {
180 Self {
181 samples: Vec::with_capacity(capacity),
182 }
183 }
184
185 /// Adds a timing sample in milliseconds.
186 #[inline]
187 pub fn push(&mut self, ms: f32) {
188 self.samples.push(ms as f64);
189 }
190
191 /// Adds a timing sample in milliseconds (f64).
192 #[inline]
193 pub fn push_f64(&mut self, ms: f64) {
194 self.samples.push(ms);
195 }
196
197 /// Returns the collected samples.
198 #[inline]
199 pub fn as_slice(&self) -> &[f64] {
200 &self.samples
201 }
202
203 /// Returns the number of samples collected.
204 #[inline]
205 pub fn len(&self) -> usize {
206 self.samples.len()
207 }
208
209 /// Returns true if no samples have been collected.
210 #[inline]
211 pub fn is_empty(&self) -> bool {
212 self.samples.is_empty()
213 }
214
215 /// Clears all samples, keeping allocated capacity.
216 #[inline]
217 pub fn clear(&mut self) {
218 self.samples.clear();
219 }
220
221 /// Computes statistics from the collected samples.
222 ///
223 /// # Panics
224 ///
225 /// Panics if no samples have been collected.
226 pub fn stats(&self) -> crate::Stats {
227 crate::Stats::from_samples(&self.samples)
228 }
229}
230
231impl From<TimingSamples> for Vec<f64> {
232 fn from(samples: TimingSamples) -> Self {
233 samples.samples
234 }
235}
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240
241 #[test]
242 fn test_timing_samples() {
243 let mut samples = TimingSamples::with_capacity(10);
244 assert!(samples.is_empty());
245
246 samples.push(1.0);
247 samples.push(2.0);
248 samples.push(3.0);
249
250 assert_eq!(samples.len(), 3);
251 assert_eq!(samples.as_slice(), &[1.0, 2.0, 3.0]);
252
253 let stats = samples.stats();
254 assert_eq!(stats.count, 3);
255 assert!((stats.mean - 2.0).abs() < 1e-10);
256 }
257}