Skip to main content

embeddenator_obs/obs/
hires_timing.rs

1//! High-Resolution Timing Infrastructure
2//!
3//! Provides picosecond-scale timing metrics where hardware supports it.
4//! Falls back gracefully to nanosecond precision on systems without
5//! high-resolution timestamp counters.
6//!
7//! # Implementation Notes
8//!
9//! True picosecond resolution requires:
10//! - CPU timestamp counters (RDTSC on x86_64, CNTVCT on ARM64)
11//! - Known TSC frequency for conversion
12//! - Invariant TSC support (modern CPUs)
13//!
14//! On Linux, we use `clock_gettime(CLOCK_MONOTONIC_RAW)` which provides
15//! nanosecond granularity. For sub-nanosecond estimation, we perform
16//! multiple measurements and statistical analysis.
17//!
18//! # Performance Notes
19//!
20//! TSC frequency is cached after first calibration to avoid repeated
21//! file I/O and calibration overhead on timer creation.
22
23use std::sync::atomic::{AtomicU64, Ordering};
24use std::time::Instant;
25
26/// Cached TSC frequency (Hz) - computed once on first use
27#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
28static CACHED_TSC_FREQ: AtomicU64 = AtomicU64::new(0);
29
30/// Sentinel value indicating TSC freq needs calibration
31#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
32const TSC_UNCALIBRATED: u64 = 0;
33
34/// Picosecond timestamp (1 ps = 10^-12 seconds)
35/// We store as u64 picoseconds, giving us ~213 days of range
36pub type Picoseconds = u64;
37
38/// Nanoseconds for compatibility
39pub type Nanoseconds = u64;
40
41/// Conversion constants
42pub const PS_PER_NS: u64 = 1_000;
43pub const PS_PER_US: u64 = 1_000_000;
44pub const PS_PER_MS: u64 = 1_000_000_000;
45pub const PS_PER_SEC: u64 = 1_000_000_000_000;
46
47/// High-resolution timing result with uncertainty bounds
48#[derive(Clone, Copy, Debug, PartialEq, Eq)]
49pub struct HiResTimestamp {
50    /// Measured time in picoseconds
51    pub picoseconds: Picoseconds,
52    /// Lower bound uncertainty (picoseconds)
53    pub uncertainty_low: Picoseconds,
54    /// Upper bound uncertainty (picoseconds)
55    pub uncertainty_high: Picoseconds,
56    /// Whether this is an estimated (sub-ns) or direct measurement
57    pub is_estimated: bool,
58}
59
60impl HiResTimestamp {
61    /// Create from nanoseconds with standard uncertainty
62    pub fn from_nanos(ns: u64) -> Self {
63        HiResTimestamp {
64            picoseconds: ns.saturating_mul(PS_PER_NS),
65            uncertainty_low: 500, // ±500ps typical for ns-resolution
66            uncertainty_high: 500,
67            is_estimated: false,
68        }
69    }
70
71    /// Create from direct picosecond measurement
72    pub fn from_picos(ps: Picoseconds, uncertainty: Picoseconds) -> Self {
73        HiResTimestamp {
74            picoseconds: ps,
75            uncertainty_low: uncertainty,
76            uncertainty_high: uncertainty,
77            is_estimated: true,
78        }
79    }
80
81    /// Convert to nanoseconds (lossy)
82    pub fn as_nanos(&self) -> u64 {
83        self.picoseconds / PS_PER_NS
84    }
85
86    /// Convert to microseconds (lossy)
87    pub fn as_micros(&self) -> u64 {
88        self.picoseconds / PS_PER_US
89    }
90
91    /// Convert to milliseconds (lossy)
92    pub fn as_millis(&self) -> u64 {
93        self.picoseconds / PS_PER_MS
94    }
95
96    /// Convert to seconds as f64 for high precision display
97    pub fn as_secs_f64(&self) -> f64 {
98        self.picoseconds as f64 / PS_PER_SEC as f64
99    }
100
101    /// Format with appropriate unit
102    pub fn format(&self) -> String {
103        if self.picoseconds < PS_PER_NS {
104            format!("{}ps", self.picoseconds)
105        } else if self.picoseconds < PS_PER_US {
106            format!("{:.3}ns", self.picoseconds as f64 / PS_PER_NS as f64)
107        } else if self.picoseconds < PS_PER_MS {
108            format!("{:.3}µs", self.picoseconds as f64 / PS_PER_US as f64)
109        } else if self.picoseconds < PS_PER_SEC {
110            format!("{:.3}ms", self.picoseconds as f64 / PS_PER_MS as f64)
111        } else {
112            format!("{:.3}s", self.picoseconds as f64 / PS_PER_SEC as f64)
113        }
114    }
115
116    /// Format with uncertainty bounds
117    pub fn format_with_uncertainty(&self) -> String {
118        let base = self.format();
119        let unc = if self.uncertainty_high == self.uncertainty_low {
120            format!("±{}ps", self.uncertainty_low)
121        } else {
122            format!("+{}/-{}ps", self.uncertainty_high, self.uncertainty_low)
123        };
124        format!("{} ({})", base, unc)
125    }
126}
127
128impl std::ops::Sub for HiResTimestamp {
129    type Output = HiResTimestamp;
130
131    fn sub(self, rhs: Self) -> Self::Output {
132        HiResTimestamp {
133            picoseconds: self.picoseconds.saturating_sub(rhs.picoseconds),
134            uncertainty_low: self.uncertainty_low + rhs.uncertainty_low,
135            uncertainty_high: self.uncertainty_high + rhs.uncertainty_high,
136            is_estimated: self.is_estimated || rhs.is_estimated,
137        }
138    }
139}
140
141/// High-resolution timer using best available clock source
142pub struct HiResTimer {
143    /// Start instant for std timing
144    start_instant: Instant,
145    /// Start TSC value (if available)
146    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
147    start_tsc: u64,
148    /// TSC frequency in Hz (calibrated)
149    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
150    tsc_freq_hz: u64,
151}
152
153impl HiResTimer {
154    /// Create and start a new high-resolution timer
155    #[inline]
156    pub fn start() -> Self {
157        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
158        {
159            let start_tsc = rdtsc();
160            let start_instant = Instant::now();
161            let tsc_freq_hz = get_tsc_frequency();
162
163            HiResTimer {
164                start_instant,
165                start_tsc,
166                tsc_freq_hz,
167            }
168        }
169
170        #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
171        {
172            HiResTimer {
173                start_instant: Instant::now(),
174            }
175        }
176    }
177
178    /// Get elapsed time with picosecond resolution (where possible)
179    #[inline]
180    pub fn elapsed(&self) -> HiResTimestamp {
181        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
182        {
183            if self.tsc_freq_hz > 0 {
184                let end_tsc = rdtsc();
185                let cycles = end_tsc.saturating_sub(self.start_tsc);
186
187                // Convert cycles to picoseconds: (cycles * 10^12) / freq_hz
188                // Use u128 to avoid overflow
189                let ps = ((cycles as u128) * PS_PER_SEC as u128) / (self.tsc_freq_hz as u128);
190                let ps = ps.min(u64::MAX as u128) as u64;
191
192                // Uncertainty is ~1 cycle at TSC frequency
193                let uncertainty = PS_PER_SEC / self.tsc_freq_hz;
194
195                return HiResTimestamp::from_picos(ps, uncertainty);
196            }
197        }
198
199        // Fallback to std::time::Instant (nanosecond resolution)
200        let elapsed = self.start_instant.elapsed();
201        HiResTimestamp::from_nanos(elapsed.as_nanos().min(u64::MAX as u128) as u64)
202    }
203
204    /// Get elapsed nanoseconds (convenience method)
205    #[inline]
206    pub fn elapsed_nanos(&self) -> u64 {
207        self.elapsed().as_nanos()
208    }
209
210    /// Get elapsed picoseconds
211    #[inline]
212    pub fn elapsed_picos(&self) -> Picoseconds {
213        self.elapsed().picoseconds
214    }
215}
216
217/// Read TSC (Time Stamp Counter) on x86/x86_64
218#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
219#[inline]
220fn rdtsc() -> u64 {
221    #[cfg(target_arch = "x86_64")]
222    unsafe {
223        std::arch::x86_64::_rdtsc()
224    }
225
226    #[cfg(target_arch = "x86")]
227    unsafe {
228        std::arch::x86::_rdtsc()
229    }
230}
231
232/// Get TSC frequency (cached after first calibration)
233///
234/// This is the critical optimization - we only calibrate once and cache
235/// the result in a static atomic, avoiding expensive file I/O on every
236/// timer creation.
237#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
238#[inline]
239fn get_tsc_frequency() -> u64 {
240    // Fast path: return cached value
241    let cached = CACHED_TSC_FREQ.load(Ordering::Relaxed);
242    if cached != TSC_UNCALIBRATED {
243        return cached;
244    }
245
246    // Slow path: calibrate and cache
247    let freq = calibrate_tsc_frequency();
248    CACHED_TSC_FREQ.store(freq, Ordering::Relaxed);
249    freq
250}
251
252/// Actually calibrate the TSC frequency (called once)
253#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
254fn calibrate_tsc_frequency() -> u64 {
255    // Try to read from sysfs (Linux) - fastest path
256    if let Ok(content) = std::fs::read_to_string("/sys/devices/system/cpu/cpu0/tsc_freq_khz") {
257        if let Ok(khz) = content.trim().parse::<u64>() {
258            return khz * 1000;
259        }
260    }
261
262    // Try cpuinfo for CPU MHz (less accurate but available)
263    if let Ok(content) = std::fs::read_to_string("/proc/cpuinfo") {
264        for line in content.lines() {
265            if line.starts_with("cpu MHz") {
266                if let Some(mhz_str) = line.split(':').nth(1) {
267                    if let Ok(mhz) = mhz_str.trim().parse::<f64>() {
268                        return (mhz * 1_000_000.0) as u64;
269                    }
270                }
271            }
272        }
273    }
274
275    // Calibration fallback: measure TSC ticks over known duration
276    // Use shorter calibration for faster startup (1ms instead of 10ms)
277    let calibration_ns = 1_000_000; // 1ms calibration
278    let start_tsc = rdtsc();
279    let start = Instant::now();
280
281    // Busy-wait for calibration period
282    while start.elapsed().as_nanos() < calibration_ns as u128 {
283        std::hint::spin_loop();
284    }
285
286    let end_tsc = rdtsc();
287    let actual_ns = start.elapsed().as_nanos() as u64;
288    let cycles = end_tsc.saturating_sub(start_tsc);
289
290    // freq = cycles / time = cycles * 10^9 / ns
291    (cycles as u128 * 1_000_000_000 / actual_ns as u128) as u64
292}
293
294/// High-resolution metrics accumulator
295///
296/// Tracks timing statistics at picosecond granularity with
297/// proper statistical aggregation.
298pub struct HiResMetrics {
299    /// Number of samples
300    pub count: AtomicU64,
301    /// Sum of all measurements (picoseconds)
302    pub total_ps: AtomicU64,
303    /// Minimum measurement (picoseconds)
304    pub min_ps: AtomicU64,
305    /// Maximum measurement (picoseconds)
306    pub max_ps: AtomicU64,
307    /// Sum of squares for variance calculation (in units of ns²)
308    /// We use ns² to avoid overflow while maintaining reasonable precision
309    pub sum_sq_ns2: AtomicU64,
310}
311
312impl HiResMetrics {
313    pub const fn new() -> Self {
314        HiResMetrics {
315            count: AtomicU64::new(0),
316            total_ps: AtomicU64::new(0),
317            min_ps: AtomicU64::new(u64::MAX),
318            max_ps: AtomicU64::new(0),
319            sum_sq_ns2: AtomicU64::new(0),
320        }
321    }
322
323    /// Record a measurement
324    pub fn record(&self, timestamp: HiResTimestamp) {
325        let ps = timestamp.picoseconds;
326        let ns = timestamp.as_nanos();
327
328        self.count.fetch_add(1, Ordering::Relaxed);
329        self.total_ps.fetch_add(ps, Ordering::Relaxed);
330
331        // Update min (atomic CAS loop)
332        let mut cur_min = self.min_ps.load(Ordering::Relaxed);
333        while ps < cur_min {
334            match self.min_ps.compare_exchange_weak(
335                cur_min,
336                ps,
337                Ordering::Relaxed,
338                Ordering::Relaxed,
339            ) {
340                Ok(_) => break,
341                Err(x) => cur_min = x,
342            }
343        }
344
345        // Update max (atomic CAS loop)
346        let mut cur_max = self.max_ps.load(Ordering::Relaxed);
347        while ps > cur_max {
348            match self.max_ps.compare_exchange_weak(
349                cur_max,
350                ps,
351                Ordering::Relaxed,
352                Ordering::Relaxed,
353            ) {
354                Ok(_) => break,
355                Err(x) => cur_max = x,
356            }
357        }
358
359        // Add to sum of squares (ns² to avoid overflow)
360        self.sum_sq_ns2
361            .fetch_add(ns.saturating_mul(ns), Ordering::Relaxed);
362    }
363
364    /// Record from HiResTimer
365    pub fn record_timer(&self, timer: &HiResTimer) {
366        self.record(timer.elapsed());
367    }
368
369    /// Get snapshot of metrics
370    pub fn snapshot(&self) -> HiResMetricsSnapshot {
371        let count = self.count.load(Ordering::Relaxed);
372        let total_ps = self.total_ps.load(Ordering::Relaxed);
373        let min_ps = self.min_ps.load(Ordering::Relaxed);
374        let max_ps = self.max_ps.load(Ordering::Relaxed);
375        let sum_sq_ns2 = self.sum_sq_ns2.load(Ordering::Relaxed);
376
377        let mean_ps = if count > 0 { total_ps / count } else { 0 };
378
379        // Variance = E[X²] - E[X]² (computed in ns for numerical stability)
380        let mean_ns = mean_ps / PS_PER_NS;
381        let variance_ns2 = if count > 1 {
382            let e_x2 = sum_sq_ns2 / count;
383            e_x2.saturating_sub(mean_ns.saturating_mul(mean_ns))
384        } else {
385            0
386        };
387
388        // Convert variance to picoseconds: var_ps = var_ns * (ps/ns)²
389        let variance_ps = variance_ns2.saturating_mul(PS_PER_NS * PS_PER_NS);
390        let stddev_ps = (variance_ps as f64).sqrt() as u64;
391
392        HiResMetricsSnapshot {
393            count,
394            total_ps,
395            min_ps: if min_ps == u64::MAX { 0 } else { min_ps },
396            max_ps,
397            mean_ps,
398            stddev_ps,
399        }
400    }
401
402    /// Reset all metrics
403    pub fn reset(&self) {
404        self.count.store(0, Ordering::Relaxed);
405        self.total_ps.store(0, Ordering::Relaxed);
406        self.min_ps.store(u64::MAX, Ordering::Relaxed);
407        self.max_ps.store(0, Ordering::Relaxed);
408        self.sum_sq_ns2.store(0, Ordering::Relaxed);
409    }
410}
411
412impl Default for HiResMetrics {
413    fn default() -> Self {
414        Self::new()
415    }
416}
417
418/// Snapshot of high-resolution metrics
419#[derive(Clone, Copy, Debug, Default)]
420pub struct HiResMetricsSnapshot {
421    /// Number of samples
422    pub count: u64,
423    /// Total time (picoseconds)
424    pub total_ps: Picoseconds,
425    /// Minimum time (picoseconds)
426    pub min_ps: Picoseconds,
427    /// Maximum time (picoseconds)
428    pub max_ps: Picoseconds,
429    /// Mean time (picoseconds)
430    pub mean_ps: Picoseconds,
431    /// Standard deviation (picoseconds)
432    pub stddev_ps: Picoseconds,
433}
434
435impl HiResMetricsSnapshot {
436    /// Format a comprehensive summary
437    pub fn format(&self) -> String {
438        if self.count == 0 {
439            return "no samples".to_string();
440        }
441
442        format!(
443            "n={} mean={} min={} max={} stddev={}",
444            self.count,
445            HiResTimestamp::from_picos(self.mean_ps, 0).format(),
446            HiResTimestamp::from_picos(self.min_ps, 0).format(),
447            HiResTimestamp::from_picos(self.max_ps, 0).format(),
448            HiResTimestamp::from_picos(self.stddev_ps, 0).format(),
449        )
450    }
451
452    /// Get throughput in operations per second
453    pub fn ops_per_sec(&self) -> f64 {
454        if self.count == 0 || self.total_ps == 0 {
455            return 0.0;
456        }
457        self.count as f64 / (self.total_ps as f64 / PS_PER_SEC as f64)
458    }
459
460    /// Get throughput in operations per microsecond
461    pub fn ops_per_us(&self) -> f64 {
462        if self.count == 0 || self.total_ps == 0 {
463            return 0.0;
464        }
465        self.count as f64 / (self.total_ps as f64 / PS_PER_US as f64)
466    }
467}
468
469/// Measure a closure with picosecond timing
470#[inline]
471pub fn measure<F, R>(f: F) -> (R, HiResTimestamp)
472where
473    F: FnOnce() -> R,
474{
475    let timer = HiResTimer::start();
476    let result = f();
477    (result, timer.elapsed())
478}
479
480/// Measure a closure N times and return statistics
481pub fn measure_n<F, R>(n: usize, mut f: F) -> (Vec<R>, HiResMetricsSnapshot)
482where
483    F: FnMut() -> R,
484{
485    let metrics = HiResMetrics::new();
486    let mut results = Vec::with_capacity(n);
487
488    for _ in 0..n {
489        let timer = HiResTimer::start();
490        results.push(f());
491        metrics.record_timer(&timer);
492    }
493
494    (results, metrics.snapshot())
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500
501    #[test]
502    fn test_timestamp_from_nanos() {
503        let ts = HiResTimestamp::from_nanos(1_000);
504        assert_eq!(ts.picoseconds, 1_000_000);
505        assert_eq!(ts.as_nanos(), 1_000);
506        assert!(!ts.is_estimated);
507    }
508
509    #[test]
510    fn test_timestamp_formatting() {
511        assert_eq!(HiResTimestamp::from_picos(500, 0).format(), "500ps");
512        assert_eq!(HiResTimestamp::from_nanos(1).format(), "1.000ns");
513        assert_eq!(HiResTimestamp::from_nanos(1_000).format(), "1.000µs");
514        assert_eq!(HiResTimestamp::from_nanos(1_000_000).format(), "1.000ms");
515        assert_eq!(HiResTimestamp::from_nanos(1_000_000_000).format(), "1.000s");
516    }
517
518    #[test]
519    fn test_timestamp_subtraction() {
520        let a = HiResTimestamp::from_nanos(1_000);
521        let b = HiResTimestamp::from_nanos(500);
522        let diff = a - b;
523        assert_eq!(diff.as_nanos(), 500);
524    }
525
526    #[test]
527    fn test_hires_timer_basic() {
528        let timer = HiResTimer::start();
529        std::thread::sleep(std::time::Duration::from_micros(100));
530        let elapsed = timer.elapsed();
531
532        // Should be at least 100µs = 100,000,000 ps
533        assert!(
534            elapsed.picoseconds >= 100_000_000,
535            "elapsed: {} ps",
536            elapsed.picoseconds
537        );
538        // But not more than 10ms (accounting for scheduling jitter)
539        assert!(
540            elapsed.picoseconds < 10_000_000_000_000,
541            "elapsed: {} ps",
542            elapsed.picoseconds
543        );
544    }
545
546    #[test]
547    fn test_hires_metrics_accumulation() {
548        let metrics = HiResMetrics::new();
549
550        // Record some measurements
551        metrics.record(HiResTimestamp::from_nanos(100));
552        metrics.record(HiResTimestamp::from_nanos(200));
553        metrics.record(HiResTimestamp::from_nanos(300));
554
555        let snapshot = metrics.snapshot();
556        assert_eq!(snapshot.count, 3);
557        assert_eq!(snapshot.min_ps, 100 * PS_PER_NS);
558        assert_eq!(snapshot.max_ps, 300 * PS_PER_NS);
559        assert_eq!(snapshot.mean_ps, 200 * PS_PER_NS);
560    }
561
562    #[test]
563    fn test_measure_closure() {
564        let (result, timing) = measure(|| {
565            let mut sum = 0u64;
566            for i in 0..1000 {
567                sum = sum.wrapping_add(i);
568            }
569            sum
570        });
571
572        assert!(result > 0);
573        assert!(timing.picoseconds > 0);
574    }
575
576    #[test]
577    fn test_measure_n() {
578        let iterations = 100;
579        let (results, stats) = measure_n(iterations, || {
580            // Simple operation
581            std::hint::black_box(42)
582        });
583
584        assert_eq!(results.len(), iterations);
585        assert_eq!(stats.count, iterations as u64);
586        assert!(stats.min_ps > 0);
587        assert!(stats.max_ps >= stats.min_ps);
588    }
589
590    #[test]
591    fn test_picosecond_precision_smoke() {
592        // This test verifies that we can distinguish sub-microsecond timings
593        let timer = HiResTimer::start();
594
595        // Very short operation
596        let _ = std::hint::black_box(1 + 1);
597
598        let elapsed = timer.elapsed();
599
600        // On modern hardware, this should be < 1µs
601        // But we're lenient due to CI variance
602        println!(
603            "Short operation: {} ({})",
604            elapsed.format(),
605            elapsed.format_with_uncertainty()
606        );
607
608        // Just verify we got a measurement
609        assert!(elapsed.picoseconds > 0 || !elapsed.is_estimated);
610    }
611}