Skip to main content

net/
timestamp.rs

1//! High-precision timestamp generation with zero syscall overhead.
2//!
3//! This module provides monotonically increasing timestamps using the CPU's
4//! Time Stamp Counter (TSC) on x86_64, avoiding syscalls in the hot path.
5//!
6//! # Design
7//!
8//! - Uses `quanta` crate which calibrates against the system clock once at startup
9//! - Subsequent reads use RDTSC instruction directly (no syscall)
10//! - Each shard has its own `TimestampGenerator` to eliminate contention
11//! - Monotonicity is guaranteed via atomic CAS operations
12
13use std::sync::atomic::{AtomicU64, Ordering};
14
15/// High-precision timestamp generator using TSC.
16///
17/// This generator provides strictly monotonic timestamps with sub-nanosecond
18/// resolution and zero syscall overhead after initialization.
19///
20/// # Single-owner invariant
21///
22/// **Each producer should own a dedicated `TimestampGenerator`.** The
23/// type is `Send + Sync` and `next()` *is* safe to call concurrently —
24/// monotonicity is preserved by `compare_exchange_weak` — but the CAS
25/// loop degenerates into a spin under sustained contention. The whole
26/// design rests on the loop almost never iterating: that's only true
27/// when one writer at a time accesses the generator.
28///
29/// The codebase enforces this structurally rather than at runtime:
30///
31/// - `Shard` owns its `TimestampGenerator` by value (not behind `Arc`).
32/// - `TimestampGenerator` is **not** `Clone`, so duplicating one is a
33///   deliberate `mem::replace` / `Default::default()` away — visible
34///   in code review.
35/// - The shard's surrounding `Mutex<Shard>` serializes producers, so
36///   `next()` is invoked by exactly one caller at a time per shard.
37///
38/// If you find yourself reaching for `Arc<TimestampGenerator>`, stop
39/// — give each producer its own instance instead. Every additional
40/// concurrent caller is one more thread potentially CAS-spinning on
41/// `last`.
42pub struct TimestampGenerator {
43    /// quanta clock (TSC-based after calibration).
44    clock: quanta::Clock,
45    /// Raw TSC value sampled at construction. All `next()` calls
46    /// compute their nanosecond offset relative to this baseline,
47    /// so the returned values are "ns since this generator was
48    /// created" rather than the unspecified "ns since the
49    /// quanta::Clock's internal calibration".
50    ///
51    /// Pre-fix the next() call did
52    /// `clock.delta_as_nanos(0, raw)`. quanta's calibration is
53    /// per-Clock and the "0" baseline doesn't correspond to any
54    /// meaningful real-world time — the returned ns counts were
55    /// in the order of system uptime, not "since this generator".
56    /// Two generators created at different times produced
57    /// timestamps with different effective offsets even on the
58    /// same physical TSC, breaking any consumer reasoning about
59    /// "approximately when did this event happen relative to
60    /// generator-creation".
61    baseline_raw: u64,
62    /// Last generated timestamp (for monotonicity).
63    last: AtomicU64,
64}
65
66impl TimestampGenerator {
67    /// Create a new timestamp generator.
68    ///
69    /// This performs a one-time calibration against the system clock.
70    /// Subsequent timestamp reads use TSC directly.
71    pub fn new() -> Self {
72        let clock = quanta::Clock::new();
73        let baseline_raw = clock.raw();
74        Self {
75            clock,
76            baseline_raw,
77            last: AtomicU64::new(0),
78        }
79    }
80
81    /// Generate the next timestamp.
82    ///
83    /// Returns a strictly monotonically increasing value in
84    /// **nanoseconds since this generator was constructed**. This
85    /// operation is lock-free and does not invoke any syscalls.
86    ///
87    /// Previously returned the raw TSC tick count. The docstring
88    /// claimed nanoseconds, but on a 3.5 GHz core the value was ~3.5×
89    /// larger than ns-since-epoch, breaking any consumer that read
90    /// `insertion_ts` and tried to correlate it with wall-clock-derived
91    /// timestamps from elsewhere. Converting via `delta_as_nanos` here
92    /// costs ~1ns extra per call and gives consumers a unit they can
93    /// actually use.
94    ///
95    /// # Performance
96    ///
97    /// - Single-threaded: ~6-12ns per call
98    /// - Under contention: may loop due to CAS, but still lock-free
99    #[inline(always)]
100    pub fn next(&self) -> u64 {
101        // Ensure strict monotonicity via CAS loop. The TSC read
102        // happens INSIDE the loop so a CAS retry under
103        // contention re-samples real time. Pre-fix `raw` / `now`
104        // were captured once before the loop; if a contended
105        // retry took even a few microseconds (worst case under
106        // heavy contention) the returned timestamp was `last+1`
107        // — wall-clock-correct only as long as no thread won
108        // the CAS in the meantime. Under sustained contention
109        // the generator drifted arbitrarily far behind real
110        // time. Re-reading the TSC is cheap (~1ns, no syscall)
111        // and restores the wall-clock contract per call.
112        loop {
113            let raw = self.clock.raw();
114            let now = self.clock.delta_as_nanos(self.baseline_raw, raw);
115            let last = self.last.load(Ordering::Acquire);
116
117            // Guard against u64::MAX exhaustion: saturating_add(1) at MAX
118            // would return MAX again, breaking strict monotonicity.
119            //
120            // Pre-fix, at `last == u64::MAX - 1` we'd return
121            // `u64::MAX` (via `.max()` clamp) and the NEXT call
122            // would panic on `checked_add(1)`. That gap leaves the
123            // generator briefly stalled at MAX before failure —
124            // not a monotonicity violation, but the caller sees a
125            // "value progression" that's actually clamped. Panic
126            // preemptively when the result would be u64::MAX so
127            // the failure mode is one consistent panic, not
128            // "return MAX once then panic on retry."
129            let next = match last.checked_add(1) {
130                Some(inc) => inc,
131                None => panic!("TimestampGenerator: timestamp space exhausted (u64::MAX)"),
132            };
133            let ts = now.max(next);
134            if ts == u64::MAX {
135                panic!(
136                    "TimestampGenerator: timestamp space exhausted (would return u64::MAX); \
137                     last={last}, now={now}",
138                );
139            }
140
141            match self
142                .last
143                .compare_exchange_weak(last, ts, Ordering::Release, Ordering::Relaxed)
144            {
145                Ok(_) => return ts,
146                Err(_) => {
147                    // Another thread updated; retry
148                    std::hint::spin_loop();
149                }
150            }
151        }
152    }
153
154    /// Get the current raw timestamp without incrementing.
155    ///
156    /// This does NOT guarantee monotonicity and is only useful for
157    /// measuring elapsed time or debugging.
158    #[inline(always)]
159    pub fn now_raw(&self) -> u64 {
160        self.clock.raw()
161    }
162
163    /// Compute the nanosecond delta between two raw timestamps
164    /// (`end - start`) using this generator's calibrated clock.
165    /// Cheaper than `raw_to_nanos(end) - raw_to_nanos(start)`
166    /// because the underlying quanta `delta_as_nanos` does the
167    /// scaling once instead of twice; intended for in-process
168    /// elapsed-time measurement on hot paths that already hold a
169    /// `now_raw()` sample (e.g. the shard's dynamic-scaling
170    /// metrics sampling — PERF_AUDIT §1.3).
171    #[inline]
172    pub fn delta_ns(&self, start_raw: u64, end_raw: u64) -> u64 {
173        self.clock.delta_as_nanos(start_raw, end_raw)
174    }
175
176    /// Convert a raw timestamp to nanoseconds since this
177    /// generator was constructed (i.e. since `baseline_raw`).
178    /// Output units match `next()`: the value returned by
179    /// `raw_to_nanos(self.now_raw())` is comparable to
180    /// recently-`next()`-returned timestamps from the same
181    /// generator (modulo the monotonicity floor `next()` enforces).
182    ///
183    /// Note: NOT "nanoseconds since UNIX epoch". The reference
184    /// point is the per-generator construction moment, so two
185    /// generators created at different times produce values with
186    /// different offsets. For wall-clock-anchored debugging,
187    /// combine with `SystemTime::now()` at generator-construction
188    /// time (recorded externally).
189    ///
190    /// Pre-fix this called `delta_as_nanos(0, raw)`, where the
191    /// `0` baseline was an unspecified quanta-internal reference
192    /// (typically system boot under Windows QPC or the clock's
193    /// first-call moment elsewhere). The returned ns values
194    /// were in the order of system uptime — not comparable to
195    /// `next()` output, despite the function's previous "ns
196    /// since epoch" doc-claim. Aligning both to `baseline_raw`
197    /// makes the surface consistent.
198    #[inline]
199    pub fn raw_to_nanos(&self, raw: u64) -> u64 {
200        self.clock.delta_as_nanos(self.baseline_raw, raw)
201    }
202
203    /// Get the last generated timestamp.
204    #[inline]
205    pub fn last(&self) -> u64 {
206        self.last.load(Ordering::Acquire)
207    }
208}
209
210impl Default for TimestampGenerator {
211    fn default() -> Self {
212        Self::new()
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use std::thread;
220
221    #[test]
222    fn test_monotonicity() {
223        let ts_gen = TimestampGenerator::new();
224        let mut prev = 0u64;
225
226        for _ in 0..10_000 {
227            let ts = ts_gen.next();
228            assert!(ts > prev, "timestamps must be strictly increasing");
229            prev = ts;
230        }
231    }
232
233    /// Source pin: `TimestampGenerator::next` must re-read the
234    /// TSC inside the CAS loop. Pre-fix `let raw = self.clock
235    /// .raw()` and `let now = ...` were captured ONCE before
236    /// the loop; under sustained contention a CAS retry reused
237    /// the stale `now` and the returned timestamp was `last+1`
238    /// rather than wall-clock-now, drifting arbitrarily far
239    /// behind real time. The TSC read is ~1ns and is inside
240    /// the loop now.
241    ///
242    /// We can't reliably reproduce the drift in a unit test
243    /// (the worst case requires sustained heavy contention
244    /// over enough retries that real-time advances measurably
245    /// past the stale capture). The source pin catches a
246    /// "simplification" PR that hoists the read back outside.
247    #[test]
248    fn timestamp_next_reads_tsc_inside_cas_loop() {
249        let src = include_str!("timestamp.rs");
250
251        // Locate the body of `pub fn next(&self) -> u64`.
252        let header = "pub fn next(&self) -> u64 {";
253        let start = src
254            .find(header)
255            .expect("TimestampGenerator::next must exist");
256        // The body ends at the next `\n    fn ` or `\n    pub fn ` at
257        // the impl indent.
258        let after_header = start + header.len();
259        let next_fn = src[after_header..]
260            .find("\n    pub fn ")
261            .or_else(|| src[after_header..].find("\n    fn "))
262            .expect("a sibling fn must follow next()")
263            + after_header;
264        let body = &src[start..next_fn];
265
266        // Strip line comments so the doc-comment that *describes*
267        // the rejected pattern doesn't trip the negative
268        // assertions.
269        let body_no_comments: String = body
270            .lines()
271            .map(|l| match l.find("//") {
272                Some(idx) => &l[..idx],
273                None => l,
274            })
275            .collect::<Vec<_>>()
276            .join("\n");
277
278        // Find the `loop {` opening — the CAS loop body must
279        // contain BOTH the TSC read and the delta_as_nanos
280        // call.
281        let loop_idx = body_no_comments
282            .find("loop {")
283            .expect("CAS loop must exist in next()");
284        let loop_body = &body_no_comments[loop_idx..];
285
286        assert!(
287            loop_body.contains("self.clock.raw()"),
288            "regression: `self.clock.raw()` must be inside the CAS \
289             loop in TimestampGenerator::next. Hoisted outside, a \
290             retry under contention reuses the stale TSC and the \
291             returned timestamp drifts behind real time."
292        );
293        assert!(
294            loop_body.contains("delta_as_nanos"),
295            "regression: `delta_as_nanos` must be inside the CAS \
296             loop alongside the TSC read."
297        );
298    }
299
300    #[test]
301    fn test_monotonicity_concurrent() {
302        let ts_gen = std::sync::Arc::new(TimestampGenerator::new());
303        let mut handles = vec![];
304
305        for _ in 0..4 {
306            let ts_gen_clone = ts_gen.clone();
307            handles.push(thread::spawn(move || {
308                let mut timestamps = Vec::with_capacity(1000);
309                for _ in 0..1000 {
310                    timestamps.push(ts_gen_clone.next());
311                }
312                timestamps
313            }));
314        }
315
316        let mut all_timestamps: Vec<u64> = handles
317            .into_iter()
318            .flat_map(|h| h.join().unwrap())
319            .collect();
320
321        // All timestamps should be unique (strictly monotonic)
322        all_timestamps.sort();
323        let unique_count = all_timestamps.windows(2).filter(|w| w[0] != w[1]).count() + 1;
324        assert_eq!(
325            unique_count,
326            all_timestamps.len(),
327            "all timestamps must be unique"
328        );
329    }
330
331    #[test]
332    fn test_no_syscall_performance() {
333        let ts_gen = TimestampGenerator::new();
334
335        // Warm up
336        for _ in 0..1000 {
337            let _ = ts_gen.next();
338        }
339
340        // Measure
341        let start = std::time::Instant::now();
342        let iterations = 100_000;
343
344        for _ in 0..iterations {
345            let _ = ts_gen.next();
346        }
347
348        let elapsed = start.elapsed();
349        let per_call = elapsed.as_nanos() / iterations as u128;
350
351        // Typically 5-20ns on bare metal, but CI runners can be slower.
352        assert!(
353            per_call < 500,
354            "timestamp generation too slow: {}ns per call",
355            per_call
356        );
357    }
358
359    #[test]
360    fn test_timestamp_generator_new() {
361        let ts_gen = TimestampGenerator::new();
362        // Initial last should be 0
363        assert_eq!(ts_gen.last(), 0);
364    }
365
366    #[test]
367    fn test_timestamp_generator_default() {
368        let ts_gen = TimestampGenerator::default();
369        assert_eq!(ts_gen.last(), 0);
370    }
371
372    #[test]
373    fn test_now_raw() {
374        let ts_gen = TimestampGenerator::new();
375        let raw1 = ts_gen.now_raw();
376        let raw2 = ts_gen.now_raw();
377        // Raw timestamps should be increasing (or at least not decreasing significantly)
378        assert!(raw2 >= raw1 || raw1 - raw2 < 1000); // Allow for some jitter
379    }
380
381    #[test]
382    fn test_raw_to_nanos() {
383        let ts_gen = TimestampGenerator::new();
384        let raw = ts_gen.now_raw();
385        let nanos = ts_gen.raw_to_nanos(raw);
386        // Nanos should be a reasonable value (not zero for a non-zero raw)
387        assert!(nanos > 0);
388    }
389
390    #[test]
391    fn test_raw_to_nanos_zero() {
392        let ts_gen = TimestampGenerator::new();
393        let nanos = ts_gen.raw_to_nanos(0);
394        assert_eq!(nanos, 0);
395    }
396
397    #[test]
398    fn test_last_after_next() {
399        let ts_gen = TimestampGenerator::new();
400        let ts1 = ts_gen.next();
401        assert_eq!(ts_gen.last(), ts1);
402
403        let ts2 = ts_gen.next();
404        assert_eq!(ts_gen.last(), ts2);
405        assert!(ts2 > ts1);
406    }
407
408    #[test]
409    fn test_next_returns_increasing_values() {
410        let ts_gen = TimestampGenerator::new();
411        let mut prev = ts_gen.next();
412
413        for _ in 0..100 {
414            let current = ts_gen.next();
415            assert!(current > prev);
416            prev = current;
417        }
418    }
419
420    #[test]
421    fn test_multiple_generators_independent() {
422        let ts_gen1 = TimestampGenerator::new();
423        let ts_gen2 = TimestampGenerator::new();
424
425        let ts1 = ts_gen1.next();
426        let ts2 = ts_gen2.next();
427
428        // Both should have advanced
429        assert!(ts1 > 0);
430        assert!(ts2 > 0);
431
432        // They are independent, so last values are different
433        assert_eq!(ts_gen1.last(), ts1);
434        assert_eq!(ts_gen2.last(), ts2);
435    }
436
437    #[test]
438    fn test_now_raw_does_not_affect_last() {
439        let ts_gen = TimestampGenerator::new();
440        let initial_last = ts_gen.last();
441
442        // Call now_raw multiple times
443        let _ = ts_gen.now_raw();
444        let _ = ts_gen.now_raw();
445        let _ = ts_gen.now_raw();
446
447        // last should not have changed
448        assert_eq!(ts_gen.last(), initial_last);
449    }
450
451    #[test]
452    fn test_rapid_calls() {
453        let ts_gen = TimestampGenerator::new();
454        let mut timestamps = Vec::with_capacity(10000);
455
456        for _ in 0..10000 {
457            timestamps.push(ts_gen.next());
458        }
459
460        // All should be unique and strictly increasing
461        for window in timestamps.windows(2) {
462            assert!(window[1] > window[0]);
463        }
464    }
465
466    // Regression: saturating_add(1) at u64::MAX used to silently return
467    // the same timestamp twice, breaking strict monotonicity (BUGS_3 #6).
468    #[test]
469    #[should_panic(expected = "timestamp space exhausted")]
470    fn test_next_panics_at_u64_max() {
471        let ts_gen = TimestampGenerator::new();
472        // Force last to u64::MAX
473        ts_gen.last.store(u64::MAX, Ordering::Release);
474        let _ = ts_gen.next();
475    }
476
477    #[test]
478    fn test_send_sync() {
479        fn assert_send_sync<T: Send + Sync>() {}
480        assert_send_sync::<TimestampGenerator>();
481    }
482
483    /// Regression: BUG_REPORT.md #14 — `next()` previously returned
484    /// raw TSC ticks (~3.5× wall-clock-ns on a 3.5 GHz core) while
485    /// claiming nanoseconds. Pin the unit by sleeping for a known
486    /// amount of wall-clock time and asserting the delta is roughly
487    /// nanoseconds, not TSC ticks.
488    #[test]
489    fn next_returns_nanoseconds_not_raw_ticks() {
490        let ts_gen = TimestampGenerator::new();
491        let t0 = ts_gen.next();
492        std::thread::sleep(std::time::Duration::from_millis(50));
493        let t1 = ts_gen.next();
494
495        let delta = t1 - t0;
496        // 50ms == 50_000_000 ns. Allow ±50% slack for sleep
497        // imprecision, scheduler jitter, and CI runners.
498        let ns_lo = 25_000_000u64;
499        let ns_hi = 200_000_000u64;
500        assert!(
501            delta >= ns_lo && delta <= ns_hi,
502            "delta over a 50ms sleep was {delta} — outside the {ns_lo}..={ns_hi} \
503             ns window. Most likely the timestamp is in raw TSC ticks again \
504             (would be ~150_000_000 on a 3 GHz core)."
505        );
506    }
507
508    /// A fresh generator's first `next()` value must be
509    /// small (close to "ns since this generator was created"),
510    /// not "ns since system uptime started" or some other
511    /// arbitrary baseline. Pre-fix the baseline was `0` against
512    /// the quanta::Clock's internal calibration, so on a system
513    /// that had been up for hours the first `next()` returned
514    /// many trillion nanoseconds.
515    ///
516    /// We assert the first `next()` is below ~10ms in nanoseconds,
517    /// which is plenty of slack for construction overhead but
518    /// nowhere near "ns since boot."
519    #[test]
520    fn next_first_call_is_close_to_zero() {
521        let ts_gen = TimestampGenerator::new();
522        let first = ts_gen.next();
523        let ten_ms_ns = 10_000_000u64;
524        assert!(
525            first < ten_ms_ns,
526            "first next() returned {first} ns; expected < {ten_ms_ns} ns. \
527             Pre-fix this would be ~uptime in ns."
528        );
529    }
530}