Skip to main content

net/
timestamp.rs

1//! High-precision timestamp generation with zero syscall overhead.
2//!
3//! This module provides monotonically increasing timestamps using the CPU's
4//! Time Stamp Counter (TSC) on x86_64, avoiding syscalls in the hot path.
5//!
6//! # Design
7//!
8//! - Uses `quanta` crate which calibrates against the system clock once at startup
9//! - Subsequent reads use RDTSC instruction directly (no syscall)
10//! - Each shard has its own `TimestampGenerator` to eliminate contention
11//! - Monotonicity is guaranteed via atomic CAS operations
12
13use std::sync::atomic::{AtomicU64, Ordering};
14
15/// High-precision timestamp generator using TSC.
16///
17/// This generator provides strictly monotonic timestamps with sub-nanosecond
18/// resolution and zero syscall overhead after initialization.
19///
20/// # Single-owner invariant
21///
22/// **Each producer should own a dedicated `TimestampGenerator`.** The
23/// type is `Send + Sync` and `next()` *is* safe to call concurrently —
24/// monotonicity is preserved by `compare_exchange_weak` — but the CAS
25/// loop degenerates into a spin under sustained contention. The whole
26/// design rests on the loop almost never iterating: that's only true
27/// when one writer at a time accesses the generator.
28///
29/// The codebase enforces this structurally rather than at runtime:
30///
31/// - `Shard` owns its `TimestampGenerator` by value (not behind `Arc`).
32/// - `TimestampGenerator` is **not** `Clone`, so duplicating one is a
33///   deliberate `mem::replace` / `Default::default()` away — visible
34///   in code review.
35/// - The shard's surrounding `Mutex<Shard>` serializes producers, so
36///   `next()` is invoked by exactly one caller at a time per shard.
37///
38/// If you find yourself reaching for `Arc<TimestampGenerator>`, stop
39/// — give each producer its own instance instead. Every additional
40/// concurrent caller is one more thread potentially CAS-spinning on
41/// `last`.
42pub struct TimestampGenerator {
43    /// quanta clock (TSC-based after calibration).
44    clock: quanta::Clock,
45    /// Raw TSC value sampled at construction. All `next()` calls
46    /// compute their nanosecond offset relative to this baseline,
47    /// so the returned values are "ns since this generator was
48    /// created" rather than the unspecified "ns since the
49    /// quanta::Clock's internal calibration".
50    ///
51    /// Pre-fix the next() call did
52    /// `clock.delta_as_nanos(0, raw)`. quanta's calibration is
53    /// per-Clock and the "0" baseline doesn't correspond to any
54    /// meaningful real-world time — the returned ns counts were
55    /// in the order of system uptime, not "since this generator".
56    /// Two generators created at different times produced
57    /// timestamps with different effective offsets even on the
58    /// same physical TSC, breaking any consumer reasoning about
59    /// "approximately when did this event happen relative to
60    /// generator-creation".
61    baseline_raw: u64,
62    /// Last generated timestamp (for monotonicity).
63    last: AtomicU64,
64}
65
66impl TimestampGenerator {
67    /// Create a new timestamp generator.
68    ///
69    /// This performs a one-time calibration against the system clock.
70    /// Subsequent timestamp reads use TSC directly.
71    pub fn new() -> Self {
72        let clock = quanta::Clock::new();
73        let baseline_raw = clock.raw();
74        Self {
75            clock,
76            baseline_raw,
77            last: AtomicU64::new(0),
78        }
79    }
80
81    /// Generate the next timestamp.
82    ///
83    /// Returns a strictly monotonically increasing value in
84    /// **nanoseconds since this generator was constructed**. This
85    /// operation is lock-free and does not invoke any syscalls.
86    ///
87    /// Previously returned the raw TSC tick count. The docstring
88    /// claimed nanoseconds, but on a 3.5 GHz core the value was ~3.5×
89    /// larger than ns-since-epoch, breaking any consumer that read
90    /// `insertion_ts` and tried to correlate it with wall-clock-derived
91    /// timestamps from elsewhere. Converting via `delta_as_nanos` here
92    /// costs ~1ns extra per call and gives consumers a unit they can
93    /// actually use.
94    ///
95    /// # Performance
96    ///
97    /// - Single-threaded: ~6-12ns per call
98    /// - Under contention: may loop due to CAS, but still lock-free
99    #[inline(always)]
100    pub fn next(&self) -> u64 {
101        // Ensure strict monotonicity via CAS loop. The TSC read
102        // happens INSIDE the loop so a CAS retry under
103        // contention re-samples real time. Pre-fix `raw` / `now`
104        // were captured once before the loop; if a contended
105        // retry took even a few microseconds (worst case under
106        // heavy contention) the returned timestamp was `last+1`
107        // — wall-clock-correct only as long as no thread won
108        // the CAS in the meantime. Under sustained contention
109        // the generator drifted arbitrarily far behind real
110        // time. Re-reading the TSC is cheap (~1ns, no syscall)
111        // and restores the wall-clock contract per call.
112        loop {
113            let raw = self.clock.raw();
114            let now = self.clock.delta_as_nanos(self.baseline_raw, raw);
115            let last = self.last.load(Ordering::Acquire);
116
117            // Guard against u64::MAX exhaustion: saturating_add(1) at MAX
118            // would return MAX again, breaking strict monotonicity.
119            //
120            // Pre-fix, at `last == u64::MAX - 1` we'd return
121            // `u64::MAX` (via `.max()` clamp) and the NEXT call
122            // would panic on `checked_add(1)`. That gap leaves the
123            // generator briefly stalled at MAX before failure —
124            // not a monotonicity violation, but the caller sees a
125            // "value progression" that's actually clamped. Panic
126            // preemptively when the result would be u64::MAX so
127            // the failure mode is one consistent panic, not
128            // "return MAX once then panic on retry."
129            let next = match last.checked_add(1) {
130                Some(inc) => inc,
131                None => panic!("TimestampGenerator: timestamp space exhausted (u64::MAX)"),
132            };
133            let ts = now.max(next);
134            if ts == u64::MAX {
135                panic!(
136                    "TimestampGenerator: timestamp space exhausted (would return u64::MAX); \
137                     last={last}, now={now}",
138                );
139            }
140
141            match self
142                .last
143                .compare_exchange_weak(last, ts, Ordering::Release, Ordering::Relaxed)
144            {
145                Ok(_) => return ts,
146                Err(_) => {
147                    // Another thread updated; retry
148                    std::hint::spin_loop();
149                }
150            }
151        }
152    }
153
154    /// Get the current raw timestamp without incrementing.
155    ///
156    /// This does NOT guarantee monotonicity and is only useful for
157    /// measuring elapsed time or debugging.
158    #[inline(always)]
159    pub fn now_raw(&self) -> u64 {
160        self.clock.raw()
161    }
162
163    /// Convert a raw timestamp to nanoseconds since this
164    /// generator was constructed (i.e. since `baseline_raw`).
165    /// Output units match `next()`: the value returned by
166    /// `raw_to_nanos(self.now_raw())` is comparable to
167    /// recently-`next()`-returned timestamps from the same
168    /// generator (modulo the monotonicity floor `next()` enforces).
169    ///
170    /// Note: NOT "nanoseconds since UNIX epoch". The reference
171    /// point is the per-generator construction moment, so two
172    /// generators created at different times produce values with
173    /// different offsets. For wall-clock-anchored debugging,
174    /// combine with `SystemTime::now()` at generator-construction
175    /// time (recorded externally).
176    ///
177    /// Pre-fix this called `delta_as_nanos(0, raw)`, where the
178    /// `0` baseline was an unspecified quanta-internal reference
179    /// (typically system boot under Windows QPC or the clock's
180    /// first-call moment elsewhere). The returned ns values
181    /// were in the order of system uptime — not comparable to
182    /// `next()` output, despite the function's previous "ns
183    /// since epoch" doc-claim. Aligning both to `baseline_raw`
184    /// makes the surface consistent.
185    #[inline]
186    pub fn raw_to_nanos(&self, raw: u64) -> u64 {
187        self.clock.delta_as_nanos(self.baseline_raw, raw)
188    }
189
190    /// Get the last generated timestamp.
191    #[inline]
192    pub fn last(&self) -> u64 {
193        self.last.load(Ordering::Acquire)
194    }
195}
196
197impl Default for TimestampGenerator {
198    fn default() -> Self {
199        Self::new()
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206    use std::thread;
207
208    #[test]
209    fn test_monotonicity() {
210        let ts_gen = TimestampGenerator::new();
211        let mut prev = 0u64;
212
213        for _ in 0..10_000 {
214            let ts = ts_gen.next();
215            assert!(ts > prev, "timestamps must be strictly increasing");
216            prev = ts;
217        }
218    }
219
220    /// Source pin: `TimestampGenerator::next` must re-read the
221    /// TSC inside the CAS loop. Pre-fix `let raw = self.clock
222    /// .raw()` and `let now = ...` were captured ONCE before
223    /// the loop; under sustained contention a CAS retry reused
224    /// the stale `now` and the returned timestamp was `last+1`
225    /// rather than wall-clock-now, drifting arbitrarily far
226    /// behind real time. The TSC read is ~1ns and is inside
227    /// the loop now.
228    ///
229    /// We can't reliably reproduce the drift in a unit test
230    /// (the worst case requires sustained heavy contention
231    /// over enough retries that real-time advances measurably
232    /// past the stale capture). The source pin catches a
233    /// "simplification" PR that hoists the read back outside.
234    #[test]
235    fn timestamp_next_reads_tsc_inside_cas_loop() {
236        let src = include_str!("timestamp.rs");
237
238        // Locate the body of `pub fn next(&self) -> u64`.
239        let header = "pub fn next(&self) -> u64 {";
240        let start = src
241            .find(header)
242            .expect("TimestampGenerator::next must exist");
243        // The body ends at the next `\n    fn ` or `\n    pub fn ` at
244        // the impl indent.
245        let after_header = start + header.len();
246        let next_fn = src[after_header..]
247            .find("\n    pub fn ")
248            .or_else(|| src[after_header..].find("\n    fn "))
249            .expect("a sibling fn must follow next()")
250            + after_header;
251        let body = &src[start..next_fn];
252
253        // Strip line comments so the doc-comment that *describes*
254        // the rejected pattern doesn't trip the negative
255        // assertions.
256        let body_no_comments: String = body
257            .lines()
258            .map(|l| match l.find("//") {
259                Some(idx) => &l[..idx],
260                None => l,
261            })
262            .collect::<Vec<_>>()
263            .join("\n");
264
265        // Find the `loop {` opening — the CAS loop body must
266        // contain BOTH the TSC read and the delta_as_nanos
267        // call.
268        let loop_idx = body_no_comments
269            .find("loop {")
270            .expect("CAS loop must exist in next()");
271        let loop_body = &body_no_comments[loop_idx..];
272
273        assert!(
274            loop_body.contains("self.clock.raw()"),
275            "regression: `self.clock.raw()` must be inside the CAS \
276             loop in TimestampGenerator::next. Hoisted outside, a \
277             retry under contention reuses the stale TSC and the \
278             returned timestamp drifts behind real time."
279        );
280        assert!(
281            loop_body.contains("delta_as_nanos"),
282            "regression: `delta_as_nanos` must be inside the CAS \
283             loop alongside the TSC read."
284        );
285    }
286
287    #[test]
288    fn test_monotonicity_concurrent() {
289        let ts_gen = std::sync::Arc::new(TimestampGenerator::new());
290        let mut handles = vec![];
291
292        for _ in 0..4 {
293            let ts_gen_clone = ts_gen.clone();
294            handles.push(thread::spawn(move || {
295                let mut timestamps = Vec::with_capacity(1000);
296                for _ in 0..1000 {
297                    timestamps.push(ts_gen_clone.next());
298                }
299                timestamps
300            }));
301        }
302
303        let mut all_timestamps: Vec<u64> = handles
304            .into_iter()
305            .flat_map(|h| h.join().unwrap())
306            .collect();
307
308        // All timestamps should be unique (strictly monotonic)
309        all_timestamps.sort();
310        let unique_count = all_timestamps.windows(2).filter(|w| w[0] != w[1]).count() + 1;
311        assert_eq!(
312            unique_count,
313            all_timestamps.len(),
314            "all timestamps must be unique"
315        );
316    }
317
318    #[test]
319    fn test_no_syscall_performance() {
320        let ts_gen = TimestampGenerator::new();
321
322        // Warm up
323        for _ in 0..1000 {
324            let _ = ts_gen.next();
325        }
326
327        // Measure
328        let start = std::time::Instant::now();
329        let iterations = 100_000;
330
331        for _ in 0..iterations {
332            let _ = ts_gen.next();
333        }
334
335        let elapsed = start.elapsed();
336        let per_call = elapsed.as_nanos() / iterations as u128;
337
338        // Typically 5-20ns on bare metal, but CI runners can be slower.
339        assert!(
340            per_call < 500,
341            "timestamp generation too slow: {}ns per call",
342            per_call
343        );
344    }
345
346    #[test]
347    fn test_timestamp_generator_new() {
348        let ts_gen = TimestampGenerator::new();
349        // Initial last should be 0
350        assert_eq!(ts_gen.last(), 0);
351    }
352
353    #[test]
354    fn test_timestamp_generator_default() {
355        let ts_gen = TimestampGenerator::default();
356        assert_eq!(ts_gen.last(), 0);
357    }
358
359    #[test]
360    fn test_now_raw() {
361        let ts_gen = TimestampGenerator::new();
362        let raw1 = ts_gen.now_raw();
363        let raw2 = ts_gen.now_raw();
364        // Raw timestamps should be increasing (or at least not decreasing significantly)
365        assert!(raw2 >= raw1 || raw1 - raw2 < 1000); // Allow for some jitter
366    }
367
368    #[test]
369    fn test_raw_to_nanos() {
370        let ts_gen = TimestampGenerator::new();
371        let raw = ts_gen.now_raw();
372        let nanos = ts_gen.raw_to_nanos(raw);
373        // Nanos should be a reasonable value (not zero for a non-zero raw)
374        assert!(nanos > 0);
375    }
376
377    #[test]
378    fn test_raw_to_nanos_zero() {
379        let ts_gen = TimestampGenerator::new();
380        let nanos = ts_gen.raw_to_nanos(0);
381        assert_eq!(nanos, 0);
382    }
383
384    #[test]
385    fn test_last_after_next() {
386        let ts_gen = TimestampGenerator::new();
387        let ts1 = ts_gen.next();
388        assert_eq!(ts_gen.last(), ts1);
389
390        let ts2 = ts_gen.next();
391        assert_eq!(ts_gen.last(), ts2);
392        assert!(ts2 > ts1);
393    }
394
395    #[test]
396    fn test_next_returns_increasing_values() {
397        let ts_gen = TimestampGenerator::new();
398        let mut prev = ts_gen.next();
399
400        for _ in 0..100 {
401            let current = ts_gen.next();
402            assert!(current > prev);
403            prev = current;
404        }
405    }
406
407    #[test]
408    fn test_multiple_generators_independent() {
409        let ts_gen1 = TimestampGenerator::new();
410        let ts_gen2 = TimestampGenerator::new();
411
412        let ts1 = ts_gen1.next();
413        let ts2 = ts_gen2.next();
414
415        // Both should have advanced
416        assert!(ts1 > 0);
417        assert!(ts2 > 0);
418
419        // They are independent, so last values are different
420        assert_eq!(ts_gen1.last(), ts1);
421        assert_eq!(ts_gen2.last(), ts2);
422    }
423
424    #[test]
425    fn test_now_raw_does_not_affect_last() {
426        let ts_gen = TimestampGenerator::new();
427        let initial_last = ts_gen.last();
428
429        // Call now_raw multiple times
430        let _ = ts_gen.now_raw();
431        let _ = ts_gen.now_raw();
432        let _ = ts_gen.now_raw();
433
434        // last should not have changed
435        assert_eq!(ts_gen.last(), initial_last);
436    }
437
438    #[test]
439    fn test_rapid_calls() {
440        let ts_gen = TimestampGenerator::new();
441        let mut timestamps = Vec::with_capacity(10000);
442
443        for _ in 0..10000 {
444            timestamps.push(ts_gen.next());
445        }
446
447        // All should be unique and strictly increasing
448        for window in timestamps.windows(2) {
449            assert!(window[1] > window[0]);
450        }
451    }
452
453    // Regression: saturating_add(1) at u64::MAX used to silently return
454    // the same timestamp twice, breaking strict monotonicity (BUGS_3 #6).
455    #[test]
456    #[should_panic(expected = "timestamp space exhausted")]
457    fn test_next_panics_at_u64_max() {
458        let ts_gen = TimestampGenerator::new();
459        // Force last to u64::MAX
460        ts_gen.last.store(u64::MAX, Ordering::Release);
461        let _ = ts_gen.next();
462    }
463
464    #[test]
465    fn test_send_sync() {
466        fn assert_send_sync<T: Send + Sync>() {}
467        assert_send_sync::<TimestampGenerator>();
468    }
469
470    /// Regression: BUG_REPORT.md #14 — `next()` previously returned
471    /// raw TSC ticks (~3.5× wall-clock-ns on a 3.5 GHz core) while
472    /// claiming nanoseconds. Pin the unit by sleeping for a known
473    /// amount of wall-clock time and asserting the delta is roughly
474    /// nanoseconds, not TSC ticks.
475    #[test]
476    fn next_returns_nanoseconds_not_raw_ticks() {
477        let ts_gen = TimestampGenerator::new();
478        let t0 = ts_gen.next();
479        std::thread::sleep(std::time::Duration::from_millis(50));
480        let t1 = ts_gen.next();
481
482        let delta = t1 - t0;
483        // 50ms == 50_000_000 ns. Allow ±50% slack for sleep
484        // imprecision, scheduler jitter, and CI runners.
485        let ns_lo = 25_000_000u64;
486        let ns_hi = 200_000_000u64;
487        assert!(
488            delta >= ns_lo && delta <= ns_hi,
489            "delta over a 50ms sleep was {delta} — outside the {ns_lo}..={ns_hi} \
490             ns window. Most likely the timestamp is in raw TSC ticks again \
491             (would be ~150_000_000 on a 3 GHz core)."
492        );
493    }
494
495    /// A fresh generator's first `next()` value must be
496    /// small (close to "ns since this generator was created"),
497    /// not "ns since system uptime started" or some other
498    /// arbitrary baseline. Pre-fix the baseline was `0` against
499    /// the quanta::Clock's internal calibration, so on a system
500    /// that had been up for hours the first `next()` returned
501    /// many trillion nanoseconds.
502    ///
503    /// We assert the first `next()` is below ~10ms in nanoseconds,
504    /// which is plenty of slack for construction overhead but
505    /// nowhere near "ns since boot."
506    #[test]
507    fn next_first_call_is_close_to_zero() {
508        let ts_gen = TimestampGenerator::new();
509        let first = ts_gen.next();
510        let ten_ms_ns = 10_000_000u64;
511        assert!(
512            first < ten_ms_ns,
513            "first next() returned {first} ns; expected < {ten_ms_ns} ns. \
514             Pre-fix this would be ~uptime in ns."
515        );
516    }
517}