net/timestamp.rs
1//! High-precision timestamp generation with zero syscall overhead.
2//!
3//! This module provides monotonically increasing timestamps using the CPU's
4//! Time Stamp Counter (TSC) on x86_64, avoiding syscalls in the hot path.
5//!
6//! # Design
7//!
8//! - Uses `quanta` crate which calibrates against the system clock once at startup
9//! - Subsequent reads use RDTSC instruction directly (no syscall)
10//! - Each shard has its own `TimestampGenerator` to eliminate contention
11//! - Monotonicity is guaranteed via atomic CAS operations
12
13use std::sync::atomic::{AtomicU64, Ordering};
14
15/// High-precision timestamp generator using TSC.
16///
17/// This generator provides strictly monotonic timestamps with sub-nanosecond
18/// resolution and zero syscall overhead after initialization.
19///
20/// # Single-owner invariant
21///
22/// **Each producer should own a dedicated `TimestampGenerator`.** The
23/// type is `Send + Sync` and `next()` *is* safe to call concurrently —
24/// monotonicity is preserved by `compare_exchange_weak` — but the CAS
25/// loop degenerates into a spin under sustained contention. The whole
26/// design rests on the loop almost never iterating: that's only true
27/// when one writer at a time accesses the generator.
28///
29/// The codebase enforces this structurally rather than at runtime:
30///
31/// - `Shard` owns its `TimestampGenerator` by value (not behind `Arc`).
32/// - `TimestampGenerator` is **not** `Clone`, so duplicating one is a
33/// deliberate `mem::replace` / `Default::default()` away — visible
34/// in code review.
35/// - The shard's surrounding `Mutex<Shard>` serializes producers, so
36/// `next()` is invoked by exactly one caller at a time per shard.
37///
38/// If you find yourself reaching for `Arc<TimestampGenerator>`, stop
39/// — give each producer its own instance instead. Every additional
40/// concurrent caller is one more thread potentially CAS-spinning on
41/// `last`.
42pub struct TimestampGenerator {
43 /// quanta clock (TSC-based after calibration).
44 clock: quanta::Clock,
45 /// Raw TSC value sampled at construction. All `next()` calls
46 /// compute their nanosecond offset relative to this baseline,
47 /// so the returned values are "ns since this generator was
48 /// created" rather than the unspecified "ns since the
49 /// quanta::Clock's internal calibration".
50 ///
51 /// Pre-fix the next() call did
52 /// `clock.delta_as_nanos(0, raw)`. quanta's calibration is
53 /// per-Clock and the "0" baseline doesn't correspond to any
54 /// meaningful real-world time — the returned ns counts were
55 /// in the order of system uptime, not "since this generator".
56 /// Two generators created at different times produced
57 /// timestamps with different effective offsets even on the
58 /// same physical TSC, breaking any consumer reasoning about
59 /// "approximately when did this event happen relative to
60 /// generator-creation".
61 baseline_raw: u64,
62 /// Last generated timestamp (for monotonicity).
63 last: AtomicU64,
64}
65
66impl TimestampGenerator {
67 /// Create a new timestamp generator.
68 ///
69 /// This performs a one-time calibration against the system clock.
70 /// Subsequent timestamp reads use TSC directly.
71 pub fn new() -> Self {
72 let clock = quanta::Clock::new();
73 let baseline_raw = clock.raw();
74 Self {
75 clock,
76 baseline_raw,
77 last: AtomicU64::new(0),
78 }
79 }
80
81 /// Generate the next timestamp.
82 ///
83 /// Returns a strictly monotonically increasing value in
84 /// **nanoseconds since this generator was constructed**. This
85 /// operation is lock-free and does not invoke any syscalls.
86 ///
87 /// Previously returned the raw TSC tick count. The docstring
88 /// claimed nanoseconds, but on a 3.5 GHz core the value was ~3.5×
89 /// larger than ns-since-epoch, breaking any consumer that read
90 /// `insertion_ts` and tried to correlate it with wall-clock-derived
91 /// timestamps from elsewhere. Converting via `delta_as_nanos` here
92 /// costs ~1ns extra per call and gives consumers a unit they can
93 /// actually use.
94 ///
95 /// # Performance
96 ///
97 /// - Single-threaded: ~6-12ns per call
98 /// - Under contention: may loop due to CAS, but still lock-free
99 #[inline(always)]
100 pub fn next(&self) -> u64 {
101 // Ensure strict monotonicity via CAS loop. The TSC read
102 // happens INSIDE the loop so a CAS retry under
103 // contention re-samples real time. Pre-fix `raw` / `now`
104 // were captured once before the loop; if a contended
105 // retry took even a few microseconds (worst case under
106 // heavy contention) the returned timestamp was `last+1`
107 // — wall-clock-correct only as long as no thread won
108 // the CAS in the meantime. Under sustained contention
109 // the generator drifted arbitrarily far behind real
110 // time. Re-reading the TSC is cheap (~1ns, no syscall)
111 // and restores the wall-clock contract per call.
112 loop {
113 let raw = self.clock.raw();
114 let now = self.clock.delta_as_nanos(self.baseline_raw, raw);
115 let last = self.last.load(Ordering::Acquire);
116
117 // Guard against u64::MAX exhaustion: saturating_add(1) at MAX
118 // would return MAX again, breaking strict monotonicity.
119 //
120 // Pre-fix, at `last == u64::MAX - 1` we'd return
121 // `u64::MAX` (via `.max()` clamp) and the NEXT call
122 // would panic on `checked_add(1)`. That gap leaves the
123 // generator briefly stalled at MAX before failure —
124 // not a monotonicity violation, but the caller sees a
125 // "value progression" that's actually clamped. Panic
126 // preemptively when the result would be u64::MAX so
127 // the failure mode is one consistent panic, not
128 // "return MAX once then panic on retry."
129 let next = match last.checked_add(1) {
130 Some(inc) => inc,
131 None => panic!("TimestampGenerator: timestamp space exhausted (u64::MAX)"),
132 };
133 let ts = now.max(next);
134 if ts == u64::MAX {
135 panic!(
136 "TimestampGenerator: timestamp space exhausted (would return u64::MAX); \
137 last={last}, now={now}",
138 );
139 }
140
141 match self
142 .last
143 .compare_exchange_weak(last, ts, Ordering::Release, Ordering::Relaxed)
144 {
145 Ok(_) => return ts,
146 Err(_) => {
147 // Another thread updated; retry
148 std::hint::spin_loop();
149 }
150 }
151 }
152 }
153
154 /// Get the current raw timestamp without incrementing.
155 ///
156 /// This does NOT guarantee monotonicity and is only useful for
157 /// measuring elapsed time or debugging.
158 #[inline(always)]
159 pub fn now_raw(&self) -> u64 {
160 self.clock.raw()
161 }
162
163 /// Compute the nanosecond delta between two raw timestamps
164 /// (`end - start`) using this generator's calibrated clock.
165 /// Cheaper than `raw_to_nanos(end) - raw_to_nanos(start)`
166 /// because the underlying quanta `delta_as_nanos` does the
167 /// scaling once instead of twice; intended for in-process
168 /// elapsed-time measurement on hot paths that already hold a
169 /// `now_raw()` sample (e.g. the shard's dynamic-scaling
170 /// metrics sampling — PERF_AUDIT §1.3).
171 #[inline]
172 pub fn delta_ns(&self, start_raw: u64, end_raw: u64) -> u64 {
173 self.clock.delta_as_nanos(start_raw, end_raw)
174 }
175
176 /// Convert a raw timestamp to nanoseconds since this
177 /// generator was constructed (i.e. since `baseline_raw`).
178 /// Output units match `next()`: the value returned by
179 /// `raw_to_nanos(self.now_raw())` is comparable to
180 /// recently-`next()`-returned timestamps from the same
181 /// generator (modulo the monotonicity floor `next()` enforces).
182 ///
183 /// Note: NOT "nanoseconds since UNIX epoch". The reference
184 /// point is the per-generator construction moment, so two
185 /// generators created at different times produce values with
186 /// different offsets. For wall-clock-anchored debugging,
187 /// combine with `SystemTime::now()` at generator-construction
188 /// time (recorded externally).
189 ///
190 /// Pre-fix this called `delta_as_nanos(0, raw)`, where the
191 /// `0` baseline was an unspecified quanta-internal reference
192 /// (typically system boot under Windows QPC or the clock's
193 /// first-call moment elsewhere). The returned ns values
194 /// were in the order of system uptime — not comparable to
195 /// `next()` output, despite the function's previous "ns
196 /// since epoch" doc-claim. Aligning both to `baseline_raw`
197 /// makes the surface consistent.
198 #[inline]
199 pub fn raw_to_nanos(&self, raw: u64) -> u64 {
200 self.clock.delta_as_nanos(self.baseline_raw, raw)
201 }
202
203 /// Get the last generated timestamp.
204 #[inline]
205 pub fn last(&self) -> u64 {
206 self.last.load(Ordering::Acquire)
207 }
208}
209
210impl Default for TimestampGenerator {
211 fn default() -> Self {
212 Self::new()
213 }
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219 use std::thread;
220
221 #[test]
222 fn test_monotonicity() {
223 let ts_gen = TimestampGenerator::new();
224 let mut prev = 0u64;
225
226 for _ in 0..10_000 {
227 let ts = ts_gen.next();
228 assert!(ts > prev, "timestamps must be strictly increasing");
229 prev = ts;
230 }
231 }
232
233 /// Source pin: `TimestampGenerator::next` must re-read the
234 /// TSC inside the CAS loop. Pre-fix `let raw = self.clock
235 /// .raw()` and `let now = ...` were captured ONCE before
236 /// the loop; under sustained contention a CAS retry reused
237 /// the stale `now` and the returned timestamp was `last+1`
238 /// rather than wall-clock-now, drifting arbitrarily far
239 /// behind real time. The TSC read is ~1ns and is inside
240 /// the loop now.
241 ///
242 /// We can't reliably reproduce the drift in a unit test
243 /// (the worst case requires sustained heavy contention
244 /// over enough retries that real-time advances measurably
245 /// past the stale capture). The source pin catches a
246 /// "simplification" PR that hoists the read back outside.
247 #[test]
248 fn timestamp_next_reads_tsc_inside_cas_loop() {
249 let src = include_str!("timestamp.rs");
250
251 // Locate the body of `pub fn next(&self) -> u64`.
252 let header = "pub fn next(&self) -> u64 {";
253 let start = src
254 .find(header)
255 .expect("TimestampGenerator::next must exist");
256 // The body ends at the next `\n fn ` or `\n pub fn ` at
257 // the impl indent.
258 let after_header = start + header.len();
259 let next_fn = src[after_header..]
260 .find("\n pub fn ")
261 .or_else(|| src[after_header..].find("\n fn "))
262 .expect("a sibling fn must follow next()")
263 + after_header;
264 let body = &src[start..next_fn];
265
266 // Strip line comments so the doc-comment that *describes*
267 // the rejected pattern doesn't trip the negative
268 // assertions.
269 let body_no_comments: String = body
270 .lines()
271 .map(|l| match l.find("//") {
272 Some(idx) => &l[..idx],
273 None => l,
274 })
275 .collect::<Vec<_>>()
276 .join("\n");
277
278 // Find the `loop {` opening — the CAS loop body must
279 // contain BOTH the TSC read and the delta_as_nanos
280 // call.
281 let loop_idx = body_no_comments
282 .find("loop {")
283 .expect("CAS loop must exist in next()");
284 let loop_body = &body_no_comments[loop_idx..];
285
286 assert!(
287 loop_body.contains("self.clock.raw()"),
288 "regression: `self.clock.raw()` must be inside the CAS \
289 loop in TimestampGenerator::next. Hoisted outside, a \
290 retry under contention reuses the stale TSC and the \
291 returned timestamp drifts behind real time."
292 );
293 assert!(
294 loop_body.contains("delta_as_nanos"),
295 "regression: `delta_as_nanos` must be inside the CAS \
296 loop alongside the TSC read."
297 );
298 }
299
300 #[test]
301 fn test_monotonicity_concurrent() {
302 let ts_gen = std::sync::Arc::new(TimestampGenerator::new());
303 let mut handles = vec![];
304
305 for _ in 0..4 {
306 let ts_gen_clone = ts_gen.clone();
307 handles.push(thread::spawn(move || {
308 let mut timestamps = Vec::with_capacity(1000);
309 for _ in 0..1000 {
310 timestamps.push(ts_gen_clone.next());
311 }
312 timestamps
313 }));
314 }
315
316 let mut all_timestamps: Vec<u64> = handles
317 .into_iter()
318 .flat_map(|h| h.join().unwrap())
319 .collect();
320
321 // All timestamps should be unique (strictly monotonic)
322 all_timestamps.sort();
323 let unique_count = all_timestamps.windows(2).filter(|w| w[0] != w[1]).count() + 1;
324 assert_eq!(
325 unique_count,
326 all_timestamps.len(),
327 "all timestamps must be unique"
328 );
329 }
330
331 #[test]
332 fn test_no_syscall_performance() {
333 let ts_gen = TimestampGenerator::new();
334
335 // Warm up
336 for _ in 0..1000 {
337 let _ = ts_gen.next();
338 }
339
340 // Measure
341 let start = std::time::Instant::now();
342 let iterations = 100_000;
343
344 for _ in 0..iterations {
345 let _ = ts_gen.next();
346 }
347
348 let elapsed = start.elapsed();
349 let per_call = elapsed.as_nanos() / iterations as u128;
350
351 // Typically 5-20ns on bare metal, but CI runners can be slower.
352 assert!(
353 per_call < 500,
354 "timestamp generation too slow: {}ns per call",
355 per_call
356 );
357 }
358
359 #[test]
360 fn test_timestamp_generator_new() {
361 let ts_gen = TimestampGenerator::new();
362 // Initial last should be 0
363 assert_eq!(ts_gen.last(), 0);
364 }
365
366 #[test]
367 fn test_timestamp_generator_default() {
368 let ts_gen = TimestampGenerator::default();
369 assert_eq!(ts_gen.last(), 0);
370 }
371
372 #[test]
373 fn test_now_raw() {
374 let ts_gen = TimestampGenerator::new();
375 let raw1 = ts_gen.now_raw();
376 let raw2 = ts_gen.now_raw();
377 // Raw timestamps should be increasing (or at least not decreasing significantly)
378 assert!(raw2 >= raw1 || raw1 - raw2 < 1000); // Allow for some jitter
379 }
380
381 #[test]
382 fn test_raw_to_nanos() {
383 let ts_gen = TimestampGenerator::new();
384 let raw = ts_gen.now_raw();
385 let nanos = ts_gen.raw_to_nanos(raw);
386 // Nanos should be a reasonable value (not zero for a non-zero raw)
387 assert!(nanos > 0);
388 }
389
390 #[test]
391 fn test_raw_to_nanos_zero() {
392 let ts_gen = TimestampGenerator::new();
393 let nanos = ts_gen.raw_to_nanos(0);
394 assert_eq!(nanos, 0);
395 }
396
397 #[test]
398 fn test_last_after_next() {
399 let ts_gen = TimestampGenerator::new();
400 let ts1 = ts_gen.next();
401 assert_eq!(ts_gen.last(), ts1);
402
403 let ts2 = ts_gen.next();
404 assert_eq!(ts_gen.last(), ts2);
405 assert!(ts2 > ts1);
406 }
407
408 #[test]
409 fn test_next_returns_increasing_values() {
410 let ts_gen = TimestampGenerator::new();
411 let mut prev = ts_gen.next();
412
413 for _ in 0..100 {
414 let current = ts_gen.next();
415 assert!(current > prev);
416 prev = current;
417 }
418 }
419
420 #[test]
421 fn test_multiple_generators_independent() {
422 let ts_gen1 = TimestampGenerator::new();
423 let ts_gen2 = TimestampGenerator::new();
424
425 let ts1 = ts_gen1.next();
426 let ts2 = ts_gen2.next();
427
428 // Both should have advanced
429 assert!(ts1 > 0);
430 assert!(ts2 > 0);
431
432 // They are independent, so last values are different
433 assert_eq!(ts_gen1.last(), ts1);
434 assert_eq!(ts_gen2.last(), ts2);
435 }
436
437 #[test]
438 fn test_now_raw_does_not_affect_last() {
439 let ts_gen = TimestampGenerator::new();
440 let initial_last = ts_gen.last();
441
442 // Call now_raw multiple times
443 let _ = ts_gen.now_raw();
444 let _ = ts_gen.now_raw();
445 let _ = ts_gen.now_raw();
446
447 // last should not have changed
448 assert_eq!(ts_gen.last(), initial_last);
449 }
450
451 #[test]
452 fn test_rapid_calls() {
453 let ts_gen = TimestampGenerator::new();
454 let mut timestamps = Vec::with_capacity(10000);
455
456 for _ in 0..10000 {
457 timestamps.push(ts_gen.next());
458 }
459
460 // All should be unique and strictly increasing
461 for window in timestamps.windows(2) {
462 assert!(window[1] > window[0]);
463 }
464 }
465
466 // Regression: saturating_add(1) at u64::MAX used to silently return
467 // the same timestamp twice, breaking strict monotonicity (BUGS_3 #6).
468 #[test]
469 #[should_panic(expected = "timestamp space exhausted")]
470 fn test_next_panics_at_u64_max() {
471 let ts_gen = TimestampGenerator::new();
472 // Force last to u64::MAX
473 ts_gen.last.store(u64::MAX, Ordering::Release);
474 let _ = ts_gen.next();
475 }
476
477 #[test]
478 fn test_send_sync() {
479 fn assert_send_sync<T: Send + Sync>() {}
480 assert_send_sync::<TimestampGenerator>();
481 }
482
483 /// Regression: BUG_REPORT.md #14 — `next()` previously returned
484 /// raw TSC ticks (~3.5× wall-clock-ns on a 3.5 GHz core) while
485 /// claiming nanoseconds. Pin the unit by sleeping for a known
486 /// amount of wall-clock time and asserting the delta is roughly
487 /// nanoseconds, not TSC ticks.
488 #[test]
489 fn next_returns_nanoseconds_not_raw_ticks() {
490 let ts_gen = TimestampGenerator::new();
491 let t0 = ts_gen.next();
492 std::thread::sleep(std::time::Duration::from_millis(50));
493 let t1 = ts_gen.next();
494
495 let delta = t1 - t0;
496 // 50ms == 50_000_000 ns. Allow ±50% slack for sleep
497 // imprecision, scheduler jitter, and CI runners.
498 let ns_lo = 25_000_000u64;
499 let ns_hi = 200_000_000u64;
500 assert!(
501 delta >= ns_lo && delta <= ns_hi,
502 "delta over a 50ms sleep was {delta} — outside the {ns_lo}..={ns_hi} \
503 ns window. Most likely the timestamp is in raw TSC ticks again \
504 (would be ~150_000_000 on a 3 GHz core)."
505 );
506 }
507
508 /// A fresh generator's first `next()` value must be
509 /// small (close to "ns since this generator was created"),
510 /// not "ns since system uptime started" or some other
511 /// arbitrary baseline. Pre-fix the baseline was `0` against
512 /// the quanta::Clock's internal calibration, so on a system
513 /// that had been up for hours the first `next()` returned
514 /// many trillion nanoseconds.
515 ///
516 /// We assert the first `next()` is below ~10ms in nanoseconds,
517 /// which is plenty of slack for construction overhead but
518 /// nowhere near "ns since boot."
519 #[test]
520 fn next_first_call_is_close_to_zero() {
521 let ts_gen = TimestampGenerator::new();
522 let first = ts_gen.next();
523 let ten_ms_ns = 10_000_000u64;
524 assert!(
525 first < ten_ms_ns,
526 "first next() returned {first} ns; expected < {ten_ms_ns} ns. \
527 Pre-fix this would be ~uptime in ns."
528 );
529 }
530}