net/timestamp.rs
1//! High-precision timestamp generation with zero syscall overhead.
2//!
3//! This module provides monotonically increasing timestamps using the CPU's
4//! Time Stamp Counter (TSC) on x86_64, avoiding syscalls in the hot path.
5//!
6//! # Design
7//!
8//! - Uses `quanta` crate which calibrates against the system clock once at startup
9//! - Subsequent reads use RDTSC instruction directly (no syscall)
10//! - Each shard has its own `TimestampGenerator` to eliminate contention
11//! - Monotonicity is guaranteed via atomic CAS operations
12
13use std::sync::atomic::{AtomicU64, Ordering};
14
15/// High-precision timestamp generator using TSC.
16///
17/// This generator provides strictly monotonic timestamps with sub-nanosecond
18/// resolution and zero syscall overhead after initialization.
19///
20/// # Single-owner invariant
21///
22/// **Each producer should own a dedicated `TimestampGenerator`.** The
23/// type is `Send + Sync` and `next()` *is* safe to call concurrently —
24/// monotonicity is preserved by `compare_exchange_weak` — but the CAS
25/// loop degenerates into a spin under sustained contention. The whole
26/// design rests on the loop almost never iterating: that's only true
27/// when one writer at a time accesses the generator.
28///
29/// The codebase enforces this structurally rather than at runtime:
30///
31/// - `Shard` owns its `TimestampGenerator` by value (not behind `Arc`).
32/// - `TimestampGenerator` is **not** `Clone`, so duplicating one is a
33/// deliberate `mem::replace` / `Default::default()` away — visible
34/// in code review.
35/// - The shard's surrounding `Mutex<Shard>` serializes producers, so
36/// `next()` is invoked by exactly one caller at a time per shard.
37///
38/// If you find yourself reaching for `Arc<TimestampGenerator>`, stop
39/// — give each producer its own instance instead. Every additional
40/// concurrent caller is one more thread potentially CAS-spinning on
41/// `last`.
42pub struct TimestampGenerator {
43 /// quanta clock (TSC-based after calibration).
44 clock: quanta::Clock,
45 /// Raw TSC value sampled at construction. All `next()` calls
46 /// compute their nanosecond offset relative to this baseline,
47 /// so the returned values are "ns since this generator was
48 /// created" rather than the unspecified "ns since the
49 /// quanta::Clock's internal calibration".
50 ///
51 /// Pre-fix the next() call did
52 /// `clock.delta_as_nanos(0, raw)`. quanta's calibration is
53 /// per-Clock and the "0" baseline doesn't correspond to any
54 /// meaningful real-world time — the returned ns counts were
55 /// in the order of system uptime, not "since this generator".
56 /// Two generators created at different times produced
57 /// timestamps with different effective offsets even on the
58 /// same physical TSC, breaking any consumer reasoning about
59 /// "approximately when did this event happen relative to
60 /// generator-creation".
61 baseline_raw: u64,
62 /// Last generated timestamp (for monotonicity).
63 last: AtomicU64,
64}
65
66impl TimestampGenerator {
67 /// Create a new timestamp generator.
68 ///
69 /// This performs a one-time calibration against the system clock.
70 /// Subsequent timestamp reads use TSC directly.
71 pub fn new() -> Self {
72 let clock = quanta::Clock::new();
73 let baseline_raw = clock.raw();
74 Self {
75 clock,
76 baseline_raw,
77 last: AtomicU64::new(0),
78 }
79 }
80
81 /// Generate the next timestamp.
82 ///
83 /// Returns a strictly monotonically increasing value in
84 /// **nanoseconds since this generator was constructed**. This
85 /// operation is lock-free and does not invoke any syscalls.
86 ///
87 /// Previously returned the raw TSC tick count. The docstring
88 /// claimed nanoseconds, but on a 3.5 GHz core the value was ~3.5×
89 /// larger than ns-since-epoch, breaking any consumer that read
90 /// `insertion_ts` and tried to correlate it with wall-clock-derived
91 /// timestamps from elsewhere. Converting via `delta_as_nanos` here
92 /// costs ~1ns extra per call and gives consumers a unit they can
93 /// actually use.
94 ///
95 /// # Performance
96 ///
97 /// - Single-threaded: ~6-12ns per call
98 /// - Under contention: may loop due to CAS, but still lock-free
99 #[inline(always)]
100 pub fn next(&self) -> u64 {
101 // Ensure strict monotonicity via CAS loop. The TSC read
102 // happens INSIDE the loop so a CAS retry under
103 // contention re-samples real time. Pre-fix `raw` / `now`
104 // were captured once before the loop; if a contended
105 // retry took even a few microseconds (worst case under
106 // heavy contention) the returned timestamp was `last+1`
107 // — wall-clock-correct only as long as no thread won
108 // the CAS in the meantime. Under sustained contention
109 // the generator drifted arbitrarily far behind real
110 // time. Re-reading the TSC is cheap (~1ns, no syscall)
111 // and restores the wall-clock contract per call.
112 loop {
113 let raw = self.clock.raw();
114 let now = self.clock.delta_as_nanos(self.baseline_raw, raw);
115 let last = self.last.load(Ordering::Acquire);
116
117 // Guard against u64::MAX exhaustion: saturating_add(1) at MAX
118 // would return MAX again, breaking strict monotonicity.
119 //
120 // Pre-fix, at `last == u64::MAX - 1` we'd return
121 // `u64::MAX` (via `.max()` clamp) and the NEXT call
122 // would panic on `checked_add(1)`. That gap leaves the
123 // generator briefly stalled at MAX before failure —
124 // not a monotonicity violation, but the caller sees a
125 // "value progression" that's actually clamped. Panic
126 // preemptively when the result would be u64::MAX so
127 // the failure mode is one consistent panic, not
128 // "return MAX once then panic on retry."
129 let next = match last.checked_add(1) {
130 Some(inc) => inc,
131 None => panic!("TimestampGenerator: timestamp space exhausted (u64::MAX)"),
132 };
133 let ts = now.max(next);
134 if ts == u64::MAX {
135 panic!(
136 "TimestampGenerator: timestamp space exhausted (would return u64::MAX); \
137 last={last}, now={now}",
138 );
139 }
140
141 match self
142 .last
143 .compare_exchange_weak(last, ts, Ordering::Release, Ordering::Relaxed)
144 {
145 Ok(_) => return ts,
146 Err(_) => {
147 // Another thread updated; retry
148 std::hint::spin_loop();
149 }
150 }
151 }
152 }
153
154 /// Get the current raw timestamp without incrementing.
155 ///
156 /// This does NOT guarantee monotonicity and is only useful for
157 /// measuring elapsed time or debugging.
158 #[inline(always)]
159 pub fn now_raw(&self) -> u64 {
160 self.clock.raw()
161 }
162
163 /// Convert a raw timestamp to nanoseconds since this
164 /// generator was constructed (i.e. since `baseline_raw`).
165 /// Output units match `next()`: the value returned by
166 /// `raw_to_nanos(self.now_raw())` is comparable to
167 /// recently-`next()`-returned timestamps from the same
168 /// generator (modulo the monotonicity floor `next()` enforces).
169 ///
170 /// Note: NOT "nanoseconds since UNIX epoch". The reference
171 /// point is the per-generator construction moment, so two
172 /// generators created at different times produce values with
173 /// different offsets. For wall-clock-anchored debugging,
174 /// combine with `SystemTime::now()` at generator-construction
175 /// time (recorded externally).
176 ///
177 /// Pre-fix this called `delta_as_nanos(0, raw)`, where the
178 /// `0` baseline was an unspecified quanta-internal reference
179 /// (typically system boot under Windows QPC or the clock's
180 /// first-call moment elsewhere). The returned ns values
181 /// were in the order of system uptime — not comparable to
182 /// `next()` output, despite the function's previous "ns
183 /// since epoch" doc-claim. Aligning both to `baseline_raw`
184 /// makes the surface consistent.
185 #[inline]
186 pub fn raw_to_nanos(&self, raw: u64) -> u64 {
187 self.clock.delta_as_nanos(self.baseline_raw, raw)
188 }
189
190 /// Get the last generated timestamp.
191 #[inline]
192 pub fn last(&self) -> u64 {
193 self.last.load(Ordering::Acquire)
194 }
195}
196
197impl Default for TimestampGenerator {
198 fn default() -> Self {
199 Self::new()
200 }
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206 use std::thread;
207
208 #[test]
209 fn test_monotonicity() {
210 let ts_gen = TimestampGenerator::new();
211 let mut prev = 0u64;
212
213 for _ in 0..10_000 {
214 let ts = ts_gen.next();
215 assert!(ts > prev, "timestamps must be strictly increasing");
216 prev = ts;
217 }
218 }
219
220 /// Source pin: `TimestampGenerator::next` must re-read the
221 /// TSC inside the CAS loop. Pre-fix `let raw = self.clock
222 /// .raw()` and `let now = ...` were captured ONCE before
223 /// the loop; under sustained contention a CAS retry reused
224 /// the stale `now` and the returned timestamp was `last+1`
225 /// rather than wall-clock-now, drifting arbitrarily far
226 /// behind real time. The TSC read is ~1ns and is inside
227 /// the loop now.
228 ///
229 /// We can't reliably reproduce the drift in a unit test
230 /// (the worst case requires sustained heavy contention
231 /// over enough retries that real-time advances measurably
232 /// past the stale capture). The source pin catches a
233 /// "simplification" PR that hoists the read back outside.
234 #[test]
235 fn timestamp_next_reads_tsc_inside_cas_loop() {
236 let src = include_str!("timestamp.rs");
237
238 // Locate the body of `pub fn next(&self) -> u64`.
239 let header = "pub fn next(&self) -> u64 {";
240 let start = src
241 .find(header)
242 .expect("TimestampGenerator::next must exist");
243 // The body ends at the next `\n fn ` or `\n pub fn ` at
244 // the impl indent.
245 let after_header = start + header.len();
246 let next_fn = src[after_header..]
247 .find("\n pub fn ")
248 .or_else(|| src[after_header..].find("\n fn "))
249 .expect("a sibling fn must follow next()")
250 + after_header;
251 let body = &src[start..next_fn];
252
253 // Strip line comments so the doc-comment that *describes*
254 // the rejected pattern doesn't trip the negative
255 // assertions.
256 let body_no_comments: String = body
257 .lines()
258 .map(|l| match l.find("//") {
259 Some(idx) => &l[..idx],
260 None => l,
261 })
262 .collect::<Vec<_>>()
263 .join("\n");
264
265 // Find the `loop {` opening — the CAS loop body must
266 // contain BOTH the TSC read and the delta_as_nanos
267 // call.
268 let loop_idx = body_no_comments
269 .find("loop {")
270 .expect("CAS loop must exist in next()");
271 let loop_body = &body_no_comments[loop_idx..];
272
273 assert!(
274 loop_body.contains("self.clock.raw()"),
275 "regression: `self.clock.raw()` must be inside the CAS \
276 loop in TimestampGenerator::next. Hoisted outside, a \
277 retry under contention reuses the stale TSC and the \
278 returned timestamp drifts behind real time."
279 );
280 assert!(
281 loop_body.contains("delta_as_nanos"),
282 "regression: `delta_as_nanos` must be inside the CAS \
283 loop alongside the TSC read."
284 );
285 }
286
287 #[test]
288 fn test_monotonicity_concurrent() {
289 let ts_gen = std::sync::Arc::new(TimestampGenerator::new());
290 let mut handles = vec![];
291
292 for _ in 0..4 {
293 let ts_gen_clone = ts_gen.clone();
294 handles.push(thread::spawn(move || {
295 let mut timestamps = Vec::with_capacity(1000);
296 for _ in 0..1000 {
297 timestamps.push(ts_gen_clone.next());
298 }
299 timestamps
300 }));
301 }
302
303 let mut all_timestamps: Vec<u64> = handles
304 .into_iter()
305 .flat_map(|h| h.join().unwrap())
306 .collect();
307
308 // All timestamps should be unique (strictly monotonic)
309 all_timestamps.sort();
310 let unique_count = all_timestamps.windows(2).filter(|w| w[0] != w[1]).count() + 1;
311 assert_eq!(
312 unique_count,
313 all_timestamps.len(),
314 "all timestamps must be unique"
315 );
316 }
317
318 #[test]
319 fn test_no_syscall_performance() {
320 let ts_gen = TimestampGenerator::new();
321
322 // Warm up
323 for _ in 0..1000 {
324 let _ = ts_gen.next();
325 }
326
327 // Measure
328 let start = std::time::Instant::now();
329 let iterations = 100_000;
330
331 for _ in 0..iterations {
332 let _ = ts_gen.next();
333 }
334
335 let elapsed = start.elapsed();
336 let per_call = elapsed.as_nanos() / iterations as u128;
337
338 // Typically 5-20ns on bare metal, but CI runners can be slower.
339 assert!(
340 per_call < 500,
341 "timestamp generation too slow: {}ns per call",
342 per_call
343 );
344 }
345
346 #[test]
347 fn test_timestamp_generator_new() {
348 let ts_gen = TimestampGenerator::new();
349 // Initial last should be 0
350 assert_eq!(ts_gen.last(), 0);
351 }
352
353 #[test]
354 fn test_timestamp_generator_default() {
355 let ts_gen = TimestampGenerator::default();
356 assert_eq!(ts_gen.last(), 0);
357 }
358
359 #[test]
360 fn test_now_raw() {
361 let ts_gen = TimestampGenerator::new();
362 let raw1 = ts_gen.now_raw();
363 let raw2 = ts_gen.now_raw();
364 // Raw timestamps should be increasing (or at least not decreasing significantly)
365 assert!(raw2 >= raw1 || raw1 - raw2 < 1000); // Allow for some jitter
366 }
367
368 #[test]
369 fn test_raw_to_nanos() {
370 let ts_gen = TimestampGenerator::new();
371 let raw = ts_gen.now_raw();
372 let nanos = ts_gen.raw_to_nanos(raw);
373 // Nanos should be a reasonable value (not zero for a non-zero raw)
374 assert!(nanos > 0);
375 }
376
377 #[test]
378 fn test_raw_to_nanos_zero() {
379 let ts_gen = TimestampGenerator::new();
380 let nanos = ts_gen.raw_to_nanos(0);
381 assert_eq!(nanos, 0);
382 }
383
384 #[test]
385 fn test_last_after_next() {
386 let ts_gen = TimestampGenerator::new();
387 let ts1 = ts_gen.next();
388 assert_eq!(ts_gen.last(), ts1);
389
390 let ts2 = ts_gen.next();
391 assert_eq!(ts_gen.last(), ts2);
392 assert!(ts2 > ts1);
393 }
394
395 #[test]
396 fn test_next_returns_increasing_values() {
397 let ts_gen = TimestampGenerator::new();
398 let mut prev = ts_gen.next();
399
400 for _ in 0..100 {
401 let current = ts_gen.next();
402 assert!(current > prev);
403 prev = current;
404 }
405 }
406
407 #[test]
408 fn test_multiple_generators_independent() {
409 let ts_gen1 = TimestampGenerator::new();
410 let ts_gen2 = TimestampGenerator::new();
411
412 let ts1 = ts_gen1.next();
413 let ts2 = ts_gen2.next();
414
415 // Both should have advanced
416 assert!(ts1 > 0);
417 assert!(ts2 > 0);
418
419 // They are independent, so last values are different
420 assert_eq!(ts_gen1.last(), ts1);
421 assert_eq!(ts_gen2.last(), ts2);
422 }
423
424 #[test]
425 fn test_now_raw_does_not_affect_last() {
426 let ts_gen = TimestampGenerator::new();
427 let initial_last = ts_gen.last();
428
429 // Call now_raw multiple times
430 let _ = ts_gen.now_raw();
431 let _ = ts_gen.now_raw();
432 let _ = ts_gen.now_raw();
433
434 // last should not have changed
435 assert_eq!(ts_gen.last(), initial_last);
436 }
437
438 #[test]
439 fn test_rapid_calls() {
440 let ts_gen = TimestampGenerator::new();
441 let mut timestamps = Vec::with_capacity(10000);
442
443 for _ in 0..10000 {
444 timestamps.push(ts_gen.next());
445 }
446
447 // All should be unique and strictly increasing
448 for window in timestamps.windows(2) {
449 assert!(window[1] > window[0]);
450 }
451 }
452
453 // Regression: saturating_add(1) at u64::MAX used to silently return
454 // the same timestamp twice, breaking strict monotonicity (BUGS_3 #6).
455 #[test]
456 #[should_panic(expected = "timestamp space exhausted")]
457 fn test_next_panics_at_u64_max() {
458 let ts_gen = TimestampGenerator::new();
459 // Force last to u64::MAX
460 ts_gen.last.store(u64::MAX, Ordering::Release);
461 let _ = ts_gen.next();
462 }
463
464 #[test]
465 fn test_send_sync() {
466 fn assert_send_sync<T: Send + Sync>() {}
467 assert_send_sync::<TimestampGenerator>();
468 }
469
470 /// Regression: BUG_REPORT.md #14 — `next()` previously returned
471 /// raw TSC ticks (~3.5× wall-clock-ns on a 3.5 GHz core) while
472 /// claiming nanoseconds. Pin the unit by sleeping for a known
473 /// amount of wall-clock time and asserting the delta is roughly
474 /// nanoseconds, not TSC ticks.
475 #[test]
476 fn next_returns_nanoseconds_not_raw_ticks() {
477 let ts_gen = TimestampGenerator::new();
478 let t0 = ts_gen.next();
479 std::thread::sleep(std::time::Duration::from_millis(50));
480 let t1 = ts_gen.next();
481
482 let delta = t1 - t0;
483 // 50ms == 50_000_000 ns. Allow ±50% slack for sleep
484 // imprecision, scheduler jitter, and CI runners.
485 let ns_lo = 25_000_000u64;
486 let ns_hi = 200_000_000u64;
487 assert!(
488 delta >= ns_lo && delta <= ns_hi,
489 "delta over a 50ms sleep was {delta} — outside the {ns_lo}..={ns_hi} \
490 ns window. Most likely the timestamp is in raw TSC ticks again \
491 (would be ~150_000_000 on a 3 GHz core)."
492 );
493 }
494
495 /// A fresh generator's first `next()` value must be
496 /// small (close to "ns since this generator was created"),
497 /// not "ns since system uptime started" or some other
498 /// arbitrary baseline. Pre-fix the baseline was `0` against
499 /// the quanta::Clock's internal calibration, so on a system
500 /// that had been up for hours the first `next()` returned
501 /// many trillion nanoseconds.
502 ///
503 /// We assert the first `next()` is below ~10ms in nanoseconds,
504 /// which is plenty of slack for construction overhead but
505 /// nowhere near "ns since boot."
506 #[test]
507 fn next_first_call_is_close_to_zero() {
508 let ts_gen = TimestampGenerator::new();
509 let first = ts_gen.next();
510 let ten_ms_ns = 10_000_000u64;
511 assert!(
512 first < ten_ms_ns,
513 "first next() returned {first} ns; expected < {ten_ms_ns} ns. \
514 Pre-fix this would be ~uptime in ns."
515 );
516 }
517}