net-mesh 0.23.0

High-performance, schema-agnostic, backend-agnostic event bus
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
//! Substrate-level cancel-token registry for the nRPC call shapes.
//!
//! Promotes the cancel-token pattern that the napi binding owned
//! at `bindings/node/src/mesh_rpc.rs` to the SDK layer so all three
//! bindings (and any future direct-consumer of [`crate::adapter::net::MeshNode`])
//! get cancel semantics through a single primitive.
//!
//! # Model
//!
//! Each in-flight call optionally carries a `cancel_token: u64`
//! reserved via [`crate::adapter::net::MeshNode::reserve_cancel_token`].
//! A caller signals cancellation from any thread by calling
//! [`crate::adapter::net::MeshNode::cancel`] with that token. The
//! in-flight call's await point observes the cancel via a
//! [`tokio::sync::Notify`] permit and short-circuits to
//! [`crate::adapter::net::mesh_rpc::RpcError::Cancelled`].
//!
//! Drop-on-cancel emits CANCEL on the wire via the existing
//! [`crate::adapter::net::mesh_rpc`]-side guards (UnaryCallGuard,
//! ClientStreamCallRaw::Drop, DuplexCallRaw::Drop).
//!
//! # Race fixes
//!
//! Two races the napi binding's local registry already pinned:
//!
//! 1. **Cancel-before-register.** A call's `cancel(token)` can land
//!    BEFORE the call's `register_cancel_notify(token)` runs (the
//!    gap between caller-side token reservation and the in-flight
//!    call reaching its `select!`). The registry latches a
//!    `pre_cancelled = true` flag on the orphan entry; the
//!    subsequent register observes the flag and the returned
//!    [`Notify`] is pre-armed via [`Notify::notify_one`] so the
//!    first `notified().await` returns immediately. Matches the
//!    napi binding's CR-13 fix.
//!
//! 2. **Orphaned cancel-only entries.** A pathological caller that
//!    reserves a token, calls cancel, and then never issues the
//!    paired call leaks an entry in the registry forever. An
//!    opportunistic GC on every `cancel(token)` evicts orphan
//!    entries older than [`ORPHAN_TTL`] (120s, matching the Go
//!    FFI's Q18 fix). The registry is a single HashMap, not on
//!    any hot path, so the per-call GC scan cost is irrelevant.

use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant};

use parking_lot::Mutex;
use tokio::sync::Notify;

/// How long an orphaned (cancel-arrived-first, no live call yet)
/// registry entry stays before opportunistic GC evicts it. The
/// window is long enough for a legitimate `reserve → slow
/// dispatch → register` flow to still observe the cancellation,
/// but short enough that a misbehaving caller can't grow the
/// registry unboundedly.
///
/// Tuned to match the Go FFI's Q18 fix value. See
/// `bindings/go/rpc-ffi/src/lib.rs::ORPHAN_TTL`.
pub const ORPHAN_TTL: Duration = Duration::from_secs(120);

/// Process-global counter for cancel tokens. Starts at 1 so `0`
/// can be a "no token" sentinel that
/// [`crate::adapter::net::MeshNode::cancel`] ignores. Monotonically-
/// increasing, never reused.
static NEXT_CANCEL_TOKEN: AtomicU64 = AtomicU64::new(1);

/// Shared never-firing [`Notify`] returned by [`CancelRegistry::register_notify`]
/// when `token == 0`. Every no-cancel call (the overwhelmingly common
/// path) clones this single `Arc` instead of allocating a fresh
/// `Notify` per call — saves two heap allocations + two refcount ops
/// on the unary/streaming hot paths.
///
/// Safe to share: nothing ever calls `notify_one` on it, so the
/// permit-slot stays empty forever and every `notified().await`
/// blocks indefinitely (which is the semantic we want — the call's
/// `tokio::select!` arm fires only on the other branch).
fn never_firing_notify() -> Arc<Notify> {
    static NOTIFY: OnceLock<Arc<Notify>> = OnceLock::new();
    NOTIFY.get_or_init(|| Arc::new(Notify::new())).clone()
}

/// Minimum interval between opportunistic GC sweeps. Called from
/// the hot path ([`CancelRegistry::register_notify`]), so a full
/// HashMap scan per call is O(N) under contention. Rate-limiting
/// to once per second keeps orphan-TTL eviction bounded without
/// the quadratic burst behavior.
const GC_INTERVAL: Duration = Duration::from_secs(1);

/// Reserve a fresh cancel token. Process-global counter — every
/// [`crate::adapter::net::MeshNode`] in the process shares the same
/// sequence so a downstream consumer holding multiple meshes can
/// build a single cancel-routing layer without collision concerns.
pub(crate) fn next_token() -> u64 {
    NEXT_CANCEL_TOKEN.fetch_add(1, Ordering::Relaxed)
}

/// Per-token state. Carries the cancel signal and a generation
/// marker for orphan-TTL GC.
struct CancelEntry {
    /// Set on the CR-13 cancel-before-register race; the eventual
    /// register pre-arms the Notify.
    pre_cancelled: bool,
    /// Lazily created on first register. `None` means only
    /// `pre_cancelled` is set (cancel arrived first).
    notify: Option<Arc<Notify>>,
    /// Birth instant for orphan-TTL GC; cleared on register since
    /// a registered entry has a live caller.
    marked_at: Option<Instant>,
}

impl CancelEntry {
    fn new() -> Self {
        Self {
            pre_cancelled: false,
            notify: None,
            marked_at: None,
        }
    }
}

/// Per-mesh cancel-token registry. Lives behind an Arc on
/// [`crate::adapter::net::MeshNode`].
///
/// All public surfaces ([`Self::reserve_token`], [`Self::cancel`],
/// [`Self::register_notify`], [`Self::release`]) are thread-safe
/// and cheap on the no-cancel path (no allocation when
/// `token == 0`).
pub struct CancelRegistry {
    entries: Mutex<RegistryInner>,
}

struct RegistryInner {
    entries: HashMap<u64, CancelEntry>,
    /// Last time [`CancelRegistry::gc`] swept the map. Rate-limits
    /// the sweep to once per [`GC_INTERVAL`] — the hot path callers
    /// (`register_notify`) check this before paying the O(N) scan.
    last_gc: Instant,
}

impl Default for CancelRegistry {
    fn default() -> Self {
        Self::new()
    }
}

impl CancelRegistry {
    pub fn new() -> Self {
        Self {
            entries: Mutex::new(RegistryInner {
                entries: HashMap::new(),
                last_gc: Instant::now(),
            }),
        }
    }

    /// Reserve a fresh cancel token. Wraps the process-global
    /// counter from [`next_token`] — exposed as a method so
    /// downstream code uses the `mesh.reserve_cancel_token()`
    /// surface uniformly.
    pub fn reserve_token(&self) -> u64 {
        next_token()
    }

    /// Signal cancellation for `token`. Idempotent; safe to call
    /// from any thread. No-op when `token == 0` (the "no token"
    /// sentinel) or when no caller ever reserved this id.
    ///
    /// If the matching call has registered its [`Notify`], this
    /// arms it via [`Notify::notify_one`] so the call's `select!`
    /// arm fires. If the matching call hasn't registered yet
    /// (cancel-before-register race), this latches
    /// `pre_cancelled = true` on the orphan entry; the eventual
    /// register observes the flag and arms the returned Notify
    /// immediately.
    pub fn cancel(&self, token: u64) {
        if token == 0 {
            return;
        }
        let notify = {
            let mut inner = self.entries.lock();
            Self::maybe_gc(&mut inner);
            let entry = inner.entries.entry(token).or_insert_with(CancelEntry::new);
            entry.pre_cancelled = true;
            if entry.marked_at.is_none() {
                entry.marked_at = Some(Instant::now());
            }
            // Snapshot the Arc; notify_one runs outside the lock.
            entry.notify.clone()
        };
        if let Some(notify) = notify {
            notify.notify_one();
        }
    }

    /// Returns a [`Notify`] the in-flight call should `select!`
    /// against. If the token has already been cancelled (the
    /// CR-13 race), the returned Notify is pre-armed so the first
    /// `notified().await` returns immediately.
    ///
    /// `token == 0` short-circuits to a shared never-firing Notify
    /// (one Arc clone, no allocation) so call shapes can write
    /// `select! { _ = notify.notified() => ... }` unconditionally
    /// without branching on `Option<Notify>` and without paying an
    /// allocation per call on the no-cancel hot path.
    pub fn register_notify(&self, token: u64) -> Arc<Notify> {
        if token == 0 {
            return never_firing_notify();
        }
        // Snapshot under the lock; notify_one runs outside.
        let (notify, was_precancelled) = {
            let mut inner = self.entries.lock();
            Self::maybe_gc(&mut inner);
            let entry = inner.entries.entry(token).or_insert_with(CancelEntry::new);
            let notify = entry
                .notify
                .get_or_insert_with(|| Arc::new(Notify::new()))
                .clone();
            let was_precancelled = entry.pre_cancelled;
            // Live caller now exists; the orphan-TTL stamp no longer applies.
            entry.marked_at = None;
            (notify, was_precancelled)
        };
        if was_precancelled {
            // CR-13: cancel arrived first; pre-arm the permit.
            // Outside the lock — avoids holding the mutex across
            // tokio internals.
            notify.notify_one();
        }
        notify
    }

    /// Remove a token's entry from the registry. Called by the
    /// in-flight call shape once it has resolved (success, error,
    /// or terminal cancel) so the registry doesn't grow
    /// unboundedly across long-running consumers.
    ///
    /// Idempotent — repeated calls on the same token are no-ops.
    /// `token == 0` is also a no-op.
    pub fn release(&self, token: u64) {
        if token == 0 {
            return;
        }
        let mut inner = self.entries.lock();
        inner.entries.remove(&token);
    }

    /// Rate-limited wrapper around [`Self::gc`]. Skips the O(N)
    /// scan if it ran within the last [`GC_INTERVAL`] — the hot
    /// path (`register_notify`) calls this on every register, so
    /// without rate-limiting a burst of N concurrent calls would
    /// pay O(N²) total scan cost.
    fn maybe_gc(inner: &mut RegistryInner) {
        let now = Instant::now();
        if now.duration_since(inner.last_gc) < GC_INTERVAL {
            return;
        }
        inner.last_gc = now;
        Self::gc(&mut inner.entries);
    }

    /// Opportunistic eviction of orphan entries (cancel-only, no
    /// live caller) older than [`ORPHAN_TTL`]. Entries with a
    /// registered [`Notify`] are kept regardless of age — they
    /// represent a live caller awaiting (or about to await) the
    /// cancel signal.
    fn gc(entries: &mut HashMap<u64, CancelEntry>) {
        let now = Instant::now();
        entries.retain(|_, entry| {
            if entry.notify.is_some() {
                return true;
            }
            match entry.marked_at {
                Some(t) => now.duration_since(t) < ORPHAN_TTL,
                None => true,
            }
        });
    }

    /// Number of entries currently tracked. Diagnostic; not on
    /// any hot path. Includes both registered-and-live entries
    /// and orphan cancel-only entries that haven't aged out yet.
    pub fn len(&self) -> usize {
        self.entries.lock().entries.len()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cancel_zero_token_is_noop() {
        let reg = CancelRegistry::new();
        reg.cancel(0);
        assert_eq!(reg.len(), 0, "cancel(0) must not create an entry");
    }

    #[test]
    fn register_zero_token_returns_never_firing_notify() {
        let reg = CancelRegistry::new();
        let notify = reg.register_notify(0);
        // No permit pre-loaded, no entry inserted.
        assert_eq!(reg.len(), 0);
        // notify is a fresh Notify; we don't poll it (no permits)
        // but its existence lets call shapes select! against it
        // unconditionally.
        let _ = notify;
    }

    #[test]
    fn release_zero_token_is_noop() {
        let reg = CancelRegistry::new();
        reg.release(0);
        assert_eq!(reg.len(), 0);
    }

    #[tokio::test]
    async fn cancel_then_register_pre_arms_notify() {
        // The CR-13 race: a cancel that arrives BEFORE the call's
        // register call. The returned Notify must be pre-armed so
        // the first notified().await fires immediately.
        let reg = CancelRegistry::new();
        let token = reg.reserve_token();
        reg.cancel(token);
        let notify = reg.register_notify(token);
        // notified() returns immediately because notify_one was
        // already called before we arrived.
        tokio::time::timeout(Duration::from_millis(100), notify.notified())
            .await
            .expect("pre-armed Notify must fire immediately");
    }

    #[tokio::test]
    async fn register_then_cancel_wakes_waiter() {
        // The forward-direction: register first, then cancel.
        let reg = CancelRegistry::new();
        let token = reg.reserve_token();
        let notify = reg.register_notify(token);
        let reg2 = std::sync::Arc::new(reg);
        let reg2_clone = reg2.clone();
        tokio::spawn(async move {
            tokio::time::sleep(Duration::from_millis(10)).await;
            reg2_clone.cancel(token);
        });
        tokio::time::timeout(Duration::from_millis(500), notify.notified())
            .await
            .expect("register-then-cancel must wake the waiter");
    }

    #[test]
    fn release_removes_entry() {
        let reg = CancelRegistry::new();
        let token = reg.reserve_token();
        let _notify = reg.register_notify(token);
        assert_eq!(reg.len(), 1);
        reg.release(token);
        assert_eq!(reg.len(), 0);
        // Idempotent.
        reg.release(token);
        assert_eq!(reg.len(), 0);
    }

    #[test]
    fn cancel_after_release_is_safe() {
        // Race: cancel arrives after the call has already
        // resolved + released. Should be a clean no-op (the
        // entry is gone), not a panic or double-counted action.
        let reg = CancelRegistry::new();
        let token = reg.reserve_token();
        let _notify = reg.register_notify(token);
        reg.release(token);
        reg.cancel(token);
        // A new orphan entry was created by the post-release
        // cancel. That's fine — the orphan-TTL GC will evict it.
        // The contract is that `cancel` is safe to call at any
        // time, not that it's a no-op on stale tokens.
        assert!(reg.len() <= 1);
    }

    #[test]
    fn next_token_is_monotonic_and_nonzero() {
        let a = next_token();
        let b = next_token();
        let c = next_token();
        assert!(a >= 1, "tokens start at 1, not 0");
        assert!(b > a);
        assert!(c > b);
    }

    /// N1: `register_notify(0)` returns the same process-wide
    /// `Arc<Notify>` on every call — no per-call allocation. The
    /// returned Arc's strong-count grows with each clone instead
    /// of starting fresh from 1. Pinned because the hot path's
    /// allocator pressure is the whole motivation for the cache.
    #[test]
    fn zero_token_returns_shared_never_firing_notify() {
        let reg = CancelRegistry::new();
        let a = reg.register_notify(0);
        let b = reg.register_notify(0);
        assert!(
            Arc::ptr_eq(&a, &b),
            "both no-cancel registrations must hand back the same Arc<Notify>"
        );
        // The shared Arc is also the static one, so a third clone
        // from never_firing_notify() matches.
        let c = never_firing_notify();
        assert!(Arc::ptr_eq(&a, &c));
        // Zero-token registrations don't create registry entries.
        assert_eq!(reg.len(), 0);
    }

    /// N2: `maybe_gc` skips the O(N) scan if it ran within
    /// [`GC_INTERVAL`]. A burst of register_notify calls touches
    /// `last_gc` exactly once, regardless of N. Pinned because
    /// the per-call quadratic burst was the original bug.
    #[test]
    fn gc_rate_limited_across_burst() {
        let reg = CancelRegistry::new();
        // Stamp `last_gc` to "just now" so any burst inside the
        // GC window must short-circuit.
        {
            let mut inner = reg.entries.lock();
            inner.last_gc = Instant::now();
        }
        // Manually insert an orphan entry that would normally be
        // collected by GC (ORPHAN_TTL = 120s; we stamp it as if it
        // had aged out). If GC fires on the next register call,
        // the entry vanishes.
        let stale = next_token();
        {
            let mut inner = reg.entries.lock();
            let entry = inner.entries.entry(stale).or_insert_with(CancelEntry::new);
            entry.pre_cancelled = true;
            entry.marked_at = Some(Instant::now() - (ORPHAN_TTL * 2));
        }
        // Trigger register_notify on a fresh token — should NOT
        // evict the stale entry because GC is rate-limited.
        let _ = reg.register_notify(next_token());
        assert!(
            reg.entries.lock().entries.contains_key(&stale),
            "stale entry survives because gc is rate-limited"
        );
    }

    /// N3: notify_one fires after the lock is released. Hard to
    /// observe directly without instrumenting parking_lot, but we
    /// can pin the contract by exercising the CR-13 pre-arm path
    /// (which goes through the same code) and asserting it still
    /// works.
    #[tokio::test]
    async fn pre_arm_works_with_lock_released_notify() {
        let reg = CancelRegistry::new();
        let token = reg.reserve_token();
        reg.cancel(token);
        let notify = reg.register_notify(token);
        tokio::time::timeout(Duration::from_millis(100), notify.notified())
            .await
            .expect("pre-armed Notify fires even with lock-narrowed register");
    }
}