net-mesh 0.21.0

High-performance, schema-agnostic, backend-agnostic event bus
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
//! The action executor — drains the
//! [`super::action::PendingAction`] queue the
//! [`super::event_loop::MeshOsLoop`] fills, runs each action
//! through the Phase G [`super::backpressure::BackpressureState::admit`]
//! gate, and dispatches to a pluggable
//! [`ActionDispatcher`].
//!
//! Locked decision #4 (action emission ≠ action execution): the
//! executor is a separate task, not inlined in reconcile.
//! Locked decision #10 (single backpressure layer): every
//! action passes through one admit; deferrals re-enter via a
//! per-executor `BinaryHeap` keyed by retry deadline; gates
//! drop with a structured failure record.
//!
//! Phase-A through G shipped the upstream side (event loop +
//! state + reconcile + gate); this module is the downstream
//! consumer. The dispatcher itself is pluggable — a
//! [`LoggingDispatcher`] ships for bootstrap / tests, and the
//! production path will wrap [`super::action::MeshOsAction`]
//! variants over the existing `DaemonRegistry`, migration
//! orchestrator, and `MeshNode::send_subprotocol` paths.

use std::cmp::Reverse;
use std::collections::{BinaryHeap, VecDeque};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};

use futures::future::BoxFuture;
use futures::FutureExt;
use parking_lot::{Mutex, RwLock};
use tokio::sync::mpsc;
use tokio::time::sleep_until;

use super::action::{MeshOsAction, PendingAction};
use super::backpressure::{AdmissionResult, BackpressureState, ClusterBackpressureChange};
use super::chain::{
    append_dispatched, append_failed, append_gated, ActionChainAppender, AppendError,
    NoOpActionChainAppender,
};
use super::config::MeshOsConfig;
use super::snapshot::{FailureRecord, RECENT_FAILURES_CAPACITY};

/// Pluggable action sink. The executor calls `dispatch` once
/// per admitted action; the impl owns the substrate-side
/// wiring (daemon registry, migration orchestrator, MeshDB
/// admin commits, etc.).
///
/// Returns a [`BoxFuture`] so the trait stays dyn-compatible;
/// production dispatchers spawn substrate-side futures
/// themselves rather than blocking the executor task.
pub trait ActionDispatcher: Send + Sync + 'static {
    /// Dispatch an admitted action. Errors record on the
    /// recent-failures ring buffer; the action is not retried
    /// (admit / defer is the retry surface).
    fn dispatch<'a>(&'a self, action: MeshOsAction) -> BoxFuture<'a, Result<(), DispatchError>>;

    /// Cluster-wide backpressure flag transitioned. The executor
    /// invokes this once per edge crossing — `Asserted` when the
    /// action-queue depth crosses the high-water mark, `Released`
    /// when it drops below the low-water mark. Production
    /// dispatchers fan `DaemonControl::BackpressureOn { level }` /
    /// `DaemonControl::BackpressureOff` out to supervised daemons
    /// so they can shed optional work. Default impl is a no-op —
    /// dispatchers that don't supervise daemons (e.g. the test
    /// logger) can ignore the hook.
    fn on_cluster_backpressure(&self, _change: ClusterBackpressureChange) {}
}

/// Dispatch error surface. Carries the operator-readable reason
/// and an optional retry hint — the executor honors the hint
/// by re-queuing the action through `admit()` after the hint
/// elapses (if any).
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct DispatchError {
    /// Operator-readable reason.
    pub reason: String,
    /// Optional retry hint — if `Some`, the executor re-enters
    /// the action through `admit()` after this duration. `None`
    /// drops the action (the typical case — admit is the retry
    /// surface).
    pub retry_after: Option<Duration>,
}

impl DispatchError {
    /// Construct a non-retried error.
    pub fn drop(reason: impl Into<String>) -> Self {
        Self {
            reason: reason.into(),
            retry_after: None,
        }
    }

    /// Construct a retried error.
    pub fn retry(reason: impl Into<String>, after: Duration) -> Self {
        Self {
            reason: reason.into(),
            retry_after: Some(after),
        }
    }
}

/// Logging-only dispatcher. Records every admitted action in an
/// internal `Mutex<Vec<MeshOsAction>>` and returns `Ok(())`.
/// Useful for bootstrap (before real subsystem wiring lands) +
/// the executor's unit tests.
#[derive(Debug, Default)]
pub struct LoggingDispatcher {
    log: Mutex<Vec<MeshOsAction>>,
    fail_next: Mutex<Option<DispatchError>>,
    backpressure_log: Mutex<Vec<ClusterBackpressureChange>>,
}

impl LoggingDispatcher {
    /// Construct an empty logger.
    pub fn new() -> Self {
        Self::default()
    }

    /// Snapshot of the actions dispatched so far.
    pub fn log(&self) -> Vec<MeshOsAction> {
        self.log.lock().clone()
    }

    /// Inject an error to surface on the next `dispatch` call.
    /// Used by tests to exercise the retry / drop paths.
    pub fn fail_next(&self, err: DispatchError) {
        *self.fail_next.lock() = Some(err);
    }

    /// Snapshot of the cluster-backpressure transitions the
    /// executor has surfaced through `on_cluster_backpressure`.
    pub fn backpressure_log(&self) -> Vec<ClusterBackpressureChange> {
        self.backpressure_log.lock().clone()
    }
}

impl ActionDispatcher for LoggingDispatcher {
    fn dispatch<'a>(&'a self, action: MeshOsAction) -> BoxFuture<'a, Result<(), DispatchError>> {
        Box::pin(async move {
            if let Some(err) = self.fail_next.lock().take() {
                return Err(err);
            }
            self.log.lock().push(action);
            Ok(())
        })
    }

    fn on_cluster_backpressure(&self, change: ClusterBackpressureChange) {
        self.backpressure_log.lock().push(change);
    }
}

/// Counters the executor maintains for diagnostics / Deck
/// rendering. Returned by [`ActionExecutor::run`] when the
/// task exits; sampled live via [`ExecutorHandle::stats`].
#[derive(Debug, Default)]
pub struct ExecutorStats {
    /// Total actions admitted + successfully dispatched.
    pub dispatched: AtomicU64,
    /// Total actions admitted but failed in dispatch (no retry).
    pub failed: AtomicU64,
    /// Total actions deferred via `AdmissionResult::Defer`.
    /// Re-admits count here each time, so a flapping action
    /// inflates the metric — the queue-depth gauge is the
    /// healthy signal, not this counter.
    pub deferred: AtomicU64,
    /// Total actions hard-gated via `AdmissionResult::Gate`.
    pub gated: AtomicU64,
    /// Total actions retried via a dispatch error's
    /// `retry_after` hint.
    pub dispatch_retries: AtomicU64,
    /// Number of cluster-backpressure assert transitions
    /// surfaced to the dispatcher.
    pub cluster_backpressure_asserts: AtomicU64,
    /// Number of cluster-backpressure release transitions
    /// surfaced to the dispatcher.
    pub cluster_backpressure_releases: AtomicU64,
    /// Total times an `append_dispatched` / `append_failed` /
    /// `append_gated` / `append_deferred` call returned an error.
    /// The dispatch itself already succeeded (or hit its terminal
    /// state) when this counter ticks — the chain record is
    /// missing for that action, but the action's effect is real.
    /// Non-zero indicates the chain appender is dropping records;
    /// in-memory ring and dispatcher state remain consistent.
    pub chain_append_failures: AtomicU64,
}

impl ExecutorStats {
    fn inc(counter: &AtomicU64) {
        counter.fetch_add(1, Ordering::Relaxed);
    }
}

/// Internal heap entry — `Reverse<Instant>` so the smallest
/// retry deadline is popped first.
struct DeferredEntry {
    retry_at: Instant,
    action: PendingAction,
    /// Number of times this action has been deferred. Capped by
    /// `BackpressureConfig::max_defer_count`; past the cap the
    /// executor drops the action with a structured failure
    /// record rather than keep it on the heap forever.
    defer_count: u32,
}

impl PartialEq for DeferredEntry {
    fn eq(&self, other: &Self) -> bool {
        self.retry_at == other.retry_at
    }
}
impl Eq for DeferredEntry {}
impl PartialOrd for DeferredEntry {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}
impl Ord for DeferredEntry {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // Min-heap by retry_at: smallest first.
        Reverse(self.retry_at).cmp(&Reverse(other.retry_at))
    }
}

/// The executor task body. Owns:
///
/// - the receiver side of the loop's action queue,
/// - a [`BackpressureState`] (Phase G),
/// - the dispatcher,
/// - the deferred-retry heap.
///
/// Construct via [`ActionExecutor::new`]; drive via
/// [`ActionExecutor::run`].
pub struct ActionExecutor<D: ActionDispatcher> {
    actions_rx: mpsc::Receiver<PendingAction>,
    config: Arc<MeshOsConfig>,
    backpressure: BackpressureState,
    dispatcher: Arc<D>,
    deferred: BinaryHeap<DeferredEntry>,
    /// Bounded ring of recent dispatch failures. Shared with the
    /// loop so the snapshot publish path can copy it into the
    /// `MeshOsSnapshot::recent_failures` field without going
    /// through the chain-fold path (which requires a real
    /// `ActionChainAppender` to be wired). Writer is the
    /// executor task; reader is the loop task on every publish.
    recent_failures: Arc<RwLock<VecDeque<FailureRecord>>>,
    /// Monotonic counter the executor stamps onto every
    /// `FailureRecord` it pushes. Same dedup primitive as the
    /// admin audit ring's seq — the Deck SDK's
    /// `subscribe_failures` stream uses it. Shared via
    /// `Arc<AtomicU64>` because the loop also records failures
    /// (e.g. migration-abort dispatch errors) and needs to
    /// stamp the same monotonic sequence — without the shared
    /// counter, loop-recorded failures would collide with
    /// executor-recorded ones at the SDK dedup gate.
    failure_seq: Arc<AtomicU64>,
    /// Failure chain appender. Production deployments wire a
    /// `TypedRedexFile<FailureRecord>` here so the failure
    /// ring's bounded history extends to cluster-lifetime
    /// replay. Default is `NoOpFailureChainAppender` — only
    /// the in-memory ring is observable when no chain is
    /// wired.
    failure_appender: Arc<dyn super::failure_chain::FailureChainAppender>,
    stats: Arc<ExecutorStats>,
    /// Optional action-chain appender. Each admit/dispatch
    /// outcome appends an [`super::chain::ActionChainRecord`].
    /// Defaults to [`NoOpActionChainAppender`] — a real
    /// appender wires only when a chain consumer is set up.
    chain_appender: Arc<dyn ActionChainAppender>,
}

impl<D: ActionDispatcher> ActionExecutor<D> {
    /// Build an executor. `actions_rx` is the loop's queue
    /// (returned by [`super::event_loop::MeshOsLoop::new`]).
    pub fn new(
        actions_rx: mpsc::Receiver<PendingAction>,
        config: Arc<MeshOsConfig>,
        dispatcher: Arc<D>,
    ) -> Self {
        Self {
            actions_rx,
            config,
            backpressure: BackpressureState::new(),
            dispatcher,
            deferred: BinaryHeap::new(),
            recent_failures: Arc::new(RwLock::new(VecDeque::with_capacity(
                RECENT_FAILURES_CAPACITY,
            ))),
            failure_seq: Arc::new(AtomicU64::new(0)),
            failure_appender: super::failure_chain::no_op_arc(),
            stats: Arc::new(ExecutorStats::default()),
            chain_appender: Arc::new(NoOpActionChainAppender),
        }
    }

    /// Attach a [`super::failure_chain::FailureChainAppender`].
    /// The executor's `record_failure` path dual-writes every
    /// failure to both the in-memory ring (snapshot readable)
    /// and this appender (chain-backed history). Without an
    /// explicit appender the executor uses the no-op default.
    pub fn with_failure_appender(
        mut self,
        appender: Arc<dyn super::failure_chain::FailureChainAppender>,
    ) -> Self {
        self.failure_appender = appender;
        self
    }

    /// Clone the shared recent-failures ring. The runtime hands
    /// this to the loop so the snapshot publish path can copy it
    /// into the [`super::snapshot::MeshOsSnapshot::recent_failures`]
    /// field — the chain-fold path is not the only failure
    /// surface.
    pub fn recent_failures_handle(&self) -> Arc<RwLock<VecDeque<FailureRecord>>> {
        Arc::clone(&self.recent_failures)
    }

    /// Clone the shared failure-seq counter. The loop side
    /// records its own failures (e.g. migration-abort
    /// dispatcher errors) and needs the same monotonic
    /// sequence so SDK consumers' dedup gate doesn't see
    /// colliding `seq` values.
    pub fn failure_seq_handle(&self) -> Arc<AtomicU64> {
        Arc::clone(&self.failure_seq)
    }

    /// Clone the shared failure-chain appender. The loop side
    /// dual-writes records via its internal
    /// `record_runtime_failure` helper so the durable chain
    /// history covers loop-side failures too, not just
    /// executor-side dispatch failures.
    pub fn failure_appender_handle(&self) -> Arc<dyn super::failure_chain::FailureChainAppender> {
        Arc::clone(&self.failure_appender)
    }

    /// Builder: install an action-chain appender. The default
    /// `NoOpActionChainAppender` swallows every record; a real
    /// appender (e.g. one writing to a RedEX chain consumed by
    /// `MeshOsSnapshotFold`) takes over per-action recording.
    pub fn with_chain_appender(mut self, appender: Arc<dyn ActionChainAppender>) -> Self {
        self.chain_appender = appender;
        self
    }

    /// Handle on the executor's live state — `stats` + the
    /// recent-failures snapshot. Cheap to clone (Arc /
    /// fixed-size copies). Useful for Phase F snapshot
    /// building from outside the task.
    pub fn handle(&self) -> ExecutorHandle {
        ExecutorHandle {
            stats: Arc::clone(&self.stats),
        }
    }

    /// Clone the stats `Arc`. Useful for the [`super::runtime::MeshOsRuntime`]
    /// stitching layer, which holds the Arc across `run()`'s
    /// consumption of `self`.
    pub fn stats_arc(&self) -> Arc<ExecutorStats> {
        Arc::clone(&self.stats)
    }

    /// Drive the executor until either the action receiver
    /// closes (the loop dropped its sender) or the inner
    /// dispatcher panics. Returns the accumulated stats.
    #[expect(
        clippy::expect_used,
        reason = "tokio::select arm is gated on `next_deadline.is_some()` which means the prior `peek()` returned Some; pop() on the same heap must succeed"
    )]
    pub async fn run(mut self) -> Arc<ExecutorStats> {
        // Periodic idle tick. The cluster-backpressure release
        // edge only fires inside `handle_one_retry`, so a queue
        // that drains below the low-water mark while no fresh
        // action arrives would otherwise leave daemons throttled
        // forever in a quiet steady-state cluster. The tick polls
        // the live queue depth on every 100 ms boundary so the
        // release surfaces independently of incoming actions.
        let mut idle_tick = tokio::time::interval(Duration::from_millis(100));
        idle_tick.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
        loop {
            let next_deadline = self.deferred.peek().map(|e| e.retry_at);
            tokio::select! {
                action = self.actions_rx.recv() => {
                    let Some(action) = action else { break };
                    self.handle_one(action).await;
                }
                _ = sleep_until_opt(next_deadline), if next_deadline.is_some() => {
                    // SAFETY: peek above returned Some.
                    let due = self.deferred.pop().expect("deferred heap non-empty");
                    self.handle_one_retry(due.action, due.defer_count).await;
                }
                _ = idle_tick.tick() => {
                    self.poll_cluster_backpressure();
                }
            }
        }
        Arc::clone(&self.stats)
    }

    /// Re-evaluate the cluster-backpressure hysteresis with the
    /// current live queue depth, without consuming an action.
    /// Used by the idle tick and the post-release path so the
    /// `Released` edge fires even with zero in-flight actions.
    /// Pre-fix `update_cluster_backpressure` had only one
    /// non-test caller (inside `handle_one_retry`); a queue that
    /// drained below the low-water mark while no fresh action
    /// arrived left daemons throttled indefinitely.
    fn poll_cluster_backpressure(&mut self) {
        let depth = self.actions_rx.len() + self.deferred.len();
        let change = self
            .backpressure
            .update_cluster_backpressure(depth, &self.config.backpressure);
        match change {
            ClusterBackpressureChange::Asserted => {
                ExecutorStats::inc(&self.stats.cluster_backpressure_asserts);
                self.dispatcher.on_cluster_backpressure(change);
            }
            ClusterBackpressureChange::Released => {
                ExecutorStats::inc(&self.stats.cluster_backpressure_releases);
                self.dispatcher.on_cluster_backpressure(change);
            }
            ClusterBackpressureChange::Steady => {}
        }
    }

    async fn handle_one(&mut self, action: PendingAction) {
        self.handle_one_retry(action, 0).await
    }

    async fn handle_one_retry(&mut self, action: PendingAction, prior_defers: u32) {
        // Source `now` from tokio's clock and convert to std so the
        // deferred-heap deadlines stay coherent with tokio::time::sleep_until
        // — under `tokio::time::pause()` the std::Instant::now() formulation
        // diverged from the paused timer, leaving tests unable to drive
        // deferred-retry semantics. into_std() round-trips through
        // tokio::time::Instant::from_std() in sleep_until_opt below.
        let now = tokio::time::Instant::now().into_std();
        self.backpressure.tick(now);
        // Compute live queue depth (channel + deferred heap) and
        // run hysteresis; surface edge crossings to the
        // dispatcher so it can broadcast
        // `DaemonControl::BackpressureOn`/`Off` to supervised
        // daemons.
        let depth = self.actions_rx.len() + self.deferred.len() + 1;
        let change = self
            .backpressure
            .update_cluster_backpressure(depth, &self.config.backpressure);
        match change {
            ClusterBackpressureChange::Asserted => {
                ExecutorStats::inc(&self.stats.cluster_backpressure_asserts);
                self.dispatcher.on_cluster_backpressure(change);
            }
            ClusterBackpressureChange::Released => {
                ExecutorStats::inc(&self.stats.cluster_backpressure_releases);
                self.dispatcher.on_cluster_backpressure(change);
            }
            ClusterBackpressureChange::Steady => {}
        }
        match self
            .backpressure
            .admit(action.id, &action.action, now, &self.config.backpressure)
        {
            AdmissionResult::Admit => {
                self.dispatch_now_with_defer_count(action, now, prior_defers)
                    .await
            }
            AdmissionResult::Defer { retry_after } => {
                let next_count = prior_defers.saturating_add(1);
                if next_count > self.config.backpressure.max_defer_count {
                    ExecutorStats::inc(&self.stats.failed);
                    let reason = format!(
                        "deferred {next_count} times — exceeds max_defer_count {}",
                        self.config.backpressure.max_defer_count,
                    );
                    self.record_failure(format!("action-id:{}", action.id.0), reason.clone());
                    let r = append_failed(&self.chain_appender, &action, reason, None);
                    self.record_chain_append(action.id.0, "failed_defer_budget", r);
                    return;
                }
                ExecutorStats::inc(&self.stats.deferred);
                self.deferred.push(DeferredEntry {
                    retry_at: now.checked_add(retry_after).unwrap_or(now),
                    action,
                    defer_count: next_count,
                });
            }
            AdmissionResult::Gate {
                cooldown_until,
                reason,
            } => {
                ExecutorStats::inc(&self.stats.gated);
                let age = cooldown_until.saturating_duration_since(now);
                let cooldown_ms = age.as_millis() as u64;
                self.record_failure(
                    format!("action-id:{}", action.id.0),
                    format!("gated ({reason}) for {cooldown_ms} ms"),
                );
                let r = append_gated(
                    &self.chain_appender,
                    &action,
                    reason.to_string(),
                    Some(cooldown_ms),
                );
                self.record_chain_append(action.id.0, "gated", r);
            }
        }
    }

    async fn dispatch_now_with_defer_count(
        &mut self,
        action: PendingAction,
        admit_anchor: Instant,
        prior_defers: u32,
    ) {
        // Wrap the dispatcher in `catch_unwind` so a panicking
        // future doesn't unwind the executor task. The trait is
        // pluggable + third-party-installed; trust-but-isolate.
        let dispatch_future = self.dispatcher.dispatch(action.action.clone());
        let result = match std::panic::AssertUnwindSafe(dispatch_future)
            .catch_unwind()
            .await
        {
            Ok(result) => result,
            Err(_) => {
                tracing::error!(
                    target: "meshos",
                    action_id = action.id.0,
                    "dispatcher panicked — recording as drop",
                );
                Err(DispatchError::drop("dispatcher panicked"))
            }
        };
        match result {
            Ok(()) => {
                ExecutorStats::inc(&self.stats.dispatched);
                let r = append_dispatched(&self.chain_appender, &action);
                self.record_chain_append(action.id.0, "dispatched", r);
            }
            Err(err) => {
                // Dispatch did not happen — roll back the
                // reservations admit installed against this
                // action's id so unrelated future actions aren't
                // gated by a side effect that never occurred.
                self.backpressure
                    .release_failed_admit(action.id, &action.action);
                // Refresh cluster-backpressure: the release just
                // dropped one in-flight action's reservation, so a
                // queue that was hovering near the release water
                // mark should surface that edge here rather than
                // wait for the next idle tick.
                self.poll_cluster_backpressure();
                let _ = admit_anchor;
                if let Some(after) = err.retry_after {
                    // Dispatch-error retries share the
                    // max_defer_count budget with admit-side
                    // defers — both occupy the same heap, both
                    // are "this action couldn't run, try later".
                    let next_count = prior_defers.saturating_add(1);
                    if next_count > self.config.backpressure.max_defer_count {
                        ExecutorStats::inc(&self.stats.failed);
                        let reason =
                            format!("dispatch retry budget exhausted after {next_count} attempts",);
                        self.record_failure(format!("action-id:{}", action.id.0), reason.clone());
                        let r = append_failed(&self.chain_appender, &action, reason, None);
                        self.record_chain_append(action.id.0, "failed_retry_budget", r);
                        return;
                    }
                    ExecutorStats::inc(&self.stats.dispatch_retries);
                    let retry_ms = after.as_millis() as u64;
                    let r = append_failed(
                        &self.chain_appender,
                        &action,
                        err.reason.clone(),
                        Some(retry_ms),
                    );
                    self.record_chain_append(action.id.0, "failed_retry", r);
                    // Same time-source rationale as handle_one_retry above.
                    let now = tokio::time::Instant::now().into_std();
                    self.deferred.push(DeferredEntry {
                        retry_at: now.checked_add(after).unwrap_or(now),
                        action,
                        defer_count: next_count,
                    });
                } else {
                    ExecutorStats::inc(&self.stats.failed);
                    let reason = err.reason.clone();
                    self.record_failure(format!("action-id:{}", action.id.0), err.reason);
                    let r = append_failed(&self.chain_appender, &action, reason, None);
                    self.record_chain_append(action.id.0, "failed", r);
                }
            }
        }
    }

    /// Record the outcome of a chain-append call. Bumps the
    /// `chain_append_failures` counter on `Err` and emits a
    /// warn log so operators can see the chain is dropping
    /// records — the dispatch / admit / gate side effect
    /// already happened either way.
    fn record_chain_append(
        &self,
        action_id: u64,
        kind: &'static str,
        result: Result<(), AppendError>,
    ) {
        if let Err(e) = result {
            self.stats
                .chain_append_failures
                .fetch_add(1, Ordering::Relaxed);
            tracing::warn!(
                target: "meshos",
                action_id,
                kind,
                error = %e,
                "executor chain append failed; in-memory state stayed consistent \
                 but the action's chain record is missing",
            );
        }
    }

    fn record_failure(&mut self, source: String, reason: String) {
        let recorded_at_ms = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .map(|d| d.as_millis() as u64)
            .unwrap_or(0);
        let seq = self.failure_seq.fetch_add(1, Ordering::SeqCst) + 1;
        let record = FailureRecord {
            seq,
            source,
            reason,
            recorded_at_ms,
        };
        // Dual-write: chain first, ring second. The chain
        // append is non-fatal — a hiccup there must never
        // wedge the executor's dispatch loop.
        if let Err(err) = self.failure_appender.append(&record) {
            tracing::warn!(
                target: "meshos",
                seq = record.seq,
                error = %err,
                "failure-chain append failed — record kept on in-memory ring only",
            );
        }
        let mut ring = self.recent_failures.write();
        if ring.len() >= RECENT_FAILURES_CAPACITY {
            ring.pop_front();
        }
        ring.push_back(record);
    }
}

/// External handle for sampling executor live state.
#[derive(Clone)]
pub struct ExecutorHandle {
    stats: Arc<ExecutorStats>,
}

impl ExecutorHandle {
    /// Sample the current stats. Atomic loads; consistent
    /// per-counter but not as a single snapshot.
    pub fn stats(&self) -> ExecutorStatsSnapshot {
        ExecutorStatsSnapshot {
            dispatched: self.stats.dispatched.load(Ordering::Relaxed),
            failed: self.stats.failed.load(Ordering::Relaxed),
            deferred: self.stats.deferred.load(Ordering::Relaxed),
            gated: self.stats.gated.load(Ordering::Relaxed),
            dispatch_retries: self.stats.dispatch_retries.load(Ordering::Relaxed),
            cluster_backpressure_asserts: self
                .stats
                .cluster_backpressure_asserts
                .load(Ordering::Relaxed),
            cluster_backpressure_releases: self
                .stats
                .cluster_backpressure_releases
                .load(Ordering::Relaxed),
            chain_append_failures: self.stats.chain_append_failures.load(Ordering::Relaxed),
        }
    }
}

/// Plain-value stats snapshot (no atomics; safe to copy +
/// serialize).
#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
pub struct ExecutorStatsSnapshot {
    /// Total actions admitted + successfully dispatched.
    pub dispatched: u64,
    /// Total actions admitted but failed in dispatch.
    pub failed: u64,
    /// Total `AdmissionResult::Defer` re-queues.
    pub deferred: u64,
    /// Total `AdmissionResult::Gate` drops.
    pub gated: u64,
    /// Total dispatch errors retried via `retry_after`.
    pub dispatch_retries: u64,
    /// Number of cluster-backpressure assert edges surfaced.
    pub cluster_backpressure_asserts: u64,
    /// Number of cluster-backpressure release edges surfaced.
    pub cluster_backpressure_releases: u64,
    /// Times the chain appender returned `Err` for an action
    /// the executor was attempting to record (dispatched /
    /// failed / gated / deferred). Non-zero means the chain is
    /// missing records — the in-memory ring and dispatcher state
    /// remain consistent, but downstream chain consumers will
    /// not see those entries.
    pub chain_append_failures: u64,
}

async fn sleep_until_opt(deadline: Option<Instant>) {
    if let Some(deadline) = deadline {
        sleep_until(tokio::time::Instant::from_std(deadline)).await;
    } else {
        // No deferred work — park forever. The select! arm
        // gating on `if next_deadline.is_some()` keeps this
        // branch from ever being polled when no deadline is
        // pending.
        std::future::pending::<()>().await;
    }
}

#[cfg(test)]
mod tests {
    use std::time::Duration;

    use tokio::sync::mpsc;

    use super::super::action::{ActionId, MaintenanceTransition};
    use super::super::config::MeshOsConfig;
    use super::super::event::{ChainId, DaemonRef};
    use super::*;

    fn pending(id: u64, action: MeshOsAction) -> PendingAction {
        PendingAction {
            id: ActionId(id),
            action,
            emitted_at: Instant::now(),
        }
    }

    fn dref(name: &str, id: u64) -> DaemonRef {
        DaemonRef {
            id,
            name: name.into(),
        }
    }

    fn fast_cfg() -> Arc<MeshOsConfig> {
        Arc::new(MeshOsConfig::default())
    }

    /// Chain appender that always returns Err. Used to pin the
    /// counter-bumping behavior — dispatch must still succeed,
    /// but `chain_append_failures` records the dropped record.
    struct FailingChainAppender;

    impl super::super::chain::ActionChainAppender for FailingChainAppender {
        fn append(
            &self,
            _record: super::super::chain::ActionChainRecord,
        ) -> Result<(), AppendError> {
            Err(AppendError {
                reason: "test-injected appender failure".into(),
            })
        }
    }

    #[tokio::test]
    async fn chain_append_failure_bumps_counter_but_dispatch_still_succeeds() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher))
            .with_chain_appender(Arc::new(FailingChainAppender));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Maintenance,
            },
        ))
        .await
        .unwrap();
        drop(tx);

        let stats = task.await.expect("join");
        // Dispatch still happened — chain miss is not a correctness gap.
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 1);
        assert_eq!(dispatcher.log().len(), 1);
        // And the counter recorded the dropped chain record.
        assert_eq!(stats.chain_append_failures.load(Ordering::Relaxed), 1);
    }

    #[tokio::test]
    async fn admitted_actions_reach_the_dispatcher() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Maintenance,
            },
        ))
        .await
        .unwrap();
        tx.send(pending(
            2,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 2);
        assert_eq!(dispatcher.log().len(), 2);
    }

    #[tokio::test]
    async fn gated_actions_do_not_reach_the_dispatcher() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let mut exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        // Pre-load the daemon gate so StartDaemon is gated.
        let d = dref("telemetry", 1);
        exec.backpressure
            .record_daemon_gate(d.clone(), Instant::now() + Duration::from_secs(60));
        let task = tokio::spawn(exec.run());

        tx.send(pending(1, MeshOsAction::StartDaemon { daemon: d }))
            .await
            .unwrap();
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 0);
        assert_eq!(stats.gated.load(Ordering::Relaxed), 1);
        assert_eq!(dispatcher.log().len(), 0);
    }

    #[tokio::test]
    async fn deferred_actions_eventually_reach_the_dispatcher() {
        // Two PullReplica in quick succession; default
        // pull_cooldown is 250 ms so the second defers.
        // tokio::time::pause() doesn't compose with our
        // Instant::now() reads (we use std time, not tokio
        // time), so we rely on real-time delays — pull_cooldown
        // is 250 ms, drift is small enough.
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        let chain_a: ChainId = 1;
        let chain_b: ChainId = 2;
        tx.send(pending(
            1,
            MeshOsAction::PullReplica {
                chain: chain_a,
                source: 5,
            },
        ))
        .await
        .unwrap();
        tx.send(pending(
            2,
            MeshOsAction::PullReplica {
                chain: chain_b,
                source: 5,
            },
        ))
        .await
        .unwrap();

        // Give the executor enough wall time to: dispatch the
        // first, defer the second, wake up after the cooldown,
        // and dispatch the second.
        tokio::time::sleep(Duration::from_millis(500)).await;
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(
            stats.dispatched.load(Ordering::Relaxed),
            2,
            "both pulls should eventually reach the dispatcher",
        );
        assert!(
            stats.deferred.load(Ordering::Relaxed) >= 1,
            "second pull should have been deferred at least once",
        );
        assert_eq!(dispatcher.log().len(), 2);
    }

    #[tokio::test]
    async fn dispatch_errors_without_retry_record_failures() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        dispatcher.fail_next(DispatchError::drop("boom"));
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 0);
        assert_eq!(stats.failed.load(Ordering::Relaxed), 1);
    }

    #[tokio::test]
    async fn failure_chain_appender_receives_every_recorded_failure() {
        use super::super::failure_chain::BufferingFailureChainAppender;
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        dispatcher.fail_next(DispatchError::drop("first boom"));
        let appender = Arc::new(BufferingFailureChainAppender::default());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher)).with_failure_appender(
            appender.clone() as Arc<dyn super::super::failure_chain::FailureChainAppender>,
        );
        // Capture the failure ring handle BEFORE moving `exec`
        // into the spawned task.
        let ring_handle = exec.recent_failures_handle();
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        drop(tx);
        let _ = task.await.expect("join");

        let captured = appender.captured();
        assert_eq!(captured.len(), 1, "appender should see one record");
        assert!(captured[0].reason.contains("first boom"));
        assert!(captured[0].seq > 0);

        // Appender + executor ring see the SAME record.
        let ring: Vec<FailureRecord> = ring_handle.read().iter().cloned().collect();
        assert_eq!(ring.len(), 1);
        assert_eq!(ring[0].seq, captured[0].seq);
        assert_eq!(ring[0].reason, captured[0].reason);
    }

    #[tokio::test]
    async fn dispatch_errors_with_retry_re_enqueue() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        // First call errors with a 50 ms retry; the second
        // call (after re-queue) succeeds.
        dispatcher.fail_next(DispatchError::retry("transient", Duration::from_millis(50)));
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        // Wait long enough for the retry to fire + drain.
        tokio::time::sleep(Duration::from_millis(200)).await;
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 1);
        assert_eq!(stats.dispatch_retries.load(Ordering::Relaxed), 1);
    }

    #[tokio::test]
    async fn executor_exits_when_sender_drops() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());
        drop(tx);
        let stats = tokio::time::timeout(Duration::from_secs(2), task)
            .await
            .expect("executor did not exit after sender dropped")
            .expect("join");
        assert_eq!(stats.dispatched.load(Ordering::Relaxed), 0);
    }

    #[tokio::test]
    async fn dispatch_retry_drops_after_exceeding_max_defer_count() {
        // Regression for I7: a dispatcher that returns
        // `retry_after` forever (a poison pill) used to occupy
        // the deferred-action heap indefinitely. The defer
        // budget caps total attempts; past the cap the
        // executor drops the action with a failure record.
        struct AlwaysRetry {
            attempts: parking_lot::Mutex<u32>,
        }
        impl ActionDispatcher for AlwaysRetry {
            fn dispatch<'a>(
                &'a self,
                _action: MeshOsAction,
            ) -> BoxFuture<'a, Result<(), DispatchError>> {
                Box::pin(async move {
                    *self.attempts.lock() += 1;
                    Err(DispatchError::retry("transient", Duration::from_millis(5)))
                })
            }
        }

        let mut cfg = MeshOsConfig::default();
        cfg.backpressure.max_defer_count = 3;
        let cfg = Arc::new(cfg);
        let (tx, rx) = mpsc::channel(8);
        let dispatcher = Arc::new(AlwaysRetry {
            attempts: parking_lot::Mutex::new(0),
        });
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        // Give the executor enough wall time for max_defer_count
        // attempts + a few ms each.
        tokio::time::sleep(Duration::from_millis(200)).await;
        drop(tx);
        let stats = task.await.expect("join");
        let attempts = *dispatcher.attempts.lock();
        assert_eq!(
            stats.failed.load(Ordering::Relaxed),
            1,
            "action must drop with a failure after exceeding max_defer_count",
        );
        assert!(
            (3..=5).contains(&attempts),
            "expected ~max_defer_count dispatch attempts, got {attempts}",
        );
    }

    #[tokio::test]
    async fn dispatcher_panic_does_not_kill_executor() {
        // Regression for I6: a panicking dispatcher future used
        // to unwind the executor task. The catch_unwind wrapper
        // converts the panic into a `DispatchError::drop`, so
        // the executor continues servicing subsequent actions.
        struct PanicOnce {
            armed: parking_lot::Mutex<bool>,
            log: Mutex<Vec<MeshOsAction>>,
        }
        impl ActionDispatcher for PanicOnce {
            fn dispatch<'a>(
                &'a self,
                action: MeshOsAction,
            ) -> BoxFuture<'a, Result<(), DispatchError>> {
                Box::pin(async move {
                    let armed = {
                        let mut g = self.armed.lock();
                        let was = *g;
                        *g = false;
                        was
                    };
                    if armed {
                        panic!("boom");
                    }
                    self.log.lock().push(action);
                    Ok(())
                })
            }
        }

        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(PanicOnce {
            armed: parking_lot::Mutex::new(true),
            log: Mutex::new(Vec::new()),
        });
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Maintenance,
            },
        ))
        .await
        .unwrap();
        tx.send(pending(
            2,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        tokio::time::sleep(Duration::from_millis(50)).await;
        drop(tx);

        let stats = task
            .await
            .expect("executor task should NOT have panicked despite dispatcher panic");
        assert_eq!(
            stats.dispatched.load(Ordering::Relaxed),
            1,
            "second action should have dispatched after the first panicked",
        );
        assert_eq!(stats.failed.load(Ordering::Relaxed), 1);
        assert_eq!(dispatcher.log.lock().len(), 1);
    }

    #[tokio::test]
    async fn cluster_backpressure_edges_surface_through_dispatcher_hook() {
        // Set high-water = 3, low-water = 1 so the channel-only
        // depth crosses the threshold quickly. The executor pushes
        // four actions into a buffered channel before draining;
        // depth at first admit reaches 4 (rx.len() == 3 + 1 in
        // flight) crossing the high mark, then drops as actions
        // drain.
        let mut cfg = MeshOsConfig::default();
        cfg.backpressure.cluster_backpressure_threshold = 3;
        cfg.backpressure.cluster_backpressure_release = 1;
        let cfg = Arc::new(cfg);
        let (tx, rx) = mpsc::channel(8);
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        // Buffer four actions before letting the executor start
        // (cargo holds the spawn until we `.await`).
        for i in 1..=4u64 {
            tx.send(pending(
                i,
                MeshOsAction::CommitMaintenanceTransition {
                    node: 1,
                    target: MaintenanceTransition::Active,
                },
            ))
            .await
            .unwrap();
        }
        let task = tokio::spawn(exec.run());
        // Let everything drain.
        tokio::time::sleep(Duration::from_millis(50)).await;
        drop(tx);
        let stats = task.await.expect("join");
        assert!(
            stats.cluster_backpressure_asserts.load(Ordering::Relaxed) >= 1,
            "depth crossed the high-water mark at least once",
        );
        assert!(
            stats.cluster_backpressure_releases.load(Ordering::Relaxed) >= 1,
            "depth dropped below the low-water mark at least once",
        );
        let log = dispatcher.backpressure_log();
        assert!(matches!(
            log.first(),
            Some(ClusterBackpressureChange::Asserted)
        ));
        assert!(matches!(
            log.last(),
            Some(ClusterBackpressureChange::Released)
        ));
    }

    #[tokio::test]
    async fn dispatch_failure_with_retry_releases_pull_cooldown() {
        // Regression: a PullReplica admit sets the global pull
        // cooldown; if dispatch fails the cooldown must be
        // rolled back so unrelated pulls aren't gated by a side
        // effect that never happened.
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        // First dispatch fails with a long retry hint; the
        // second admit (on a different chain) must succeed
        // without waiting on the rolled-back cooldown.
        dispatcher.fail_next(DispatchError::retry("transient", Duration::from_secs(60)));
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::PullReplica {
                chain: 1,
                source: 5,
            },
        ))
        .await
        .unwrap();
        // Brief settle: first action processed (admit + fail +
        // release + heap push) before the second arrives.
        tokio::time::sleep(Duration::from_millis(50)).await;
        tx.send(pending(
            2,
            MeshOsAction::PullReplica {
                chain: 2,
                source: 5,
            },
        ))
        .await
        .unwrap();
        tokio::time::sleep(Duration::from_millis(50)).await;
        drop(tx);

        let stats = task.await.expect("join");
        assert_eq!(
            stats.dispatched.load(Ordering::Relaxed),
            1,
            "second pull should dispatch immediately after the first \
             released its leaked cooldown",
        );
        assert_eq!(stats.dispatch_retries.load(Ordering::Relaxed), 1);
    }

    #[tokio::test]
    async fn handle_exposes_atomic_stats_to_outside_observers() {
        let (tx, rx) = mpsc::channel(8);
        let cfg = fast_cfg();
        let dispatcher = Arc::new(LoggingDispatcher::new());
        let exec = ActionExecutor::new(rx, cfg, Arc::clone(&dispatcher));
        let handle = exec.handle();
        let task = tokio::spawn(exec.run());

        tx.send(pending(
            1,
            MeshOsAction::CommitMaintenanceTransition {
                node: 1,
                target: MaintenanceTransition::Active,
            },
        ))
        .await
        .unwrap();
        tokio::time::sleep(Duration::from_millis(50)).await;

        let snap = handle.stats();
        assert!(snap.dispatched >= 1);
        drop(tx);
        let _ = task.await;
    }
}