vyre_driver/
persistent_kernel_policy.rs

1//! D1 substrate: persistent-kernel-mode decision policy.
2//!
3//! When a workload submits many small kernel launches with the same
4//! pipeline, the launch overhead dominates execution time (~5 µs
5//! per native launch, ~10–50 µs per portable queue submit). Replacing the
6//! N launches with ONE persistent kernel that polls a device-side
7//! work queue eliminates the per-launch cost entirely  -  a 100×
8//! speedup on workloads where kernel duration < 50 µs.
9//!
10//! Persistent mode has a one-time setup cost (allocate the work queue,
11//! launch the persistent kernel, signal shutdown at the end). This
12//! amortises only when the batch is large enough. The decision policy
13//! here owns the threshold: given the measured per-launch overhead and
14//! per-item kernel duration, should the dispatcher run N standard
15//! launches or one persistent kernel?
16//!
17//! Pure decision  -  no kernel launch, no Program walk. Caller passes
18//! the measurements; the substrate produces a verdict.
19
20/// Inputs to the persistent-kernel decision.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct PersistentKernelInputs {
23    /// Number of small launches in the upcoming batch.
24    pub batch_size: u32,
25    /// Average per-launch host-side overhead in nanoseconds. Measured
26    /// on the live backend at startup; native is typically ~5_000 ns,
27    /// portable typically ~25_000 ns.
28    pub per_launch_overhead_ns: u64,
29    /// Average per-item kernel duration in nanoseconds. The
30    /// dispatcher measures this on the warmup pass before the batch.
31    pub per_item_kernel_ns: u64,
32    /// Setup cost of bringing up persistent mode (work-queue alloc,
33    /// initial launch, signal handshake) in nanoseconds. native: ~50_000
34    /// for a fresh queue; portable: ~200_000.
35    pub persistent_setup_overhead_ns: u64,
36}
37
38/// Verdict returned by [`decide_persistent_kernel`].
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum PersistentKernelDecision {
41    /// Use the standard launch path  -  N separate kernel launches.
42    /// Either the batch is too small to amortise persistent setup,
43    /// or the per-item kernel is large enough that launch overhead
44    /// is negligible.
45    StandardLaunches,
46    /// Use persistent kernel mode  -  one launch + device-side queue
47    /// polling for `batch_size` work items.
48    PersistentKernel {
49        /// Predicted total time saved (in nanoseconds) by using the
50        /// persistent path vs N standard launches. Useful for
51        /// telemetry and for the autotune store.
52        savings_ns: u128,
53    },
54}
55
56/// Decide whether to use persistent kernel mode for this batch.
57///
58/// Standard launches cost: `batch_size * per_launch_overhead + batch_size * per_item_kernel`.
59/// Persistent cost: `persistent_setup + batch_size * per_item_kernel`.
60/// Persistent wins iff `batch_size * per_launch_overhead > persistent_setup`.
61///
62/// Returns `StandardLaunches` when batch_size is 0 or 1 (persistent
63/// mode never wins for a single launch  -  the setup cost dominates).
64#[must_use]
65pub fn decide_persistent_kernel(inputs: PersistentKernelInputs) -> PersistentKernelDecision {
66    if inputs.batch_size <= 1 {
67        return PersistentKernelDecision::StandardLaunches;
68    }
69    // Defensive: zero per-launch overhead means we have no model to
70    // amortise  -  keep the standard path.
71    if inputs.per_launch_overhead_ns == 0 {
72        return PersistentKernelDecision::StandardLaunches;
73    }
74    let standard_overhead =
75        u128::from(inputs.batch_size) * u128::from(inputs.per_launch_overhead_ns);
76    let persistent_setup_overhead_ns = u128::from(inputs.persistent_setup_overhead_ns);
77    if standard_overhead <= persistent_setup_overhead_ns {
78        return PersistentKernelDecision::StandardLaunches;
79    }
80    let savings_ns = standard_overhead - persistent_setup_overhead_ns;
81    PersistentKernelDecision::PersistentKernel { savings_ns }
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    fn inp(batch: u32, launch_ovh: u64, item_ns: u64, setup: u64) -> PersistentKernelInputs {
89        PersistentKernelInputs {
90            batch_size: batch,
91            per_launch_overhead_ns: launch_ovh,
92            per_item_kernel_ns: item_ns,
93            persistent_setup_overhead_ns: setup,
94        }
95    }
96
97    #[test]
98    fn single_launch_is_always_standard() {
99        // No matter how cheap the persistent setup, a 1-launch batch
100        // can't beat the standard path.
101        let dec = decide_persistent_kernel(inp(1, 5_000, 1_000, 1_000));
102        assert_eq!(dec, PersistentKernelDecision::StandardLaunches);
103    }
104
105    #[test]
106    fn zero_batch_is_standard() {
107        let dec = decide_persistent_kernel(inp(0, 5_000, 1_000, 50_000));
108        assert_eq!(dec, PersistentKernelDecision::StandardLaunches);
109    }
110
111    #[test]
112    fn small_batch_below_amortisation_threshold_is_standard() {
113        // 5 launches × 5 µs = 25 µs total; persistent setup = 50 µs →
114        // standard is cheaper.
115        let dec = decide_persistent_kernel(inp(5, 5_000, 1_000, 50_000));
116        assert_eq!(dec, PersistentKernelDecision::StandardLaunches);
117    }
118
119    #[test]
120    fn batch_at_amortisation_threshold_is_standard() {
121        // Exactly equal  -  the policy uses strict `>` so equal cost
122        // stays on the standard path (cheaper to keep launching).
123        let dec = decide_persistent_kernel(inp(10, 5_000, 1_000, 50_000));
124        assert_eq!(dec, PersistentKernelDecision::StandardLaunches);
125    }
126
127    #[test]
128    fn large_batch_above_threshold_picks_persistent() {
129        // 100 launches × 5 µs = 500 µs; persistent setup = 50 µs →
130        // savings = 450 µs.
131        let dec = decide_persistent_kernel(inp(100, 5_000, 1_000, 50_000));
132        assert_eq!(
133            dec,
134            PersistentKernelDecision::PersistentKernel {
135                savings_ns: 450_000
136            }
137        );
138    }
139
140    #[test]
141    fn portable_typical_overheads_pick_persistent_at_modest_batch() {
142        // portable submit overhead ~25 µs; persistent setup ~200 µs.
143        // 10 launches × 25 µs = 250 µs > 200 µs setup → persistent.
144        let dec = decide_persistent_kernel(inp(10, 25_000, 5_000, 200_000));
145        assert_eq!(
146            dec,
147            PersistentKernelDecision::PersistentKernel { savings_ns: 50_000 }
148        );
149    }
150
151    #[test]
152    fn zero_per_launch_overhead_returns_standard() {
153        // Defensive: a backend that reports zero launch overhead has
154        // no model to amortise  -  keep the standard path.
155        let dec = decide_persistent_kernel(inp(1000, 0, 100, 50_000));
156        assert_eq!(dec, PersistentKernelDecision::StandardLaunches);
157    }
158
159    #[test]
160    fn savings_is_strictly_positive_for_persistent_verdict() {
161        let dec = decide_persistent_kernel(inp(1000, 5_000, 1_000, 50_000));
162        match dec {
163            PersistentKernelDecision::PersistentKernel { savings_ns } => {
164                assert!(savings_ns > 0);
165            }
166            other => panic!("expected PersistentKernel; got {:?}", other),
167        }
168    }
169
170    #[test]
171    fn item_duration_does_not_affect_decision() {
172        // The decision is purely about overhead vs setup; per-item
173        // kernel duration appears on both sides of the inequality
174        // and cancels.
175        let small_kernel = decide_persistent_kernel(inp(100, 5_000, 100, 50_000));
176        let large_kernel = decide_persistent_kernel(inp(100, 5_000, 1_000_000, 50_000));
177        assert_eq!(small_kernel, large_kernel);
178    }
179
180    #[test]
181    fn widened_arithmetic_preserves_extreme_savings() {
182        // Adversarial: batch_size × per_launch_overhead near u64::MAX
183        // must not panic or clamp the predicted savings.
184        let dec = decide_persistent_kernel(inp(u32::MAX, u64::MAX / 2, 1, 50_000));
185        match dec {
186            PersistentKernelDecision::PersistentKernel { savings_ns } => {
187                assert_eq!(
188                    savings_ns,
189                    u128::from(u32::MAX) * u128::from(u64::MAX / 2) - 50_000
190                );
191            }
192            other => panic!("expected PersistentKernel; got {:?}", other),
193        }
194    }
195
196    #[test]
197    fn persistent_policy_source_uses_exact_widened_arithmetic() {
198        let source = include_str!("persistent_kernel_policy.rs");
199
200        assert!(
201            !source.contains(concat!("saturating", "_mul"))
202                && !source.contains(concat!("saturating", "_sub")),
203            "Fix: persistent-kernel policy must use exact widened arithmetic, not saturating launch-cost math."
204        );
205        assert!(
206            source.contains("u128::from(inputs.batch_size)")
207                && source.contains("u128::from(inputs.per_launch_overhead_ns)")
208                && source.contains("standard_overhead - persistent_setup_overhead_ns"),
209            "Fix: persistent-kernel savings must stay widened through the verdict."
210        );
211    }
212}
vyre_driver/persistent_kernel_policy.rs

vyre_driver/
persistent_kernel_policy.rs