Skip to main content

vyre_driver/
megakernel_execution.rs

1//! Backend-neutral execution planning for persistent megakernel waves.
2//!
3//! Backends can feed telemetry and device budgets into this module to choose a
4//! sparse, dense, hybrid, or fused execution topology before allocating device
5//! scratch. The policy is deterministic, allocation-free, and validates byte
6//! pressure before a backend reaches an API-specific allocation path.
7
8const WARP_SPARSE_DENSITY: f64 = 0.03125;
9const SPARSE_DENSITY: f64 = 0.125;
10const DENSE_DENSITY: f64 = 0.70;
11const BLOCK_DENSE_DENSITY: f64 = 0.85;
12const FUSION_PRESSURE: f64 = 0.70;
13const FUSION_PRESSURE_HYSTERESIS: f64 = 0.10;
14const FRONTIER_HYSTERESIS: f64 = 0.025;
15const MEMORY_RED_ZONE_BPS: u32 = 9_000;
16const MEMORY_HYSTERESIS_BPS: u32 = 250;
17const LAUNCH_PRESSURE_BPS: u32 = 1_500;
18const LAUNCH_HYSTERESIS_BPS: u32 = 250;
19const FUSION_READBACK_BYTES: u64 = 4_096;
20const DENSE_AVERAGE_DEGREE_BPS: u64 = 20_000;
21const WARP_SPARSE_AVERAGE_DEGREE_BPS: u64 = 80_000;
22
23/// Per-candidate telemetry used to bias megakernel fusion.
24#[derive(Clone, Copy, Debug, PartialEq)]
25pub struct MegakernelExecutionSample {
26    /// Observed candidate dispatch cost in nanoseconds.
27    pub dispatch_cost_ns: f64,
28    /// Observed active-frontier density in `[0, 1]`.
29    pub frontier_density: f64,
30    /// Observed final readback byte volume.
31    pub readback_bytes: u64,
32}
33
34/// Device-side megakernel execution topology selected for a dataflow wave.
35#[derive(Clone, Copy, Debug, Eq, PartialEq)]
36pub enum MegakernelExecutionTopology {
37    /// Ultra-low-density frontier expansion where one warp owns sparse active
38    /// nodes and avoids block-wide work distribution overhead.
39    WarpSparseFrontier,
40    /// Low-density frontier expansion with queue-like work distribution.
41    SparseFrontier,
42    /// Very high-density propagation where a block owns coalesced bitset lanes
43    /// and amortizes shared-memory scans across many active facts.
44    BlockDenseFrontier,
45    /// Dense bitset-style propagation with coalesced scans.
46    DenseFrontier,
47    /// Mixed sparse/dense execution when density is in the transition band.
48    HybridFrontier,
49    /// Fused adjacent waves when launch/readback pressure dominates and memory
50    /// budget leaves room for the fused plan.
51    FusedWave,
52}
53
54/// Static graph shape used by topology selection.
55#[derive(Clone, Copy, Debug, Eq, PartialEq)]
56pub struct MegakernelGraphShape {
57    /// Logical graph node count.
58    pub node_count: u64,
59    /// Logical graph edge count.
60    pub edge_count: u64,
61}
62
63/// Device memory envelope for a candidate megakernel plan.
64#[derive(Clone, Copy, Debug, Eq, PartialEq)]
65pub struct MegakernelMemoryBudget {
66    /// Estimated resident plus transient bytes required by the candidate plan.
67    pub required_bytes: u64,
68    /// Caller-approved device-memory budget for the plan.
69    pub budget_bytes: u64,
70}
71
72/// Detailed megakernel memory plan.
73#[derive(Clone, Copy, Debug, Eq, PartialEq)]
74pub struct MegakernelMemoryPlan {
75    /// Graph-layout bytes retained on device.
76    pub graph_bytes: u64,
77    /// Frontier-state bytes retained on device.
78    pub frontier_bytes: u64,
79    /// Temporary scratch bytes required by the selected topology.
80    pub scratch_bytes: u64,
81    /// Final compact output/readback bytes.
82    pub output_bytes: u64,
83    /// Total peak bytes required by the plan.
84    pub required_bytes: u64,
85    /// Caller-approved byte budget.
86    pub budget_bytes: u64,
87    /// Required/budget pressure in basis points.
88    pub memory_pressure_bps: u32,
89}
90
91/// Complete megakernel execution plan selected from runtime telemetry.
92#[derive(Clone, Copy, Debug, Eq, PartialEq)]
93pub struct MegakernelExecutionPlan {
94    /// Final topology after memory-budget validation.
95    pub topology: MegakernelExecutionTopology,
96    /// Memory plan for the final topology.
97    pub memory: MegakernelMemoryPlan,
98    /// Whether the planner downgraded a denser/fused topology to sparse to fit
99    /// the explicit memory budget.
100    pub downgraded_to_sparse: bool,
101}
102
103/// Memory planning failure for megakernel execution.
104#[derive(Clone, Debug, Eq, PartialEq)]
105pub enum MegakernelMemoryError {
106    /// A byte-count multiplication or addition overflowed.
107    ByteCountOverflow {
108        /// Field being computed when overflow happened.
109        field: &'static str,
110    },
111    /// The candidate plan exceeds the caller-approved device-memory budget.
112    OverBudget {
113        /// Selected topology.
114        topology: MegakernelExecutionTopology,
115        /// Required peak bytes.
116        required_bytes: u64,
117        /// Caller-approved budget bytes.
118        budget_bytes: u64,
119        /// Graph node count.
120        node_count: u64,
121        /// Graph edge count.
122        edge_count: u64,
123    },
124}
125
126impl std::fmt::Display for MegakernelMemoryError {
127    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
128        match self {
129            Self::ByteCountOverflow { field } => write!(
130                f,
131                "megakernel memory planner overflowed while computing {field}. Fix: shard the graph or lower the candidate topology before planning device residency."
132            ),
133            Self::OverBudget {
134                topology,
135                required_bytes,
136                budget_bytes,
137                node_count,
138                edge_count,
139            } => write!(
140                f,
141                "megakernel {topology:?} plan requires {required_bytes} bytes but budget allows {budget_bytes} bytes for graph nodes={node_count} edges={edge_count}. Fix: choose a sparse topology, reduce fusion pressure, shard the graph, or raise the explicit device-memory budget."
142            ),
143        }
144    }
145}
146
147impl std::error::Error for MegakernelMemoryError {}
148
149/// Topology decision with the pressure metrics that caused it.
150#[derive(Clone, Copy, Debug, PartialEq)]
151pub struct MegakernelTopologyDecision {
152    /// Selected execution topology.
153    pub topology: MegakernelExecutionTopology,
154    /// Required/budget memory pressure in basis points.
155    pub memory_pressure_bps: u32,
156    /// Edge/node average degree proxy in basis points.
157    pub average_degree_bps: u64,
158    /// Launch overhead divided by observed dispatch cost in basis points.
159    pub launch_pressure_bps: u32,
160}
161
162impl MegakernelTopologyDecision {
163    /// Stable single-line explanation for release logs and scheduler debugging.
164    #[must_use]
165    pub fn stable_explanation(&self) -> String {
166        format!(
167            "megakernel-topology-v1|topology={:?}|memory_pressure_bps={}|average_degree_bps={}|launch_pressure_bps={}|reason={}",
168            self.topology,
169            self.memory_pressure_bps,
170            self.average_degree_bps,
171            self.launch_pressure_bps,
172            self.reason_code()
173        )
174    }
175
176    fn reason_code(&self) -> &'static str {
177        match self.topology {
178            MegakernelExecutionTopology::WarpSparseFrontier => "ultra_sparse_warp_specialized",
179            MegakernelExecutionTopology::SparseFrontier if self.memory_pressure_bps >= 9_000 => {
180                "memory_pressure_sparse_safety"
181            }
182            MegakernelExecutionTopology::SparseFrontier => "low_density_sparse_queue",
183            MegakernelExecutionTopology::BlockDenseFrontier => "high_density_block_specialized",
184            MegakernelExecutionTopology::DenseFrontier => "dense_coalesced_frontier",
185            MegakernelExecutionTopology::HybridFrontier => "transition_band_hybrid",
186            MegakernelExecutionTopology::FusedWave => "launch_and_readback_pressure_fused",
187        }
188    }
189}
190
191/// Select the megakernel execution topology for one candidate wave.
192#[must_use]
193pub fn select_megakernel_topology(
194    sample: MegakernelExecutionSample,
195    graph: MegakernelGraphShape,
196    memory: MegakernelMemoryBudget,
197    launch_overhead_ns: f64,
198    fusion_pressure: f64,
199) -> MegakernelTopologyDecision {
200    let memory_pressure_bps = pressure_bps(memory.required_bytes, memory.budget_bytes);
201    let average_degree_bps = pressure_bps_u64(graph.edge_count, graph.node_count);
202    let launch_pressure_bps =
203        if sample.dispatch_cost_ns <= 0.0 || !sample.dispatch_cost_ns.is_finite() {
204            0
205        } else {
206            finite_ratio_bps(
207                launch_overhead_ns.max(0.0),
208                sample.dispatch_cost_ns,
209                "launch overhead pressure",
210            )
211        };
212    let density = finite_unit(sample.frontier_density);
213    let fusion = finite_unit(fusion_pressure);
214    let topology = if memory_pressure_bps >= MEMORY_RED_ZONE_BPS {
215        MegakernelExecutionTopology::SparseFrontier
216    } else if fusion >= FUSION_PRESSURE
217        && launch_pressure_bps >= LAUNCH_PRESSURE_BPS
218        && sample.readback_bytes >= FUSION_READBACK_BYTES
219        && memory_pressure_bps
220            <= checked_bps_sub(MEMORY_RED_ZONE_BPS, 500, "fusion memory red-zone margin")
221    {
222        MegakernelExecutionTopology::FusedWave
223    } else if density <= WARP_SPARSE_DENSITY && average_degree_bps <= WARP_SPARSE_AVERAGE_DEGREE_BPS
224    {
225        MegakernelExecutionTopology::WarpSparseFrontier
226    } else if density <= SPARSE_DENSITY {
227        MegakernelExecutionTopology::SparseFrontier
228    } else if density >= BLOCK_DENSE_DENSITY && average_degree_bps >= DENSE_AVERAGE_DEGREE_BPS {
229        MegakernelExecutionTopology::BlockDenseFrontier
230    } else if density >= DENSE_DENSITY && average_degree_bps >= DENSE_AVERAGE_DEGREE_BPS {
231        MegakernelExecutionTopology::DenseFrontier
232    } else {
233        MegakernelExecutionTopology::HybridFrontier
234    };
235    MegakernelTopologyDecision {
236        topology,
237        memory_pressure_bps,
238        average_degree_bps,
239        launch_pressure_bps,
240    }
241}
242
243/// Select megakernel topology with previous-topology hysteresis.
244#[must_use]
245pub fn select_megakernel_topology_stable(
246    sample: MegakernelExecutionSample,
247    graph: MegakernelGraphShape,
248    memory: MegakernelMemoryBudget,
249    launch_overhead_ns: f64,
250    fusion_pressure: f64,
251    previous_topology: MegakernelExecutionTopology,
252) -> MegakernelTopologyDecision {
253    let mut decision =
254        select_megakernel_topology(sample, graph, memory, launch_overhead_ns, fusion_pressure);
255    decision.topology = stabilize_topology(decision, sample, fusion_pressure, previous_topology);
256    decision
257}
258
259fn stabilize_topology(
260    decision: MegakernelTopologyDecision,
261    sample: MegakernelExecutionSample,
262    fusion_pressure: f64,
263    previous_topology: MegakernelExecutionTopology,
264) -> MegakernelExecutionTopology {
265    if decision.memory_pressure_bps >= MEMORY_RED_ZONE_BPS {
266        return decision.topology;
267    }
268    let density = finite_unit(sample.frontier_density);
269    let fusion = finite_unit(fusion_pressure);
270    if matches!(
271        previous_topology,
272        MegakernelExecutionTopology::SparseFrontier
273            | MegakernelExecutionTopology::WarpSparseFrontier
274    ) && decision.memory_pressure_bps
275        >= checked_bps_sub(
276            MEMORY_RED_ZONE_BPS,
277            MEMORY_HYSTERESIS_BPS,
278            "memory hysteresis floor",
279        )
280    {
281        return MegakernelExecutionTopology::SparseFrontier;
282    }
283
284    match previous_topology {
285        MegakernelExecutionTopology::WarpSparseFrontier
286            if density <= WARP_SPARSE_DENSITY + FRONTIER_HYSTERESIS
287                && decision.average_degree_bps <= WARP_SPARSE_AVERAGE_DEGREE_BPS =>
288        {
289            MegakernelExecutionTopology::WarpSparseFrontier
290        }
291        MegakernelExecutionTopology::SparseFrontier
292            if density <= SPARSE_DENSITY + FRONTIER_HYSTERESIS =>
293        {
294            MegakernelExecutionTopology::SparseFrontier
295        }
296        MegakernelExecutionTopology::HybridFrontier
297            if decision.topology == MegakernelExecutionTopology::SparseFrontier
298                && density >= SPARSE_DENSITY - FRONTIER_HYSTERESIS =>
299        {
300            MegakernelExecutionTopology::HybridFrontier
301        }
302        MegakernelExecutionTopology::HybridFrontier
303            if matches!(
304                decision.topology,
305                MegakernelExecutionTopology::DenseFrontier
306                    | MegakernelExecutionTopology::BlockDenseFrontier
307            ) && density <= DENSE_DENSITY + FRONTIER_HYSTERESIS =>
308        {
309            MegakernelExecutionTopology::HybridFrontier
310        }
311        MegakernelExecutionTopology::DenseFrontier
312            if density >= DENSE_DENSITY - FRONTIER_HYSTERESIS
313                && decision.average_degree_bps >= DENSE_AVERAGE_DEGREE_BPS =>
314        {
315            MegakernelExecutionTopology::DenseFrontier
316        }
317        MegakernelExecutionTopology::BlockDenseFrontier
318            if density >= BLOCK_DENSE_DENSITY - FRONTIER_HYSTERESIS
319                && decision.average_degree_bps >= DENSE_AVERAGE_DEGREE_BPS =>
320        {
321            MegakernelExecutionTopology::BlockDenseFrontier
322        }
323        MegakernelExecutionTopology::FusedWave
324            if fusion >= FUSION_PRESSURE - FUSION_PRESSURE_HYSTERESIS
325                && decision.launch_pressure_bps
326                    >= checked_bps_sub(
327                        LAUNCH_PRESSURE_BPS,
328                        LAUNCH_HYSTERESIS_BPS,
329                        "launch hysteresis floor",
330                    )
331                && sample.readback_bytes >= FUSION_READBACK_BYTES
332                && decision.memory_pressure_bps
333                    <= checked_bps_sub(
334                        MEMORY_RED_ZONE_BPS,
335                        MEMORY_HYSTERESIS_BPS,
336                        "memory hysteresis floor",
337                    ) =>
338        {
339            MegakernelExecutionTopology::FusedWave
340        }
341        _ => decision.topology,
342    }
343}
344
345/// Compute and validate a megakernel device-memory plan.
346pub fn plan_megakernel_memory_budget(
347    topology: MegakernelExecutionTopology,
348    graph: MegakernelGraphShape,
349    bytes_per_node: u64,
350    bytes_per_edge: u64,
351    frontier_bytes: u64,
352    scratch_bytes: u64,
353    output_bytes: u64,
354    budget_bytes: u64,
355) -> Result<MegakernelMemoryPlan, MegakernelMemoryError> {
356    let node_bytes = checked_mul(graph.node_count, bytes_per_node, "node layout bytes")?;
357    let edge_bytes = checked_mul(graph.edge_count, bytes_per_edge, "edge layout bytes")?;
358    let graph_bytes = checked_add(node_bytes, edge_bytes, "graph layout bytes")?;
359    let topology_scratch_bytes = topology_scratch_bytes(topology, scratch_bytes)?;
360    let required_without_output =
361        checked_add(graph_bytes, frontier_bytes, "graph plus frontier bytes")?;
362    let required_without_output = checked_add(
363        required_without_output,
364        topology_scratch_bytes,
365        "scratch bytes",
366    )?;
367    let required_bytes = checked_add(required_without_output, output_bytes, "output bytes")?;
368    if required_bytes > budget_bytes {
369        return Err(MegakernelMemoryError::OverBudget {
370            topology,
371            required_bytes,
372            budget_bytes,
373            node_count: graph.node_count,
374            edge_count: graph.edge_count,
375        });
376    }
377    Ok(MegakernelMemoryPlan {
378        graph_bytes,
379        frontier_bytes,
380        scratch_bytes: topology_scratch_bytes,
381        output_bytes,
382        required_bytes,
383        budget_bytes,
384        memory_pressure_bps: pressure_bps(required_bytes, budget_bytes),
385    })
386}
387
388/// Select a megakernel topology and validate its device-memory plan.
389pub fn plan_megakernel_execution(
390    sample: MegakernelExecutionSample,
391    graph: MegakernelGraphShape,
392    bytes_per_node: u64,
393    bytes_per_edge: u64,
394    frontier_bytes: u64,
395    scratch_bytes: u64,
396    output_bytes: u64,
397    budget_bytes: u64,
398    launch_overhead_ns: f64,
399    fusion_pressure: f64,
400) -> Result<MegakernelExecutionPlan, MegakernelMemoryError> {
401    let sparse_memory = plan_megakernel_memory_budget(
402        MegakernelExecutionTopology::SparseFrontier,
403        graph,
404        bytes_per_node,
405        bytes_per_edge,
406        frontier_bytes,
407        scratch_bytes,
408        output_bytes,
409        budget_bytes,
410    )?;
411    let decision = select_megakernel_topology(
412        sample,
413        graph,
414        MegakernelMemoryBudget {
415            required_bytes: sparse_memory.required_bytes,
416            budget_bytes,
417        },
418        launch_overhead_ns,
419        fusion_pressure,
420    );
421    match plan_megakernel_memory_budget(
422        decision.topology,
423        graph,
424        bytes_per_node,
425        bytes_per_edge,
426        frontier_bytes,
427        scratch_bytes,
428        output_bytes,
429        budget_bytes,
430    ) {
431        Ok(memory) => Ok(MegakernelExecutionPlan {
432            topology: decision.topology,
433            memory,
434            downgraded_to_sparse: false,
435        }),
436        Err(MegakernelMemoryError::OverBudget { .. })
437            if decision.topology != MegakernelExecutionTopology::SparseFrontier =>
438        {
439            Ok(MegakernelExecutionPlan {
440                topology: MegakernelExecutionTopology::SparseFrontier,
441                memory: sparse_memory,
442                downgraded_to_sparse: true,
443            })
444        }
445        Err(error) => Err(error),
446    }
447}
448
449fn finite_unit(value: f64) -> f64 {
450    if value.is_finite() {
451        value.clamp(0.0, 1.0)
452    } else {
453        0.0
454    }
455}
456
457fn pressure_bps(numerator: u64, denominator: u64) -> u32 {
458    let clamped = pressure_bps_u64(numerator, denominator).min(10_000);
459    match u32::try_from(clamped) {
460        Ok(value) => value,
461        Err(error) => {
462            tracing::error!(
463                "megakernel pressure conversion failed after clamping value {clamped}: {error}. Fix: inspect ratio/clamp invariants before topology selection."
464            );
465            10_000
466        }
467    }
468}
469
470fn pressure_bps_u64(numerator: u64, denominator: u64) -> u64 {
471    crate::numeric::ratio_basis_points_u64_wide(
472        numerator,
473        denominator,
474        if numerator == 0 { 0 } else { u64::MAX },
475        "megakernel scheduler pressure",
476        "megakernel execution",
477    )
478}
479
480fn finite_ratio_bps(numerator: f64, denominator: f64, label: &'static str) -> u32 {
481    crate::numeric::finite_f64_ratio_basis_points_round(
482        numerator,
483        denominator,
484        u32::MAX,
485        u32::MAX,
486        label,
487        "megakernel execution",
488    )
489}
490
491fn checked_bps_sub(value: u32, margin: u32, label: &'static str) -> u32 {
492    if let Some(result) = value.checked_sub(margin) {
493        return result;
494    }
495    tracing::error!(
496        "megakernel {label} underflowed basis-point threshold. Fix: configure hysteresis below the threshold."
497    );
498    0
499}
500
501fn topology_scratch_bytes(
502    topology: MegakernelExecutionTopology,
503    base_scratch_bytes: u64,
504) -> Result<u64, MegakernelMemoryError> {
505    match topology {
506        MegakernelExecutionTopology::WarpSparseFrontier => Ok(base_scratch_bytes.max(32)),
507        MegakernelExecutionTopology::SparseFrontier => Ok(base_scratch_bytes),
508        MegakernelExecutionTopology::BlockDenseFrontier => checked_mul(
509            base_scratch_bytes.max(1024),
510            2,
511            "block dense topology scratch bytes",
512        ),
513        MegakernelExecutionTopology::DenseFrontier => {
514            checked_mul(base_scratch_bytes, 2, "dense topology scratch bytes")
515        }
516        MegakernelExecutionTopology::HybridFrontier => {
517            checked_mul(base_scratch_bytes, 3, "hybrid topology scratch bytes")
518        }
519        MegakernelExecutionTopology::FusedWave => {
520            checked_mul(base_scratch_bytes, 4, "fused topology scratch bytes")
521        }
522    }
523}
524
525fn checked_add(lhs: u64, rhs: u64, field: &'static str) -> Result<u64, MegakernelMemoryError> {
526    lhs.checked_add(rhs)
527        .ok_or(MegakernelMemoryError::ByteCountOverflow { field })
528}
529
530fn checked_mul(lhs: u64, rhs: u64, field: &'static str) -> Result<u64, MegakernelMemoryError> {
531    lhs.checked_mul(rhs)
532        .ok_or(MegakernelMemoryError::ByteCountOverflow { field })
533}
534
535#[cfg(test)]
536mod tests {
537    use super::{
538        plan_megakernel_execution, plan_megakernel_memory_budget, select_megakernel_topology,
539        select_megakernel_topology_stable, MegakernelExecutionSample, MegakernelExecutionTopology,
540        MegakernelGraphShape, MegakernelMemoryBudget, MegakernelMemoryError,
541    };
542
543    #[test]
544    fn topology_selector_uses_sparse_dense_hybrid_and_fused_bands() {
545        let graph = MegakernelGraphShape {
546            node_count: 1_000,
547            edge_count: 4_000,
548        };
549        let memory = MegakernelMemoryBudget {
550            required_bytes: 1_000,
551            budget_bytes: 10_000,
552        };
553        let warp_sparse = select_megakernel_topology(
554            MegakernelExecutionSample {
555                dispatch_cost_ns: 1_000.0,
556                frontier_density: 0.01,
557                readback_bytes: 256,
558            },
559            graph,
560            memory,
561            100.0,
562            0.0,
563        );
564        assert_eq!(
565            warp_sparse.topology,
566            MegakernelExecutionTopology::WarpSparseFrontier
567        );
568        assert_eq!(
569            warp_sparse.stable_explanation(),
570            "megakernel-topology-v1|topology=WarpSparseFrontier|memory_pressure_bps=1000|average_degree_bps=40000|launch_pressure_bps=1000|reason=ultra_sparse_warp_specialized"
571        );
572
573        let block_dense = select_megakernel_topology(
574            MegakernelExecutionSample {
575                dispatch_cost_ns: 1_000.0,
576                frontier_density: 0.90,
577                readback_bytes: 512,
578            },
579            graph,
580            memory,
581            100.0,
582            0.0,
583        );
584        assert_eq!(
585            block_dense.topology,
586            MegakernelExecutionTopology::BlockDenseFrontier
587        );
588
589        let hybrid = select_megakernel_topology(
590            MegakernelExecutionSample {
591                dispatch_cost_ns: 1_000.0,
592                frontier_density: 0.35,
593                readback_bytes: 512,
594            },
595            graph,
596            memory,
597            100.0,
598            0.0,
599        );
600        assert_eq!(hybrid.topology, MegakernelExecutionTopology::HybridFrontier);
601
602        let fused = select_megakernel_topology(
603            MegakernelExecutionSample {
604                dispatch_cost_ns: 1_000.0,
605                frontier_density: 0.50,
606                readback_bytes: 1 << 20,
607            },
608            graph,
609            memory,
610            250.0,
611            0.90,
612        );
613        assert_eq!(fused.topology, MegakernelExecutionTopology::FusedWave);
614        assert_eq!(fused.launch_pressure_bps, 2_500);
615    }
616
617    #[test]
618    fn stable_topology_selector_prevents_variant_flapping_near_thresholds() {
619        let graph = MegakernelGraphShape {
620            node_count: 1_000,
621            edge_count: 4_000,
622        };
623        let memory = MegakernelMemoryBudget {
624            required_bytes: 1_000,
625            budget_bytes: 10_000,
626        };
627        let sparse_to_hybrid = select_megakernel_topology_stable(
628            MegakernelExecutionSample {
629                dispatch_cost_ns: 1_000.0,
630                frontier_density: 0.14,
631                readback_bytes: 512,
632            },
633            graph,
634            memory,
635            100.0,
636            0.0,
637            MegakernelExecutionTopology::SparseFrontier,
638        );
639        assert_eq!(
640            sparse_to_hybrid.topology,
641            MegakernelExecutionTopology::SparseFrontier
642        );
643    }
644
645    #[test]
646    fn memory_planner_bounds_peak_bytes_by_topology() {
647        let graph = MegakernelGraphShape {
648            node_count: 1_000,
649            edge_count: 4_000,
650        };
651        let plan = plan_megakernel_memory_budget(
652            MegakernelExecutionTopology::FusedWave,
653            graph,
654            16,
655            8,
656            4_096,
657            2_048,
658            512,
659            128 * 1024,
660        )
661        .expect("Fix: valid fused plan should fit the explicit device-memory budget");
662
663        assert_eq!(plan.graph_bytes, 48_000);
664        assert_eq!(plan.scratch_bytes, 8_192);
665        assert_eq!(plan.required_bytes, 60_800);
666        assert!(plan.memory_pressure_bps > 0);
667    }
668
669    #[test]
670    fn memory_planner_rejects_budget_and_overflow_failures() {
671        let graph = MegakernelGraphShape {
672            node_count: 1_000,
673            edge_count: 4_000,
674        };
675        let err = plan_megakernel_memory_budget(
676            MegakernelExecutionTopology::DenseFrontier,
677            graph,
678            16,
679            8,
680            4_096,
681            2_048,
682            512,
683            32 * 1024,
684        )
685        .expect_err("over-budget dense plan must fail before allocation");
686        assert!(matches!(
687            err,
688            MegakernelMemoryError::OverBudget {
689                topology: MegakernelExecutionTopology::DenseFrontier,
690                ..
691            }
692        ));
693        assert!(err.to_string().contains("Fix: choose a sparse topology"));
694
695        let overflow = plan_megakernel_memory_budget(
696            MegakernelExecutionTopology::SparseFrontier,
697            MegakernelGraphShape {
698                node_count: u64::MAX,
699                edge_count: 0,
700            },
701            2,
702            0,
703            0,
704            0,
705            0,
706            u64::MAX,
707        )
708        .expect_err("overflowing graph byte count must be rejected");
709        assert!(matches!(
710            overflow,
711            MegakernelMemoryError::ByteCountOverflow {
712                field: "node layout bytes"
713            }
714        ));
715    }
716
717    #[test]
718    fn generated_execution_plans_never_exceed_budget_or_hide_overflow() {
719        let mut state = 0x4d59_5df4_d0f3_3173_u64;
720        for case_index in 0..1024usize {
721            let node_count = 1 + next_u64(&mut state) % 8_192;
722            let edge_count = node_count + next_u64(&mut state) % 65_536;
723            let bytes_per_node = 1 + next_u64(&mut state) % 64;
724            let bytes_per_edge = 1 + next_u64(&mut state) % 32;
725            let frontier_bytes = next_u64(&mut state) % 65_536;
726            let scratch_bytes = next_u64(&mut state) % 16_384;
727            let output_bytes = next_u64(&mut state) % 8_192;
728            let budget_bytes = 64 * 1024 + next_u64(&mut state) % (4 * 1024 * 1024);
729            let sample = MegakernelExecutionSample {
730                dispatch_cost_ns: 100.0 + (next_u64(&mut state) % 10_000) as f64,
731                frontier_density: (next_u64(&mut state) % 10_001) as f64 / 10_000.0,
732                readback_bytes: next_u64(&mut state) % (1 << 20),
733            };
734
735            let result = plan_megakernel_execution(
736                sample,
737                MegakernelGraphShape {
738                    node_count,
739                    edge_count,
740                },
741                bytes_per_node,
742                bytes_per_edge,
743                frontier_bytes,
744                scratch_bytes,
745                output_bytes,
746                budget_bytes,
747                250.0,
748                0.85,
749            );
750            match result {
751                Ok(plan) => {
752                    assert!(
753                        plan.memory.required_bytes <= plan.memory.budget_bytes,
754                        "case {case_index}"
755                    );
756                    assert!(plan.memory.memory_pressure_bps <= 10_000);
757                }
758                Err(MegakernelMemoryError::OverBudget {
759                    required_bytes,
760                    budget_bytes,
761                    ..
762                }) => assert!(required_bytes > budget_bytes, "case {case_index}"),
763                Err(MegakernelMemoryError::ByteCountOverflow { .. }) => {}
764            }
765        }
766    }
767
768    fn next_u64(state: &mut u64) -> u64 {
769        *state = state
770            .wrapping_mul(6_364_136_223_846_793_005)
771            .wrapping_add(1_442_695_040_888_963_407);
772        *state
773    }
774}