vyre_runtime/megakernel/planner/caps.rs
1//! Megakernel backend capability and report types.
2
3use std::time::Duration;
4
5use super::super::policy::{
6 MegakernelDispatchTopology, MegakernelExecutionMode, MegakernelQueuePressure,
7};
8
9/// Capabilities surfaced by megakernel-aware backends.
10#[derive(Debug, Clone, Copy)]
11pub struct MegakernelCaps {
12 /// Whether the backend implements a megakernel path.
13 pub supported: bool,
14 /// Maximum worker-count ceiling the backend accepts.
15 pub max_worker_count: u32,
16}
17
18impl MegakernelCaps {
19 /// Unsupported - every method returns an explicit error.
20 #[must_use]
21 pub const fn unsupported() -> Self {
22 Self {
23 supported: false,
24 max_worker_count: 0,
25 }
26 }
27
28 /// Declare supported with the given worker ceiling.
29 #[must_use]
30 pub const fn supported(max_worker_count: u32) -> Self {
31 Self {
32 supported: true,
33 max_worker_count,
34 }
35 }
36}
37
38/// One work-queue item the megakernel worker consumes.
39#[repr(C)]
40#[derive(Debug, Clone, Copy, PartialEq, Eq, bytemuck::Pod, bytemuck::Zeroable)]
41pub struct MegakernelWorkItem {
42 /// Stable op id index into the dialect registry.
43 pub op_handle: u32,
44 /// Input-buffer handle.
45 pub input_handle: u32,
46 /// Output-buffer handle.
47 pub output_handle: u32,
48 /// Optional per-item parameter word.
49 pub param: u32,
50}
51
52/// Production counters from one megakernel dispatch.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub struct MegakernelTelemetry {
55 /// Bytes uploaded across control, ring, debug, and IO inputs.
56 pub bytes_uploaded: u64,
57 /// Bytes read back across all megakernel output buffers.
58 pub bytes_read_back: u64,
59 /// Total host/device transfer bytes attributable to this dispatch.
60 pub bytes_moved: u64,
61 /// Resident input allocations performed before dispatch.
62 pub resident_allocations: u32,
63 /// Kernel launches issued for this logical dispatch.
64 pub kernel_launches: u32,
65 /// Host-visible synchronization/readback wait points.
66 pub sync_points: u32,
67 /// Approximate lane occupancy in basis points, capped at 10000.
68 pub occupancy_proxy_bps: u16,
69 /// Active queue/frontier density in basis points, capped at 10000.
70 pub frontier_density_bps: u16,
71 /// Number of output buffers read back from the backend.
72 pub readback_buffers: u32,
73 /// True when the direct dispatch reused a compiled megakernel pipeline.
74 pub compiled_pipeline_cache_hit: bool,
75 /// True when the direct dispatch reused resident input resources.
76 pub resident_input_cache_hit: bool,
77 /// Scale-aware topology selected by the launch policy.
78 pub topology: MegakernelDispatchTopology,
79 /// Queue pressure classification selected by the launch policy.
80 pub pressure: MegakernelQueuePressure,
81 /// Interpreter or JIT route selected by launch policy telemetry.
82 pub execution_mode: MegakernelExecutionMode,
83 /// Sparse-hit capacity selected by the launch policy.
84 pub hit_capacity: u32,
85 /// Estimated peak device bytes for the selected launch plan.
86 pub estimated_peak_device_bytes: u64,
87 /// Hard device-memory budget applied to the launch. Zero means unbounded.
88 pub device_memory_budget_bytes: u64,
89}
90
91impl Default for MegakernelTelemetry {
92 fn default() -> Self {
93 Self {
94 bytes_uploaded: 0,
95 bytes_read_back: 0,
96 bytes_moved: 0,
97 resident_allocations: 0,
98 kernel_launches: 0,
99 sync_points: 0,
100 occupancy_proxy_bps: 0,
101 frontier_density_bps: 0,
102 readback_buffers: 0,
103 compiled_pipeline_cache_hit: false,
104 resident_input_cache_hit: false,
105 topology: MegakernelDispatchTopology::Empty,
106 pressure: MegakernelQueuePressure::Empty,
107 execution_mode: MegakernelExecutionMode::Interpreter,
108 hit_capacity: 0,
109 estimated_peak_device_bytes: 0,
110 device_memory_budget_bytes: 0,
111 }
112 }
113}
114
115/// Summary stats from one megakernel run.
116#[derive(Debug, Clone, Default)]
117pub struct MegakernelReport {
118 /// Items the workers processed before exiting.
119 pub items_processed: u64,
120 /// Items still queued when `max_wall_time` fired.
121 pub items_remaining: u64,
122 /// Wall-clock time spent.
123 pub wall_time: Duration,
124 /// Host-side time spent shaping the queue before publication:
125 /// dedupe, fusion planning, and launch-geometry preparation.
126 pub queue_plan_ns: u64,
127 /// Host-side time spent encoding protocol buffers and publishing
128 /// queued work into ring slots.
129 pub queue_publish_ns: u64,
130 /// Host-observed backend dispatch latency after queue publication.
131 pub backend_dispatch_ns: u64,
132 /// Host-observed time spent computing optional region lineage after
133 /// dispatch. Zero when lineage tracking is skipped.
134 pub lineage_ns: u64,
135 /// Logical work items removed by queue dedupe before publication.
136 pub deduped_items: u64,
137 /// Work items actually published into megakernel ring slots.
138 pub published_items: u64,
139 /// Number of work items included in region lineage tracking.
140 pub lineage_items: u64,
141 /// Production counters for performance gates and launch tuning.
142 pub telemetry: MegakernelTelemetry,
143 /// Per-output provenance lineage bitsets, one entry per fused
144 /// region in dispatch order. `lineage[i]` is a 32-bit set of
145 /// source-rule IDs that contributed to fused-region `i`'s output,
146 /// computed via the substrate
147 /// `vyre_self_substrate::scallop_provenance` Datalog
148 /// closure on the rule-derivation graph. Empty `Vec` when
149 /// provenance tracking was disabled for the dispatch.
150 ///
151 /// Lets observability collectors (Tempo, Honeycomb, Prometheus)
152 /// attribute every megakernel output back to the source rules
153 /// that derived it - without this, fused-region outputs lose
154 /// their lineage.
155 pub region_lineage: Vec<u32>,
156}