1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//! Megakernel backend capability and report types.
use std::time::Duration;
use super::super::policy::{
MegakernelDispatchTopology, MegakernelExecutionMode, MegakernelQueuePressure,
};
/// Capabilities surfaced by megakernel-aware backends.
#[derive(Debug, Clone, Copy)]
pub struct MegakernelCaps {
/// Whether the backend implements a megakernel path.
pub supported: bool,
/// Maximum worker-count ceiling the backend accepts.
pub max_worker_count: u32,
}
impl MegakernelCaps {
/// Unsupported - every method returns an explicit error.
#[must_use]
pub const fn unsupported() -> Self {
Self {
supported: false,
max_worker_count: 0,
}
}
/// Declare supported with the given worker ceiling.
#[must_use]
pub const fn supported(max_worker_count: u32) -> Self {
Self {
supported: true,
max_worker_count,
}
}
}
/// One work-queue item the megakernel worker consumes.
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, bytemuck::Pod, bytemuck::Zeroable)]
pub struct MegakernelWorkItem {
/// Stable op id index into the dialect registry.
pub op_handle: u32,
/// Input-buffer handle.
pub input_handle: u32,
/// Output-buffer handle.
pub output_handle: u32,
/// Optional per-item parameter word.
pub param: u32,
}
/// Production counters from one megakernel dispatch.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct MegakernelTelemetry {
/// Bytes uploaded across control, ring, debug, and IO inputs.
pub bytes_uploaded: u64,
/// Bytes read back across all megakernel output buffers.
pub bytes_read_back: u64,
/// Total host/device transfer bytes attributable to this dispatch.
pub bytes_moved: u64,
/// Resident input allocations performed before dispatch.
pub resident_allocations: u32,
/// Kernel launches issued for this logical dispatch.
pub kernel_launches: u32,
/// Host-visible synchronization/readback wait points.
pub sync_points: u32,
/// Approximate lane occupancy in basis points, capped at 10000.
pub occupancy_proxy_bps: u16,
/// Active queue/frontier density in basis points, capped at 10000.
pub frontier_density_bps: u16,
/// Number of output buffers read back from the backend.
pub readback_buffers: u32,
/// True when the direct dispatch reused a compiled megakernel pipeline.
pub compiled_pipeline_cache_hit: bool,
/// True when the direct dispatch reused resident input resources.
pub resident_input_cache_hit: bool,
/// Scale-aware topology selected by the launch policy.
pub topology: MegakernelDispatchTopology,
/// Queue pressure classification selected by the launch policy.
pub pressure: MegakernelQueuePressure,
/// Interpreter or JIT route selected by launch policy telemetry.
pub execution_mode: MegakernelExecutionMode,
/// Sparse-hit capacity selected by the launch policy.
pub hit_capacity: u32,
/// Estimated peak device bytes for the selected launch plan.
pub estimated_peak_device_bytes: u64,
/// Hard device-memory budget applied to the launch. Zero means unbounded.
pub device_memory_budget_bytes: u64,
}
impl Default for MegakernelTelemetry {
fn default() -> Self {
Self {
bytes_uploaded: 0,
bytes_read_back: 0,
bytes_moved: 0,
resident_allocations: 0,
kernel_launches: 0,
sync_points: 0,
occupancy_proxy_bps: 0,
frontier_density_bps: 0,
readback_buffers: 0,
compiled_pipeline_cache_hit: false,
resident_input_cache_hit: false,
topology: MegakernelDispatchTopology::Empty,
pressure: MegakernelQueuePressure::Empty,
execution_mode: MegakernelExecutionMode::Interpreter,
hit_capacity: 0,
estimated_peak_device_bytes: 0,
device_memory_budget_bytes: 0,
}
}
}
/// Summary stats from one megakernel run.
#[derive(Debug, Clone, Default)]
pub struct MegakernelReport {
/// Items the workers processed before exiting.
pub items_processed: u64,
/// Items still queued when `max_wall_time` fired.
pub items_remaining: u64,
/// Wall-clock time spent.
pub wall_time: Duration,
/// Host-side time spent shaping the queue before publication:
/// dedupe, fusion planning, and launch-geometry preparation.
pub queue_plan_ns: u64,
/// Host-side time spent encoding protocol buffers and publishing
/// queued work into ring slots.
pub queue_publish_ns: u64,
/// Host-observed backend dispatch latency after queue publication.
pub backend_dispatch_ns: u64,
/// Host-observed time spent computing optional region lineage after
/// dispatch. Zero when lineage tracking is skipped.
pub lineage_ns: u64,
/// Logical work items removed by queue dedupe before publication.
pub deduped_items: u64,
/// Work items actually published into megakernel ring slots.
pub published_items: u64,
/// Number of work items included in region lineage tracking.
pub lineage_items: u64,
/// Production counters for performance gates and launch tuning.
pub telemetry: MegakernelTelemetry,
/// Per-output provenance lineage bitsets, one entry per fused
/// region in dispatch order. `lineage[i]` is a 32-bit set of
/// source-rule IDs that contributed to fused-region `i`'s output,
/// computed via the substrate
/// `vyre_self_substrate::scallop_provenance` Datalog
/// closure on the rule-derivation graph. Empty `Vec` when
/// provenance tracking was disabled for the dispatch.
///
/// Lets observability collectors (Tempo, Honeycomb, Prometheus)
/// attribute every megakernel output back to the source rules
/// that derived it - without this, fused-region outputs lose
/// their lineage.
pub region_lineage: Vec<u32>,
}