Skip to main content

vyre_runtime/megakernel/
mod.rs

1//! Persistent megakernel  -  the GPU becomes a VIR0 bytecode interpreter.
2//!
3//! One dispatch compiles the program; the kernel loops forever, pulling
4//! packed bytecode slots from a host-fed ring buffer and executing each.
5//! The host never re-dispatches  -  it only writes new slots and observes
6//! atomic counters in the control buffer.
7//!
8//! ## Layout
9//!
10//! - `protocol`  -  ring-buffer slot layout, control words, opcodes.
11//! - `handlers`  -  built-in opcode handlers + extension mechanism.
12//! - `builder`  -  IR `Program` construction (interpreted + JIT).
13//! - `execution`  -  compiled persistent-kernel handle and dispatch path.
14//! - `resident`  -  host mirrors for GPU-resident runtime buffers.
15//! - `readback`  -  strict output-buffer decoding after dispatch.
16//! - `recovery`  -  device-loss classification and pipeline rebuild.
17//!
18//! ## Coordination protocol
19//!
20//! 1. Read `control[SHUTDOWN]`; if non-zero, `Node::Return`.
21//! 2. Read this slot's `status`; skip idle slots without tenant metadata loads.
22//! 3. If PUBLISHED, read `tenant_id`; authorize via tenant-mask table.
23//! 4. CAS `ring_buffer[status]` from PUBLISHED → CLAIMED.
24//! 5. Dispatch on opcode through If-tree (or JIT fused body).
25//! 6. `atomic_add(control[DONE_COUNT], 1)`.
26//! 7. Store DONE into the status word.
27
28#[cfg(feature = "megakernel-batch")]
29pub mod advanced;
30pub mod automata_worklist;
31pub mod builder;
32pub mod descriptor;
33pub mod execution;
34pub mod handlers;
35pub mod io;
36pub mod ir_util;
37pub mod mixed_work;
38pub mod planner;
39pub mod policy;
40pub mod protocol;
41mod protocol_api;
42pub mod readback;
43pub mod recovery;
44pub mod resident;
45pub mod ring;
46#[cfg(feature = "megakernel-batch")]
47pub mod rule_catalog;
48pub mod scaling;
49pub mod scheduler;
50pub mod speculation;
51mod staging_reserve;
52pub mod task;
53pub mod telemetry;
54pub mod workspace_adapter;
55pub mod workspace_layout;
56
57use vyre_driver::backend::BackendError;
58
59// Re-export protocol constants at the megakernel level for back-compat.
60pub use automata_worklist::{
61    AutomataStateIndex, AutomataWorklistEvidence, AutomataWorklistMode, AutomataWorklistPolicy,
62    AutomataWorklistRecommendation, AutomataWorklistRequest,
63    AUTOMATA_WORKLIST_EVIDENCE_SCHEMA_VERSION,
64};
65pub use builder::{
66    build_program, build_program_jit, build_program_jit_slots, build_program_priority,
67    build_program_priority_slots, build_program_sharded, build_program_sharded_no_io,
68    build_program_sharded_once_slots, build_program_sharded_once_slots_control_report_shared,
69    build_program_sharded_once_slots_shared, build_program_sharded_slots,
70    build_program_sharded_slots_shared, build_program_sharded_with_io_polling,
71    build_program_sharded_with_workspace_adapter, persistent_body, persistent_body_jit,
72    persistent_body_priority, persistent_body_priority_slots,
73    try_build_program_with_self_loading_miss_handler,
74};
75#[cfg(any(test, feature = "legacy-infallible"))]
76pub use builder::build_program_with_self_loading_miss_handler;
77pub use descriptor::{
78    BatchDescriptor, BuiltinOpcode, PackedOpDescriptor, SlotDescriptor, SlotOpcode, WindowClass,
79    WindowDescriptor,
80};
81pub use execution::{
82    Megakernel, MegakernelDispatchOutput, MegakernelDispatchStats, MegakernelResidentHandles,
83};
84pub use handlers::OpcodeHandler;
85pub use io::{IoCompletion, IoRequest, MegakernelIoQueue, IO_SLOT_COUNT, IO_SLOT_WORDS};
86pub use mixed_work::{
87    mixed_work_protocol_evidence, validate_mixed_work_protocol, MixedWorkProtocolError,
88    MixedWorkProtocolEvidence, MixedWorkProtocolPlan, MixedWorkQueueClass, MixedWorkUnit,
89    MixedWorkUnitType, OutputSlabId, ResidentArtifactId, MIXED_WORK_PROTOCOL_SCHEMA_VERSION,
90};
91#[cfg(feature = "self-substrate-adapters")]
92pub use planner::{
93    build_bellman_tn_order_program, build_kfac_autotune_step_program,
94    build_persistent_fixpoint_program, build_scallop_lineage_with_scratch,
95    build_scallop_provenance_wide_program, build_sinkhorn_clustering_program,
96    build_sinkhorn_full_clustering_program,
97};
98pub use planner::{
99    build_scallop_lineage_with_program_and_scratch, default_worker_groups_from_limits,
100    dispatch_grid_for, padded_slot_count, plan_compact_fusion_into,
101    select_fused_subset, select_fused_subset_compact, select_fused_subset_compact_into,
102    select_fused_subset_into, select_fused_subset_with_rate, select_optimal_fused_subset,
103    try_detect_cross_arm_redundancy, try_prune_redundant_work_items_into,
104    try_prune_redundant_work_items_with_scratch_into, worker_workgroup_size,
105    CompactFusionPlanningScratch, CrossArmRedundancy, FusionSelectionScratch, MegakernelCaps,
106    MegakernelConfig, MegakernelGridLimits, MegakernelGridPlan, MegakernelGridRequest,
107    MegakernelLaunchGeometry, MegakernelReport, MegakernelSizingPolicy, MegakernelTelemetry,
108    MegakernelWorkItem, MegakernelWorkloadHints, RedundantWorkItemPruneScratch,
109};
110#[cfg(any(test, feature = "legacy-infallible"))]
111pub use planner::{prune_redundant_work_items_into, prune_redundant_work_items_with_scratch_into};
112#[cfg(any(test, feature = "legacy-infallible"))]
113pub use policy::{diffuse_priority_across_siblings, diffuse_priority_across_siblings_into};
114pub use policy::{
115    try_diffuse_priority_across_siblings, try_diffuse_priority_across_siblings_into,
116    MegakernelDispatchTopology, MegakernelExecutionMode, MegakernelGraphBlasSwitchClass,
117    MegakernelLaunchCacheStats, MegakernelLaunchPolicy, MegakernelLaunchRecommendation,
118    MegakernelLaunchRequest, MegakernelPromotionEvidence, MegakernelPromotionRoute,
119    MegakernelQueuePressure, MegakernelTopologyEvidence, PriorityDrainReason,
120    PriorityDrainRecommendation, PriorityRequeueAccounting, HOT_WINDOW_PROMOTION_EVIDENCE_SCHEMA_VERSION,
121    PRIORITY_COUNTER_DRAIN_FIX, PRIORITY_COUNTER_DRAIN_HEADROOM, TOPOLOGY_EVIDENCE_SCHEMA_VERSION,
122};
123pub use protocol::{
124    control, control_byte_len, count_done_ring_slots, debug, debug_log_byte_len, encode_control,
125    encode_empty_debug_log, encode_empty_ring, opcode, read_debug_log, read_done_count, read_epoch,
126    read_metrics, read_observable, ring_byte_len, slot, try_count_done_ring_slots,
127    try_encode_control, try_encode_control_into, try_encode_empty_debug_log,
128    try_encode_empty_debug_log_into, try_encode_empty_ring, try_encode_empty_ring_into,
129    try_read_debug_log, try_read_done_count, try_read_epoch, try_read_metrics, try_read_observable,
130    DebugRecord, ProtocolError, ARG0_WORD, ARGS_PER_SLOT, CONTROL_MIN_WORDS, OPCODE_WORD,
131    PRIORITY_WORD, SLOT_WORDS, STATUS_WORD, TENANT_WORD,
132};
133pub use protocol_api::RingSlotTransition;
134pub use readback::{MegakernelReadback, MegakernelReadbackCounters};
135pub use telemetry::{
136    MegakernelRuntimeEvidence, RuntimeEvidenceMetricCoverage, RuntimeEvidenceMetricFamily,
137    TelemetryDecodeCapacityEvidence, TelemetryDecodeScratch, RUNTIME_IO_EVIDENCE_SCHEMA_VERSION,
138    TELEMETRY_DECODE_CAPACITY_SCHEMA_VERSION,
139};
140pub use recovery::{
141    backend_error_indicates_device_loss, MegakernelRecoveryDecision, MegakernelRecoveryPolicy,
142};
143pub use resident::{MegakernelResidentBuffers, MegakernelResidentDispatchScratch};
144#[cfg(feature = "megakernel-batch")]
145pub use rule_catalog::{BatchRuleProgram, BatchRuleRejection};
146pub use scheduler::{
147    default_priority_offsets, priority_partition_active_lane_count,
148    priority_partition_probe_budget, priority_partition_probe_count, priority_scan_body,
149    priority_scan_body_with_stride, try_default_priority_offsets, write_default_priority_offsets,
150};
151pub use speculation::{PairedSpeculationSample, PairedSpeculationUpdate, PairedSpeculationWindow};
152pub use task::{TaskPriority, TaskQueueSnapshot, TaskState, TaskWorkItem};
153pub use telemetry::{
154    ControlSnapshot, CountMinSketch, MegakernelRuntimeCounters, RingOccupancy, RingSlotSnapshot,
155    RingStatus, RingTelemetry, SketchTelemetry, WindowTelemetry,
156};
157pub use workspace_adapter::MegakernelWorkspaceAdapter;
158pub use workspace_layout::{
159    build_workspace_regions, first_workspace_region, next_record_workspace_region,
160    next_workspace_region, workspace_record_words, MegakernelWorkspaceLayoutError,
161    MegakernelWorkspaceRegion, MegakernelWorkspaceRegionSpec,
162};
163/// Backend-neutral megakernel dispatch contract.
164pub trait MegakernelDispatch {
165    /// Drain the requested megakernel dispatch.
166    fn dispatch_megakernel(
167        &self,
168        work_queue: &[MegakernelWorkItem],
169        config: &MegakernelConfig,
170    ) -> Result<MegakernelReport, BackendError>;
171}