rlx_runtime/
compiled.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Compiled graph — the hot-path execution object.
17
18use crate::backend::ExecutableGraph;
19use rlx_driver::Device;
20
21/// A compiled graph ready for execution.
22///
23/// Created by [`crate::Session::compile`]. Holds the fused + memory-planned
24/// graph and all pre-allocated execution state. Call
25/// [`CompiledGraph::run`] repeatedly with different inputs — zero
26/// allocation per call.
27pub struct CompiledGraph {
28    inner: Box<dyn ExecutableGraph>,
29    device: Device,
30}
31
32impl Clone for CompiledGraph {
33    /// Deep-clones the underlying executable via `ExecutableGraph::clone_box`.
34    /// Backends that don't support cloning will panic at this point.
35    fn clone(&self) -> Self {
36        Self {
37            inner: self.inner.clone_box(),
38            device: self.device,
39        }
40    }
41}
42
43impl CompiledGraph {
44    pub(crate) fn new(inner: Box<dyn ExecutableGraph>, device: Device) -> Self {
45        Self { inner, device }
46    }
47
48    /// Which device this graph runs on.
49    pub fn device(&self) -> Device {
50        self.device
51    }
52
53    /// Set a named parameter (model weight).
54    /// Call once per parameter after compilation.
55    pub fn set_param(&mut self, name: &str, data: &[f32]) {
56        self.inner.set_param(name, data);
57    }
58
59    /// Execute the graph with named inputs.
60    /// Returns one `Vec<f32>` per graph output (copies from arena).
61    pub fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
62        self.inner.run(inputs)
63    }
64
65    /// Run and read back only selected outputs (logits-only decode on MLX).
66    pub fn run_read_outputs(
67        &mut self,
68        inputs: &[(&str, &[f32])],
69        read_indices: Option<&[usize]>,
70    ) -> Vec<Vec<f32>> {
71        self.inner.run_read_outputs(inputs, read_indices)
72    }
73
74    /// Read one row from a row-major output tensor after a forward pass.
75    pub fn read_output_row(
76        &self,
77        out_idx: usize,
78        row: usize,
79        row_inner: usize,
80    ) -> Option<Vec<f32>> {
81        self.inner.read_output_row(out_idx, row, row_inner)
82    }
83
84    /// Execute and return raw pointers to output data (zero-copy).
85    /// Data is valid until the next `run`/`run_raw` call.
86    ///
87    /// # Safety
88    /// The returned pointers point into the arena. Do not use after
89    /// the next call to run/run_raw (arena data will be overwritten).
90    pub fn run_raw(&mut self, inputs: &[(&str, &[f32])]) -> Vec<(*const f32, usize)> {
91        self.inner.run_raw(inputs)
92    }
93
94    /// Fastest execution: inputs by slot index (order matches graph input declaration).
95    /// Returns output (offset, len) pairs. Read data via `arena_ptr().add(offset)`.
96    /// Zero HashMap lookup, zero Vec allocation, zero name matching.
97    pub fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
98        self.inner.run_slots(inputs)
99    }
100
101    /// Arena pointer for reading output data after `run_slots`.
102    pub fn arena_ptr(&self) -> *const u8 {
103        self.inner.arena_ptr()
104    }
105
106    /// Bind a persistent buffer (KV-cache, optimizer state, etc.).
107    /// Stays alive across `run()` calls; the backend uses it as the
108    /// graph input with the matching name.
109    /// Returns true if the backend supports persistent handles.
110    pub fn bind_handle(&mut self, name: &str, data: &[f32]) -> bool {
111        self.inner.bind_handle(name, data)
112    }
113
114    /// Read the current contents of a persistent buffer.
115    pub fn read_handle(&self, name: &str) -> Option<Vec<f32>> {
116        self.inner.read_handle(name)
117    }
118
119    /// GPU-resident MLX input (no-op on non-MLX backends).
120    pub fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
121        self.inner.bind_gpu_handle(name, data)
122    }
123
124    pub fn has_gpu_handle(&self, name: &str) -> bool {
125        self.inner.has_gpu_handle(name)
126    }
127
128    pub fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
129        self.inner.set_gpu_handle_feed(handle_name, output_index)
130    }
131
132    pub fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
133        self.inner.read_gpu_handle(name)
134    }
135
136    /// Read one row from a resident GPU input handle without full-tensor D2H.
137    pub fn read_gpu_handle_row(
138        &self,
139        name: &str,
140        row: usize,
141        row_inner: usize,
142    ) -> Option<Vec<f32>> {
143        self.inner.read_gpu_handle_row(name, row, row_inner)
144    }
145
146    /// Register a targeted row feed for resident KV decode (graphs that emit the
147    /// new token at the last bucket-padded output row). No-op (false) on
148    /// backends without GPU-resident handle support. See [`Self::feed_kv_row`].
149    pub fn register_kv_row_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
150        self.inner.register_kv_row_feed(handle_name, output_index)
151    }
152
153    /// Fold each registered row feed's new-token row (`src_row` of its output)
154    /// into the resident handle slot at `dst_row` (`row_elems` = kv_dim),
155    /// in-place on device. Returns false when unsupported.
156    pub fn feed_kv_row(&mut self, src_row: usize, dst_row: usize, row_elems: usize) -> bool {
157        self.inner.feed_kv_row(src_row, dst_row, row_elems)
158    }
159
160    /// Mark a graph input as device-resident without host staging.
161    pub fn prepare_resident_gpu_handle(&mut self, name: &str) -> bool {
162        self.inner.prepare_resident_gpu_handle(name)
163    }
164
165    /// Upload bound (non-resident) GPU handle mirrors into the arena.
166    pub fn stage_bound_gpu_handles_to_arena(&mut self) -> bool {
167        self.inner.stage_bound_gpu_handles_to_arena();
168        true
169    }
170
171    /// D2D copy resident KV rows `[from_row..to_row)` from another compiled graph.
172    pub fn seed_resident_kv_prefix_from(
173        &mut self,
174        src: &CompiledGraph,
175        prefix_tokens: usize,
176        outgoing_upper: usize,
177        kv_dim: usize,
178        n_layers: usize,
179    ) -> bool {
180        if self.device != src.device {
181            return false;
182        }
183        self.inner.seed_resident_kv_prefix_from(
184            src.inner.as_ref(),
185            prefix_tokens,
186            outgoing_upper,
187            kv_dim,
188            n_layers,
189        )
190    }
191
192    /// D2D copy resident KV rows `[from_row..to_row)` from another compiled graph.
193    pub fn copy_resident_kv_rows_from(
194        &mut self,
195        src: &CompiledGraph,
196        from_row: usize,
197        to_row: usize,
198        outgoing_upper: usize,
199        kv_dim: usize,
200        n_layers: usize,
201    ) -> bool {
202        if self.device != src.device {
203            return false;
204        }
205        self.inner.copy_resident_kv_rows_from(
206            src.inner.as_ref(),
207            from_row,
208            to_row,
209            outgoing_upper,
210            kv_dim,
211            n_layers,
212        )
213    }
214
215    /// Copy parameter buffers from `src` when layouts match (same device/backend).
216    pub fn copy_params_from(&mut self, src: &CompiledGraph) -> bool {
217        if self.device != src.device {
218            return false;
219        }
220        self.inner.copy_params_from(src.inner.as_ref())
221    }
222
223    /// Run, refresh GPU handle from output, return that output vector.
224    pub fn run_feed_gpu_handle(
225        &mut self,
226        inputs: &[(&str, &[f32])],
227        handle_name: &str,
228        output_index: usize,
229    ) -> Option<Vec<f32>> {
230        self.inner
231            .run_feed_gpu_handle(inputs, handle_name, output_index)
232    }
233
234    /// Hint subsequent `run` calls to process only the first `actual`
235    /// rows along the bucket axis (out of `upper`, the compile extent).
236    /// Backends that support per-kernel active-extent dispatch honor
237    /// this; others ignore it. Pass `None` to clear.
238    ///
239    /// See `BucketedCompileCache::run_padded` for the canonical caller.
240    pub fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
241        #[cfg(feature = "cpu")]
242        if let Some((actual, _)) = extent {
243            crate::onnx_active::set_active_token_count(Some(actual))
244        }
245        self.inner.set_active_extent(extent);
246    }
247
248    /// TIDE merged MoE placement (`mask[expert]` device-resident if any layer has it).
249    pub fn set_moe_resident_experts(&mut self, mask: &[bool]) {
250        self.inner.set_moe_resident_experts(mask);
251    }
252
253    /// Per MoE layer placement (forward order). Preferred on CPU over merged mask.
254    pub fn set_moe_resident_experts_per_layer(&mut self, masks: &[&[bool]]) {
255        self.inner.set_moe_resident_experts_per_layer(masks);
256    }
257
258    /// Capture MoE router TopK on next forward (CPU). Returns false if unsupported.
259    pub fn enable_moe_topk_capture(&mut self, num_experts: usize) -> bool {
260        self.inner.enable_moe_topk_capture(num_experts)
261    }
262
263    /// Per-layer expert indices from the last forward (MoE router TopK order).
264    pub fn take_moe_topk_capture(&mut self) -> Option<Vec<Vec<u32>>> {
265        self.inner.take_moe_topk_capture()
266    }
267
268    /// GroupedMatMul GPU/CPU token accounting from the last forward (CPU).
269    pub fn take_moe_residency_stats(&mut self) -> Option<crate::MoeResidencyStats> {
270        self.inner.take_moe_residency_stats()
271    }
272
273    // ── Pipelined / async execution (Phase C) ─────────────────────────
274
275    /// Encode + commit a forward pass without waiting for the device.
276    ///
277    /// Outputs of intermediate calls are stomped — use `run_pipelined`
278    /// when you need each call's outputs back. Pair with `sync_pending`
279    /// to drain. CPU is synchronous, so this falls back to `run`.
280    pub fn commit_no_wait(&mut self, inputs: &[(&str, &[f32])]) {
281        self.inner.commit_no_wait(inputs);
282    }
283
284    /// Wait for every command queued by `commit_no_wait`. CPU is a no-op.
285    pub fn sync_pending(&mut self) {
286        self.inner.sync_pending();
287    }
288
289    /// Pipelined batch run. Issues one commit per input set, syncs once
290    /// at the end. On Metal, each commit gets its own output snapshot
291    /// (allocated + blit-copied), so subsequent commits stomping the
292    /// shared arena don't corrupt earlier runs' outputs.
293    /// Returns `out[run_idx][output_idx][element_idx]`.
294    pub fn run_pipelined(&mut self, input_sets: &[Vec<(&str, &[f32])>]) -> Vec<Vec<Vec<f32>>> {
295        self.inner.run_pipelined(input_sets)
296    }
297
298    /// Set a named parameter from raw bytes in the given dtype. The
299    /// backend handles the widen-to-f32 (or zero-widen, when supported
300    /// natively) on the way in. Lets callers feed F16/BF16 weights
301    /// without a host-side cast.
302    pub fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
303        self.inner.set_param_typed(name, data, dtype);
304    }
305
306    /// Finish param upload — warms backend caches when supported.
307    pub fn finalize_params(&mut self) {
308        self.inner.finalize_params();
309    }
310
311    /// Execute with typed inputs and return outputs in their declared
312    /// graph dtype, byte-encoded. Mirrors the wgpu / MLX zero-widen
313    /// semantics on f32-arena backends (CPU + Metal) by widening at
314    /// the boundary.
315    pub fn run_typed(
316        &mut self,
317        inputs: &[(&str, &[u8], rlx_ir::DType)],
318    ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
319        self.inner.run_typed(inputs)
320    }
321
322    /// Override RNG policy for in-graph random ops without recompiling.
323    pub fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
324        self.inner.set_rng(rng);
325    }
326
327    /// Current RNG compile/execute policy.
328    pub fn rng(&self) -> rlx_ir::RngOptions {
329        self.inner.rng()
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use crate::*;
336
337    #[test]
338    #[cfg(feature = "cpu")]
339    fn end_to_end_session() {
340        let mut g = Graph::new("matmul_bias_gelu");
341        let x = g.input("x", Shape::new(&[2, 4], DType::F32));
342        let w = g.param("w", Shape::new(&[4, 3], DType::F32));
343        let b = g.param("b", Shape::new(&[3], DType::F32));
344        let mm = g.matmul(x, w, Shape::new(&[2, 3], DType::F32));
345        let add = g.binary(op::BinaryOp::Add, mm, b, Shape::new(&[2, 3], DType::F32));
346        let out = g.activation(op::Activation::Gelu, add, Shape::new(&[2, 3], DType::F32));
347        g.set_outputs(vec![out]);
348
349        // Compile
350        let session = Session::new(Device::Cpu);
351        let mut compiled = session.compile(g);
352
353        // Set weights
354        // w = identity-ish [4, 3]: first 3 rows are I, last row is 0
355        compiled.set_param(
356            "w",
357            &[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
358        );
359        compiled.set_param("b", &[0.5, -0.5, 0.0]);
360
361        // Run
362        let x_data = vec![
363            1.0, 0.0, 0.0, 0.0, // row 0: [1,0,0,0] @ w = [1,0,0] + bias = [1.5,-0.5,0]
364            0.0, 1.0, 0.0, 0.0, // row 1: [0,1,0,0] @ w = [0,1,0] + bias = [0.5, 0.5,0]
365        ];
366        let outputs = compiled.run(&[("x", &x_data)]);
367
368        assert_eq!(outputs.len(), 1);
369        let result = &outputs[0];
370        assert_eq!(result.len(), 6); // [2, 3]
371
372        // gelu(1.5) ≈ 1.399, gelu(-0.5) ≈ -0.154, gelu(0) = 0
373        assert!(
374            (result[0] - 1.399).abs() < 0.01,
375            "gelu(1.5) = {}",
376            result[0]
377        );
378        assert!(
379            (result[1] - -0.154).abs() < 0.01,
380            "gelu(-0.5) = {}",
381            result[1]
382        );
383        assert!((result[2]).abs() < 0.01, "gelu(0) = {}", result[2]);
384
385        // gelu(0.5) ≈ 0.346, gelu(0.5) ≈ 0.346, gelu(0) = 0
386        assert!(
387            (result[3] - 0.346).abs() < 0.01,
388            "gelu(0.5) = {}",
389            result[3]
390        );
391        assert!(
392            (result[4] - 0.346).abs() < 0.01,
393            "gelu(0.5) = {}",
394            result[4]
395        );
396
397        // Run again with different input — zero allocation
398        let x2 = vec![0.0f32; 8];
399        let outputs2 = compiled.run(&[("x", &x2)]);
400        // All zeros input → gelu(bias) for each output
401        let r2 = &outputs2[0];
402        assert!((r2[0] - 0.346).abs() < 0.01, "gelu(0.5) = {}", r2[0]); // gelu(0+0.5)
403    }
404
405    #[test]
406    #[cfg(feature = "cpu")]
407    fn device_display() {
408        use crate::device_ext::is_available;
409        assert!(format!("{}", Device::Cpu).starts_with("CPU"));
410        assert!(is_available(Device::Cpu));
411        // Backend availability is feature-gated; only assert
412        // unavailable when the corresponding feature is off.
413        #[cfg(not(feature = "gpu"))]
414        assert!(!is_available(Device::Gpu));
415        #[cfg(not(feature = "cuda"))]
416        assert!(!is_available(Device::Cuda));
417        #[cfg(not(feature = "rocm"))]
418        assert!(!is_available(Device::Rocm));
419        #[cfg(not(feature = "tpu"))]
420        assert!(!is_available(Device::Tpu));
421    }
422}
rlx_runtime/compiled.rs

rlx_runtime/
compiled.rs