rlx_runtime/
backend.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Backend trait — abstraction over CPU/GPU/CUDA execution.
17//!
18//! Each backend implements `Backend::compile(graph, &CompileOptions)` and
19//! returns an `ExecutableGraph`. New compile knobs go in `CompileOptions`
20//! rather than as new trait methods.
21
22use crate::CompileOptions;
23use rlx_ir::Graph;
24use rlx_ir::hir::HirModule;
25use rlx_ir::lir::LirModule;
26use std::collections::HashMap;
27use std::sync::Arc;
28
29use crate::cpu_low_precision;
30
31// ── Typed I/O helpers (shared across f32-arena backends) ────────────────
32
33/// Widen a typed byte buffer to `Vec<f32>`. Used by `set_param_typed` /
34/// `run_typed` overrides on backends whose internal arena is f32-uniform
35/// (CPU, Metal, wgpu) so callers can hand in F16/BF16 without doing the
36/// host-side cast themselves. Panics on dtypes the f32 arena can't carry.
37#[allow(dead_code)]
38pub(crate) fn widen_bytes_to_f32(data: &[u8], dtype: rlx_ir::DType) -> Vec<f32> {
39    use rlx_ir::DType;
40    match dtype {
41        DType::F32 => {
42            let n = data.len() / 4;
43            let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
44            s.to_vec()
45        }
46        DType::F16 => {
47            let n = data.len() / 2;
48            let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const half::f16, n) };
49            s.iter().map(|h| h.to_f32()).collect()
50        }
51        DType::BF16 => {
52            let n = data.len() / 2;
53            let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const half::bf16, n) };
54            s.iter().map(|h| h.to_f32()).collect()
55        }
56        other => panic!(
57            "widen_bytes_to_f32: dtype {other:?} unsupported on f32-arena backends \
58             (only F32/F16/BF16 are accepted on the host I/O surface)"
59        ),
60    }
61}
62
63/// Narrow a `&[f32]` buffer down to the declared output dtype, returning
64/// the corresponding little-endian byte stream. Mirrors the bytes a
65/// backend that stores the native dtype would emit. Used by `run_typed`
66/// to keep the byte-level output contract identical across backends.
67#[allow(dead_code)]
68pub(crate) fn narrow_f32_to_bytes(v: &[f32], dt: rlx_ir::DType) -> Vec<u8> {
69    use rlx_ir::DType;
70    match dt {
71        DType::F32 => {
72            let mut bytes = Vec::with_capacity(v.len() * 4);
73            for &x in v {
74                bytes.extend_from_slice(&x.to_le_bytes());
75            }
76            bytes
77        }
78        DType::F16 => {
79            let mut bytes = Vec::with_capacity(v.len() * 2);
80            for &x in v {
81                bytes.extend_from_slice(&half::f16::from_f32(x).to_le_bytes());
82            }
83            bytes
84        }
85        DType::BF16 => {
86            let mut bytes = Vec::with_capacity(v.len() * 2);
87            for &x in v {
88                bytes.extend_from_slice(&half::bf16::from_f32(x).to_le_bytes());
89            }
90            bytes
91        }
92        DType::F64 => {
93            let mut bytes = Vec::with_capacity(v.len() * 8);
94            for &x in v {
95                bytes.extend_from_slice(&(x as f64).to_le_bytes());
96            }
97            bytes
98        }
99        DType::I8 => v.iter().map(|&x| x as i8 as u8).collect(),
100        DType::U8 => v.iter().map(|&x| x as u8).collect(),
101        DType::I16 => {
102            let mut bytes = Vec::with_capacity(v.len() * 2);
103            for &x in v {
104                bytes.extend_from_slice(&(x as i16).to_le_bytes());
105            }
106            bytes
107        }
108        DType::I32 => {
109            let mut bytes = Vec::with_capacity(v.len() * 4);
110            for &x in v {
111                bytes.extend_from_slice(&(x as i32).to_le_bytes());
112            }
113            bytes
114        }
115        DType::U32 => {
116            let mut bytes = Vec::with_capacity(v.len() * 4);
117            for &x in v {
118                bytes.extend_from_slice(&(x as u32).to_le_bytes());
119            }
120            bytes
121        }
122        DType::I64 => {
123            let mut bytes = Vec::with_capacity(v.len() * 8);
124            for &x in v {
125                bytes.extend_from_slice(&(x as i64).to_le_bytes());
126            }
127            bytes
128        }
129        DType::Bool => v
130            .iter()
131            .map(|&x| if x != 0.0 { 1u8 } else { 0u8 })
132            .collect(),
133        DType::C64 => {
134            // Complex narrow path: real part = the f32 value, imaginary
135            // part = 0. Mirrors how the backend stores narrowed f32
136            // operands when promoted to a complex op input.
137            let mut bytes = Vec::with_capacity(v.len() * 8);
138            for &x in v {
139                bytes.extend_from_slice(&x.to_le_bytes());
140                bytes.extend_from_slice(&0.0_f32.to_le_bytes());
141            }
142            bytes
143        }
144    }
145}
146
147/// A compiled, ready-to-execute graph on a specific backend.
148pub trait ExecutableGraph: Send {
149    /// Set a named parameter (weight) buffer.
150    fn set_param(&mut self, name: &str, data: &[f32]);
151
152    /// Called after all params are uploaded (`set_param` / `set_param_typed`).
153    /// Backends may warm caches (e.g. Metal QMatMul weight dequant).
154    fn finalize_params(&mut self) {}
155
156    /// Deep-clone this executable into a fresh `Box`. Lets
157    /// `CompiledGraph` implement `Clone` so callers (e.g. eda-mna's
158    /// `SensitivityContext`) can spin up N independent executor
159    /// copies for thread-parallel dispatch without paying the full
160    /// graph-compile cost N times. Default implementation panics;
161    /// backends that support cloning override.
162    fn clone_box(&self) -> Box<dyn ExecutableGraph> {
163        panic!("clone_box not implemented for this backend");
164    }
165
166    /// Execute the graph with named inputs. Returns output data (copies from arena).
167    fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>>;
168
169    /// Like [`Self::run`] but only read back outputs at `read_indices`.
170    /// GPU handle feeds still update for every output. Default: all outputs.
171    fn run_read_outputs(
172        &mut self,
173        inputs: &[(&str, &[f32])],
174        read_indices: Option<&[usize]>,
175    ) -> Vec<Vec<f32>> {
176        match read_indices {
177            None => self.run(inputs),
178            Some(ix) => {
179                // Backends without a native partial-read path still run the full
180                // graph; only clone the requested outputs on the host.
181                let all = self.run(inputs);
182                ix.iter().filter_map(|&i| all.get(i).cloned()).collect()
183            }
184        }
185    }
186
187    /// Execute and return raw pointers to output data in arena (zero-copy).
188    fn run_raw(&mut self, inputs: &[(&str, &[f32])]) -> Vec<(*const f32, usize)> {
189        let vecs = self.run(inputs);
190        vecs.iter().map(|v| (v.as_ptr(), v.len())).collect()
191    }
192
193    /// Fastest: inputs by slot index, returns output (offset, len) pairs.
194    /// Read output from arena via `arena_ptr().add(offset)`.
195    fn run_slots(&mut self, _inputs: &[&[f32]]) -> &[(usize, usize)] {
196        &[] // default: not supported
197    }
198
199    /// Get the raw arena buffer pointer for reading outputs after run_slots.
200    fn arena_ptr(&self) -> *const u8 {
201        std::ptr::null()
202    }
203
204    /// Hint the executor that subsequent `run` calls should process
205    /// only the first `actual` rows along the bucket axis (out of
206    /// `upper`, the extent the graph was compiled at). Backends that
207    /// support per-kernel active-extent dispatch honor this; others
208    /// ignore it and process the full compiled extent.
209    ///
210    /// Pass `None` to clear the hint. The hint is sticky — set it
211    /// before each `run` and clear it after, or maintain it across
212    /// runs at your discretion.
213    ///
214    /// Even when honored, callers must not rely on the contents of the
215    /// output past `actual` rows — that region may contain stale data
216    /// from earlier runs (kernels skip it).
217    ///
218    /// Default: no-op. See `BucketedCompileCache::run_padded` for the
219    /// canonical caller; backends opt in by overriding this method.
220    fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
221        let _ = extent;
222    }
223
224    /// Override RNG policy for in-graph random ops without recompiling.
225    fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
226        let _ = rng;
227    }
228
229    /// Current RNG policy (default when the backend does not override).
230    fn rng(&self) -> rlx_ir::RngOptions {
231        rlx_ir::RngOptions::default()
232    }
233
234    /// TIDE merged placement mask (union across MoE layers). CPU: stats + host path.
235    fn set_moe_resident_experts(&mut self, _mask: &[bool]) {}
236
237    /// Per MoE layer placement (`masks[layer][expert]`). Preferred over merged on CPU.
238    fn set_moe_resident_experts_per_layer(&mut self, _masks: &[&[bool]]) {}
239
240    /// Capture MoE router TopK indices on the next CPU forward (TIDE refresh).
241    fn enable_moe_topk_capture(&mut self, _num_experts: usize) -> bool {
242        false
243    }
244
245    /// Take captured per-layer expert indices (one vec per MoE TopK in order).
246    fn take_moe_topk_capture(&mut self) -> Option<Vec<Vec<u32>>> {
247        None
248    }
249
250    /// MoE GroupedMatMul residency accounting from the last forward (CPU).
251    fn take_moe_residency_stats(&mut self) -> Option<crate::MoeResidencyStats> {
252        None
253    }
254
255    /// Bind a persistent buffer handle (KV-cache, training state, etc.).
256    /// The buffer lives across run() calls and is not in the arena.
257    /// Returns true if the backend supports persistent handles.
258    fn bind_handle(&mut self, _name: &str, _data: &[f32]) -> bool {
259        false
260    }
261
262    /// Read a persistent buffer's current contents.
263    fn read_handle(&self, _name: &str) -> Option<Vec<f32>> {
264        None
265    }
266
267    /// GPU-resident input (MLX): upload once, reuse across runs.
268    fn bind_gpu_handle(&mut self, _name: &str, _data: &[f32]) -> bool {
269        false
270    }
271
272    fn has_gpu_handle(&self, _name: &str) -> bool {
273        false
274    }
275
276    fn set_gpu_handle_feed(&mut self, _handle_name: &str, _output_index: usize) -> bool {
277        false
278    }
279
280    fn read_gpu_handle(&self, _name: &str) -> Option<Vec<f32>> {
281        None
282    }
283
284    /// Read one row from a resident GPU input handle without full-tensor D2H.
285    fn read_gpu_handle_row(&self, _name: &str, _row: usize, _row_inner: usize) -> Option<Vec<f32>> {
286        None
287    }
288
289    /// Register a targeted *row* feed for resident KV decode (graphs that emit
290    /// the new token at the last bucket-padded output row). Returns false when
291    /// the backend has no GPU-resident handle support. See [`feed_kv_row`].
292    fn register_kv_row_feed(&mut self, _handle_name: &str, _output_index: usize) -> bool {
293        false
294    }
295
296    /// Fold each registered row feed's new-token row (`src_row` of its output)
297    /// into the resident handle slot at `dst_row` (`row_elems` = kv_dim),
298    /// in-place on device. Call after a logits-only run. Returns false when
299    /// unsupported (caller keeps the host KV path).
300    fn feed_kv_row(&mut self, _src_row: usize, _dst_row: usize, _row_elems: usize) -> bool {
301        false
302    }
303
304    /// Mark a graph input as a device-resident handle with no host mirror.
305    fn prepare_resident_gpu_handle(&mut self, _name: &str) -> bool {
306        false
307    }
308
309    /// Upload bound (non-resident) GPU handle mirrors into the arena.
310    fn stage_bound_gpu_handles_to_arena(&mut self) {}
311
312    /// D2D seed of resident `past_k_*` / `past_v_*` from another executable's
313    /// resident prefix (bucket rollover without host DRAM round-trip).
314    fn seed_resident_kv_prefix_from(
315        &mut self,
316        _src: &dyn ExecutableGraph,
317        _prefix_tokens: usize,
318        _outgoing_upper: usize,
319        _kv_dim: usize,
320        _n_layers: usize,
321    ) -> bool {
322        false
323    }
324
325    /// D2D copy resident KV rows `[from_row..to_row)` from another executable.
326    fn copy_resident_kv_rows_from(
327        &mut self,
328        _src: &dyn ExecutableGraph,
329        _from_row: usize,
330        _to_row: usize,
331        _outgoing_upper: usize,
332        _kv_dim: usize,
333        _n_layers: usize,
334    ) -> bool {
335        false
336    }
337
338    /// Copy named parameter storage from another executable on the same backend.
339    /// Used to avoid re-uploading packed U8 weights when compiling decode buckets.
340    fn copy_params_from(&mut self, src: &dyn ExecutableGraph) -> bool {
341        let _ = src;
342        false
343    }
344
345    /// Downcast hook for [`Self::copy_params_from`]. Backends override when supported.
346    fn executable_as_any(&self) -> Option<&dyn std::any::Any> {
347        None
348    }
349
350    /// Mutable downcast hook for [`Self::copy_params_from`].
351    fn executable_as_any_mut(&mut self) -> Option<&mut dyn std::any::Any> {
352        None
353    }
354
355    /// CUDA-only: mutable access for device KV seeding. Default `None`.
356    #[cfg(feature = "cuda")]
357    fn cuda_executable_for_kv_seed(&mut self) -> Option<&mut rlx_cuda::backend::CudaExecutable> {
358        let _ = self;
359        None
360    }
361
362    /// CUDA-only: immutable access for device KV seeding. Default `None`.
363    #[cfg(feature = "cuda")]
364    fn cuda_executable_for_kv_seed_ref(&self) -> Option<&rlx_cuda::backend::CudaExecutable> {
365        None
366    }
367
368    /// Read one row from a row-major graph output after `run` / `run_read_outputs`.
369    /// Metal reads a single row from the arena; default returns `None` (caller falls back).
370    fn read_output_row(&self, _out_idx: usize, _row: usize, _row_inner: usize) -> Option<Vec<f32>> {
371        None
372    }
373
374    /// Run and refresh a GPU handle from `output_index`; returns that output on host.
375    fn run_feed_gpu_handle(
376        &mut self,
377        inputs: &[(&str, &[f32])],
378        _handle_name: &str,
379        _output_index: usize,
380    ) -> Option<Vec<f32>> {
381        let _ = inputs;
382        None
383    }
384
385    // ── Pipelined / async execution (Phase C) ─────────────────────────
386    //
387    // These allow callers to amortize per-run sync latency on backends
388    // where it matters (Metal: ~150 µs `wait_until_completed` per commit).
389    // CPU has no such cost, so the default impls just call `run` serially.
390
391    /// Encode + commit a forward pass without waiting for completion.
392    ///
393    /// Outputs of intermediate calls are stomped — use `run_pipelined` if
394    /// you need outputs from each individual commit. Pair with
395    /// `sync_pending` to drain.
396    ///
397    /// Default: synchronous fallback (calls `run`, discards output). CPU
398    /// uses this default since BLAS is synchronous anyway.
399    fn commit_no_wait(&mut self, inputs: &[(&str, &[f32])]) {
400        let _ = self.run(inputs);
401    }
402
403    /// Wait for every command queued by `commit_no_wait`.
404    /// Default: no-op (synchronous backends have nothing pending).
405    fn sync_pending(&mut self) {}
406
407    /// Issue a batch of forward passes pipelined, returning per-run outputs.
408    ///
409    /// The Metal impl encodes a per-commit blit so each in-flight run's
410    /// outputs survive subsequent commits stomping the shared arena. The
411    /// CPU default is just sequential `run`s — equally correct, no perf
412    /// penalty (CPU has no GPU sync cost to amortize).
413    ///
414    /// Returns `out[run_idx][output_idx][element_idx]`.
415    fn run_pipelined(&mut self, input_sets: &[Vec<(&str, &[f32])>]) -> Vec<Vec<Vec<f32>>> {
416        input_sets.iter().map(|inputs| self.run(inputs)).collect()
417    }
418
419    // ── Typed (non-F32) host I/O ──────────────────────────────────
420    //
421    // `set_param` and `run` are F32 by contract. The typed entry
422    // points let callers pass and receive raw bytes in any rlx-ir
423    // dtype, avoiding the f32 widen/narrow round-trip that's
424    // wasteful for F16/BF16 weights and activations.
425    //
426    // The default impls only handle F32 — any other dtype panics.
427    // Backends that support typed I/O natively (e.g. MLX via
428    // Array::from_bytes/to_bytes) override these.
429
430    /// Set a named parameter from raw bytes in the given dtype.
431    fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
432        if dtype != rlx_ir::DType::F32 {
433            panic!(
434                "backend's default set_param_typed only handles F32; \
435                    got {dtype:?}. Override on the backend for typed support."
436            );
437        }
438        if !data.len().is_multiple_of(4) {
439            panic!(
440                "set_param_typed F32: data length {} not a multiple of 4",
441                data.len()
442            );
443        }
444        // SAFETY: F32 bytes are 4-aligned by source convention; we
445        // only widen access (read &[f32] from owned &[u8]). Failure
446        // mode if a caller hands us mis-aligned bytes is undefined,
447        // hence the % 4 length check.
448        let n = data.len() / 4;
449        let f32_slice = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
450        self.set_param(name, f32_slice);
451    }
452
453    /// Run with typed inputs and typed outputs. Returns
454    /// `(bytes, dtype)` per output; the dtype is whatever the
455    /// graph's output node was declared as.
456    fn run_typed(
457        &mut self,
458        inputs: &[(&str, &[u8], rlx_ir::DType)],
459    ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
460        // Default impl: convert each typed input to f32 (F32-only),
461        // run, then re-emit outputs as F32 bytes.
462        let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
463        for (name, data, dt) in inputs {
464            if *dt != rlx_ir::DType::F32 {
465                panic!(
466                    "backend's default run_typed only handles F32 inputs; \
467                        got {dt:?} for input '{name}'"
468                );
469            }
470            if data.len() % 4 != 0 {
471                panic!(
472                    "run_typed F32 input '{name}': len {} not multiple of 4",
473                    data.len()
474                );
475            }
476            let n = data.len() / 4;
477            let v: Vec<f32> =
478                unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) }.to_vec();
479            owned.push((name.to_string(), v));
480        }
481        let refs: Vec<(&str, &[f32])> = owned
482            .iter()
483            .map(|(n, d)| (n.as_str(), d.as_slice()))
484            .collect();
485        let outs = self.run(&refs);
486        outs.into_iter()
487            .map(|v| {
488                let bytes =
489                    unsafe { std::slice::from_raw_parts(v.as_ptr() as *const u8, v.len() * 4) }
490                        .to_vec();
491                (bytes, rlx_ir::DType::F32)
492            })
493            .collect()
494    }
495}
496
497/// Backend implementation trait.
498///
499/// Single compile entry point. New compile-time knobs are added to
500/// `CompileOptions`, not as new trait methods.
501///
502/// `Send + Sync` because backends are stateless factories — multiple
503/// threads can call `compile` concurrently. The returned
504/// `Box<dyn ExecutableGraph>` is `Send` (moveable to a worker thread)
505/// but **not** `Sync` (`run`/`run_slots` take `&mut self`).
506pub trait Backend: Send + Sync {
507    /// Compile a graph for this backend with the given options.
508    fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph>;
509
510    /// Compile pre-optimized LIR (HIR → MIR → LIR pipeline output).
511    /// Default re-enters [`Self::compile`] — backends should override
512    /// when they can reuse the embedded buffer plan.
513    fn compile_lir(&self, lir: LirModule, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
514        self.compile(lir.into_graph(), options)
515    }
516
517    /// HIR-first compile: lower blocks, run fusion pipeline, emit executable.
518    fn compile_hir(
519        &self,
520        hir: HirModule,
521        device: rlx_driver::Device,
522        options: &CompileOptions,
523    ) -> Result<Box<dyn ExecutableGraph>, rlx_ir::hir::LowerError> {
524        let result = crate::stages::compile_hir_stages(device, hir, options)?;
525        crate::stages::maybe_log_fusion(&result.fusion);
526        Ok(self.compile_lir(result.lir, options))
527    }
528
529    /// [`GraphModule`] compile — unified HIR/MIR/LIR entry.
530    fn compile_module(
531        &self,
532        module: rlx_ir::GraphModule,
533        device: rlx_driver::Device,
534        options: &CompileOptions,
535    ) -> Result<Box<dyn ExecutableGraph>, rlx_ir::hir::LowerError> {
536        let result = crate::stages::compile_module_stages(device, module, options)?;
537        crate::stages::maybe_log_fusion(&result.fusion);
538        Ok(self.compile_lir(result.lir, options))
539    }
540
541    /// PLAN L4: declare which `OpKind`s this backend can lower.
542    /// Default: empty slice = "no claim made — accept everything"
543    /// (preserves existing behavior; backends opt in by overriding).
544    /// When non-empty, the `LegalizeForBackend` pass will refuse to
545    /// compile a graph that contains an op outside this set, instead
546    /// of silently falling through to slower / wrong dispatch.
547    fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
548        &[]
549    }
550}
551
552/// Prepare a fused MIR graph from LIR for backend executable construction.
553/// Skips the fusion pipeline — LIR must come from `compile_*_stages`.
554#[allow(dead_code)]
555fn prepare_fused_graph(
556    graph: Graph,
557    options: &CompileOptions,
558    supported_ops: &[rlx_ir::OpKind],
559    backend_name: &str,
560) -> Graph {
561    let (mut graph, report) = rlx_opt::prepare_graph_for_backend_with_report(
562        graph,
563        backend_name,
564        supported_ops,
565        options.kernel_dispatch,
566    );
567    rlx_opt::maybe_log_dispatch_report(&report);
568    if !report.compile_ready {
569        panic!(
570            "{}\n{}",
571            rlx_opt::format_legalize_error(backend_name, &report.still_unsupported),
572            rlx_opt::format_dispatch_report(&report)
573        );
574    }
575    graph = crate::precompile::post_fusion_cleanup(graph, options);
576    if let Some(p) = options.policy.clone() {
577        use rlx_opt::pass::Pass as _;
578        graph = rlx_opt::AutoMixedPrecision::new(p).run(graph);
579    }
580    graph
581}
582
583#[allow(dead_code)]
584fn declared_output_dtypes(
585    manifest: &cpu_low_precision::IoDtypeManifest,
586    exec_dtypes: Vec<rlx_ir::DType>,
587) -> Vec<rlx_ir::DType> {
588    exec_dtypes
589        .into_iter()
590        .enumerate()
591        .map(|(i, exec)| manifest.output_dtype(i, exec))
592        .collect()
593}
594
595// ── Convenience helpers preserved from older API ──────────────────────
596//
597// These let existing call sites keep working unchanged while the new
598// trait is the canonical one. We provide free functions rather than
599// trait methods so adding them doesn't grow the trait surface.
600
601/// Compile at default options (F32, no policy).
602pub fn compile(backend: &dyn Backend, graph: Graph) -> Box<dyn ExecutableGraph> {
603    backend.compile(graph, &CompileOptions::default())
604}
605
606/// Compile HIR through the fusion-first pipeline.
607pub fn compile_hir(
608    backend: &dyn Backend,
609    hir: HirModule,
610    device: rlx_driver::Device,
611    options: &CompileOptions,
612) -> Result<Box<dyn ExecutableGraph>, rlx_ir::hir::LowerError> {
613    backend.compile_hir(hir, device, options)
614}
615
616/// Compile a [`GraphModule`] through the fusion-first pipeline.
617pub fn compile_module(
618    backend: &dyn Backend,
619    module: rlx_ir::GraphModule,
620    device: rlx_driver::Device,
621    options: &CompileOptions,
622) -> Result<Box<dyn ExecutableGraph>, rlx_ir::hir::LowerError> {
623    backend.compile_module(module, device, options)
624}
625
626/// Compile at a specific precision (default policy = none).
627pub fn compile_with_precision(
628    backend: &dyn Backend,
629    graph: Graph,
630    precision: crate::Precision,
631) -> Box<dyn ExecutableGraph> {
632    backend.compile(graph, &CompileOptions::new().precision(precision))
633}
634
635/// Helper retained for backward compatibility — applies the precision
636/// rewrite at the runtime layer if backends don't override their
637/// pipeline placement. Modern code: pass the policy via CompileOptions
638/// and let the backend handle ordering.
639fn _legacy_apply_policy(graph: Graph, policy: Option<rlx_opt::PrecisionPolicy>) -> Graph {
640    use rlx_opt::pass::Pass as _;
641    match policy {
642        Some(p) => rlx_opt::AutoMixedPrecision::new(p).run(graph),
643        None => graph,
644    }
645}
646
647// ── CPU Backend ─────────────────────────────────────────────────────────
648
649#[cfg(feature = "cpu")]
650pub mod cpu_backend {
651    use super::*;
652    use rlx_cpu::{arena::Arena, thunk};
653    use rlx_ir::{DType, NodeId, Op};
654    use rlx_opt::memory::{self, MemoryPlan};
655    // Arena typed read/write helpers live in `crate::arena` so every
656    // backend (CPU, Metal, future CUDA/wgpu/WASM) shares one implementation.
657    use rlx_driver::arena::{read_typed_to_f32, write_typed_from_f32};
658
659    pub struct CpuBackend;
660
661    /// PLAN L4: ops the CPU backend can lower today. Includes
662    /// DotGeneral (lowered via `LowerDotGeneral` pass) and
663    /// ElementwiseRegion (lowered natively per L2). Excludes
664    /// FusedTransformerLayer / If / While — those have IR variants
665    /// but no CPU lowering yet (see `compile_thunks` arm absence +
666    /// `subgraph.rs` "If/While executor wiring is pending" note).
667    const CPU_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
668        use rlx_ir::OpKind::*;
669        &[
670            Input,
671            Param,
672            Constant,
673            Activation,
674            Cast,
675            StopGradient,
676            Binary,
677            Compare,
678            Where,
679            Fma,
680            ElementwiseRegion,
681            MatMul,
682            DotGeneral,
683            DenseSolve,
684            BatchedDenseSolve,
685            Scan,
686            ScanBackward,
687            ScanBackwardXs,
688            LayerNorm,
689            LayerNorm2d,
690            GroupNorm,
691            BatchNormInference,
692            RmsNorm,
693            ResizeNearest2x,
694            AxialRope2d,
695            Attention,
696            Rope,
697            Reshape,
698            Transpose,
699            Narrow,
700            Concat,
701            Expand,
702            Gather,
703            Reverse,
704            Reduce,
705            Softmax,
706            Cumsum,
707            ArgMax,
708            ArgMin,
709            TopK,
710            Sample,
711            RngNormal,
712            RngUniform,
713            Conv,
714            Im2Col,
715            ConvTranspose2d,
716            Conv3d,
717            ConvTranspose3d,
718            Pool,
719            GroupedMatMul,
720            DequantGroupedMatMul,
721            DequantMoEWeights,
722            ScatterAdd,
723            LoraMatMul,
724            DequantMatMul,
725            ScaledMatMul,
726            ScaledQuantize,
727            ScaledQuantScale,
728            ScaledDequantize,
729            SelectiveScan,
730            GatedDeltaNet,
731            Lstm,
732            Gru,
733            Rnn,
734            Mamba2,
735            FusedSwiGLU,
736            FusedMatMulBiasAct,
737            FusedResidualLN,
738            FusedResidualRmsNorm,
739            FusedAttentionBlock,
740            // Backward ops emitted by `rlx_opt::autodiff::grad_with_loss`.
741            // Their thunks live in rlx-cpu/src/thunk.rs alongside the
742            // forward kernels; without these entries the legalize step
743            // below would reject any compiled gradient graph.
744            ReluBackward,
745            ActivationBackward,
746            FakeQuantize,
747            FakeQuantizeBackward,
748            // LSQ (learned step size) QAT — native CPU thunks in thunk.rs.
749            FakeQuantizeLSQ,
750            FakeQuantizeLSQBackwardX,
751            FakeQuantizeLSQBackwardScale,
752            MaxPool2dBackward,
753            Conv2dBackwardInput,
754            Conv2dBackwardWeight,
755            SoftmaxCrossEntropy,
756            SoftmaxCrossEntropyWithLogits,
757            SoftmaxCrossEntropyBackward,
758            AttentionBackward,
759            LayerNormBackwardInput,
760            LayerNormBackwardGamma,
761            BatchNormInferenceBackwardInput,
762            BatchNormInferenceBackwardGamma,
763            BatchNormInferenceBackwardBeta,
764            // GroupNorm backward (native thunks in rlx-cpu/training_bwd):
765            GroupNormBackwardInput,
766            GroupNormBackwardGamma,
767            GroupNormBackwardBeta,
768            RmsNormBackwardInput,
769            RmsNormBackwardGamma,
770            RmsNormBackwardBeta,
771            RopeBackward,
772            CumsumBackward,
773            GatherBackward,
774            // 3D Gaussian splat CPU reference render/backward (requires `rlx-cpu/splat`).
775            GaussianSplatRender,
776            GaussianSplatRenderBackward,
777            GaussianSplatPrepare,
778            GaussianSplatRasterize,
779            // User-registered custom ops dispatched through
780            // `rlx_cpu::op_registry`. Lowering panics with a clear
781            // message if the named CPU kernel isn't registered.
782            Custom,
783            // User-defined sub-graph with optional override AD rules
784            // (JAX-shaped custom_vjp / custom_jvp). Body is a regular
785            // Graph compiled recursively in compile_thunks.
786            CustomFn,
787            // FFT primitive (1D last-axis, 2N real-block layout, f64
788            // power-of-2 sizes). Other backends panic at lowering;
789            // pin FFT-containing graphs to Device::Cpu for now.
790            Fft,
791            FftButterflyStage,
792            LogMel,
793            LogMelBackward,
794            WelchPeaks,
795            // C64 Wirtinger AD surface. ComplexNormSq is the canonical
796            // real-valued loss for complex inputs; Conjugate is emitted
797            // by the new Wirtinger VJP rules for BinaryOp::Mul/Div on
798            // C64. Both have CPU thunks in rlx-cpu.
799            ComplexNormSq,
800            ComplexNormSqBackward,
801            Conjugate,
802        ]
803    };
804
805    impl Backend for CpuBackend {
806        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
807            CPU_SUPPORTED_OPS
808        }
809
810        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
811            use rlx_opt::pass::Pass as _;
812            static ONNX_KERNELS: std::sync::Once = std::sync::Once::new();
813            ONNX_KERNELS.call_once(rlx_cpu::onnx_ref::register_onnx_reference_kernels);
814            // Lower Op::If / Op::While to primitives BEFORE legalize
815            // so the supported-op check doesn't reject them — the CPU
816            // backend has no native sub-graph executor; this rewrite
817            // makes If/While invisible to the rest of the pipeline.
818            // No-op when neither op is in the graph.
819            let graph = rlx_opt::LowerControlFlow.run(graph);
820            // PLAN L4: legalize against the backend's claimed op set
821            // BEFORE running fusion (so the diagnostic points at the
822            // user's IR, not at a fused-away node).
823            if let Err(errors) = rlx_opt::legalize_for_backend(&graph, CPU_SUPPORTED_OPS) {
824                panic!("{}", rlx_opt::format_legalize_error("cpu", &errors));
825            }
826            let policy = options.policy.clone();
827            let _precision = options.precision;
828            let cfg = rlx_cpu::config::RuntimeConfig::global();
829
830            let graph = crate::precompile::precompile_cleanup(graph, options);
831
832            // Run fusion pipeline (HIR/MIR/LIR ideology — fusion is first-class).
833            let mut compile_opts = options.clone();
834            compile_opts.arena_alignment = cfg.arena_alignment;
835            let compile_result = crate::stages::compile_graph_stages_for_backend(
836                rlx_driver::Device::Cpu,
837                graph,
838                &compile_opts,
839                CPU_SUPPORTED_OPS,
840            );
841            crate::stages::maybe_log_fusion(&compile_result.fusion);
842            let fused = compile_result.lir.into_graph();
843
844            // Apply precision policy AFTER fusion — Cast nodes don't disrupt
845            // the now-flattened fused ops.
846            let fused = match policy {
847                Some(p) => rlx_opt::AutoMixedPrecision::new(p).run(fused),
848                None => fused,
849            };
850
851            let io_manifest = cpu_low_precision::IoDtypeManifest::from_graph(&fused);
852            let exec_graph = if cpu_low_precision::needs_f32_exec(&fused) {
853                cpu_low_precision::promote_to_f32(fused)
854            } else {
855                fused
856            };
857
858            // Re-plan after precision rewrites (may change dtypes / sizes).
859            let plan = memory::plan_memory_aligned(&exec_graph, cfg.arena_alignment);
860            if cfg.verbose >= 1 {
861                eprintln!(
862                    "[rlx] arena: {} bytes, {} buffers, alignment: {}",
863                    plan.arena_size,
864                    plan.assignments.len(),
865                    cfg.arena_alignment
866                );
867            }
868            Box::new(build_cpu_executable(
869                exec_graph,
870                plan,
871                io_manifest,
872                options.rng,
873            ))
874        }
875
876        fn compile_lir(
877            &self,
878            lir: LirModule,
879            options: &CompileOptions,
880        ) -> Box<dyn ExecutableGraph> {
881            let alignment = lir.buffers.alignment.max(options.arena_alignment);
882            let mut graph = lir.into_graph();
883            {
884                use rlx_opt::pass::Pass as _;
885                graph = rlx_opt::LegalizeBroadcast.run(graph);
886            }
887            if let Some(p) = options.policy.clone() {
888                use rlx_opt::pass::Pass;
889                graph = rlx_opt::AutoMixedPrecision::new(p).run(graph);
890            }
891            let io_manifest = cpu_low_precision::IoDtypeManifest::from_graph(&graph);
892            let promote = cpu_low_precision::needs_f32_exec(&graph);
893            let exec_graph = if promote {
894                cpu_low_precision::promote_to_f32(graph)
895            } else {
896                graph
897            };
898            // LegalizeBroadcast may insert Expand nodes — must replan; the
899            // embedded LIR buffer map is from before legalization.
900            let plan = memory::plan_memory_aligned(&exec_graph, alignment);
901            let cfg = rlx_cpu::config::RuntimeConfig::global();
902            if cfg.verbose >= 1 {
903                eprintln!(
904                    "[rlx] compile_lir: arena {} bytes ({} buffers, alignment {})",
905                    plan.arena_size,
906                    plan.assignments.len(),
907                    alignment,
908                );
909            }
910            Box::new(build_cpu_executable(
911                exec_graph,
912                plan,
913                io_manifest,
914                options.rng,
915            ))
916        }
917    }
918
919    fn build_cpu_executable(
920        graph: Graph,
921        plan: MemoryPlan,
922        io_manifest: cpu_low_precision::IoDtypeManifest,
923        rng: rlx_ir::RngOptions,
924    ) -> CpuExecutable {
925        let mut arena = Arena::from_plan(plan);
926        let mut input_ids = HashMap::new();
927        let mut param_ids = HashMap::new();
928        let mut node_dtypes: HashMap<NodeId, DType> = HashMap::new();
929        for node in graph.nodes() {
930            node_dtypes.insert(node.id, node.shape.dtype());
931            match &node.op {
932                Op::Input { name } => {
933                    input_ids.insert(name.clone(), node.id);
934                }
935                Op::Param { name } => {
936                    param_ids.insert(name.clone(), node.id);
937                }
938                _ => {}
939            }
940        }
941
942        let schedule = thunk::compile_thunks_with_rng(&graph, &arena, rng);
943
944        let mut input_slots = Vec::new();
945        for node in graph.nodes() {
946            if let Op::Input { name } = &node.op {
947                let off = arena.byte_offset(node.id);
948                let len = node.shape.num_elements().unwrap_or(0);
949                input_slots.push((name.clone(), off, len, node.shape.dtype()));
950            }
951        }
952
953        let output_slots: Vec<(usize, usize)> = graph
954            .outputs
955            .iter()
956            .map(|&id| {
957                let off = arena.byte_offset(id);
958                let len = graph.node(id).shape.num_elements().unwrap_or(0);
959                (off, len)
960            })
961            .collect();
962
963        for node in graph.nodes() {
964            if let Op::Constant { data } = &node.op
965                && arena.has_buffer(node.id)
966                && !data.is_empty()
967            {
968                match node.shape.dtype() {
969                    // True-width dtypes (their arena slot is sized to the real
970                    // element width, not f32): copy the raw bytes. I64/I32/U32
971                    // constants were previously caught by the f32-reinterpret
972                    // branch below, which read them in 4-byte chunks as f32 —
973                    // corrupting e.g. the VITS sequence-mask `arange` constant
974                    // (i64 [0..T-1]) so the downstream i64 `Compare` read garbage.
975                    DType::F64
976                    | DType::F16
977                    | DType::BF16
978                    | DType::I64
979                    | DType::I32
980                    | DType::U32 => {
981                        let off = arena.byte_offset(node.id);
982                        let buf = arena.raw_buf_mut();
983                        let n = buf.len().saturating_sub(off).min(data.len());
984                        buf[off..off + n].copy_from_slice(&data[..n]);
985                    }
986                    _ => {
987                        let buf = arena.slice_mut(node.id);
988                        let n_floats = data.len() / 4;
989                        let n = buf.len().min(n_floats);
990                        for i in 0..n {
991                            let bytes = [
992                                data[i * 4],
993                                data[i * 4 + 1],
994                                data[i * 4 + 2],
995                                data[i * 4 + 3],
996                            ];
997                            buf[i] = f32::from_le_bytes(bytes);
998                        }
999                    }
1000                }
1001            }
1002        }
1003
1004        CpuExecutable {
1005            graph,
1006            arena,
1007            input_ids,
1008            param_ids,
1009            node_dtypes,
1010            io_manifest,
1011            schedule,
1012            input_slots,
1013            output_slots,
1014            handles: HashMap::new(),
1015            active_extent: None,
1016            moe_resident: None,
1017            moe_resident_layers: None,
1018            moe_topk_capture: None,
1019            baseline_written: false,
1020        }
1021    }
1022
1023    #[derive(Clone)]
1024    struct CpuExecutable {
1025        graph: Graph,
1026        arena: Arena,
1027        input_ids: HashMap<String, NodeId>,
1028        param_ids: HashMap<String, NodeId>,
1029        /// Per-node arena dtype. Lets set_param/run cast f32 ↔ F16/BF16
1030        /// when AutoMixedPrecision has rewritten the graph.
1031        node_dtypes: HashMap<NodeId, DType>,
1032        /// User-facing boundary dtypes (before f32 promotion for CPU exec).
1033        io_manifest: cpu_low_precision::IoDtypeManifest,
1034        schedule: thunk::ThunkSchedule,
1035        // Pre-resolved: ordered list of (input_name, arena_byte_offset, max_elems, dtype)
1036        input_slots: Vec<(String, usize, usize, DType)>,
1037        /// Output (byte_offset, num_elements). dtype is in node_dtypes.
1038        output_slots: Vec<(usize, usize)>,
1039        /// Persistent buffer handles (KV-cache, optimizer state, etc.).
1040        /// Lives outside the arena and survives across run() calls.
1041        /// On run(): if a handle's name matches a graph input, the
1042        /// handle's data is used as the input.
1043        handles: HashMap<String, Vec<f32>>,
1044        /// Active-extent hint (`Some((actual, upper))`) for L1 bucketed
1045        /// dispatch. When set AND every thunk in the schedule is in
1046        /// `Thunk::safe_for_active_extent`, the executor processes only
1047        /// `actual / upper` of each kernel's work. Otherwise (or when
1048        /// `None`) runs at the full compiled extent. See PLAN L1.
1049        active_extent: Option<(usize, usize)>,
1050        moe_resident: Option<std::sync::Arc<[bool]>>,
1051        moe_resident_layers: Option<std::sync::Arc<Vec<std::sync::Arc<[bool]>>>>,
1052        moe_topk_capture: Option<std::sync::Arc<rlx_cpu::moe_topk_capture::MoeTopkCapture>>,
1053        /// Whether params + constants are already resident in the arena. While
1054        /// `true`, `restore_arena_baseline` zeros only the scratch buffers instead
1055        /// of re-zeroing + rewriting every param each run (which is O(params) and
1056        /// allocates a full params clone — catastrophic for multi-GB models).
1057        /// `set_param`/`set_param_typed` reset it to `false`.
1058        baseline_written: bool,
1059    }
1060
1061    unsafe impl Send for CpuExecutable {}
1062
1063    impl CpuExecutable {
1064        /// Write a f32 input slice into the arena, casting to the node's dtype.
1065        fn write_input(&mut self, id: NodeId, data: &[f32]) {
1066            let dtype = self.node_dtypes.get(&id).copied().unwrap_or(DType::F32);
1067            let off = self.arena.byte_offset(id);
1068            let buf = self.arena.raw_buf_mut();
1069            let elem_size = dtype.size_bytes();
1070            let max_elems = (buf.len() - off) / elem_size;
1071            unsafe {
1072                write_typed_from_f32(buf.as_mut_ptr().add(off), dtype, data, max_elems);
1073            }
1074        }
1075
1076        /// Read a node's arena bytes back as Vec<f32>, casting from its dtype.
1077        fn read_output(&self, id: NodeId) -> Vec<f32> {
1078            let dtype = self.node_dtypes.get(&id).copied().unwrap_or(DType::F32);
1079            let off = self.arena.byte_offset(id);
1080            let buf = self.arena.raw_buf();
1081            let n_elems = self.graph.node(id).shape.num_elements().unwrap_or(0);
1082            unsafe { read_typed_to_f32(buf.as_ptr().add(off), dtype, n_elems) }
1083        }
1084    }
1085
1086    impl ExecutableGraph for CpuExecutable {
1087        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
1088            Box::new(self.clone())
1089        }
1090        fn set_param(&mut self, name: &str, data: &[f32]) {
1091            // Params live solely in the arena (dedicated, never-aliased slots, see
1092            // the memory planner) — no redundant CPU-side copy is kept, which would
1093            // double the weight footprint for multi-GB models.
1094            // Cast f32 → arena dtype when the param has been rewritten to F16/BF16.
1095            if let Some(&id) = self.param_ids.get(name)
1096                && self.arena.has_buffer(id)
1097            {
1098                let dtype = self.node_dtypes.get(&id).copied().unwrap_or(DType::F32);
1099                let off = self.arena.byte_offset(id);
1100                let buf = self.arena.raw_buf_mut();
1101                let elem_size = dtype.size_bytes();
1102                let max_elems = (buf.len() - off) / elem_size;
1103                unsafe {
1104                    write_typed_from_f32(buf.as_mut_ptr().add(off), dtype, data, max_elems);
1105                }
1106            }
1107        }
1108
1109        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
1110            self.restore_arena_baseline();
1111            // 1. Apply persistent handles first — they act like default inputs.
1112            //    Explicit `inputs` passed to run() override matching handle names.
1113            let handle_names: Vec<String> = self.handles.keys().cloned().collect();
1114            for name in &handle_names {
1115                if let Some(&id) = self.input_ids.get(name)
1116                    && self.arena.has_buffer(id)
1117                {
1118                    let data = self.handles.get(name).cloned().unwrap_or_default();
1119                    self.write_input(id, &data);
1120                }
1121            }
1122            // 2. Explicit per-call inputs override handles.
1123            for &(name, data) in inputs {
1124                if let Some(&id) = self.input_ids.get(name)
1125                    && self.arena.has_buffer(id)
1126                {
1127                    self.write_input(id, data);
1128                }
1129            }
1130
1131            // Active-extent fast-path (PLAN L1): if hinted AND every thunk
1132            // in the schedule supports it, run scaled. Otherwise fall back
1133            // to full-extent dispatch — preserves correctness when the
1134            // schedule contains a thunk that hasn't yet been wired in.
1135            let active_used = if let Some((actual, upper)) = self.active_extent {
1136                thunk::execute_thunks_active(
1137                    &self.schedule,
1138                    self.arena.raw_buf_mut(),
1139                    actual,
1140                    upper,
1141                )
1142            } else {
1143                false
1144            };
1145            if !active_used {
1146                // Execute via pre-compiled thunks (zero per-node dispatch overhead)
1147                thunk::execute_thunks(&self.schedule, self.arena.raw_buf_mut());
1148            }
1149
1150            // 3. Sync any handle whose name matches a graph OUTPUT —
1151            //    KV-cache pattern: outputs flow back into the same-named
1152            //    handle for the next iteration.
1153            for (idx, &out_id) in self.graph.outputs.iter().enumerate() {
1154                let name = format!("out{idx}");
1155                if self.handles.contains_key(&name) {
1156                    let v = self.read_output(out_id);
1157                    self.handles.insert(name, v);
1158                }
1159            }
1160
1161            self.graph
1162                .outputs
1163                .iter()
1164                .map(|&out_id| self.read_output(out_id))
1165                .collect()
1166        }
1167
1168        fn run_raw(&mut self, inputs: &[(&str, &[f32])]) -> Vec<(*const f32, usize)> {
1169            self.restore_arena_baseline();
1170            // Copy inputs by name (HashMap lookup), casting to arena dtype.
1171            for &(name, data) in inputs {
1172                if let Some(&id) = self.input_ids.get(name)
1173                    && self.arena.has_buffer(id)
1174                {
1175                    self.write_input(id, data);
1176                }
1177            }
1178            thunk::execute_thunks(&self.schedule, self.arena.raw_buf_mut());
1179            // Note: pointers are raw arena bytes — for F16 outputs, callers
1180            // must read 2 bytes/elem, not 4. run() is the safe path for
1181            // mixed precision; run_raw() is only meaningful for F32.
1182            self.graph
1183                .outputs
1184                .iter()
1185                .map(|&out_id| {
1186                    let (ptr, len) = self.arena.raw_ptr(out_id);
1187                    (ptr as *const f32, len)
1188                })
1189                .collect()
1190        }
1191
1192        /// Fastest path: inputs by index (matching input_slots order), zero-copy output.
1193        /// No HashMap, no name matching, no Vec allocation. Casts f32 input
1194        /// to F16/BF16 if the input slot's dtype was rewritten.
1195        fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
1196            self.restore_arena_baseline();
1197            let buf = self.arena.raw_buf_mut();
1198            for (i, &data) in inputs.iter().enumerate() {
1199                if i < self.input_slots.len() {
1200                    let (_, off, max_len, dtype) = &self.input_slots[i];
1201                    unsafe {
1202                        write_typed_from_f32(buf.as_mut_ptr().add(*off), *dtype, data, *max_len);
1203                    }
1204                }
1205            }
1206            thunk::execute_thunks(&self.schedule, self.arena.raw_buf_mut());
1207            &self.output_slots
1208        }
1209
1210        fn arena_ptr(&self) -> *const u8 {
1211            self.arena.raw_buf_mut_ptr()
1212        }
1213
1214        fn bind_handle(&mut self, name: &str, data: &[f32]) -> bool {
1215            // Persistent buffer: stored separately from arena, survives run().
1216            // If the name matches a graph input, run() will use this data
1217            // as the input. If the graph also writes back to this name (via
1218            // an output binding pattern), read_handle returns the latest.
1219            self.handles.insert(name.to_string(), data.to_vec());
1220            true
1221        }
1222
1223        fn read_handle(&self, name: &str) -> Option<Vec<f32>> {
1224            self.handles.get(name).cloned()
1225        }
1226
1227        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
1228            self.active_extent = extent;
1229        }
1230
1231        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
1232            *self.schedule.rng.write().unwrap() = rng;
1233        }
1234
1235        fn rng(&self) -> rlx_ir::RngOptions {
1236            *self.schedule.rng.read().unwrap()
1237        }
1238
1239        fn set_moe_resident_experts(&mut self, mask: &[bool]) {
1240            self.moe_resident_layers = None;
1241            self.schedule.moe_resident_layers = None;
1242            self.moe_resident = Some(Arc::from(mask));
1243            self.schedule.moe_resident = self.moe_resident.clone();
1244        }
1245
1246        fn set_moe_resident_experts_per_layer(&mut self, masks: &[&[bool]]) {
1247            self.moe_resident = None;
1248            self.schedule.moe_resident = None;
1249            let layers: Vec<Arc<[bool]>> = masks.iter().map(|m| Arc::from(*m)).collect();
1250            let arc = Arc::new(layers);
1251            self.moe_resident_layers = Some(arc.clone());
1252            self.schedule.moe_resident_layers = Some(arc);
1253        }
1254
1255        fn enable_moe_topk_capture(&mut self, num_experts: usize) -> bool {
1256            let cap = rlx_cpu::moe_topk_capture::MoeTopkCapture::new(num_experts);
1257            self.moe_topk_capture = Some(cap.clone());
1258            self.schedule.moe_topk_capture = Some(cap);
1259            true
1260        }
1261
1262        fn take_moe_topk_capture(&mut self) -> Option<Vec<Vec<u32>>> {
1263            let cap = self.moe_topk_capture.as_ref()?;
1264            let layers = cap.take_layers();
1265            if layers.is_empty() {
1266                None
1267            } else {
1268                Some(layers)
1269            }
1270        }
1271
1272        fn take_moe_residency_stats(&mut self) -> Option<crate::MoeResidencyStats> {
1273            rlx_cpu::moe_residency::take_last_forward_stats()
1274        }
1275
1276        /// Typed param upload. F32 / F16 / BF16 go through the existing
1277        /// widen-to-f32 path (the CPU arena is historically f32 with
1278        /// optional half-precision rewrite). F64 (and any future
1279        /// non-widenable dtype) lands directly in the arena as bytes —
1280        /// the f32 path would lose precision.
1281        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
1282            if matches!(dtype, DType::F64 | DType::I64 | DType::I32 | DType::U32) {
1283                self.set_param_bytes(name, data, dtype);
1284                return;
1285            }
1286            // U8 / I8 raw byte tensors: opaque storage for the GGUF
1287            // K-quant `Op::DequantMatMul` path (weights stay packed
1288            // in the arena). One arena byte = one element.
1289            if matches!(dtype, DType::U8 | DType::I8) {
1290                self.set_param_bytes(name, data, dtype);
1291                return;
1292            }
1293            if dtype == DType::F32 {
1294                let n = data.len() / 4;
1295                let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
1296                self.set_param(name, s);
1297            } else {
1298                let f32_buf = super::widen_bytes_to_f32(data, dtype);
1299                self.set_param(name, &f32_buf);
1300            }
1301        }
1302
1303        /// Typed run with mixed-dtype inputs/outputs.
1304        ///
1305        /// For each input: if its declared graph dtype matches the
1306        /// caller's bytes, we write directly into the arena (zero
1307        /// precision loss — F64 stays F64). For F32 with a half-precision
1308        /// arena rewrite, we widen as before. F16/BF16 callers go
1309        /// through the existing widen path.
1310        ///
1311        /// Outputs are read straight from the arena in the graph node's
1312        /// declared dtype — F64 outputs come back as 8 bytes/element,
1313        /// F32 as 4, etc.
1314        fn run_typed(
1315            &mut self,
1316            inputs: &[(&str, &[u8], rlx_ir::DType)],
1317        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
1318            // Decide: are *all* inputs F64? If so, use the direct-byte
1319            // path for everything and skip the f32 widening machinery
1320            // entirely. Mixed dtype graphs (F32 + F64) take the
1321            // per-input dispatch route below.
1322            let all_f64 = !inputs.is_empty() && inputs.iter().all(|(_, _, dt)| *dt == DType::F64);
1323
1324            if all_f64 {
1325                for (name, data, _) in inputs {
1326                    if let Some(&id) = self.input_ids.get(*name) {
1327                        if !self.arena.has_buffer(id) {
1328                            continue;
1329                        }
1330                        let off = self.arena.byte_offset(id);
1331                        let buf = self.arena.raw_buf_mut();
1332                        let n = data.len();
1333                        debug_assert!(
1334                            off + n <= buf.len(),
1335                            "run_typed: input '{name}' overflows arena slot"
1336                        );
1337                        buf[off..off + n].copy_from_slice(data);
1338                    }
1339                }
1340                thunk::execute_thunks(&self.schedule, self.arena.raw_buf_mut());
1341            } else {
1342                // Mixed-dtype path: dtypes that survive untouched
1343                // through the f32-aliased arena (F64, I32, I64, U32)
1344                // go in as bytes; F32 and the half-precision family
1345                // route through widen-to-f32 + run.
1346                let mut f32_owned: Vec<(String, Vec<f32>)> = Vec::new();
1347                for (name, data, dt) in inputs {
1348                    let direct = matches!(
1349                        *dt,
1350                        DType::F64 | DType::I32 | DType::I64 | DType::U32 | DType::C64
1351                    );
1352                    if direct {
1353                        if let Some(&id) = self.input_ids.get(*name) {
1354                            if !self.arena.has_buffer(id) {
1355                                continue;
1356                            }
1357                            let off = self.arena.byte_offset(id);
1358                            let buf = self.arena.raw_buf_mut();
1359                            buf[off..off + data.len()].copy_from_slice(data);
1360                        }
1361                    } else {
1362                        let v = super::widen_bytes_to_f32(data, *dt);
1363                        f32_owned.push((name.to_string(), v));
1364                    }
1365                }
1366                for (name, data) in &f32_owned {
1367                    if let Some(&id) = self.input_ids.get(name.as_str()) {
1368                        if self.arena.has_buffer(id) {
1369                            self.write_input(id, data);
1370                        }
1371                    }
1372                }
1373                let active_used = if let Some((actual, upper)) = self.active_extent {
1374                    thunk::execute_thunks_active(
1375                        &self.schedule,
1376                        self.arena.raw_buf_mut(),
1377                        actual,
1378                        upper,
1379                    )
1380                } else {
1381                    false
1382                };
1383                if !active_used {
1384                    thunk::execute_thunks(&self.schedule, self.arena.raw_buf_mut());
1385                }
1386            }
1387
1388            // Read outputs in declared boundary dtypes.
1389            self.graph
1390                .outputs
1391                .iter()
1392                .enumerate()
1393                .map(|(idx, &id)| {
1394                    let exec_dtype = self.graph.node(id).shape.dtype();
1395                    let declared = self.io_manifest.output_dtype(idx, exec_dtype);
1396                    if matches!(
1397                        exec_dtype,
1398                        DType::F64
1399                            | DType::F16
1400                            | DType::BF16
1401                            | DType::I32
1402                            | DType::I64
1403                            | DType::U32
1404                            | DType::C64
1405                    ) {
1406                        let n_elems = self.graph.node(id).shape.num_elements().unwrap_or(0);
1407                        let n_bytes = n_elems * exec_dtype.size_bytes();
1408                        let off = self.arena.byte_offset(id);
1409                        let bytes = self.arena.raw_buf()[off..off + n_bytes].to_vec();
1410                        return (bytes, declared);
1411                    }
1412                    let f32_vals = self.read_output(id);
1413                    if declared != exec_dtype {
1414                        return (super::narrow_f32_to_bytes(&f32_vals, declared), declared);
1415                    }
1416                    let bytes = f32_vals.iter().flat_map(|v| v.to_le_bytes()).collect();
1417                    (bytes, declared)
1418                })
1419                .collect()
1420        }
1421    }
1422
1423    impl CpuExecutable {
1424        /// Clear ephemeral (scratch) arena slots before each `run()`. Params are
1425        /// written into their dedicated, never-aliased arena slots by `set_param`
1426        /// and live for the whole execution, so they are NOT re-zeroed/rewritten
1427        /// here — only the intermediate buffers (which carry stale data from the
1428        /// previous pass) are zeroed. Compile-time constants are written once.
1429        ///
1430        /// This keeps the per-run cost O(scratch) instead of O(params): a previous
1431        /// version cloned + rewrote the entire (multi-GB) weight region every run,
1432        /// which made large models swap-thrash.
1433        fn restore_arena_baseline(&mut self) {
1434            // Persistent slots (params + constants) — never zeroed.
1435            let persistent: std::collections::HashSet<NodeId> = {
1436                let mut s: std::collections::HashSet<NodeId> =
1437                    self.param_ids.values().copied().collect();
1438                for node in self.graph.nodes() {
1439                    if matches!(node.op, Op::Constant { .. }) {
1440                        s.insert(node.id);
1441                    }
1442                }
1443                s
1444            };
1445
1446            // Write compile-time constants into the arena once (a fresh arena is
1447            // zero-initialized; params are already resident via set_param).
1448            if !self.baseline_written {
1449                let constants: Vec<(NodeId, DType, Vec<u8>)> = self
1450                    .graph
1451                    .nodes()
1452                    .iter()
1453                    .filter_map(|node| {
1454                        if let Op::Constant { data } = &node.op
1455                            && self.arena.has_buffer(node.id)
1456                            && !data.is_empty()
1457                        {
1458                            Some((node.id, node.shape.dtype(), data.clone()))
1459                        } else {
1460                            None
1461                        }
1462                    })
1463                    .collect();
1464                for (id, dtype, data) in constants {
1465                    self.write_constant_to_arena(id, dtype, &data);
1466                }
1467                self.baseline_written = true;
1468            }
1469
1470            // Zero everything EXCEPT the persistent (param + constant) byte ranges.
1471            //
1472            // We zero the *complement* of the persistent ranges rather than each
1473            // scratch node's exact byte span. That covers inter-slot padding and
1474            // arena gaps too — a kernel that over-reads its input into adjacent
1475            // alignment padding (common in SIMD reductions) would otherwise pick up
1476            // stale bytes from a previous run, since per-node zeroing only clears
1477            // `num_elements` and leaves the padding dirty. The cost stays O(arena −
1478            // params): for a 7B the params dominate the arena and are skipped, so
1479            // the swept region is tiny.
1480            let mut keep: Vec<(usize, usize)> = self
1481                .graph
1482                .nodes()
1483                .iter()
1484                .filter_map(|node| {
1485                    let id = node.id;
1486                    if !persistent.contains(&id) || !self.arena.has_buffer(id) {
1487                        return None;
1488                    }
1489                    let dtype = self.node_dtypes.get(&id).copied().unwrap_or(DType::F32);
1490                    let nbytes = node.shape.num_elements().unwrap_or(0) * dtype.size_bytes();
1491                    let off = self.arena.byte_offset(id);
1492                    Some((off, off + nbytes))
1493                })
1494                .collect();
1495            keep.sort_unstable();
1496
1497            let buf = self.arena.raw_buf_mut();
1498            let len = buf.len();
1499            let mut cursor = 0usize;
1500            for (start, end) in keep {
1501                let start = start.min(len);
1502                if cursor < start {
1503                    buf[cursor..start].fill(0);
1504                }
1505                cursor = cursor.max(end.min(len));
1506            }
1507            if cursor < len {
1508                buf[cursor..len].fill(0);
1509            }
1510        }
1511
1512        fn write_constant_to_arena(&mut self, id: NodeId, dtype: DType, data: &[u8]) {
1513            match dtype {
1514                DType::F64 | DType::F16 | DType::BF16 | DType::U8 | DType::I8 => {
1515                    let off = self.arena.byte_offset(id);
1516                    let buf = self.arena.raw_buf_mut();
1517                    let n = buf.len().saturating_sub(off).min(data.len());
1518                    buf[off..off + n].copy_from_slice(&data[..n]);
1519                }
1520                _ => {
1521                    let buf = self.arena.slice_mut(id);
1522                    let n_floats = data.len() / 4;
1523                    let n = buf.len().min(n_floats);
1524                    for i in 0..n {
1525                        let bytes = [
1526                            data[i * 4],
1527                            data[i * 4 + 1],
1528                            data[i * 4 + 2],
1529                            data[i * 4 + 3],
1530                        ];
1531                        buf[i] = f32::from_le_bytes(bytes);
1532                    }
1533                }
1534            }
1535        }
1536
1537        /// Direct-byte param upload — copies caller's bytes into the
1538        /// arena slot for the named param without any dtype conversion.
1539        /// Used by `set_param_typed` for dtypes that f32-widening would
1540        /// corrupt (F64). Caller is responsible for matching the param's
1541        /// declared graph dtype.
1542        fn set_param_bytes(&mut self, name: &str, data: &[u8], _dtype: rlx_ir::DType) {
1543            // Byte-backed params also live solely in the arena (no CPU-side copy).
1544            self.write_param_bytes_to_arena(name, data);
1545        }
1546
1547        fn write_param_bytes_to_arena(&mut self, name: &str, data: &[u8]) {
1548            if let Some(&id) = self.param_ids.get(name)
1549                && self.arena.has_buffer(id)
1550            {
1551                let off = self.arena.byte_offset(id);
1552                let buf = self.arena.raw_buf_mut();
1553                debug_assert!(
1554                    off + data.len() <= buf.len(),
1555                    "set_param_bytes: '{name}' would overflow arena slot"
1556                );
1557                buf[off..off + data.len()].copy_from_slice(data);
1558            }
1559        }
1560    }
1561}
1562
1563// ── Metal Backend ───────────────────────────────────────────────────────
1564
1565// ── wgpu Backend ────────────────────────────────────────────────────────
1566
1567#[cfg(feature = "gpu")]
1568pub mod wgpu_backend {
1569    use super::*;
1570    use rlx_ir::OpKind;
1571    use rlx_wgpu::backend::WgpuExecutable;
1572
1573    pub struct WgpuBackend;
1574
1575    /// PLAN L4: ops the wgpu backend can lower today. The fused
1576    /// macro-kernels (FAB, FTL, FusedSwiGLU) get decomposed by
1577    /// `crate::unfuse::unfuse` upstream — they're listed here too so
1578    /// graphs that already contain them legalize cleanly. Conv1d/3d
1579    /// and Pool1d/3d are deferred (Conv2d only).
1580    const WGPU_SUPPORTED_OPS: &[OpKind] = &[
1581        OpKind::Input,
1582        OpKind::Param,
1583        OpKind::Constant,
1584        OpKind::Activation,
1585        OpKind::Cast,
1586        OpKind::StopGradient,
1587        OpKind::Binary,
1588        OpKind::Compare,
1589        OpKind::Where,
1590        OpKind::Fma,
1591        OpKind::ElementwiseRegion,
1592        OpKind::TransformRegion,
1593        OpKind::BatchElementwiseRegion,
1594        OpKind::MatMul,
1595        OpKind::DotGeneral,
1596        OpKind::LayerNorm,
1597        OpKind::LayerNorm2d,
1598        OpKind::GroupNorm,
1599        OpKind::ResizeNearest2x,
1600        OpKind::RmsNorm,
1601        OpKind::Attention,
1602        OpKind::AttentionBackward,
1603        OpKind::RmsNormBackwardInput,
1604        OpKind::RmsNormBackwardGamma,
1605        OpKind::RmsNormBackwardBeta,
1606        // LayerNorm backward family:
1607        //   * Input  — single workgroup-per-row fused kernel.
1608        //   * Gamma  — two-dispatch (partial + reduce) that uses a tail
1609        //              scratch zone in the arena to hold per-chunk
1610        //              partial sums; the reduce kernel sums them.
1611        // Both beat the autodiff-decomposed primitive chain.
1612        OpKind::LayerNormBackwardInput,
1613        OpKind::LayerNormBackwardGamma,
1614        OpKind::RopeBackward,
1615        OpKind::CumsumBackward,
1616        OpKind::GatherBackward,
1617        OpKind::Rope,
1618        OpKind::Reshape,
1619        OpKind::Transpose,
1620        OpKind::Narrow,
1621        OpKind::Concat,
1622        OpKind::Expand,
1623        OpKind::Gather,
1624        OpKind::Reverse,
1625        OpKind::Reduce,
1626        OpKind::Softmax,
1627        OpKind::SoftmaxCrossEntropy,
1628        OpKind::ArgMax,
1629        OpKind::ArgMin,
1630        OpKind::Cumsum,
1631        OpKind::TopK,
1632        OpKind::Sample,
1633        OpKind::Conv,
1634        OpKind::Im2Col,
1635        OpKind::Pool,
1636        OpKind::GroupedMatMul,
1637        OpKind::DequantGroupedMatMul,
1638        OpKind::DequantMoEWeights,
1639        OpKind::ScatterAdd,
1640        OpKind::SelectiveScan,
1641        OpKind::Lstm,
1642        OpKind::Gru,
1643        OpKind::Rnn,
1644        OpKind::Mamba2,
1645        // Transposed conv (vision U-Net decoder) — host fallback via the CPU kernel.
1646        OpKind::ConvTranspose2d,
1647        // 3-D convs (volumetric UNETR-style decoders) — CPU NCDHW kernels.
1648        OpKind::Conv3d,
1649        OpKind::ConvTranspose3d,
1650        OpKind::DequantMatMul,
1651        OpKind::FusedMatMulBiasAct,
1652        OpKind::FusedResidualLN,
1653        OpKind::FusedResidualRmsNorm,
1654        OpKind::FusedSwiGLU,
1655        OpKind::FusedAttentionBlock,
1656        OpKind::FusedTransformerLayer,
1657        // Native FFT (WGSL radix-2): f32 only, power-of-2 N ≤ 1024.
1658        // Anything outside that envelope panics at lowering with a
1659        // "pin to Device::Cpu" hint. No host fallback — WGPU has no
1660        // unified memory, so silent CPU round-trip would be a hidden
1661        // performance cliff.
1662        OpKind::Fft,
1663        // Op::Scan (arbitrary-body recurrence) via readback host fallback —
1664        // compile the body once, loop it on the CPU against an arena readback.
1665        // Enables IIR (`biquad`/`sosfilt`) on wgpu.
1666        OpKind::Scan,
1667        OpKind::LogMel,
1668        OpKind::LogMelBackward,
1669        OpKind::WelchPeaks,
1670        // 3D Gaussian splat: native Metal / CPU reference per backend.
1671        OpKind::GaussianSplatRender,
1672        OpKind::GaussianSplatRenderBackward,
1673        OpKind::GaussianSplatPrepare,
1674        OpKind::GaussianSplatRasterize,
1675        OpKind::Custom,
1676        OpKind::RngNormal,
1677        OpKind::RngUniform,
1678        // LoRA, If, While: not yet wired in wgpu — fail loudly.
1679    ];
1680
1681    impl Backend for WgpuBackend {
1682        fn supported_ops(&self) -> &'static [OpKind] {
1683            WGPU_SUPPORTED_OPS
1684        }
1685
1686        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
1687            use rlx_opt::pass::Pass as _;
1688            let graph = rlx_opt::LowerControlFlow.run(graph);
1689            let graph = rlx_opt::legalize_or_rewrite_for_backend(graph, WGPU_SUPPORTED_OPS)
1690                .unwrap_or_else(|errors| {
1691                    panic!("{}", rlx_opt::format_legalize_error("wgpu", &errors));
1692                });
1693            let graph = crate::precompile::precompile_cleanup(graph, options);
1694            // Materialize mid-axis broadcasts before MarkElementwiseRegions:
1695            // wgpu Binary/region kernels only handle trailing/scalar broadcast
1696            // via modulus; EEG patch embed uses [1,C,1,D] + [1,C,P,D].
1697            let graph = rlx_opt::LegalizeBroadcast.run(graph);
1698            // ORDER MATTERS: targeted-pattern fusions run BEFORE the
1699            // catch-all `MarkElementwiseRegions`. Otherwise the region
1700            // pass swallows the Add / Activation nodes into chains and
1701            // FuseMatMulBiasAct / FuseResidualLN fail to match the
1702            // narrower patterns they look for. (Metal pipeline at line
1703            // ~377 already orders these correctly; wgpu was inverted
1704            // and silently shipped 13 unfused LayerNorms per BERT
1705            // forward where 12 should have been FusedResidualLN.)
1706            let compile_result = crate::stages::compile_graph_stages_for_backend(
1707                rlx_driver::Device::Gpu,
1708                graph,
1709                options,
1710                WGPU_SUPPORTED_OPS,
1711            );
1712            crate::stages::maybe_log_fusion(&compile_result.fusion);
1713            let graph = compile_result.lir.into_graph();
1714            let graph = match options.policy.clone() {
1715                Some(p) => rlx_opt::AutoMixedPrecision::new(p).run(graph),
1716                None => graph,
1717            };
1718            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
1719            Box::new(WgpuExecutableWrapper {
1720                inner: WgpuExecutable::compile_rng(graph, options.rng),
1721                io_manifest,
1722            })
1723        }
1724
1725        fn compile_lir(
1726            &self,
1727            lir: LirModule,
1728            options: &CompileOptions,
1729        ) -> Box<dyn ExecutableGraph> {
1730            use rlx_opt::pass::Pass as _;
1731            // LIR may already contain fused ElementwiseRegions; legalize
1732            // broadcasts on the unfused graph shape before backend prep.
1733            let graph = rlx_opt::LegalizeBroadcast.run(lir.into_graph());
1734            let graph = prepare_fused_graph(graph, options, WGPU_SUPPORTED_OPS, "wgpu");
1735            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
1736            Box::new(WgpuExecutableWrapper {
1737                inner: WgpuExecutable::compile_rng(graph, options.rng),
1738                io_manifest,
1739            })
1740        }
1741    }
1742
1743    struct WgpuExecutableWrapper {
1744        inner: WgpuExecutable,
1745        io_manifest: cpu_low_precision::IoDtypeManifest,
1746    }
1747
1748    unsafe impl Send for WgpuExecutableWrapper {}
1749
1750    impl ExecutableGraph for WgpuExecutableWrapper {
1751        fn set_param(&mut self, name: &str, data: &[f32]) {
1752            self.inner.set_param(name, data);
1753        }
1754        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
1755            self.inner.run(inputs)
1756        }
1757        fn run_read_outputs(
1758            &mut self,
1759            inputs: &[(&str, &[f32])],
1760            read_indices: Option<&[usize]>,
1761        ) -> Vec<Vec<f32>> {
1762            self.inner.run_read_outputs(inputs, read_indices)
1763        }
1764        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
1765            self.inner.bind_gpu_handle(name, data)
1766        }
1767        fn has_gpu_handle(&self, name: &str) -> bool {
1768            self.inner.has_gpu_handle(name)
1769        }
1770        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
1771            self.inner.set_gpu_handle_feed(handle_name, output_index);
1772            true
1773        }
1774        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
1775            self.inner.read_gpu_handle(name)
1776        }
1777        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
1778            self.inner.set_active_extent(extent);
1779        }
1780
1781        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
1782            self.inner.set_rng(rng);
1783        }
1784
1785        fn rng(&self) -> rlx_ir::RngOptions {
1786            self.inner.rng()
1787        }
1788
1789        /// Typed param upload: widens F16/BF16 to F32 at the host boundary,
1790        /// since the wgpu arena is f32-uniform.
1791        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
1792            match dtype {
1793                rlx_ir::DType::U8 | rlx_ir::DType::I8 => {
1794                    self.inner.set_param_bytes(name, data);
1795                }
1796                rlx_ir::DType::F32 => {
1797                    let n = data.len() / 4;
1798                    let f32_slice =
1799                        unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
1800                    self.inner.set_param(name, f32_slice);
1801                }
1802                rlx_ir::DType::F16 => {
1803                    let n = data.len() / 2;
1804                    let f16_slice =
1805                        unsafe { std::slice::from_raw_parts(data.as_ptr() as *const half::f16, n) };
1806                    let f32: Vec<f32> = f16_slice.iter().map(|h| h.to_f32()).collect();
1807                    self.inner.set_param(name, &f32);
1808                }
1809                rlx_ir::DType::BF16 => {
1810                    let n = data.len() / 2;
1811                    let bf16_slice = unsafe {
1812                        std::slice::from_raw_parts(data.as_ptr() as *const half::bf16, n)
1813                    };
1814                    let f32: Vec<f32> = bf16_slice.iter().map(|h| h.to_f32()).collect();
1815                    self.inner.set_param(name, &f32);
1816                }
1817                other => panic!(
1818                    "rlx-wgpu set_param_typed: dtype {other:?} unsupported \
1819                                 (F32, F16, BF16 only — wgpu arena is f32-uniform)"
1820                ),
1821            }
1822        }
1823
1824        /// Typed run: widen each typed input to F32, run, then narrow each
1825        /// output back to its declared dtype.
1826        fn run_typed(
1827            &mut self,
1828            inputs: &[(&str, &[u8], rlx_ir::DType)],
1829        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
1830            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
1831            for (name, data, dt) in inputs {
1832                let v: Vec<f32> = match *dt {
1833                    rlx_ir::DType::F32 => {
1834                        let n = data.len() / 4;
1835                        unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) }
1836                            .to_vec()
1837                    }
1838                    rlx_ir::DType::F16 => {
1839                        let n = data.len() / 2;
1840                        let s = unsafe {
1841                            std::slice::from_raw_parts(data.as_ptr() as *const half::f16, n)
1842                        };
1843                        s.iter().map(|h| h.to_f32()).collect()
1844                    }
1845                    rlx_ir::DType::BF16 => {
1846                        let n = data.len() / 2;
1847                        let s = unsafe {
1848                            std::slice::from_raw_parts(data.as_ptr() as *const half::bf16, n)
1849                        };
1850                        s.iter().map(|h| h.to_f32()).collect()
1851                    }
1852                    // Integer/bool inputs (e.g. embedding indices `phone_ids`) are
1853                    // widened to f32, matching the f32-arena convention shared with
1854                    // the CPU backend (Gather etc. operate on f32-encoded indices).
1855                    rlx_ir::DType::I64 => {
1856                        let n = data.len() / 8;
1857                        let s =
1858                            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const i64, n) };
1859                        s.iter().map(|&x| x as f32).collect()
1860                    }
1861                    rlx_ir::DType::I32 => {
1862                        let n = data.len() / 4;
1863                        let s =
1864                            unsafe { std::slice::from_raw_parts(data.as_ptr() as *const i32, n) };
1865                        s.iter().map(|&x| x as f32).collect()
1866                    }
1867                    rlx_ir::DType::U8 | rlx_ir::DType::I8 | rlx_ir::DType::Bool => {
1868                        data.iter().map(|&b| b as f32).collect()
1869                    }
1870                    other => {
1871                        panic!("rlx-wgpu run_typed: input '{name}' dtype {other:?} unsupported")
1872                    }
1873                };
1874                owned.push((name.to_string(), v));
1875            }
1876            let refs: Vec<(&str, &[f32])> = owned
1877                .iter()
1878                .map(|(n, d)| (n.as_str(), d.as_slice()))
1879                .collect();
1880            let dtypes =
1881                super::declared_output_dtypes(&self.io_manifest, self.inner.output_dtypes());
1882            let outs = self.inner.run(&refs);
1883            outs.into_iter()
1884                .zip(
1885                    dtypes
1886                        .into_iter()
1887                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
1888                )
1889                .map(|(v, dt)| (narrow_to_dtype(&v, dt), dt))
1890                .collect()
1891        }
1892
1893        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
1894            Box::new(WgpuExecutableWrapper {
1895                inner: self.inner.clone_for_cache(),
1896                io_manifest: self.io_manifest.clone(),
1897            })
1898        }
1899    }
1900
1901    /// Cast every element of a wgpu f32 output buffer down to the
1902    /// declared output dtype, returning the corresponding byte stream.
1903    /// The arena keeps every value as f32; declared output dtypes
1904    /// (Bool, I8, I32, F16, ...) require an exit-time narrowing to be
1905    /// byte-identical with backends that store the native dtype.
1906    fn narrow_to_dtype(v: &[f32], dt: rlx_ir::DType) -> Vec<u8> {
1907        use rlx_ir::DType;
1908        match dt {
1909            DType::F32 => {
1910                let mut bytes = Vec::with_capacity(v.len() * 4);
1911                for &x in v {
1912                    bytes.extend_from_slice(&x.to_le_bytes());
1913                }
1914                bytes
1915            }
1916            DType::F16 => {
1917                let mut bytes = Vec::with_capacity(v.len() * 2);
1918                for &x in v {
1919                    bytes.extend_from_slice(&half::f16::from_f32(x).to_le_bytes());
1920                }
1921                bytes
1922            }
1923            DType::BF16 => {
1924                let mut bytes = Vec::with_capacity(v.len() * 2);
1925                for &x in v {
1926                    bytes.extend_from_slice(&half::bf16::from_f32(x).to_le_bytes());
1927                }
1928                bytes
1929            }
1930            DType::F64 => {
1931                let mut bytes = Vec::with_capacity(v.len() * 8);
1932                for &x in v {
1933                    bytes.extend_from_slice(&(x as f64).to_le_bytes());
1934                }
1935                bytes
1936            }
1937            DType::I8 => v.iter().map(|&x| x as i8 as u8).collect(),
1938            DType::U8 => v.iter().map(|&x| x as u8).collect(),
1939            DType::I16 => {
1940                let mut bytes = Vec::with_capacity(v.len() * 2);
1941                for &x in v {
1942                    bytes.extend_from_slice(&(x as i16).to_le_bytes());
1943                }
1944                bytes
1945            }
1946            DType::I32 => {
1947                let mut bytes = Vec::with_capacity(v.len() * 4);
1948                for &x in v {
1949                    bytes.extend_from_slice(&(x as i32).to_le_bytes());
1950                }
1951                bytes
1952            }
1953            DType::U32 => {
1954                let mut bytes = Vec::with_capacity(v.len() * 4);
1955                for &x in v {
1956                    bytes.extend_from_slice(&(x as u32).to_le_bytes());
1957                }
1958                bytes
1959            }
1960            DType::I64 => {
1961                let mut bytes = Vec::with_capacity(v.len() * 8);
1962                for &x in v {
1963                    bytes.extend_from_slice(&(x as i64).to_le_bytes());
1964                }
1965                bytes
1966            }
1967            DType::Bool => v
1968                .iter()
1969                .map(|&x| if x != 0.0 { 1u8 } else { 0u8 })
1970                .collect(),
1971            // C64 (complex f32 pair) — the wgpu backend's f32 arena
1972            // doesn't synthesize complex outputs today; this branch
1973            // only fires if a graph somehow asks for a C64 output and
1974            // the backend lowered it as 2N real floats. We pass the
1975            // raw f32 stream straight through; downstream code that
1976            // wants complex semantics is responsible for re-pairing.
1977            DType::C64 => {
1978                let mut bytes = Vec::with_capacity(v.len() * 4);
1979                for &x in v {
1980                    bytes.extend_from_slice(&x.to_le_bytes());
1981                }
1982                bytes
1983            }
1984        }
1985    }
1986}
1987
1988// ── Native Vulkan Backend ───────────────────────────────────────────────
1989
1990#[cfg(feature = "vulkan")]
1991pub mod vulkan_backend {
1992    use super::*;
1993    use rlx_ir::OpKind;
1994    use rlx_vulkan::backend::VulkanExecutable;
1995
1996    pub struct VulkanBackend;
1997
1998    impl Backend for VulkanBackend {
1999        fn supported_ops(&self) -> &'static [OpKind] {
2000            rlx_vulkan::backend::SUPPORTED_OPS
2001        }
2002
2003        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
2004            // `VulkanExecutable::compile_rng` runs the legalize/rewrite pass
2005            // (decomposing DotGeneral / Fma / fused ops / non-last reduce down
2006            // to the native primitive set) itself, so we can hand it the graph
2007            // directly — no fusion pre-pass that would emit ops it can't lower.
2008            Box::new(VulkanExecutableWrapper {
2009                inner: VulkanExecutable::compile_rng(graph, options.rng),
2010            })
2011        }
2012    }
2013
2014    struct VulkanExecutableWrapper {
2015        inner: VulkanExecutable,
2016    }
2017
2018    unsafe impl Send for VulkanExecutableWrapper {}
2019
2020    impl ExecutableGraph for VulkanExecutableWrapper {
2021        fn set_param(&mut self, name: &str, data: &[f32]) {
2022            self.inner.set_param(name, data);
2023        }
2024
2025        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
2026            self.inner.run(inputs)
2027        }
2028
2029        fn run_read_outputs(
2030            &mut self,
2031            inputs: &[(&str, &[f32])],
2032            read_indices: Option<&[usize]>,
2033        ) -> Vec<Vec<f32>> {
2034            self.inner.run_read_outputs(inputs, read_indices)
2035        }
2036
2037        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
2038            self.inner.set_active_extent(extent);
2039        }
2040
2041        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
2042            self.inner.set_rng(rng);
2043        }
2044
2045        fn rng(&self) -> rlx_ir::RngOptions {
2046            self.inner.rng()
2047        }
2048
2049        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
2050            self.inner.bind_gpu_handle(name, data)
2051        }
2052
2053        fn has_gpu_handle(&self, name: &str) -> bool {
2054            self.inner.has_gpu_handle(name)
2055        }
2056
2057        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
2058            self.inner.set_gpu_handle_feed(handle_name, output_index);
2059            true
2060        }
2061
2062        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
2063            self.inner.read_gpu_handle(name)
2064        }
2065
2066        fn register_kv_row_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
2067            self.inner.register_kv_row_feed(handle_name, output_index);
2068            true
2069        }
2070
2071        fn feed_kv_row(&mut self, src_row: usize, dst_row: usize, row_elems: usize) -> bool {
2072            self.inner.feed_kv_row(src_row, dst_row, row_elems);
2073            true
2074        }
2075
2076        fn read_output_row(
2077            &self,
2078            out_idx: usize,
2079            row: usize,
2080            row_inner: usize,
2081        ) -> Option<Vec<f32>> {
2082            self.inner.read_output_row(out_idx, row, row_inner)
2083        }
2084
2085        fn read_gpu_handle_row(
2086            &self,
2087            name: &str,
2088            row: usize,
2089            row_inner: usize,
2090        ) -> Option<Vec<f32>> {
2091            self.inner.read_gpu_handle_row(name, row, row_inner)
2092        }
2093
2094        /// The Vulkan arena is f32-uniform: widen F16/BF16/int params to f32.
2095        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
2096            match dtype {
2097                rlx_ir::DType::U8 | rlx_ir::DType::I8 => self.inner.set_param_bytes(name, data),
2098                rlx_ir::DType::F32 => {
2099                    let n = data.len() / 4;
2100                    let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
2101                    self.inner.set_param(name, s);
2102                }
2103                other => {
2104                    let f = super::widen_bytes_to_f32(data, other);
2105                    self.inner.set_param(name, &f);
2106                }
2107            }
2108        }
2109
2110        /// Widen typed inputs to f32, run, then narrow each output back to its
2111        /// declared dtype (byte-identical with native-dtype backends).
2112        fn run_typed(
2113            &mut self,
2114            inputs: &[(&str, &[u8], rlx_ir::DType)],
2115        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
2116            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
2117            for (name, data, dt) in inputs {
2118                let v = if *dt == rlx_ir::DType::F32 {
2119                    let n = data.len() / 4;
2120                    unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) }.to_vec()
2121                } else {
2122                    super::widen_bytes_to_f32(data, *dt)
2123                };
2124                owned.push((name.to_string(), v));
2125            }
2126            let refs: Vec<(&str, &[f32])> = owned
2127                .iter()
2128                .map(|(n, d)| (n.as_str(), d.as_slice()))
2129                .collect();
2130            let dtypes = self.inner.output_dtypes();
2131            let outs = self.inner.run(&refs);
2132            outs.into_iter()
2133                .zip(
2134                    dtypes
2135                        .into_iter()
2136                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
2137                )
2138                .map(|(v, dt)| (super::narrow_f32_to_bytes(&v, dt), dt))
2139                .collect()
2140        }
2141
2142        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
2143            Box::new(VulkanExecutableWrapper {
2144                inner: self.inner.clone_for_cache(),
2145            })
2146        }
2147    }
2148}
2149
2150// ── Intel oneAPI (Level Zero) Backend ───────────────────────────────────
2151
2152#[cfg(feature = "oneapi")]
2153pub mod oneapi_backend {
2154    use super::*;
2155    use rlx_ir::OpKind;
2156    use rlx_oneapi::backend::OneApiExecutable;
2157
2158    pub struct OneApiBackend;
2159
2160    impl Backend for OneApiBackend {
2161        fn supported_ops(&self) -> &'static [OpKind] {
2162            rlx_oneapi::backend::SUPPORTED_OPS
2163        }
2164
2165        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
2166            // `OneApiExecutable::compile_rng` runs the legalize/rewrite pass
2167            // itself (decomposing DotGeneral / Fma / fused ops down to the
2168            // native primitive set), so hand it the graph directly.
2169            Box::new(OneApiExecutableWrapper {
2170                inner: OneApiExecutable::compile_rng(graph, options.rng),
2171            })
2172        }
2173    }
2174
2175    struct OneApiExecutableWrapper {
2176        inner: OneApiExecutable,
2177    }
2178
2179    unsafe impl Send for OneApiExecutableWrapper {}
2180
2181    impl ExecutableGraph for OneApiExecutableWrapper {
2182        fn set_param(&mut self, name: &str, data: &[f32]) {
2183            self.inner.set_param(name, data);
2184        }
2185
2186        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
2187            self.inner.run(inputs)
2188        }
2189
2190        fn run_read_outputs(
2191            &mut self,
2192            inputs: &[(&str, &[f32])],
2193            read_indices: Option<&[usize]>,
2194        ) -> Vec<Vec<f32>> {
2195            self.inner.run_read_outputs(inputs, read_indices)
2196        }
2197
2198        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
2199            self.inner.set_active_extent(extent);
2200        }
2201
2202        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
2203            self.inner.set_rng(rng);
2204        }
2205
2206        fn rng(&self) -> rlx_ir::RngOptions {
2207            self.inner.rng()
2208        }
2209
2210        /// The oneAPI arena is f32-uniform: widen F16/BF16/int params to f32.
2211        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
2212            match dtype {
2213                rlx_ir::DType::U8 | rlx_ir::DType::I8 => self.inner.set_param_bytes(name, data),
2214                rlx_ir::DType::F32 => {
2215                    let n = data.len() / 4;
2216                    let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
2217                    self.inner.set_param(name, s);
2218                }
2219                other => {
2220                    let f = super::widen_bytes_to_f32(data, other);
2221                    self.inner.set_param(name, &f);
2222                }
2223            }
2224        }
2225
2226        /// Widen typed inputs to f32, run, then narrow each output back to its
2227        /// declared dtype (byte-identical with native-dtype backends).
2228        fn run_typed(
2229            &mut self,
2230            inputs: &[(&str, &[u8], rlx_ir::DType)],
2231        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
2232            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
2233            for (name, data, dt) in inputs {
2234                let v = if *dt == rlx_ir::DType::F32 {
2235                    let n = data.len() / 4;
2236                    unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) }.to_vec()
2237                } else {
2238                    super::widen_bytes_to_f32(data, *dt)
2239                };
2240                owned.push((name.to_string(), v));
2241            }
2242            let refs: Vec<(&str, &[f32])> = owned
2243                .iter()
2244                .map(|(n, d)| (n.as_str(), d.as_slice()))
2245                .collect();
2246            let dtypes = self.inner.output_dtypes();
2247            let outs = self.inner.run(&refs);
2248            outs.into_iter()
2249                .zip(
2250                    dtypes
2251                        .into_iter()
2252                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
2253                )
2254                .map(|(v, dt)| (super::narrow_f32_to_bytes(&v, dt), dt))
2255                .collect()
2256        }
2257
2258        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
2259            Box::new(OneApiExecutableWrapper {
2260                inner: self.inner.clone_for_cache(),
2261            })
2262        }
2263    }
2264}
2265
2266// ── MLX Backend ─────────────────────────────────────────────────────────
2267
2268#[cfg(all(feature = "mlx", rlx_mlx_host))]
2269pub mod mlx_backend {
2270    use super::*;
2271    use rlx_mlx::MlxExecutable;
2272
2273    pub struct MlxBackend;
2274
2275    /// PLAN L4: ops the MLX backend can lower today. MLX has the
2276    /// widest IR coverage of any GPU backend — handles everything
2277    /// including If/While via topo unrolling, and lowers
2278    /// ElementwiseRegion natively via the per-step composition in
2279    /// rlx-mlx/src/lower.rs (PLAN L2).
2280    ///
2281    /// `GroupNorm` / `BatchNormInference` are intentionally omitted — lowered
2282    /// to primitives via [`LowerGroupNorm`] / [`LowerBatchNormInference`]
2283    /// before MLX lowering (no native MLX kernel).
2284    const MLX_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
2285        use rlx_ir::OpKind::*;
2286        &[
2287            Input,
2288            Param,
2289            Constant,
2290            Activation,
2291            Cast,
2292            StopGradient,
2293            Binary,
2294            Compare,
2295            Where,
2296            ElementwiseRegion,
2297            TransformRegion,
2298            BatchElementwiseRegion,
2299            MatMul,
2300            DotGeneral,
2301            DenseSolve,
2302            BatchedDenseSolve,
2303            LayerNorm,
2304            LayerNorm2d,
2305            GroupNorm,
2306            ResizeNearest2x,
2307            RmsNorm,
2308            Attention,
2309            Rope,
2310            Reshape,
2311            Transpose,
2312            Narrow,
2313            Concat,
2314            Expand,
2315            Gather,
2316            Reverse,
2317            Reduce,
2318            Softmax,
2319            Cumsum,
2320            ArgMax,
2321            ArgMin,
2322            TopK,
2323            RngNormal,
2324            RngUniform,
2325            Sample,
2326            Conv,
2327            Im2Col,
2328            ConvTranspose2d,
2329            Pool,
2330            GroupedMatMul,
2331            DequantGroupedMatMul,
2332            DequantMoEWeights,
2333            ScatterAdd,
2334            LoraMatMul,
2335            DequantMatMul,
2336            SelectiveScan,
2337            GatedDeltaNet,
2338            FusedSwiGLU,
2339            FusedMatMulBiasAct,
2340            FusedResidualLN,
2341            FusedResidualRmsNorm,
2342            FusedAttentionBlock,
2343            FusedTransformerLayer,
2344            If,
2345            While,
2346            // Loop-unrolled scan (Op::Scan body is statically unrolled
2347            // `length` times into MLX ops; mirror of Op::While's
2348            // bounded-unroll lowering). ScanBackward is the AD
2349            // companion — handled the same way.
2350            Scan,
2351            ScanBackward,
2352            ScanBackwardXs,
2353            // Tier 1 autodiff backward ops — lowered as primitive
2354            // compositions in `rlx-mlx/src/lower.rs`.
2355            ReluBackward,
2356            ActivationBackward,
2357            SoftmaxCrossEntropy,
2358            SoftmaxCrossEntropyWithLogits,
2359            SoftmaxCrossEntropyBackward,
2360            AttentionBackward,
2361            LayerNormBackwardInput,
2362            LayerNormBackwardGamma,
2363            // GroupNorm backward — native MLX lowering in `lower.rs`
2364            // (group-reshape + reduce, mirrors GroupNormBackwardInput).
2365            GroupNormBackwardInput,
2366            GroupNormBackwardGamma,
2367            GroupNormBackwardBeta,
2368            // Tier 2 — conv backward via `mc::conv_general` with the
2369            // same parameter-mapping MLX uses inside its built-in vjp.
2370            // Currently groups=1 only; grouped conv backward will
2371            // surface as a clear error from `lower.rs`.
2372            Conv2dBackwardInput,
2373            Conv2dBackwardWeight,
2374            // Tier 3 — max-pool backward via slice-strided argmax over
2375            // pool windows + a per-kernel-slot scatter-add, matching
2376            // the CPU thunk's "first-hit-wins" tiebreaking.
2377            MaxPool2dBackward,
2378            // QAT — `FakeQuantize` (PerBatch + Fixed scale modes;
2379            // EMA returns a clear error from `lower.rs`) and the
2380            // `FakeQuantizeBackward` family covering all 4 STE
2381            // variants. Closes the last gap vs `CPU_SUPPORTED_OPS`.
2382            FakeQuantize,
2383            FakeQuantizeBackward,
2384            // User-registered custom ops dispatched through
2385            // `rlx_mlx::op_registry`. Lowering looks up the
2386            // registered `MlxKernel` and calls its `execute` method
2387            // to produce the lazy MLX `Array` for this node.
2388            Custom,
2389            Fft,
2390            LogMel,
2391            LogMelBackward,
2392            WelchPeaks,
2393            GaussianSplatRender,
2394            GaussianSplatRenderBackward,
2395            // Op::Fft on MLX: native `mlx::fft::fft` via rlx_mlx_op_fft shim.
2396            // 2N real-block f32/f64 and complex64 inputs supported.
2397        ]
2398    };
2399
2400    impl Backend for MlxBackend {
2401        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
2402            MLX_SUPPORTED_OPS
2403        }
2404
2405        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
2406            let compile_result = crate::stages::compile_graph_stages_for_backend(
2407                rlx_driver::Device::Mlx,
2408                graph,
2409                options,
2410                MLX_SUPPORTED_OPS,
2411            );
2412            crate::stages::maybe_log_fusion(&compile_result.fusion);
2413            self.compile_lir(compile_result.lir, options)
2414        }
2415
2416        fn compile_lir(
2417            &self,
2418            lir: LirModule,
2419            options: &CompileOptions,
2420        ) -> Box<dyn ExecutableGraph> {
2421            use rlx_opt::pass::Pass as _;
2422            let mut graph = lir.into_graph();
2423            graph = rlx_opt::LowerControlFlow.run(graph);
2424            let graph = prepare_fused_graph(graph, options, MLX_SUPPORTED_OPS, "mlx");
2425            Box::new(build_mlx_executable(graph, options.rng))
2426        }
2427    }
2428
2429    fn build_mlx_executable(graph: Graph, rng: rlx_ir::RngOptions) -> MlxExecutableWrapper {
2430        let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
2431        let mode = mlx_mode_from_env();
2432        let mut exe = MlxExecutable::compile_from_fused_with_rng(graph, mode, rng);
2433        if mode == rlx_mlx::lower::MlxMode::Compiled {
2434            if let Err(e) = exe.warm_compile() {
2435                eprintln!(
2436                    "[rlx-runtime] MLX warm_compile failed ({e}); first run will pay the trace cost"
2437                );
2438            }
2439        }
2440        MlxExecutableWrapper {
2441            inner: exe,
2442            io_manifest,
2443        }
2444    }
2445
2446    fn mlx_mode_from_env() -> rlx_mlx::lower::MlxMode {
2447        match rlx_ir::env::var("RLX_MLX_MODE").as_deref() {
2448            Some(s) if s.eq_ignore_ascii_case("eager") => rlx_mlx::lower::MlxMode::Eager,
2449            Some(s) if s.eq_ignore_ascii_case("lazy") => rlx_mlx::lower::MlxMode::Lazy,
2450            Some(s) if s.eq_ignore_ascii_case("compiled") => rlx_mlx::lower::MlxMode::Compiled,
2451            _ => rlx_mlx::lower::MlxMode::Compiled,
2452        }
2453    }
2454
2455    struct MlxExecutableWrapper {
2456        inner: MlxExecutable,
2457        io_manifest: cpu_low_precision::IoDtypeManifest,
2458    }
2459
2460    unsafe impl Send for MlxExecutableWrapper {}
2461
2462    impl ExecutableGraph for MlxExecutableWrapper {
2463        fn set_param(&mut self, name: &str, data: &[f32]) {
2464            self.inner.set_param(name, data);
2465        }
2466        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
2467            self.inner.run(inputs)
2468        }
2469        fn run_read_outputs(
2470            &mut self,
2471            inputs: &[(&str, &[f32])],
2472            read_indices: Option<&[usize]>,
2473        ) -> Vec<Vec<f32>> {
2474            self.inner
2475                .run_read_outputs(inputs, read_indices)
2476                .unwrap_or_else(|e| panic!("MLX run_read_outputs failed: {e}"))
2477        }
2478        fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
2479            self.inner.run_slots(inputs)
2480        }
2481        fn arena_ptr(&self) -> *const u8 {
2482            self.inner.arena_ptr()
2483        }
2484        fn commit_no_wait(&mut self, inputs: &[(&str, &[f32])]) {
2485            self.inner.commit_no_wait(inputs);
2486        }
2487        fn sync_pending(&mut self) {
2488            self.inner.sync_pending();
2489        }
2490        fn run_pipelined(&mut self, input_sets: &[Vec<(&str, &[f32])>]) -> Vec<Vec<Vec<f32>>> {
2491            self.inner.run_pipelined(input_sets)
2492        }
2493        fn bind_handle(&mut self, name: &str, data: &[f32]) -> bool {
2494            self.inner.bind_handle(name, data)
2495        }
2496        fn read_handle(&self, name: &str) -> Option<Vec<f32>> {
2497            self.inner.read_handle(name)
2498        }
2499        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
2500            self.inner.bind_gpu_handle(name, data).is_ok()
2501        }
2502        fn has_gpu_handle(&self, name: &str) -> bool {
2503            self.inner.has_gpu_handle(name)
2504        }
2505        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
2506            self.inner.set_gpu_handle_feed(handle_name, output_index);
2507            true
2508        }
2509        fn register_kv_row_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
2510            self.inner.register_kv_row_feed(handle_name, output_index);
2511            true
2512        }
2513        fn feed_kv_row(&mut self, src_row: usize, dst_row: usize, row_elems: usize) -> bool {
2514            self.inner.feed_kv_row(src_row, dst_row, row_elems).is_ok()
2515        }
2516        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
2517            self.inner.read_gpu_handle(name).ok()
2518        }
2519        fn run_feed_gpu_handle(
2520            &mut self,
2521            inputs: &[(&str, &[f32])],
2522            handle_name: &str,
2523            output_index: usize,
2524        ) -> Option<Vec<f32>> {
2525            self.inner
2526                .run_feed_gpu(inputs, handle_name, output_index)
2527                .ok()
2528        }
2529        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
2530            self.inner.set_param_typed(name, data, dtype);
2531        }
2532        fn run_typed(
2533            &mut self,
2534            inputs: &[(&str, &[u8], rlx_ir::DType)],
2535        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
2536            self.inner.run_typed(inputs)
2537        }
2538        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
2539            self.inner.set_active_extent(extent);
2540        }
2541
2542        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
2543            self.inner.set_rng(rng);
2544        }
2545
2546        fn rng(&self) -> rlx_ir::RngOptions {
2547            self.inner.rng()
2548        }
2549
2550        fn copy_params_from(&mut self, src: &dyn ExecutableGraph) -> bool {
2551            let Some(src_any) = src.executable_as_any() else {
2552                return false;
2553            };
2554            let Some(src_wrap) = src_any.downcast_ref::<MlxExecutableWrapper>() else {
2555                return false;
2556            };
2557            let Some(dst_any) = self.executable_as_any_mut() else {
2558                return false;
2559            };
2560            let Some(dst_wrap) = dst_any.downcast_mut::<MlxExecutableWrapper>() else {
2561                return false;
2562            };
2563            dst_wrap.inner.copy_params_from(&src_wrap.inner)
2564        }
2565
2566        fn executable_as_any(&self) -> Option<&dyn std::any::Any> {
2567            Some(self)
2568        }
2569
2570        fn executable_as_any_mut(&mut self) -> Option<&mut dyn std::any::Any> {
2571            Some(self)
2572        }
2573
2574        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
2575            Box::new(MlxExecutableWrapper {
2576                inner: self.inner.clone_for_cache(),
2577                io_manifest: self.io_manifest.clone(),
2578            })
2579        }
2580    }
2581}
2582
2583/// Ops with a MIL lowering today (see `rlx_coreml::mil`).
2584///
2585/// Single source of truth, shared by `coreml_backend::CoremlBackend::
2586/// supported_ops` and `device_ext::supports(Device::Ane, ..)` so the two
2587/// never drift. Ungated so the support probe compiles on every target
2588/// (it's only *consulted* when the `coreml` backend is available).
2589pub(crate) const COREML_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
2590    use rlx_ir::OpKind::*;
2591    &[
2592        Input,
2593        Param,
2594        Constant,
2595        Activation,
2596        Cast,
2597        Binary,
2598        MatMul,
2599        LayerNorm,
2600        RmsNorm,
2601        Reduce,
2602        Softmax,
2603        Reshape,
2604        Transpose,
2605        Narrow,
2606        Concat,
2607        Gather,
2608        Rope,
2609        Attention,
2610        // Claimed first-class; `CoremlExecutable::compile_with_options`
2611        // decomposes it to the primitive chain (matmul → narrow → rope →
2612        // attention → matmul) since the MIL lowering has no fused-attention
2613        // op. FAB-only decompose, so native LoraMatMul below is untouched.
2614        FusedAttentionBlock,
2615        Compare,
2616        Where,
2617        Expand,
2618        Cumsum,
2619        ScatterAdd,
2620        BatchNormInference,
2621        GroupNorm,
2622        LayerNorm2d,
2623        LoraMatMul,
2624        Conv,
2625        ConvTranspose2d,
2626        Pool,
2627        TopK,
2628        AxialRope2d,
2629        ResizeNearest2x,
2630        StopGradient,
2631        GroupedMatMul,
2632        DequantMatMul,
2633        DequantMoEWeights,
2634        DequantGroupedMatMul,
2635        Quantize,
2636        Dequantize,
2637        SelectiveScan,
2638        GatedDeltaNet,
2639        ArgMax,
2640        ArgMin,
2641        Reverse,
2642        Fft,
2643        LogMel,
2644        Sample,
2645        RngNormal,
2646        Lstm,
2647        // General Op::Scan (arbitrary-body recurrence, e.g. IIR biquad) runs on
2648        // the host between MIL segments via rlx-cpu's execute_scan_host.
2649        Scan,
2650        Gru,
2651        Rnn,
2652        Mamba2,
2653        WelchPeaks,
2654        Custom,
2655    ]
2656};
2657
2658/// Backward / training `OpKind`s the CoreML backend can run **via decomposition**
2659/// (`rlx_autodiff::decompose_backward_ops_except` in the legalize/rewrite pass):
2660/// each lowers to a chain of primitives that are all in [`COREML_SUPPORTED_OPS`].
2661///
2662/// Kept separate from `COREML_SUPPORTED_OPS` on purpose: these must NOT be in the
2663/// list handed to `legalize_or_rewrite_for_backend` (otherwise they'd be treated
2664/// as directly lowerable and skip the decompose, and the MIL lowering would choke
2665/// on a raw `*Backward` op). They feed only the *device-selection* probe
2666/// (`device_ext::coreml_supports`) so the runtime picks `Device::Ane` for a graph
2667/// that carries them. As native MIL backward kernels land (see `rlx_coreml::mil`),
2668/// the corresponding kind graduates into `COREML_SUPPORTED_OPS` and is removed
2669/// here. Only consulted under the `training` feature.
2670///
2671/// Excluded deliberately: `Conv2dBackwardWeight` (decomposes via `Im2Col`, which
2672/// has no MIL lowering yet — lands with the Phase-2 conv kernels) and the
2673/// conditional / domain backward ops (`ScanBackward*`, `LogMelBackward`,
2674/// `GaussianSplatRenderBackward`, `ComplexNormSqBackward`, `FakeQuantizeLSQ*`).
2675#[allow(dead_code)] // only read under `feature = "training"`.
2676pub(crate) const COREML_BACKWARD_OPS: &[rlx_ir::OpKind] = {
2677    use rlx_ir::OpKind::*;
2678    &[
2679        ReluBackward,
2680        ActivationBackward,
2681        LayerNormBackwardInput,
2682        LayerNormBackwardGamma,
2683        GroupNormBackwardInput,
2684        GroupNormBackwardGamma,
2685        GroupNormBackwardBeta,
2686        BatchNormInferenceBackwardInput,
2687        BatchNormInferenceBackwardGamma,
2688        BatchNormInferenceBackwardBeta,
2689        RopeBackward,
2690        AttentionBackward,
2691        SoftmaxCrossEntropyBackward,
2692        CumsumBackward,
2693        GatherBackward,
2694        FakeQuantizeBackward,
2695    ]
2696};
2697
2698/// Backward `OpKind`s the CoreML backend lowers through a **native MIL kernel**
2699/// (`rlx_coreml::mil`, gated by `rlx-coreml/training`) rather than decomposition.
2700/// Unlike [`COREML_BACKWARD_OPS`], these ARE added to the list handed to
2701/// `legalize_or_rewrite_for_backend` (under `training`), so the rewrite leaves
2702/// them intact for the lowering's dedicated arm.
2703///
2704/// These cases land here:
2705/// - **RMSNorm backward** — the dominant norm in modern transformers; a
2706///   hand-composed MIL kernel (implicit broadcasting, ~13 ops) beats the autodiff
2707///   decomposition (~27 ops of `Expand`-with-ones). `ReluBackward`/
2708///   `ActivationBackward` are NOT here: their decomposition is already `dy·f'(x)`
2709///   (~3 MIL ops), so a native kernel would emit identical MIL — they ride the
2710///   decompose route instead.
2711/// - **MaxPool2d backward** — MUST be native: the decomposition builds an
2712///   N²-sized dense scatter that blows RLX's size cap on any real CNN. The native
2713///   kernel is O(input) (reshape + reduce_max/min + select). Non-overlapping,
2714///   unpadded pooling only (the CNN-training norm, e.g. MNIST 2×2/2); other
2715///   configs return `Unsupported`.
2716/// - **Conv2d backward** (input via `conv_transpose`, weight via the transpose-conv
2717///   trick) — native because the autodiff input decomposition emits the wrong op.
2718/// - **Softmax-cross-entropy (forward `WithLogits` + backward)** — MUST be native
2719///   for LLM-scale training: the decompose builds the one-hot by concatenating C
2720///   class columns, O(C) graph nodes that explode at vocab size. MIL `one_hot` is a
2721///   single node. Both halves are listed so the loss op stays out of `bad` and the
2722///   shared `LowerSoftmaxCrossEntropy` pass never re-decomposes the backward.
2723///
2724/// MUST stay in lock-step with the lowering arms in `rlx_coreml::mil` (a kind
2725/// here without an arm would skip decompose and hit the `Unsupported` fallback).
2726/// The norm arms mirror the autodiff decomposition's math — the RMSNorm input
2727/// cross-term is `inv_r³` (finite-difference-verified, the same formula every
2728/// backend now uses) — so ANE gradients stay consistent with the rest of the
2729/// training path.
2730#[allow(dead_code)] // only read under `feature = "training"`.
2731pub(crate) const COREML_NATIVE_BACKWARD_OPS: &[rlx_ir::OpKind] = {
2732    use rlx_ir::OpKind::*;
2733    &[
2734        RmsNormBackwardInput,
2735        RmsNormBackwardGamma,
2736        RmsNormBackwardBeta,
2737        LayerNormBackwardInput,
2738        LayerNormBackwardGamma,
2739        GroupNormBackwardInput,
2740        GroupNormBackwardGamma,
2741        GroupNormBackwardBeta,
2742        MaxPool2dBackward,
2743        Conv2dBackwardInput,
2744        Conv2dBackwardWeight,
2745        AttentionBackward,
2746        // The SCE training pair (integer-label loss + its gradient). Both native so
2747        // neither lands in `bad` — otherwise the shared `LowerSoftmaxCrossEntropy`
2748        // pass fires on the forward and re-decomposes the backward into the O(C)
2749        // one-hot concat. `SoftmaxCrossEntropyWithLogits` is a forward op but is
2750        // training-only, so it lives here rather than in the base inference set.
2751        SoftmaxCrossEntropyWithLogits,
2752        SoftmaxCrossEntropyBackward,
2753    ]
2754};
2755
2756/// `COREML_SUPPORTED_OPS` ∪ `COREML_NATIVE_BACKWARD_OPS` — the op claim under the
2757/// `training` feature, returned by `CoremlBackend::supported_ops`.
2758///
2759/// This matters because the fusion pipeline (`stages::compile_module_stages`, run
2760/// by the default `Backend::compile_module` *before* the backend's own lowering)
2761/// decides what to decompose from `supported_ops()`. If the native backward
2762/// kernels aren't claimed *here*, the pipeline decomposes them into primitives
2763/// before `rlx_coreml`'s dedicated arm can fire — which silently sent MaxPool2d
2764/// backward down the (rank-6, CoreML-illegal) upsample decomposition instead of
2765/// the native O(input) kernel.
2766#[cfg(feature = "training")]
2767pub(crate) const COREML_SUPPORTED_OPS_TRAINING: [rlx_ir::OpKind;
2768    COREML_SUPPORTED_OPS.len() + COREML_NATIVE_BACKWARD_OPS.len()] = {
2769    let mut arr =
2770        [rlx_ir::OpKind::Input; COREML_SUPPORTED_OPS.len() + COREML_NATIVE_BACKWARD_OPS.len()];
2771    let mut i = 0;
2772    while i < COREML_SUPPORTED_OPS.len() {
2773        arr[i] = COREML_SUPPORTED_OPS[i];
2774        i += 1;
2775    }
2776    let mut j = 0;
2777    while j < COREML_NATIVE_BACKWARD_OPS.len() {
2778        arr[COREML_SUPPORTED_OPS.len() + j] = COREML_NATIVE_BACKWARD_OPS[j];
2779        j += 1;
2780    }
2781    arr
2782};
2783
2784/// Apple CoreML / Neural Engine (ANE) backend wiring.
2785///
2786/// Unlike the GPU backends, CoreML compiles a *static* model with weights
2787/// baked in, so we hand the raw graph (Param nodes intact) straight to
2788/// `rlx_coreml` and skip the fusion/LIR arena pipeline. Weights arrive via
2789/// `set_param`; the `.mlpackage` is built + loaded on `finalize_params`
2790/// (or lazily on first `run`).
2791#[cfg(all(
2792    feature = "coreml",
2793    target_vendor = "apple",
2794    not(target_os = "watchos")
2795))]
2796pub mod coreml_backend {
2797    use super::*;
2798    use crate::Precision;
2799    use rlx_coreml::{CoremlExecutable, default_lower_options};
2800
2801    pub struct CoremlBackend;
2802
2803    impl Backend for CoremlBackend {
2804        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
2805            // Under `training` the claim includes the native backward kernels so
2806            // the fusion pipeline preserves them for the dedicated MIL arm instead
2807            // of decomposing them first (decompose-route backward ops stay out, so
2808            // the pipeline still turns those into primitives). See
2809            // `COREML_SUPPORTED_OPS_TRAINING`.
2810            #[cfg(feature = "training")]
2811            {
2812                &super::COREML_SUPPORTED_OPS_TRAINING
2813            }
2814            #[cfg(not(feature = "training"))]
2815            {
2816                super::COREML_SUPPORTED_OPS
2817            }
2818        }
2819
2820        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
2821            // `supported_ops()` already includes the native backward kernels under
2822            // `training`, so the rewrite keeps them intact for their MIL arm while
2823            // decomposing the decompose-route backward ops to primitives.
2824            let graph = rlx_opt::legalize_or_rewrite_for_backend(graph, self.supported_ops())
2825                .unwrap_or_else(|errors| {
2826                    panic!("{}", rlx_opt::format_legalize_error("coreml", &errors));
2827                });
2828            // Automatic Floating Point: apply the mixed-precision policy if set.
2829            // AMP keeps boundaries (input/param/const/output) in F32 and casts at
2830            // f16 boundaries, so CoreML lowers a valid mixed f16/f32 ML Program
2831            // (per-node dtypes; consts stay F32 → no storage/type mismatch). This
2832            // is the precise way to get f16 compute — unlike a blanket
2833            // `float_dtype = F16` flip, which mis-sizes float consts.
2834            let (graph, mut lower_opts) = match options.policy.clone() {
2835                Some(policy) => {
2836                    use rlx_opt::pass::Pass as _;
2837                    let g = rlx_opt::AutoMixedPrecision::new(policy).run(graph);
2838                    let opts = default_lower_options(&g);
2839                    (g, opts)
2840                }
2841                None => {
2842                    let mut opts = default_lower_options(&graph);
2843                    // Legacy blanket-F16 path (no AMP policy): kept for inference.
2844                    if options.precision == Precision::F16 {
2845                        opts.float_dtype = rlx_ir::DType::F16;
2846                    }
2847                    (graph, opts)
2848                }
2849            };
2850            if let Some(binding) = &options.dim_binding {
2851                let _ = binding;
2852                lower_opts.flexible_inputs = false;
2853            }
2854            Box::new(CoremlExecutableWrapper {
2855                inner: CoremlExecutable::compile_with_lower_opts(graph, lower_opts),
2856            })
2857        }
2858
2859        fn compile_lir(
2860            &self,
2861            lir: LirModule,
2862            options: &CompileOptions,
2863        ) -> Box<dyn ExecutableGraph> {
2864            // No LIR arena path for CoreML; reconstruct the graph and go
2865            // through the normal compile.
2866            self.compile(lir.into_graph(), options)
2867        }
2868    }
2869
2870    struct CoremlExecutableWrapper {
2871        inner: CoremlExecutable,
2872    }
2873
2874    // The loaded MLModel is owned exclusively and accessed via &mut self.
2875    unsafe impl Send for CoremlExecutableWrapper {}
2876
2877    impl ExecutableGraph for CoremlExecutableWrapper {
2878        fn set_param(&mut self, name: &str, data: &[f32]) {
2879            self.inner.set_param(name, data);
2880        }
2881
2882        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
2883            // GGUF-quantized weights arrive as raw bytes; the lowering
2884            // host-dequantizes them when baking the model.
2885            self.inner.set_param_typed(name, data, dtype);
2886        }
2887
2888        fn finalize_params(&mut self) {
2889            self.inner
2890                .finalize()
2891                .unwrap_or_else(|e| panic!("CoreML finalize failed: {e}"));
2892        }
2893
2894        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
2895            self.inner
2896                .run(inputs)
2897                .unwrap_or_else(|e| panic!("CoreML run failed: {e}"))
2898        }
2899
2900        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
2901            Box::new(CoremlExecutableWrapper {
2902                inner: self.inner.clone_for_cache(),
2903            })
2904        }
2905
2906        fn run_typed(
2907            &mut self,
2908            inputs: &[(&str, &[u8], rlx_ir::DType)],
2909        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
2910            use rlx_ir::DType;
2911            // The CoreML model is promoted to an f32 flow (`promote_int_to_f32`), so
2912            // widen integer/bool inputs (e.g. `phone_ids`) to f32 on the host surface.
2913            let owned: Vec<(String, Vec<f32>)> = inputs
2914                .iter()
2915                .map(|(name, data, dt)| {
2916                    let v: Vec<f32> = match dt {
2917                        DType::I64 => data
2918                            .chunks_exact(8)
2919                            .map(|c| i64::from_le_bytes(c.try_into().unwrap()) as f32)
2920                            .collect(),
2921                        DType::I32 => data
2922                            .chunks_exact(4)
2923                            .map(|c| i32::from_le_bytes(c.try_into().unwrap()) as f32)
2924                            .collect(),
2925                        DType::U8 | DType::Bool => data.iter().map(|&b| b as f32).collect(),
2926                        _ => super::widen_bytes_to_f32(data, *dt),
2927                    };
2928                    (name.to_string(), v)
2929                })
2930                .collect();
2931            let refs: Vec<(&str, &[f32])> = owned
2932                .iter()
2933                .map(|(n, d)| (n.as_str(), d.as_slice()))
2934                .collect();
2935            self.run(&refs)
2936                .into_iter()
2937                .map(|v| {
2938                    let bytes: Vec<u8> = v.iter().flat_map(|f| f.to_le_bytes()).collect();
2939                    (bytes, DType::F32)
2940                })
2941                .collect()
2942        }
2943    }
2944}
2945
2946#[cfg(all(feature = "metal", target_vendor = "apple", not(target_os = "watchos")))]
2947pub mod metal_backend {
2948    use super::*;
2949    use rlx_metal::backend::MetalExecutable;
2950
2951    pub struct MetalBackend;
2952
2953    /// PLAN L4: ops the Metal backend can lower today. Includes
2954    /// DotGeneral (LowerDotGeneral pass) and ElementwiseRegion
2955    /// (decomposed by UnfuseElementwiseRegions). Excludes
2956    /// SelectiveScan, LoraMatMul, Sample,
2957    /// FusedTransformerLayer, If, While —
2958    /// not yet wired in `rlx-metal/src/thunk.rs`'s compile_thunks.
2959    /// `FusedAttentionBlock` IS claimed (so it legalizes / the fusion
2960    /// pipeline may emit it); `MetalExecutable::compile_inner` decomposes
2961    /// it to the primitive chain — there is no monolithic fused-attention
2962    /// MSL kernel yet.
2963    /// DequantMatMul (GGUF K-quants) lowers to a GPU dequant kernel
2964    /// + MPS matmul; legacy Int8 schemes remain CPU-only.
2965    ///
2966    const METAL_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
2967        use rlx_ir::OpKind::*;
2968        &[
2969            Input,
2970            Param,
2971            Constant,
2972            Activation,
2973            Cast,
2974            StopGradient,
2975            Binary,
2976            Compare,
2977            Where,
2978            Fma,
2979            ElementwiseRegion,
2980            TransformRegion,
2981            BatchElementwiseRegion,
2982            MatMul,
2983            ScaledMatMul,
2984            ScaledQuantize,
2985            ScaledQuantScale,
2986            ScaledDequantize,
2987            DotGeneral,
2988            LayerNorm,
2989            LayerNorm2d,
2990            GroupNorm,
2991            RmsNorm,
2992            ResizeNearest2x,
2993            AxialRope2d,
2994            Attention,
2995            AttentionBackward,
2996            RmsNormBackwardInput,
2997            RmsNormBackwardGamma,
2998            RmsNormBackwardBeta,
2999            RopeBackward,
3000            Cumsum,
3001            CumsumBackward,
3002            GatherBackward,
3003            Conv2dBackwardInput,
3004            Conv2dBackwardWeight,
3005            MaxPool2dBackward,
3006            Rope,
3007            Reshape,
3008            Transpose,
3009            Narrow,
3010            Concat,
3011            Expand,
3012            Gather,
3013            Reverse,
3014            Reduce,
3015            Softmax,
3016            SoftmaxCrossEntropy,
3017            SoftmaxCrossEntropyWithLogits,
3018            SoftmaxCrossEntropyBackward,
3019            ArgMax,
3020            ArgMin,
3021            TopK,
3022            Sample,
3023            RngNormal,
3024            RngUniform,
3025            Conv,
3026            Im2Col,
3027            ConvTranspose2d,
3028            Pool,
3029            GroupedMatMul,
3030            DequantGroupedMatMul,
3031            DequantMoEWeights,
3032            ScatterAdd,
3033            DequantMatMul,
3034            GatedDeltaNet,
3035            SelectiveScan,
3036            Lstm,
3037            Gru,
3038            Rnn,
3039            Mamba2,
3040            FusedSwiGLU,
3041            FusedMatMulBiasAct,
3042            FusedResidualLN,
3043            FusedResidualRmsNorm,
3044            // Claimed so the Metal fusion pipeline may emit it;
3045            // `MetalExecutable::compile_inner` decomposes it back to the
3046            // primitive chain (no monolithic fused-attention MSL kernel
3047            // yet — the per-run cost is dominated by wait_until_completed,
3048            // not encode, so a dispatch-wrapper fusion buys nothing).
3049            FusedAttentionBlock,
3050            // User-registered custom ops dispatched through
3051            // `rlx_metal::op_registry`. Lowering panics with a clear
3052            // message if the named MetalKernel isn't registered;
3053            // executor inserts a sync point + runs the host kernel
3054            // against the unified-memory arena.
3055            Custom,
3056            // Op::Fft is supported via the same host-fallback pattern
3057            // as Custom: sync the GPU, run rlx-cpu's FFT against the
3058            // unified-memory arena, restart cmd_buf. A native Metal
3059            // compute kernel will replace this when a workload makes
3060            // the sync the bottleneck.
3061            Fft,
3062            // Op::Scan (arbitrary-body recurrence) via the same host
3063            // fallback: compile the body once, loop it on the CPU against
3064            // the unified-memory arena. Enables IIR (`biquad`/`sosfilt`).
3065            Scan,
3066            LogMel,
3067            LogMelBackward,
3068            WelchPeaks,
3069            // Host-fallback splat (unified-memory arena + rlx-cpu/splat).
3070            GaussianSplatRender,
3071            GaussianSplatRenderBackward,
3072            GaussianSplatPrepare,
3073            GaussianSplatRasterize,
3074        ]
3075    };
3076
3077    impl Backend for MetalBackend {
3078        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
3079            METAL_SUPPORTED_OPS
3080        }
3081
3082        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
3083            use rlx_opt::pass::Pass as _;
3084            // Same If/While → primitive rewrite as the CPU pipeline
3085            // (Metal also has no native sub-graph executor wired
3086            // through its thunk schedule).
3087            let graph = rlx_opt::LowerControlFlow.run(graph);
3088            let dispatch = options.kernel_dispatch;
3089            let graph = rlx_opt::legalize_or_rewrite_for_backend_with_config(
3090                graph,
3091                METAL_SUPPORTED_OPS,
3092                dispatch,
3093            )
3094            .unwrap_or_else(|errors| {
3095                panic!("{}", rlx_opt::format_legalize_error("metal", &errors));
3096            });
3097            let graph = crate::precompile::precompile_cleanup(graph, options);
3098
3099            // Hand the policy to MetalExecutable so the rewrite runs AFTER
3100            // its internal fusion passes (avoids breaking pattern matchers).
3101            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
3102            Box::new(MetalExecutableWrapper {
3103                inner: MetalExecutable::compile_with_policy(
3104                    graph,
3105                    options.policy.clone(),
3106                    Some(METAL_SUPPORTED_OPS),
3107                    options.rng,
3108                ),
3109                io_manifest,
3110            })
3111        }
3112
3113        fn compile_lir(
3114            &self,
3115            lir: LirModule,
3116            options: &CompileOptions,
3117        ) -> Box<dyn ExecutableGraph> {
3118            use rlx_opt::pass::Pass as _;
3119            let mut graph = lir.into_graph();
3120            graph = rlx_opt::LowerControlFlow.run(graph);
3121            let dispatch = options.kernel_dispatch;
3122            let mut graph = rlx_opt::legalize_or_rewrite_for_backend_with_config(
3123                graph,
3124                METAL_SUPPORTED_OPS,
3125                dispatch,
3126            )
3127            .unwrap_or_else(|errors| {
3128                panic!("{}", rlx_opt::format_legalize_error("metal", &errors));
3129            });
3130            graph = crate::precompile::precompile_cleanup(graph, options);
3131            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
3132            Box::new(MetalExecutableWrapper {
3133                inner: MetalExecutable::compile_from_fused(
3134                    graph,
3135                    options.policy.clone(),
3136                    Some(METAL_SUPPORTED_OPS),
3137                    options.rng,
3138                ),
3139                io_manifest,
3140            })
3141        }
3142    }
3143
3144    struct MetalExecutableWrapper {
3145        inner: MetalExecutable,
3146        io_manifest: cpu_low_precision::IoDtypeManifest,
3147    }
3148
3149    unsafe impl Send for MetalExecutableWrapper {}
3150
3151    impl ExecutableGraph for MetalExecutableWrapper {
3152        fn set_param(&mut self, name: &str, data: &[f32]) {
3153            self.inner.set_param(name, data);
3154        }
3155
3156        fn finalize_params(&mut self) {
3157            self.inner.preload_qmatmul_weights();
3158        }
3159
3160        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
3161            self.inner.run(inputs)
3162        }
3163        fn run_read_outputs(
3164            &mut self,
3165            inputs: &[(&str, &[f32])],
3166            read_indices: Option<&[usize]>,
3167        ) -> Vec<Vec<f32>> {
3168            self.inner.run_read_outputs(inputs, read_indices)
3169        }
3170        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
3171            self.inner.bind_gpu_handle(name, data)
3172        }
3173        fn has_gpu_handle(&self, name: &str) -> bool {
3174            self.inner.has_gpu_handle(name)
3175        }
3176        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
3177            self.inner.set_gpu_handle_feed(handle_name, output_index);
3178            true
3179        }
3180        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
3181            self.inner.read_gpu_handle(name)
3182        }
3183        fn register_kv_row_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
3184            self.inner.register_kv_row_feed(handle_name, output_index);
3185            true
3186        }
3187        fn feed_kv_row(&mut self, src_row: usize, dst_row: usize, row_elems: usize) -> bool {
3188            self.inner.feed_kv_row(src_row, dst_row, row_elems);
3189            true
3190        }
3191        fn read_output_row(
3192            &self,
3193            out_idx: usize,
3194            row: usize,
3195            row_inner: usize,
3196        ) -> Option<Vec<f32>> {
3197            Some(self.inner.read_graph_output_row(out_idx, row, row_inner))
3198        }
3199        fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
3200            self.inner.run_slots(inputs)
3201        }
3202        fn arena_ptr(&self) -> *const u8 {
3203            self.inner.arena_ptr()
3204        }
3205        fn commit_no_wait(&mut self, inputs: &[(&str, &[f32])]) {
3206            self.inner.commit_no_wait(inputs);
3207        }
3208        fn sync_pending(&mut self) {
3209            self.inner.sync_pending();
3210        }
3211        fn run_pipelined(&mut self, input_sets: &[Vec<(&str, &[f32])>]) -> Vec<Vec<Vec<f32>>> {
3212            self.inner.run_pipelined(input_sets)
3213        }
3214        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
3215            self.inner.set_active_extent(extent);
3216        }
3217
3218        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
3219            self.inner.set_rng(rng);
3220        }
3221
3222        fn rng(&self) -> rlx_ir::RngOptions {
3223            self.inner.rng()
3224        }
3225
3226        /// Typed param upload — accepts F16/BF16 host bytes by widening
3227        /// to F32 first, then routing through `set_param`. The Metal
3228        /// arena's `write_from_f32` honors per-node F16 storage when
3229        /// AutoMixedPrecision rewrote the param. U8/I8 packed weights
3230        /// copy directly into the arena for `Op::DequantMatMul`.
3231        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
3232            if matches!(
3233                dtype,
3234                rlx_ir::DType::U8
3235                    | rlx_ir::DType::I8
3236                    | rlx_ir::DType::I32
3237                    | rlx_ir::DType::I64
3238                    | rlx_ir::DType::U32
3239                    | rlx_ir::DType::F64
3240            ) {
3241                self.inner.set_param_bytes(name, data);
3242                return;
3243            }
3244            if dtype == rlx_ir::DType::F32 {
3245                let n = data.len() / 4;
3246                let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
3247                self.inner.set_param(name, s);
3248            } else {
3249                let f32_buf = super::widen_bytes_to_f32(data, dtype);
3250                self.inner.set_param(name, &f32_buf);
3251            }
3252        }
3253
3254        /// Typed run. Integer inputs (I64 token ids, etc.) are copied
3255        /// directly into the unified-memory arena; F32/F16/BF16 widen
3256        /// through the existing host path. Outputs use native arena bytes.
3257        fn run_typed(
3258            &mut self,
3259            inputs: &[(&str, &[u8], rlx_ir::DType)],
3260        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
3261            self.inner.run_typed(inputs)
3262        }
3263
3264        fn copy_params_from(&mut self, src: &dyn ExecutableGraph) -> bool {
3265            let Some(src_any) = src.executable_as_any() else {
3266                return false;
3267            };
3268            let Some(src_wrap) = src_any.downcast_ref::<MetalExecutableWrapper>() else {
3269                return false;
3270            };
3271            let Some(dst_any) = self.executable_as_any_mut() else {
3272                return false;
3273            };
3274            let Some(dst_wrap) = dst_any.downcast_mut::<MetalExecutableWrapper>() else {
3275                return false;
3276            };
3277            dst_wrap.inner.copy_params_from(&src_wrap.inner)
3278        }
3279
3280        fn executable_as_any(&self) -> Option<&dyn std::any::Any> {
3281            Some(self)
3282        }
3283
3284        fn executable_as_any_mut(&mut self) -> Option<&mut dyn std::any::Any> {
3285            Some(self)
3286        }
3287
3288        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
3289            Box::new(MetalExecutableWrapper {
3290                inner: self.inner.clone_for_cache(),
3291                io_manifest: self.io_manifest.clone(),
3292            })
3293        }
3294    }
3295}
3296
3297// ── CUDA Backend ────────────────────────────────────────────────────────
3298
3299#[cfg(feature = "cuda")]
3300pub mod cuda_backend {
3301    use super::*;
3302    use rlx_cuda::backend::CudaExecutable;
3303
3304    pub struct CudaBackend;
3305
3306    /// PLAN L4: ops the CUDA backend can lower today. Excludes
3307    /// FusedSwiGLU, LoraMatMul, FusedTransformerLayer (no kernel) +
3308    /// If, While (no executor wiring). `FusedAttentionBlock` IS claimed:
3309    /// the `FuseAttentionBlock` pass fires, then `CudaExecutable`'s own
3310    /// `unfuse` decomposes it back to the primitive chain (matmul →
3311    /// narrow → rope → attention → matmul) — same fuse-then-unfuse the
3312    /// WGPU backend uses. DotGeneral via LowerDotGeneral; ElementwiseRegion
3313    /// lowered natively by an NVRTC interpreted-chain kernel.
3314    const CUDA_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
3315        use rlx_ir::OpKind::*;
3316        &[
3317            Input,
3318            Param,
3319            Constant,
3320            Activation,
3321            Cast,
3322            StopGradient,
3323            Binary,
3324            Compare,
3325            Where,
3326            ElementwiseRegion,
3327            TransformRegion,
3328            BatchElementwiseRegion,
3329            MatMul,
3330            ScaledMatMul,
3331            ScaledQuantize,
3332            ScaledQuantScale,
3333            ScaledDequantize,
3334            DotGeneral,
3335            LayerNorm,
3336            LayerNorm2d,
3337            GroupNorm,
3338            ResizeNearest2x,
3339            AxialRope2d,
3340            Reverse,
3341            ArgMax,
3342            ArgMin,
3343            RmsNorm,
3344            Attention,
3345            AttentionBackward,
3346            RmsNormBackwardInput,
3347            RmsNormBackwardGamma,
3348            RmsNormBackwardBeta,
3349            RopeBackward,
3350            CumsumBackward,
3351            GatherBackward,
3352            Conv2dBackwardInput,
3353            Conv2dBackwardWeight,
3354            MaxPool2dBackward,
3355            Rope,
3356            Reshape,
3357            Transpose,
3358            Narrow,
3359            Concat,
3360            Expand,
3361            Gather,
3362            Reduce,
3363            Softmax,
3364            Cumsum,
3365            TopK,
3366            Sample,
3367            Conv,
3368            ConvTranspose2d,
3369            Pool,
3370            GroupedMatMul,
3371            DequantGroupedMatMul,
3372            DequantMoEWeights,
3373            ScatterAdd,
3374            DequantMatMul,
3375            SelectiveScan,
3376            Lstm,
3377            // General Op::Scan (arbitrary-body recurrence, e.g. IIR biquad) via
3378            // D2H→CPU→H2D host fallback (forces eager, not graph-captured).
3379            Scan,
3380            FusedMatMulBiasAct,
3381            FusedResidualLN,
3382            FusedResidualRmsNorm,
3383            // Fused, then decomposed by the backend's own `unfuse` pass
3384            // (rlx-cuda / rlx-rocm) before lowering — no monolithic
3385            // fused-attention kernel yet, same fuse-then-unfuse as WGPU.
3386            FusedAttentionBlock,
3387            GaussianSplatRender,
3388            GaussianSplatRenderBackward,
3389            GaussianSplatPrepare,
3390            GaussianSplatRasterize,
3391            Custom,
3392            Fft,
3393            LogMel,
3394            LogMelBackward,
3395            WelchPeaks,
3396            Im2Col,
3397            RngNormal,
3398            RngUniform,
3399        ]
3400    };
3401
3402    impl Backend for CudaBackend {
3403        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
3404            CUDA_SUPPORTED_OPS
3405        }
3406
3407        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
3408            use rlx_opt::pass::Pass as _;
3409            // Decompose FusedSwiGLU / FAB / etc. before legalization (CudaExecutable
3410            // unfuses again; this pass is idempotent).
3411            let graph = rlx_cuda::unfuse::unfuse(graph);
3412            let graph = rlx_opt::legalize_or_rewrite_for_backend(graph, CUDA_SUPPORTED_OPS)
3413                .unwrap_or_else(|errors| {
3414                    panic!("{}", rlx_opt::format_legalize_error("cuda", &errors));
3415                });
3416            let graph = crate::precompile::precompile_cleanup(graph, options);
3417            // Mid-axis broadcasts (EEG patch embed) before elementwise fusion.
3418            let graph = rlx_opt::LegalizeBroadcast.run(graph);
3419            // Backend-aware fusion via the shared compile pipeline.
3420            let compile_result = crate::stages::compile_graph_stages_for_backend(
3421                rlx_driver::Device::Cuda,
3422                graph,
3423                options,
3424                CUDA_SUPPORTED_OPS,
3425            );
3426            crate::stages::maybe_log_fusion(&compile_result.fusion);
3427            let graph = compile_result.lir.into_graph();
3428            let graph = match options.policy.clone() {
3429                Some(p) => rlx_opt::AutoMixedPrecision::new(p).run(graph),
3430                None => graph,
3431            };
3432            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
3433            Box::new(CudaExecutableWrapper {
3434                inner: CudaExecutable::compile_rng(graph, options.rng),
3435                io_manifest,
3436            })
3437        }
3438
3439        fn compile_lir(
3440            &self,
3441            lir: LirModule,
3442            options: &CompileOptions,
3443        ) -> Box<dyn ExecutableGraph> {
3444            use rlx_opt::pass::Pass as _;
3445            let graph = rlx_opt::LegalizeBroadcast.run(lir.into_graph());
3446            let (graph, io_manifest) =
3447                cpu_low_precision::prepare_f32_exec_graph(prepare_fused_graph(
3448                    rlx_cuda::unfuse::unfuse(graph),
3449                    options,
3450                    CUDA_SUPPORTED_OPS,
3451                    "cuda",
3452                ));
3453            Box::new(CudaExecutableWrapper {
3454                inner: CudaExecutable::compile_rng(graph, options.rng),
3455                io_manifest,
3456            })
3457        }
3458    }
3459
3460    struct CudaExecutableWrapper {
3461        inner: CudaExecutable,
3462        io_manifest: cpu_low_precision::IoDtypeManifest,
3463    }
3464
3465    // CudaExecutable owns CudaContext + CudaSlice handles; cudarc claims
3466    // they're Send (CudaContext is Arc-wrapped, CudaSlice is logically
3467    // a device pointer + length). The Backend trait requires Send for
3468    // the executable; we honor that here.
3469    unsafe impl Send for CudaExecutableWrapper {}
3470
3471    impl ExecutableGraph for CudaExecutableWrapper {
3472        fn set_param(&mut self, name: &str, data: &[f32]) {
3473            self.inner.set_param(name, data);
3474        }
3475        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
3476            self.inner.run(inputs)
3477        }
3478        fn run_read_outputs(
3479            &mut self,
3480            inputs: &[(&str, &[f32])],
3481            read_indices: Option<&[usize]>,
3482        ) -> Vec<Vec<f32>> {
3483            self.inner.run_read_outputs(inputs, read_indices)
3484        }
3485        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
3486            self.inner.bind_gpu_handle(name, data)
3487        }
3488        fn has_gpu_handle(&self, name: &str) -> bool {
3489            self.inner.has_gpu_handle(name)
3490        }
3491        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
3492            self.inner.set_gpu_handle_feed(handle_name, output_index);
3493            true
3494        }
3495        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
3496            self.inner.read_gpu_handle(name)
3497        }
3498        fn register_kv_row_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
3499            self.inner.register_kv_row_feed(handle_name, output_index);
3500            true
3501        }
3502        fn feed_kv_row(&mut self, src_row: usize, dst_row: usize, row_elems: usize) -> bool {
3503            self.inner.feed_kv_row(src_row, dst_row, row_elems);
3504            true
3505        }
3506        fn read_output_row(
3507            &self,
3508            out_idx: usize,
3509            row: usize,
3510            row_inner: usize,
3511        ) -> Option<Vec<f32>> {
3512            self.inner.read_output_row(out_idx, row, row_inner)
3513        }
3514        fn read_gpu_handle_row(
3515            &self,
3516            name: &str,
3517            row: usize,
3518            row_inner: usize,
3519        ) -> Option<Vec<f32>> {
3520            self.inner.read_gpu_handle_row(name, row, row_inner)
3521        }
3522        fn prepare_resident_gpu_handle(&mut self, name: &str) -> bool {
3523            self.inner.prepare_resident_gpu_handle(name)
3524        }
3525        fn stage_bound_gpu_handles_to_arena(&mut self) {
3526            self.inner.stage_bound_gpu_handles_to_arena();
3527        }
3528        fn seed_resident_kv_prefix_from(
3529            &mut self,
3530            src: &dyn ExecutableGraph,
3531            prefix_tokens: usize,
3532            outgoing_upper: usize,
3533            kv_dim: usize,
3534            n_layers: usize,
3535        ) -> bool {
3536            let Some(dst_exe) = self.cuda_executable_for_kv_seed() else {
3537                return false;
3538            };
3539            let Some(src_exe) = src.cuda_executable_for_kv_seed_ref() else {
3540                return false;
3541            };
3542            dst_exe.seed_resident_kv_prefix_from(
3543                src_exe,
3544                prefix_tokens,
3545                outgoing_upper,
3546                kv_dim,
3547                n_layers,
3548            )
3549        }
3550        fn copy_resident_kv_rows_from(
3551            &mut self,
3552            src: &dyn ExecutableGraph,
3553            from_row: usize,
3554            to_row: usize,
3555            outgoing_upper: usize,
3556            kv_dim: usize,
3557            n_layers: usize,
3558        ) -> bool {
3559            let Some(dst_exe) = self.cuda_executable_for_kv_seed() else {
3560                return false;
3561            };
3562            let Some(src_exe) = src.cuda_executable_for_kv_seed_ref() else {
3563                return false;
3564            };
3565            dst_exe.copy_resident_kv_rows_from(
3566                src_exe,
3567                from_row,
3568                to_row,
3569                outgoing_upper,
3570                kv_dim,
3571                n_layers,
3572            )
3573        }
3574        fn cuda_executable_for_kv_seed(
3575            &mut self,
3576        ) -> Option<&mut rlx_cuda::backend::CudaExecutable> {
3577            Some(&mut self.inner)
3578        }
3579        fn cuda_executable_for_kv_seed_ref(&self) -> Option<&rlx_cuda::backend::CudaExecutable> {
3580            Some(&self.inner)
3581        }
3582        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
3583            self.inner.set_active_extent(extent);
3584        }
3585
3586        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
3587            self.inner.set_rng(rng);
3588        }
3589
3590        fn rng(&self) -> rlx_ir::RngOptions {
3591            self.inner.rng()
3592        }
3593
3594        fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
3595            self.inner.run_slots(inputs)
3596        }
3597
3598        fn arena_ptr(&self) -> *const u8 {
3599            self.inner.arena_ptr()
3600        }
3601
3602        /// Typed param upload — widens F16/BF16 host bytes to f32
3603        /// before routing through `set_param`. CUDA's arena is
3604        /// f32-uniform; the half-precision matmul tier opts in via
3605        /// the separate `set_param_half` API.
3606        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
3607            if matches!(dtype, rlx_ir::DType::U8 | rlx_ir::DType::I8) {
3608                self.inner.set_param_bytes(name, data);
3609                return;
3610            }
3611            if dtype == rlx_ir::DType::F32 {
3612                let n = data.len() / 4;
3613                let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
3614                self.inner.set_param(name, s);
3615            } else {
3616                let f32_buf = super::widen_bytes_to_f32(data, dtype);
3617                self.inner.set_param(name, &f32_buf);
3618            }
3619        }
3620
3621        /// Typed run — widen each typed input to F32, run, then narrow
3622        /// each output back to its declared graph dtype.
3623        fn run_typed(
3624            &mut self,
3625            inputs: &[(&str, &[u8], rlx_ir::DType)],
3626        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
3627            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
3628            for (name, data, dt) in inputs {
3629                let v = super::widen_bytes_to_f32(data, *dt);
3630                owned.push((name.to_string(), v));
3631            }
3632            let refs: Vec<(&str, &[f32])> = owned
3633                .iter()
3634                .map(|(n, d)| (n.as_str(), d.as_slice()))
3635                .collect();
3636            let dtypes =
3637                super::declared_output_dtypes(&self.io_manifest, self.inner.output_dtypes());
3638            let outs = self.inner.run(&refs);
3639            outs.into_iter()
3640                .zip(
3641                    dtypes
3642                        .into_iter()
3643                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
3644                )
3645                .map(|(v, dt)| (super::narrow_f32_to_bytes(&v, dt), dt))
3646                .collect()
3647        }
3648
3649        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
3650            Box::new(CudaExecutableWrapper {
3651                inner: self.inner.clone_for_cache(),
3652                io_manifest: self.io_manifest.clone(),
3653            })
3654        }
3655    }
3656}
3657
3658// ── ROCm Backend ────────────────────────────────────────────────────────
3659
3660#[cfg(feature = "rocm")]
3661pub mod rocm_backend {
3662    use super::*;
3663    use rlx_rocm::backend::RocmExecutable;
3664
3665    pub struct RocmBackend;
3666
3667    /// PLAN L4: ROCm is the sister crate of CUDA; identical Step
3668    /// enum + dispatch shape → identical claimed op set.
3669    const ROCM_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
3670        use rlx_ir::OpKind::*;
3671        &[
3672            Input,
3673            Param,
3674            Constant,
3675            Activation,
3676            Cast,
3677            StopGradient,
3678            Binary,
3679            Compare,
3680            Where,
3681            ElementwiseRegion,
3682            TransformRegion,
3683            BatchElementwiseRegion,
3684            MatMul,
3685            ScaledMatMul,
3686            ScaledQuantize,
3687            ScaledQuantScale,
3688            ScaledDequantize,
3689            DotGeneral,
3690            LayerNorm,
3691            LayerNorm2d,
3692            GroupNorm,
3693            ResizeNearest2x,
3694            AxialRope2d,
3695            Reverse,
3696            ArgMax,
3697            ArgMin,
3698            RmsNorm,
3699            Attention,
3700            AttentionBackward,
3701            RmsNormBackwardInput,
3702            RmsNormBackwardGamma,
3703            RmsNormBackwardBeta,
3704            RopeBackward,
3705            CumsumBackward,
3706            GatherBackward,
3707            Rope,
3708            Reshape,
3709            Transpose,
3710            Narrow,
3711            Concat,
3712            Expand,
3713            Gather,
3714            Reduce,
3715            Softmax,
3716            Cumsum,
3717            TopK,
3718            Sample,
3719            Conv,
3720            ConvTranspose2d,
3721            Pool,
3722            GroupedMatMul,
3723            DequantGroupedMatMul,
3724            DequantMoEWeights,
3725            ScatterAdd,
3726            DequantMatMul,
3727            SelectiveScan,
3728            Lstm,
3729            // General Op::Scan (arbitrary-body recurrence, e.g. IIR biquad) via
3730            // D2H→CPU→H2D host fallback (forces eager, not graph-captured).
3731            Scan,
3732            FusedMatMulBiasAct,
3733            FusedResidualLN,
3734            FusedResidualRmsNorm,
3735            // Fused, then decomposed by the backend's own `unfuse` pass
3736            // (rlx-cuda / rlx-rocm) before lowering — no monolithic
3737            // fused-attention kernel yet, same fuse-then-unfuse as WGPU.
3738            FusedAttentionBlock,
3739            GaussianSplatRender,
3740            GaussianSplatRenderBackward,
3741            GaussianSplatPrepare,
3742            GaussianSplatRasterize,
3743            Custom,
3744            Fft,
3745            LogMel,
3746            LogMelBackward,
3747            WelchPeaks,
3748            Im2Col,
3749            RngNormal,
3750            RngUniform,
3751        ]
3752    };
3753
3754    impl Backend for RocmBackend {
3755        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
3756            ROCM_SUPPORTED_OPS
3757        }
3758
3759        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
3760            use rlx_opt::pass::Pass as _;
3761            let graph = rlx_rocm::unfuse::unfuse(graph);
3762            let graph = rlx_opt::legalize_or_rewrite_for_backend(graph, ROCM_SUPPORTED_OPS)
3763                .unwrap_or_else(|errors| {
3764                    panic!("{}", rlx_opt::format_legalize_error("rocm", &errors));
3765                });
3766            let graph = crate::precompile::precompile_cleanup(graph, options);
3767            let graph = rlx_opt::LegalizeBroadcast.run(graph);
3768            let compile_result = crate::stages::compile_graph_stages_for_backend(
3769                rlx_driver::Device::Rocm,
3770                graph,
3771                options,
3772                ROCM_SUPPORTED_OPS,
3773            );
3774            crate::stages::maybe_log_fusion(&compile_result.fusion);
3775            let graph = compile_result.lir.into_graph();
3776            let graph = match options.policy.clone() {
3777                Some(p) => rlx_opt::AutoMixedPrecision::new(p).run(graph),
3778                None => graph,
3779            };
3780            let (graph, io_manifest) = cpu_low_precision::prepare_f32_exec_graph(graph);
3781            Box::new(RocmExecutableWrapper {
3782                inner: RocmExecutable::compile_rng(graph, options.rng),
3783                io_manifest,
3784            })
3785        }
3786
3787        fn compile_lir(
3788            &self,
3789            lir: LirModule,
3790            options: &CompileOptions,
3791        ) -> Box<dyn ExecutableGraph> {
3792            let (graph, io_manifest) =
3793                cpu_low_precision::prepare_f32_exec_graph(prepare_fused_graph(
3794                    rlx_rocm::unfuse::unfuse(lir.into_graph()),
3795                    options,
3796                    ROCM_SUPPORTED_OPS,
3797                    "rocm",
3798                ));
3799            Box::new(RocmExecutableWrapper {
3800                inner: RocmExecutable::compile_rng(graph, options.rng),
3801                io_manifest,
3802            })
3803        }
3804    }
3805
3806    struct RocmExecutableWrapper {
3807        inner: RocmExecutable,
3808        io_manifest: cpu_low_precision::IoDtypeManifest,
3809    }
3810
3811    // Same Send-claim shape as CudaExecutableWrapper. RocmExecutable
3812    // owns Arc<RocmContext> + HipBuffer handles; the HipRuntime bundle
3813    // is internally thread-safe per AMD's documentation.
3814    unsafe impl Send for RocmExecutableWrapper {}
3815
3816    impl ExecutableGraph for RocmExecutableWrapper {
3817        fn set_param(&mut self, name: &str, data: &[f32]) {
3818            self.inner.set_param(name, data);
3819        }
3820        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
3821            self.inner.run(inputs)
3822        }
3823        fn run_read_outputs(
3824            &mut self,
3825            inputs: &[(&str, &[f32])],
3826            read_indices: Option<&[usize]>,
3827        ) -> Vec<Vec<f32>> {
3828            self.inner.run_read_outputs(inputs, read_indices)
3829        }
3830        fn bind_gpu_handle(&mut self, name: &str, data: &[f32]) -> bool {
3831            self.inner.bind_gpu_handle(name, data)
3832        }
3833        fn has_gpu_handle(&self, name: &str) -> bool {
3834            self.inner.has_gpu_handle(name)
3835        }
3836        fn set_gpu_handle_feed(&mut self, handle_name: &str, output_index: usize) -> bool {
3837            self.inner.set_gpu_handle_feed(handle_name, output_index);
3838            true
3839        }
3840        fn read_gpu_handle(&self, name: &str) -> Option<Vec<f32>> {
3841            self.inner.read_gpu_handle(name)
3842        }
3843        fn run_slots(&mut self, inputs: &[&[f32]]) -> &[(usize, usize)] {
3844            self.inner.run_slots(inputs)
3845        }
3846        fn arena_ptr(&self) -> *const u8 {
3847            self.inner.arena_ptr()
3848        }
3849        fn set_active_extent(&mut self, extent: Option<(usize, usize)>) {
3850            self.inner.set_active_extent(extent);
3851        }
3852
3853        fn set_rng(&mut self, rng: rlx_ir::RngOptions) {
3854            self.inner.set_rng(rng);
3855        }
3856
3857        fn rng(&self) -> rlx_ir::RngOptions {
3858            self.inner.rng()
3859        }
3860
3861        /// Typed param upload — widens F16/BF16 host bytes to f32
3862        /// before routing through `set_param`. ROCm's arena is
3863        /// f32-uniform; the half-precision matmul tier opts in via
3864        /// the separate `set_param_half` API.
3865        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
3866            if matches!(dtype, rlx_ir::DType::U8 | rlx_ir::DType::I8) {
3867                self.inner.set_param_bytes(name, data);
3868                return;
3869            }
3870            if dtype == rlx_ir::DType::F32 {
3871                let n = data.len() / 4;
3872                let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
3873                self.inner.set_param(name, s);
3874            } else {
3875                let f32_buf = super::widen_bytes_to_f32(data, dtype);
3876                self.inner.set_param(name, &f32_buf);
3877            }
3878        }
3879
3880        /// Typed run — widen each typed input to F32, run, then narrow
3881        /// each output back to its declared graph dtype.
3882        fn run_typed(
3883            &mut self,
3884            inputs: &[(&str, &[u8], rlx_ir::DType)],
3885        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
3886            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
3887            for (name, data, dt) in inputs {
3888                let v = super::widen_bytes_to_f32(data, *dt);
3889                owned.push((name.to_string(), v));
3890            }
3891            let refs: Vec<(&str, &[f32])> = owned
3892                .iter()
3893                .map(|(n, d)| (n.as_str(), d.as_slice()))
3894                .collect();
3895            let dtypes =
3896                super::declared_output_dtypes(&self.io_manifest, self.inner.output_dtypes());
3897            let outs = self.inner.run(&refs);
3898            outs.into_iter()
3899                .zip(
3900                    dtypes
3901                        .into_iter()
3902                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
3903                )
3904                .map(|(v, dt)| (super::narrow_f32_to_bytes(&v, dt), dt))
3905                .collect()
3906        }
3907
3908        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
3909            Box::new(RocmExecutableWrapper {
3910                inner: self.inner.clone_for_cache(),
3911                io_manifest: self.io_manifest.clone(),
3912            })
3913        }
3914    }
3915}
3916
3917// ── TPU Backend ─────────────────────────────────────────────────────────
3918
3919#[cfg(feature = "tpu")]
3920pub mod tpu_backend {
3921    use super::*;
3922    use rlx_tpu::TpuExecutable;
3923
3924    pub struct TpuBackend;
3925
3926    /// Ops the TPU backend lowers to HLO. Full inference parity with
3927    /// rlx-cuda / rlx-rocm. Composite ops (FusedSwiGLU /
3928    /// FusedTransformerLayer / LoraMatMul / If / While) are unfused
3929    /// inside `rlx_tpu::unfuse::unfuse` ahead of HLO emission, so they
3930    /// don't appear here. `FusedAttentionBlock` IS claimed (for legalize
3931    /// + op-coverage); the same `unfuse` pass decomposes it to the
3932    /// primitive chain before HLO, so no HLO-level FAB op is emitted.
3933    const TPU_SUPPORTED_OPS: &[rlx_ir::OpKind] = {
3934        use rlx_ir::OpKind::*;
3935        &[
3936            Input,
3937            Param,
3938            Constant,
3939            Activation,
3940            Cast,
3941            StopGradient,
3942            Binary,
3943            Compare,
3944            Where,
3945            ElementwiseRegion,
3946            TransformRegion,
3947            BatchElementwiseRegion,
3948            MatMul,
3949            DotGeneral,
3950            LayerNorm,
3951            RmsNorm,
3952            Attention,
3953            Rope,
3954            Reshape,
3955            Transpose,
3956            Narrow,
3957            Concat,
3958            Expand,
3959            Gather,
3960            Reduce,
3961            Softmax,
3962            Cumsum,
3963            TopK,
3964            Sample,
3965            Conv,
3966            Pool,
3967            GroupedMatMul,
3968            DequantGroupedMatMul,
3969            DequantMoEWeights,
3970            ScatterAdd,
3971            DequantMatMul,
3972            SelectiveScan,
3973            // Real-INT8 path + fake-quant.
3974            QMatMul,
3975            QConv2d,
3976            Quantize,
3977            Dequantize,
3978            FusedMatMulBiasAct,
3979            FusedResidualLN,
3980            FusedResidualRmsNorm,
3981            // Claimed for legalize/coverage; `rlx_tpu::unfuse::unfuse`
3982            // decomposes it to the primitive chain ahead of HLO emission.
3983            FusedAttentionBlock,
3984            Fft,
3985            LogMel,
3986            LogMelBackward,
3987            WelchPeaks,
3988            RngNormal,
3989            RngUniform,
3990            // Splat: no on-chip kernel — lowered to common primitive MIR via logical_kernel.
3991        ]
3992    };
3993
3994    impl Backend for TpuBackend {
3995        fn supported_ops(&self) -> &'static [rlx_ir::OpKind] {
3996            TPU_SUPPORTED_OPS
3997        }
3998
3999        fn compile(&self, graph: Graph, options: &CompileOptions) -> Box<dyn ExecutableGraph> {
4000            let graph = rlx_opt::legalize_or_rewrite_for_backend_with_config(
4001                graph,
4002                TPU_SUPPORTED_OPS,
4003                options.kernel_dispatch,
4004            )
4005            .unwrap_or_else(|errors| {
4006                panic!("{}", rlx_opt::format_legalize_error("tpu", &errors));
4007            });
4008            // The TPU's IR-side pass pipeline (DCE, ConstFold,
4009            // FuseResidualLN, FuseMatMulBiasAct, LegalizeBroadcast,
4010            // MarkElementwiseRegions) lives inside
4011            // `TpuExecutable::compile` so the same passes run whether
4012            // a caller goes through Session or invokes the executable
4013            // directly. We only do backend-cross-cutting work here:
4014            // legalization (must precede the pipeline so we panic
4015            // early on unsupported ops) and AutoMixedPrecision.
4016            //
4017            // Default policy on TPU is `AutoMixedBf16`: BF16 is the
4018            // native compute dtype on TPU silicon and recent GPUs,
4019            // and XLA's CPU plugin handles it natively too. Callers
4020            // can opt out by passing an explicit `PrecisionPolicy`
4021            // (e.g. `AlwaysF32` for accuracy debugging or
4022            // `AlwaysF16` to match a CUDA workload's choice).
4023            use rlx_opt::pass::Pass as _;
4024            let policy = options
4025                .policy
4026                .clone()
4027                .unwrap_or(rlx_opt::PrecisionPolicy::AutoMixedBf16);
4028            let graph = rlx_opt::AutoMixedPrecision::new(policy).run(graph);
4029            let _ = options.dce;
4030            let _ = options.constant_folding;
4031            Box::new(TpuExecutableWrapper {
4032                inner: TpuExecutable::compile_rng_with_param_bytes(
4033                    graph,
4034                    options.rng,
4035                    options.quant_param_bindings.as_ref(),
4036                ),
4037            })
4038        }
4039    }
4040
4041    struct TpuExecutableWrapper {
4042        inner: TpuExecutable,
4043    }
4044
4045    // PJRT clients + buffers are documented as thread-safe per the
4046    // upstream C API. Same Send-claim shape as CudaExecutableWrapper /
4047    // RocmExecutableWrapper.
4048    unsafe impl Send for TpuExecutableWrapper {}
4049
4050    impl ExecutableGraph for TpuExecutableWrapper {
4051        fn set_param(&mut self, name: &str, data: &[f32]) {
4052            self.inner.set_param(name, data);
4053        }
4054        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
4055            self.inner.run(inputs)
4056        }
4057
4058        /// Typed param upload — widens F16/BF16/etc. host bytes to
4059        /// f32 today. Once the HLO emitter speaks bf16 natively
4060        /// (which TPUs prefer over f16), the typed path will hand
4061        /// the original bytes straight through `Buffer_FromHostBuffer`.
4062        fn set_param_typed(&mut self, name: &str, data: &[u8], dtype: rlx_ir::DType) {
4063            if dtype == rlx_ir::DType::F32 {
4064                let n = data.len() / 4;
4065                let s = unsafe { std::slice::from_raw_parts(data.as_ptr() as *const f32, n) };
4066                self.inner.set_param(name, s);
4067            } else {
4068                let f32_buf = super::widen_bytes_to_f32(data, dtype);
4069                self.inner.set_param(name, &f32_buf);
4070            }
4071        }
4072
4073        fn run_typed(
4074            &mut self,
4075            inputs: &[(&str, &[u8], rlx_ir::DType)],
4076        ) -> Vec<(Vec<u8>, rlx_ir::DType)> {
4077            let mut owned: Vec<(String, Vec<f32>)> = Vec::with_capacity(inputs.len());
4078            for (name, data, dt) in inputs {
4079                let v = super::widen_bytes_to_f32(data, *dt);
4080                owned.push((name.to_string(), v));
4081            }
4082            let refs: Vec<(&str, &[f32])> = owned
4083                .iter()
4084                .map(|(n, d)| (n.as_str(), d.as_slice()))
4085                .collect();
4086            let dtypes = self.inner.output_dtypes();
4087            let outs = self.inner.run(&refs);
4088            outs.into_iter()
4089                .zip(
4090                    dtypes
4091                        .into_iter()
4092                        .chain(std::iter::repeat(rlx_ir::DType::F32)),
4093                )
4094                .map(|(v, dt)| (super::narrow_f32_to_bytes(&v, dt), dt))
4095                .collect()
4096        }
4097
4098        fn clone_box(&self) -> Box<dyn ExecutableGraph> {
4099            Box::new(TpuExecutableWrapper {
4100                inner: self.inner.clone_for_cache(),
4101            })
4102        }
4103    }
4104}
4105
4106/// QNN (Hexagon NPU) backend adapter — wraps the `rlx-qnn` FFI runtime
4107/// (`Device::Hexagon`). Milestone 1: a single rank-2 MatMul executed in-process
4108/// on a QNN backend library (libQnnCpu.so / libQnnHtp.so) through the dynamic
4109/// QNN C API. Mirrors `coreml_backend`, the other NPU adapter.
4110#[cfg(feature = "qnn")]
4111pub mod qnn_backend {
4112    use super::*;
4113    use rlx_qnn::runtime::QnnExecutable;
4114
4115    pub struct QnnBackend;
4116
4117    impl Backend for QnnBackend {
4118        // `supported_ops()` keeps the trait default (empty = "accept
4119        // everything"): a non-empty list makes LegalizeForBackend reject every
4120        // other kind, including structural `Input`/`Output`. Milestone 1's
4121        // recognizer (`Model::from_graph`) already errors clearly on any graph
4122        // that isn't a single MatMul, so we don't gate via legalize yet.
4123
4124        fn compile(&self, graph: Graph, _options: &CompileOptions) -> Box<dyn ExecutableGraph> {
4125            let exec = QnnExecutable::compile_graph(&graph)
4126                .unwrap_or_else(|e| panic!("rlx-qnn compile failed: {e}"));
4127            Box::new(QnnExecutableWrapper { inner: exec })
4128        }
4129
4130        fn compile_lir(
4131            &self,
4132            lir: LirModule,
4133            options: &CompileOptions,
4134        ) -> Box<dyn ExecutableGraph> {
4135            // No LIR arena path for QNN (milestone-1 single matmul): reconstruct
4136            // the graph and go through the normal compile.
4137            self.compile(lir.into_graph(), options)
4138        }
4139    }
4140
4141    struct QnnExecutableWrapper {
4142        inner: QnnExecutable,
4143    }
4144
4145    impl ExecutableGraph for QnnExecutableWrapper {
4146        fn set_param(&mut self, name: &str, data: &[f32]) {
4147            // Bind static weights (Param tensors) by name; unknown names are
4148            // ignored inside the executable.
4149            self.inner.set_param(name, data);
4150        }
4151
4152        fn run(&mut self, inputs: &[(&str, &[f32])]) -> Vec<Vec<f32>> {
4153            self.inner
4154                .run(inputs)
4155                .unwrap_or_else(|e| panic!("rlx-qnn run failed: {e}"))
4156        }
4157    }
4158}
rlx_runtime/backend.rs

rlx_runtime/
backend.rs