rlx_runtime/
compile_cache.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Shape-bucketed compile cache.
17//!
18//! Lets variable-shape callers (e.g., embedding-model wrappers that vary
19//! batch + seq per request) amortize the per-(shape) compile cost. Cache
20//! keys are caller-provided `u64`s — the caller decides what counts as a
21//! shape bucket. Typical recipe: `(batch as u64) << 32 | seq as u64`.
22//!
23//! The cache stores one `CompiledGraph` per key. Params loaded onto a
24//! cached entry persist for that entry — re-fetching from cache does
25//! **not** require re-running `set_param`. Eviction is FIFO, capped at
26//! `capacity` entries (good enough for the current "a handful of common
27//! shapes" usage pattern; switch to LRU if a real workload shows churn).
28//!
29//! # Example
30//!
31//! ```rust,ignore
32//! let mut cache = CompileCache::new(Device::Metal, 8);
33//! let key = ((batch as u64) << 32) | seq as u64;
34//! let mut compiled = cache.get_or_compile(key, || build_my_graph(batch, seq));
35//! // First call for `key`: compiles. Subsequent calls: cache hit.
36//! compiled.run(&[("x", &input_data)]);
37//! ```
38
39use crate::{CompiledGraph, Device, Session};
40use rlx_ir::DimBinding;
41use rlx_ir::Graph;
42use rlx_ir::hir::HirModule;
43use rlx_opt::CompileResult;
44use std::collections::HashMap;
45use std::collections::VecDeque;
46use std::ops::Range;
47
48/// Named runtime input for [`BucketedCompileCache::run_padded_mixed`].
49pub struct CacheRunInput<'a> {
50    pub name: &'a str,
51    pub data: &'a [f32],
52    /// Row inner stride for [`pad_rows`]; `None` = use data as-is (no padding).
53    pub row_inner: Option<usize>,
54}
55
56pub struct CompileCache {
57    device: Device,
58    capacity: usize,
59    // Per-cache precision policy. None → default (F32). Set once at
60    // construction; applies to every compile this cache performs.
61    policy: Option<rlx_opt::PrecisionPolicy>,
62    // (key, compiled). Vec keeps insertion order for FIFO eviction; the
63    // expected hit-rate at our cap (~8) makes the linear scan cheaper
64    // than a HashMap + separate eviction list.
65    entries: Vec<(u64, CompiledGraph)>,
66    // Insertion order for eviction.
67    order: VecDeque<u64>,
68}
69
70impl CompileCache {
71    pub fn new(device: Device, capacity: usize) -> Self {
72        Self::with_policy(device, capacity, None)
73    }
74
75    /// Cache that compiles every entry with the given precision policy.
76    /// Use this when the cached entries should differ from CPU-default
77    /// F32 — e.g., `PrecisionPolicy::AutoMixed` for f16 compute on Metal.
78    pub fn with_policy(
79        device: Device,
80        capacity: usize,
81        policy: Option<rlx_opt::PrecisionPolicy>,
82    ) -> Self {
83        assert!(capacity > 0, "CompileCache capacity must be ≥ 1");
84        Self {
85            device,
86            capacity,
87            policy,
88            entries: Vec::with_capacity(capacity),
89            order: VecDeque::with_capacity(capacity),
90        }
91    }
92
93    /// Compile if not present, then return a mutable reference. The borrow
94    /// lifetime is tied to `&mut self` so callers naturally serialize their
95    /// use of any one entry — the cache is single-owner today.
96    pub fn get_or_compile<F: FnOnce() -> Graph>(
97        &mut self,
98        key: u64,
99        build: F,
100    ) -> &mut CompiledGraph {
101        self.get_or_compile_with_options(key, build, &crate::CompileOptions::new())
102    }
103
104    /// Like [`Self::get_or_compile`] with explicit [`CompileOptions`].
105    pub fn get_or_compile_with_options<F: FnOnce() -> Graph>(
106        &mut self,
107        key: u64,
108        build: F,
109        options: &crate::CompileOptions,
110    ) -> &mut CompiledGraph {
111        if let Some(idx) = self.entries.iter().position(|(k, _)| *k == key) {
112            return &mut self.entries[idx].1;
113        }
114        let mut session = Session::new(self.device);
115        if let Some(p) = &self.policy {
116            session = session.with_policy(p.clone());
117        }
118        let compiled = session.compile_with(build(), options);
119
120        // Evict FIFO if at capacity.
121        if self.entries.len() >= self.capacity
122            && let Some(evict_key) = self.order.pop_front()
123        {
124            sync_evicted_entry(&mut self.entries, evict_key);
125            self.entries.retain(|(k, _)| *k != evict_key);
126        }
127        self.entries.push((key, compiled));
128        self.order.push_back(key);
129        &mut self.entries.last_mut().unwrap().1
130    }
131
132    /// Like [`Self::get_or_compile_with_options`] but builds and compiles HIR directly.
133    pub fn get_or_compile_hir_with_options<F: FnOnce() -> rlx_ir::hir::HirModule>(
134        &mut self,
135        key: u64,
136        build: F,
137        options: &crate::CompileOptions,
138    ) -> &mut CompiledGraph {
139        if let Some(idx) = self.entries.iter().position(|(k, _)| *k == key) {
140            return &mut self.entries[idx].1;
141        }
142        let mut session = Session::new(self.device);
143        if let Some(p) = &self.policy {
144            session = session.with_policy(p.clone());
145        }
146        let compiled = session
147            .compile_hir_with(build(), options)
148            .expect("HIR lower/compile in compile cache");
149
150        if self.entries.len() >= self.capacity
151            && let Some(evict_key) = self.order.pop_front()
152        {
153            sync_evicted_entry(&mut self.entries, evict_key);
154            self.entries.retain(|(k, _)| *k != evict_key);
155        }
156        self.entries.push((key, compiled));
157        self.order.push_back(key);
158        &mut self.entries.last_mut().unwrap().1
159    }
160
161    /// Number of entries currently cached. Useful for tests + diagnostics.
162    pub fn len(&self) -> usize {
163        self.entries.len()
164    }
165    pub fn is_empty(&self) -> bool {
166        self.entries.is_empty()
167    }
168    /// Was this key already compiled? Doesn't change recency.
169    pub fn contains(&self, key: u64) -> bool {
170        self.entries.iter().any(|(k, _)| *k == key)
171    }
172
173    /// Drop all cached compiled graphs (free weight params).
174    pub fn clear(&mut self) {
175        self.sync_all();
176        self.entries.clear();
177        self.order.clear();
178    }
179
180    /// Drain in-flight GPU work on every cached entry (Metal `commit_no_wait` paths).
181    pub fn sync_all(&mut self) {
182        for (_, compiled) in &mut self.entries {
183            compiled.sync_pending();
184        }
185    }
186}
187
188fn sync_evicted_entry(entries: &mut [(u64, CompiledGraph)], evict_key: u64) {
189    if let Some((_, compiled)) = entries.iter_mut().find(|(k, _)| *k == evict_key) {
190        compiled.sync_pending();
191    }
192}
193
194// ── Bucketed cache (PLAN L1) ──────────────────────────────────────────
195//
196// Variant of `CompileCache` that compiles one `CompiledGraph` per shape
197// *range* instead of per exact key. The caller declares buckets up front
198// (e.g. `1..16`, `16..64`, `64..256`); each bucket is compiled lazily at
199// its upper bound the first time a key in that bucket arrives.
200//
201// Trade vs `CompileCache`: unique keys → unique compiles becomes unique
202// buckets → unique compiles. The compiled graph is specialized for each
203// bucket's upper-bound dim. Two ways to use it:
204//
205// **Manual padding** — caller drives the pad/slice cycle:
206// ```rust,ignore
207// let buckets = vec![1..16, 16..64, 64..256];
208// let mut cache = BucketedCompileCache::new(Device::Metal, buckets);
209// let (upper, compiled) = cache
210//     .get_or_compile(seq as u64, |max_seq| build_graph(max_seq as usize))
211//     .expect("seq within buckets");
212// // pad input to `upper as usize` elements before run
213// compiled.run(&[("x", &padded)]);
214// ```
215//
216// **`run_padded` shortcut** — cache pads and slices for you:
217// ```rust,ignore
218// let (upper, outputs) = cache.run_padded(
219//     seq as u64,
220//     seq,                                    // actual rows
221//     |max_seq| build_graph(max_seq as usize),
222//     &[("x", &raw_input, hidden)],           // (name, data, inner stride)
223//     &[hidden],                              // per-output inner stride
224// ).expect("in range");
225// ```
226//
227// **How "skip compute" actually works here**: each bucket compiles at
228// its own upper bound, so kernels run at *that* extent, not at some
229// global maximum. Smaller buckets ⇒ less padded compute. The
230// `power_of_two_ladder` constructor builds a logarithmic schedule that
231// guarantees ≤2× padding waste in exchange for `O(log max)` compiled
232// artifacts. For finer control, hand-construct the bucket list.
233//
234// True per-kernel active-extent dispatch (one big compile, runtime
235// extent override that short-circuits each kernel's inner loop) is a
236// per-backend change across `rlx-cuda`, `rlx-rocm`,
237// `rlx-cpu/src/thunk.rs`, `rlx-metal/src/thunk.rs`, `rlx-mlx`,
238// `rlx-wgpu` — multi-day project, not in this layer.
239
240pub struct BucketedCompileCache {
241    device: Device,
242    policy: Option<rlx_opt::PrecisionPolicy>,
243    buckets: Vec<Bucket>,
244}
245
246struct Bucket {
247    range: Range<u64>,
248    compiled: Option<CompiledGraph>,
249}
250
251impl BucketedCompileCache {
252    pub fn new(device: Device, buckets: Vec<Range<u64>>) -> Self {
253        Self::with_policy(device, buckets, None)
254    }
255
256    /// Power-of-two ladder over `[1, max]`, with extents
257    /// `[min_pow2, 2·min_pow2, 4·min_pow2, …, max_pow2]` where
258    /// `min_pow2 = min.next_power_of_two()` and `max_pow2` is the smallest
259    /// power of two ≥ `max`. Each bucket compiles at its upper-bound
260    /// extent, so an `actual` value in bucket `(prev_extent .. ext]` runs
261    /// kernels at extent `ext` (not at the worst case of the whole range).
262    /// Guarantees compute waste from padding ≤2× — `actual > ext / 2`
263    /// for every bucket except possibly the smallest.
264    ///
265    /// Example: `power_of_two_ladder(Device::Cpu, 8, 256)` yields buckets
266    /// `1..9, 9..17, 17..33, 33..65, 65..129, 129..257` with compile
267    /// extents `8, 16, 32, 64, 128, 256`. An `actual = 17` runs at extent
268    /// 32 instead of the 255 a single wide `1..256` bucket would compile
269    /// at — that's the "skip compute" win, paid for with `O(log max)`
270    /// compiled artifacts instead of one.
271    pub fn power_of_two_ladder(device: Device, min: u64, max: u64) -> Self {
272        Self::power_of_two_ladder_with_policy(device, min, max, None)
273    }
274
275    pub fn power_of_two_ladder_with_policy(
276        device: Device,
277        min: u64,
278        max: u64,
279        policy: Option<rlx_opt::PrecisionPolicy>,
280    ) -> Self {
281        assert!(min >= 1, "power_of_two_ladder: min must be ≥ 1, got {min}");
282        assert!(
283            max >= min,
284            "power_of_two_ladder: max ({max}) must be ≥ min ({min})"
285        );
286        let mut buckets: Vec<Range<u64>> = Vec::new();
287        let mut start = 1u64;
288        let mut extent = min.next_power_of_two();
289        loop {
290            buckets.push(start..(extent + 1));
291            if extent >= max {
292                break;
293            }
294            start = extent + 1;
295            extent = extent
296                .checked_mul(2)
297                .expect("power_of_two_ladder: extent overflow");
298        }
299        Self::with_policy(device, buckets, policy)
300    }
301
302    pub fn with_policy(
303        device: Device,
304        buckets: Vec<Range<u64>>,
305        policy: Option<rlx_opt::PrecisionPolicy>,
306    ) -> Self {
307        assert!(!buckets.is_empty(), "BucketedCompileCache needs ≥1 bucket");
308        for (i, b) in buckets.iter().enumerate() {
309            assert!(b.start < b.end, "bucket {i} ({b:?}) is empty");
310            if i + 1 < buckets.len() {
311                assert!(
312                    b.end <= buckets[i + 1].start,
313                    "buckets {i} ({b:?}) and {} ({:?}) overlap",
314                    i + 1,
315                    buckets[i + 1],
316                );
317            }
318        }
319        let buckets = buckets
320            .into_iter()
321            .map(|range| Bucket {
322                range,
323                compiled: None,
324            })
325            .collect();
326        Self {
327            device,
328            policy,
329            buckets,
330        }
331    }
332
333    /// Find the bucket containing `key`, compile if needed, return
334    /// `(upper, &mut CompiledGraph)` where `upper = range.end - 1` is the
335    /// extent the graph was compiled for. Caller pads inputs to `upper`
336    /// before calling `run`. Returns `None` if `key` is outside every
337    /// bucket — caller decides whether to fall back to a one-off compile.
338    ///
339    /// `build` receives `upper` and must return a `Graph` specialized for
340    /// that extent.
341    pub fn get_or_compile<F: FnOnce(u64) -> Graph>(
342        &mut self,
343        key: u64,
344        build: F,
345    ) -> Option<(u64, &mut CompiledGraph)> {
346        self.get_or_compile_with_options(key, build, &crate::CompileOptions::new())
347    }
348
349    /// Like [`Self::get_or_compile`] with explicit [`CompileOptions`].
350    pub fn get_or_compile_with_options<F: FnOnce(u64) -> Graph>(
351        &mut self,
352        key: u64,
353        build: F,
354        options: &crate::CompileOptions,
355    ) -> Option<(u64, &mut CompiledGraph)> {
356        let idx = self.bucket_for(key)?;
357        let upper = self.buckets[idx].range.end - 1;
358        if self.buckets[idx].compiled.is_none() {
359            let mut session = Session::new(self.device);
360            if let Some(p) = &self.policy {
361                session = session.with_policy(p.clone());
362            }
363            self.buckets[idx].compiled = Some(session.compile_with(build(upper), options));
364        }
365        Some((upper, self.buckets[idx].compiled.as_mut().unwrap()))
366    }
367
368    /// Like [`Self::get_or_compile`] but builds and compiles HIR directly
369    /// through the fusion-first pipeline (`Session::compile_hir`).
370    pub fn get_or_compile_hir<F: FnOnce(u64) -> HirModule>(
371        &mut self,
372        key: u64,
373        build: F,
374    ) -> Option<(u64, &mut CompiledGraph)> {
375        self.get_or_compile_hir_with_options(key, build, &crate::CompileOptions::new())
376    }
377
378    /// Like [`Self::get_or_compile_hir`] with explicit [`CompileOptions`] (tier-1 profile, fusion target, …).
379    pub fn get_or_compile_hir_with_options<F: FnOnce(u64) -> HirModule>(
380        &mut self,
381        key: u64,
382        build: F,
383        options: &crate::CompileOptions,
384    ) -> Option<(u64, &mut CompiledGraph)> {
385        let idx = self.bucket_for(key)?;
386        let upper = self.buckets[idx].range.end - 1;
387        if self.buckets[idx].compiled.is_none() {
388            let mut session = Session::new(self.device);
389            if let Some(p) = &self.policy {
390                session = session.with_policy(p.clone());
391            }
392            let compiled = session
393                .compile_hir_with(build(upper), options)
394                .expect("HIR lower/compile in bucketed cache");
395            self.buckets[idx].compiled = Some(compiled);
396        }
397        Some((upper, self.buckets[idx].compiled.as_mut().unwrap()))
398    }
399
400    /// Index of the bucket containing `key`, or `None` if out of range.
401    /// Linear scan — bucket counts are small in practice.
402    pub fn bucket_for(&self, key: u64) -> Option<usize> {
403        self.buckets.iter().position(|b| b.range.contains(&key))
404    }
405
406    /// Upper compile extent for `key`'s bucket (`range.end - 1`), without compiling.
407    pub fn bucket_upper_for_key(&self, key: u64) -> Option<u64> {
408        let idx = self.bucket_for(key)?;
409        Some(self.buckets[idx].range.end - 1)
410    }
411
412    pub fn buckets(&self) -> impl Iterator<Item = &Range<u64>> {
413        self.buckets.iter().map(|b| &b.range)
414    }
415
416    /// Number of buckets that have been compiled so far (≤ total buckets).
417    pub fn compiled_count(&self) -> usize {
418        self.buckets.iter().filter(|b| b.compiled.is_some()).count()
419    }
420
421    /// Mutable compiled graph for `key`'s bucket, if already compiled.
422    pub fn compiled_for_key_mut(&mut self, key: u64) -> Option<&mut CompiledGraph> {
423        let idx = self.bucket_for(key)?;
424        self.buckets[idx].compiled.as_mut()
425    }
426
427    /// Immutable compiled graph for a bucket with compile upper bound `upper`.
428    pub fn compiled_for_upper(&self, upper: u64) -> Option<&CompiledGraph> {
429        self.buckets
430            .iter()
431            .find(|b| b.range.end.saturating_sub(1) == upper)
432            .and_then(|b| b.compiled.as_ref())
433    }
434
435    /// Copy parameter storage from the bucket compiled at `src_upper` into `dst_upper`.
436    pub fn try_copy_params_between_uppers(&mut self, dst_upper: u64, src_upper: u64) -> bool {
437        if dst_upper == src_upper {
438            return true;
439        }
440        let dst_idx = self
441            .buckets
442            .iter()
443            .position(|b| b.range.end.saturating_sub(1) == dst_upper);
444        let src_idx = self
445            .buckets
446            .iter()
447            .position(|b| b.range.end.saturating_sub(1) == src_upper);
448        let (Some(dst_idx), Some(src_idx)) = (dst_idx, src_idx) else {
449            return false;
450        };
451        if dst_idx == src_idx {
452            return true;
453        }
454        let (dst, src) = if dst_idx < src_idx {
455            let (left, right) = self.buckets.split_at_mut(src_idx);
456            (&mut left[dst_idx], &right[0])
457        } else {
458            let (left, right) = self.buckets.split_at_mut(dst_idx);
459            (&mut right[0], &left[src_idx])
460        };
461        let Some(dst_c) = dst.compiled.as_mut() else {
462            return false;
463        };
464        let Some(src_c) = src.compiled.as_ref() else {
465            return false;
466        };
467        dst_c.copy_params_from(src_c)
468    }
469
470    /// D2D seed resident KV from `src_key`'s bucket into `dst_key`'s bucket.
471    /// See `rlx-cuda::CudaExecutable::copy_resident_kv_rows_from` and
472    /// `rlx-llama32/docs/cuda-gguf-decode.md`.
473    pub fn seed_resident_kv_prefix_from_keys(
474        &mut self,
475        src_key: u64,
476        dst_key: u64,
477        prefix_tokens: usize,
478        outgoing_upper: usize,
479        kv_dim: usize,
480        n_layers: usize,
481    ) -> bool {
482        let Some(src_idx) = self.bucket_for(src_key) else {
483            return false;
484        };
485        let Some(dst_idx) = self.bucket_for(dst_key) else {
486            return false;
487        };
488        if src_idx == dst_idx {
489            return true;
490        }
491        if src_idx < dst_idx {
492            let (left, right) = self.buckets.split_at_mut(dst_idx);
493            let Some(src) = left[src_idx].compiled.as_ref() else {
494                return false;
495            };
496            let Some(dst) = right[0].compiled.as_mut() else {
497                return false;
498            };
499            return dst.seed_resident_kv_prefix_from(
500                src,
501                prefix_tokens,
502                outgoing_upper,
503                kv_dim,
504                n_layers,
505            );
506        }
507        let (left, right) = self.buckets.split_at_mut(src_idx);
508        let Some(src) = right[0].compiled.as_ref() else {
509            return false;
510        };
511        let Some(dst) = left[dst_idx].compiled.as_mut() else {
512            return false;
513        };
514        dst.seed_resident_kv_prefix_from(src, prefix_tokens, outgoing_upper, kv_dim, n_layers)
515    }
516
517    /// Hybrid bucket rollover: H2D the host-known prefix once, then copy only rows
518    /// the host cache does not already have from `src` (via `copy_resident_kv_rows_from`).
519    /// Not used by `rlx-llama32` today (generator flush+bind path); kept for experiments.
520    pub fn rebind_resident_kv_hybrid_from_keys(
521        &mut self,
522        src_key: u64,
523        dst_key: u64,
524        host_k: &[Vec<f32>],
525        host_v: &[Vec<f32>],
526        prefix_tokens: usize,
527        outgoing_upper: usize,
528        upper: usize,
529        kv_dim: usize,
530        n_layers: usize,
531    ) -> bool {
532        let Some(src_idx) = self.bucket_for(src_key) else {
533            return false;
534        };
535        let Some(dst_idx) = self.bucket_for(dst_key) else {
536            return false;
537        };
538        if src_idx == dst_idx {
539            return true;
540        }
541        let host_rows = host_k.first().map(|k| k.len() / kv_dim.max(1)).unwrap_or(0);
542        let rebind = |src: &CompiledGraph, dst: &mut CompiledGraph| -> bool {
543            for i in 0..n_layers {
544                let mut kp = vec![0f32; upper * kv_dim];
545                let mut vp = vec![0f32; upper * kv_dim];
546                let nk = host_k[i].len().min(kp.len());
547                let nv = host_v[i].len().min(vp.len());
548                kp[..nk].copy_from_slice(&host_k[i][..nk]);
549                vp[..nv].copy_from_slice(&host_v[i][..nv]);
550                let k_name = format!("past_k_{i}");
551                let v_name = format!("past_v_{i}");
552                if !dst.bind_gpu_handle(&k_name, &kp) || !dst.bind_gpu_handle(&v_name, &vp) {
553                    return false;
554                }
555                dst.register_kv_row_feed(&k_name, 1 + 2 * i);
556                dst.register_kv_row_feed(&v_name, 2 + 2 * i);
557            }
558            dst.stage_bound_gpu_handles_to_arena();
559            if host_rows < prefix_tokens {
560                return dst.copy_resident_kv_rows_from(
561                    src,
562                    host_rows,
563                    prefix_tokens,
564                    outgoing_upper,
565                    kv_dim,
566                    n_layers,
567                );
568            }
569            for i in 0..n_layers {
570                let k_name = format!("past_k_{i}");
571                let v_name = format!("past_v_{i}");
572                dst.prepare_resident_gpu_handle(&k_name);
573                dst.prepare_resident_gpu_handle(&v_name);
574            }
575            true
576        };
577        if src_idx < dst_idx {
578            let (left, right) = self.buckets.split_at_mut(dst_idx);
579            let Some(src) = left[src_idx].compiled.as_ref() else {
580                return false;
581            };
582            let Some(dst) = right[0].compiled.as_mut() else {
583                return false;
584            };
585            return rebind(src, dst);
586        }
587        let (left, right) = self.buckets.split_at_mut(src_idx);
588        let Some(src) = right[0].compiled.as_ref() else {
589            return false;
590        };
591        let Some(dst) = left[dst_idx].compiled.as_mut() else {
592            return false;
593        };
594        rebind(src, dst)
595    }
596
597    pub fn total_buckets(&self) -> usize {
598        self.buckets.len()
599    }
600
601    /// Drop compiled graphs for all buckets except `keep`, freeing weight params.
602    pub fn evict_except(&mut self, keep: usize) {
603        for (i, bucket) in self.buckets.iter_mut().enumerate() {
604            if i != keep {
605                bucket.compiled = None;
606            }
607        }
608    }
609
610    /// Drop the single highest-index compiled bucket that is not `protect`,
611    /// freeing its weight params, and return its index (or `None` if no
612    /// other bucket is compiled). Pass `usize::MAX` for `protect` to allow
613    /// evicting any bucket.
614    ///
615    /// This is the incremental counterpart to [`Self::evict_except`]: rather
616    /// than collapsing to a single resident bucket, callers can free one
617    /// bucket at a time (highest index first) until a memory budget is met,
618    /// keeping the rest of the ladder cached. Highest-index-first is the
619    /// right victim order for a monotonically increasing `past_seq` sweep —
620    /// the top bucket is reached last within an utterance and the low buckets
621    /// recur first at the start of the next, so evicting from the top
622    /// maximizes cross-utterance reuse.
623    pub fn evict_one_except(&mut self, protect: usize) -> Option<usize> {
624        let victim = self
625            .buckets
626            .iter()
627            .enumerate()
628            .rev()
629            .find(|(i, b)| *i != protect && b.compiled.is_some())
630            .map(|(i, _)| i)?;
631        if let Some(compiled) = self.buckets[victim].compiled.as_mut() {
632            compiled.sync_pending();
633        }
634        self.buckets[victim].compiled = None;
635        Some(victim)
636    }
637
638    /// Drop all compiled graphs (free param storage).
639    pub fn clear_compiled(&mut self) {
640        for bucket in &mut self.buckets {
641            bucket.compiled = None;
642        }
643    }
644
645    /// "Compile at max, run at less" convenience for inputs and outputs
646    /// whose outer dimension is the bucket key:
647    ///
648    /// 1. Find or compile the bucket containing `key`.
649    /// 2. For each input, pad to `upper` rows along the outer dim using
650    ///    `pad_rows` (caller passes the inner-dim stride per input;
651    ///    `inner = 1` for purely 1D inputs).
652    /// 3. Run the compiled graph at full extent.
653    /// 4. Slice each output back to `actual_rows` along its outer dim.
654    ///    Outputs flagged with `inner = 0` in `output_inners` are
655    ///    returned unsliced (use this for extent-independent outputs
656    ///    like a pooled `[hidden]` embedding). Missing entries past
657    ///    the end of `output_inners` are also returned unsliced.
658    ///
659    /// Returns `(upper, outputs)`. Returns `None` if `key` falls outside
660    /// every bucket.
661    ///
662    /// **Compute scope:** kernels execute at the bucket's compile
663    /// extent (`upper`), not at `actual_rows`. This means smaller
664    /// buckets directly translate to less padded compute. With
665    /// [`power_of_two_ladder`](Self::power_of_two_ladder) the worst-
666    /// case waste is bounded at 2×; with hand-tuned buckets it can be
667    /// arbitrarily tight. True active-extent dispatch — one big
668    /// compile, kernels short-circuit at runtime — is a separate
669    /// per-backend change.
670    pub fn run_padded<F: FnOnce(u64) -> Graph>(
671        &mut self,
672        key: u64,
673        actual_rows: usize,
674        build: F,
675        inputs: &[(&str, &[f32], usize)],
676        output_inners: &[usize],
677    ) -> Option<(u64, Vec<Vec<f32>>)> {
678        let (upper, compiled) = self.get_or_compile(key, build)?;
679
680        // Own the padded buffers so they outlive the borrow handed to `run`.
681        let padded: Vec<(&str, Vec<f32>)> = inputs
682            .iter()
683            .map(|(name, data, inner)| (*name, pad_rows(data, *inner, upper)))
684            .collect();
685        let pairs: Vec<(&str, &[f32])> = padded.iter().map(|(n, d)| (*n, d.as_slice())).collect();
686
687        // Hint active-extent: backends that support per-kernel skip-
688        // compute (today: CPU's Activation thunk family) honor it; the
689        // default trait impl is a no-op, so other backends just process
690        // full extent and the slice_rows below still gives the user
691        // correct outputs.
692        compiled.set_active_extent(Some((actual_rows, upper as usize)));
693        let raw_outputs = compiled.run(&pairs);
694        compiled.set_active_extent(None);
695        #[cfg(feature = "cpu")]
696        crate::onnx_active::set_active_token_count(None);
697
698        let outs = raw_outputs
699            .into_iter()
700            .enumerate()
701            .map(|(i, out)| match output_inners.get(i).copied() {
702                Some(0) | None => out,
703                Some(inner) => slice_rows(&out, inner, actual_rows),
704            })
705            .collect();
706
707        Some((upper, outs))
708    }
709
710    /// Like [`Self::get_or_compile_with_options`] but also uploads `params` on first compile.
711    pub fn ensure_graph_with_params<F>(
712        &mut self,
713        key: u64,
714        build: F,
715        options: &crate::CompileOptions,
716    ) -> Option<(u64, &mut CompiledGraph)>
717    where
718        F: FnOnce(u64) -> (Graph, HashMap<String, Vec<f32>>),
719    {
720        let idx = self.bucket_for(key)?;
721        let upper = self.buckets[idx].range.end - 1;
722        if self.buckets[idx].compiled.is_none() {
723            let (graph, params) = build(upper);
724            let mut session = Session::new(self.device);
725            if let Some(p) = &self.policy {
726                session = session.with_policy(p.clone());
727            }
728            let mut compiled = session.compile_with(graph, options);
729            for (name, data) in params {
730                compiled.set_param(&name, &data);
731            }
732            self.buckets[idx].compiled = Some(compiled);
733        }
734        Some((upper, self.buckets[idx].compiled.as_mut().unwrap()))
735    }
736
737    /// HIR variant of [`Self::ensure_graph_with_params`].
738    pub fn ensure_hir_with_params<F>(
739        &mut self,
740        key: u64,
741        build: F,
742        options: &crate::CompileOptions,
743    ) -> Option<(u64, &mut CompiledGraph)>
744    where
745        F: FnOnce(u64) -> (HirModule, HashMap<String, Vec<f32>>),
746    {
747        let idx = self.bucket_for(key)?;
748        let upper = self.buckets[idx].range.end - 1;
749        if self.buckets[idx].compiled.is_none() {
750            let (hir, params) = build(upper);
751            let mut session = Session::new(self.device);
752            if let Some(p) = &self.policy {
753                session = session.with_policy(p.clone());
754            }
755            let mut compiled = session
756                .compile_hir_with(hir, options)
757                .expect("HIR lower/compile in ensure_hir_with_params");
758            for (name, data) in params {
759                compiled.set_param(&name, &data);
760            }
761            self.buckets[idx].compiled = Some(compiled);
762        }
763        Some((upper, self.buckets[idx].compiled.as_mut().unwrap()))
764    }
765
766    /// [`Self::run_padded`] with per-input optional row padding (`CacheRunInput`).
767    pub fn run_padded_mixed<F>(
768        &mut self,
769        key: u64,
770        actual_rows: usize,
771        build: F,
772        inputs: &[CacheRunInput<'_>],
773        output_inners: &[usize],
774    ) -> Option<(u64, Vec<Vec<f32>>)>
775    where
776        F: FnOnce(u64) -> Graph,
777    {
778        let (upper, compiled) = self.get_or_compile(key, build)?;
779
780        let padded: Vec<(&str, Vec<f32>)> = inputs
781            .iter()
782            .map(|inp| match inp.row_inner {
783                Some(inner) => (inp.name, pad_rows(inp.data, inner, upper)),
784                None => (inp.name, inp.data.to_vec()),
785            })
786            .collect();
787        let pairs: Vec<(&str, &[f32])> = padded.iter().map(|(n, d)| (*n, d.as_slice())).collect();
788
789        compiled.set_active_extent(Some((actual_rows, upper as usize)));
790        let raw_outputs = compiled.run(&pairs);
791        compiled.set_active_extent(None);
792        #[cfg(feature = "cpu")]
793        crate::onnx_active::set_active_token_count(None);
794
795        let outs = raw_outputs
796            .into_iter()
797            .enumerate()
798            .map(|(i, out)| match output_inners.get(i).copied() {
799                Some(0) | None => out,
800                Some(inner) => slice_rows(&out, inner, actual_rows),
801            })
802            .collect();
803
804        Some((upper, outs))
805    }
806
807    /// Drain in-flight GPU work on every compiled bucket.
808    pub fn sync_all(&mut self) {
809        for bucket in &mut self.buckets {
810            if let Some(compiled) = &mut bucket.compiled {
811                compiled.sync_pending();
812            }
813        }
814    }
815}
816
817// ── Dynamic-dim cache (plan #54) ──────────────────────────────────────
818//
819// Compile HIR once through the fusion pipeline (graph may contain
820// `Dim::Dynamic` symbols), then specialize to concrete shapes per cache
821// key and backend-compile the resulting LIR.
822
823/// Compile-once / specialize-at-runtime cache for symbolic HIR modules.
824pub struct DynamicDimCompileCache {
825    device: Device,
826    policy: Option<rlx_opt::PrecisionPolicy>,
827    capacity: usize,
828    template: Option<CompileResult>,
829    entries: Vec<(u64, CompiledGraph)>,
830    order: VecDeque<u64>,
831}
832
833impl DynamicDimCompileCache {
834    pub fn new(device: Device, capacity: usize) -> Self {
835        Self::with_policy(device, capacity, None)
836    }
837
838    pub fn with_policy(
839        device: Device,
840        capacity: usize,
841        policy: Option<rlx_opt::PrecisionPolicy>,
842    ) -> Self {
843        assert!(capacity > 0, "DynamicDimCompileCache capacity must be ≥ 1");
844        Self {
845            device,
846            policy,
847            capacity,
848            template: None,
849            entries: Vec::with_capacity(capacity),
850            order: VecDeque::with_capacity(capacity),
851        }
852    }
853
854    pub fn compile_device(&self) -> Device {
855        self.device
856    }
857
858    /// Return a backend-compiled graph specialized for `binding`.
859    /// `build_hir` runs at most once to populate the dynamic template.
860    pub fn get_or_specialize<F: FnOnce() -> HirModule>(
861        &mut self,
862        key: u64,
863        binding: &DimBinding,
864        build_hir: F,
865        options: &crate::CompileOptions,
866    ) -> Result<&mut CompiledGraph, rlx_ir::hir::LowerError> {
867        if let Some(idx) = self.entries.iter().position(|(k, _)| *k == key) {
868            return Ok(&mut self.entries[idx].1);
869        }
870        if self.template.is_none() {
871            let mut template_opts = options.clone();
872            template_opts.dim_binding = None;
873            let pipe = crate::stages::pipeline_for(self.device, &template_opts);
874            self.template = Some(pipe.compile_hir(build_hir())?);
875        }
876        let template = self.template.as_ref().expect("template just set");
877        let mut spec_opts = options.clone();
878        spec_opts.dim_binding = None;
879        let pipe = crate::stages::pipeline_for(self.device, &spec_opts);
880        let specialized = template.specialize(&pipe, binding);
881        let backend = crate::registry::backend_for(self.device).expect("backend registered");
882        let mut compile_opts = options.clone();
883        compile_opts.dim_binding = None;
884        if compile_opts.policy.is_none() {
885            if let Some(p) = &self.policy {
886                compile_opts = compile_opts.policy(p.clone());
887            }
888        }
889        let executable = backend.compile_lir(specialized.lir, &compile_opts);
890        let compiled = CompiledGraph::new(executable, self.device);
891
892        if self.entries.len() >= self.capacity
893            && let Some(evict_key) = self.order.pop_front()
894        {
895            sync_evicted_entry(&mut self.entries, evict_key);
896            self.entries.retain(|(k, _)| *k != evict_key);
897        }
898        self.entries.push((key, compiled));
899        self.order.push_back(key);
900        Ok(&mut self.entries.last_mut().unwrap().1)
901    }
902
903    pub fn len(&self) -> usize {
904        self.entries.len()
905    }
906
907    pub fn is_empty(&self) -> bool {
908        self.entries.is_empty()
909    }
910
911    pub fn contains(&self, key: u64) -> bool {
912        self.entries.iter().any(|(k, _)| *k == key)
913    }
914
915    pub fn clear(&mut self) {
916        self.sync_all();
917        self.template = None;
918        self.entries.clear();
919        self.order.clear();
920    }
921
922    pub fn has_template(&self) -> bool {
923        self.template.is_some()
924    }
925
926    /// Drain in-flight GPU work on every specialized entry.
927    pub fn sync_all(&mut self) {
928        for (_, compiled) in &mut self.entries {
929            compiled.sync_pending();
930        }
931    }
932
933    /// Build the symbolic template once (no specialization).
934    pub fn ensure_template<F: FnOnce() -> HirModule>(
935        &mut self,
936        build_hir: F,
937        options: &crate::CompileOptions,
938    ) -> Result<&CompileResult, rlx_ir::hir::LowerError> {
939        if self.template.is_none() {
940            let mut opts = options.clone();
941            opts.dim_binding = None;
942            let pipe = crate::stages::pipeline_for(self.device, &opts);
943            self.template = Some(pipe.compile_hir(build_hir())?);
944        }
945        Ok(self.template.as_ref().expect("template set"))
946    }
947
948    pub fn template_result(&self) -> Option<&CompileResult> {
949        self.template.as_ref()
950    }
951
952    /// Specialize via on-disk LIR cache ([`CompilationMode::Aot`]).
953    /// Disk-backed specialize ([`rlx_ir::CompilationMode::Aot`]).
954    pub fn get_or_specialize_aot<F: FnOnce() -> HirModule>(
955        &mut self,
956        aot: &crate::AotCache,
957        disk_base: &str,
958        key: u64,
959        binding: &rlx_ir::DimBinding,
960        build_hir: F,
961        options: &crate::CompileOptions,
962    ) -> Result<&mut CompiledGraph, crate::AotCacheError> {
963        if let Some(idx) = self.entries.iter().position(|(k, _)| *k == key) {
964            return Ok(&mut self.entries[idx].1);
965        }
966        let device = self.device;
967        let template = self.ensure_template(build_hir, options)?;
968        let compiled = aot.specialize_cached(disk_base, binding, device, template, options)?;
969        if self.entries.len() >= self.capacity
970            && let Some(evict_key) = self.order.pop_front()
971        {
972            sync_evicted_entry(&mut self.entries, evict_key);
973            self.entries.retain(|(k, _)| *k != evict_key);
974        }
975        self.entries.push((key, compiled));
976        self.order.push_back(key);
977        Ok(&mut self.entries.last_mut().unwrap().1)
978    }
979}
980
981/// Pad `data` (interpreted as `[actual, inner]` row-major) up to `upper`
982/// rows by appending zeros. Returns a `Vec<f32>` of length
983/// `upper * inner`. Companion of [`slice_rows`] for the
984/// "compile at max, run at less" workflow with [`BucketedCompileCache`].
985///
986/// Panics if `data.len()` is not a multiple of `inner`, if `inner == 0`,
987/// or if `data.len() / inner > upper`.
988pub fn pad_rows(data: &[f32], inner: usize, upper: u64) -> Vec<f32> {
989    assert!(inner > 0, "pad_rows: inner stride must be ≥ 1");
990    assert_eq!(
991        data.len() % inner,
992        0,
993        "pad_rows: data len {} not a multiple of inner {inner}",
994        data.len(),
995    );
996    let upper = upper as usize;
997    let actual = data.len() / inner;
998    assert!(
999        actual <= upper,
1000        "pad_rows: actual rows {actual} exceed upper bound {upper}",
1001    );
1002    let mut out = vec![0.0_f32; upper * inner];
1003    out[..actual * inner].copy_from_slice(data);
1004    out
1005}
1006
1007/// Pad `data` (`[actual, inner]` row-major) into preallocated `out` (`[upper, inner]`).
1008pub fn pad_rows_into(out: &mut [f32], data: &[f32], inner: usize) {
1009    assert!(inner > 0, "pad_rows_into: inner stride must be ≥ 1");
1010    assert_eq!(
1011        data.len() % inner,
1012        0,
1013        "pad_rows_into: data len {} not a multiple of inner {inner}",
1014        data.len(),
1015    );
1016    assert_eq!(
1017        out.len() % inner,
1018        0,
1019        "pad_rows_into: out len {} not a multiple of inner {inner}",
1020        out.len(),
1021    );
1022    let upper = out.len() / inner;
1023    let actual = data.len() / inner;
1024    assert!(
1025        actual <= upper,
1026        "pad_rows_into: actual rows {actual} exceed upper bound {upper}",
1027    );
1028    out.fill(0.0);
1029    out[..data.len()].copy_from_slice(data);
1030}
1031
1032/// Slice `data` (interpreted as `[upper, inner]` row-major) down to
1033/// `actual` rows. Companion of [`pad_rows`].
1034///
1035/// Panics if `data.len()` is not a multiple of `inner`, if `inner == 0`,
1036/// or if `actual` exceeds the number of rows in `data`.
1037pub fn slice_rows(data: &[f32], inner: usize, actual: usize) -> Vec<f32> {
1038    assert!(inner > 0, "slice_rows: inner stride must be ≥ 1");
1039    assert_eq!(
1040        data.len() % inner,
1041        0,
1042        "slice_rows: data len {} not a multiple of inner {inner}",
1043        data.len(),
1044    );
1045    let upper = data.len() / inner;
1046    assert!(
1047        actual <= upper,
1048        "slice_rows: actual rows {actual} exceed upper {upper}",
1049    );
1050    data[..actual * inner].to_vec()
1051}
1052
1053#[cfg(test)]
1054mod tests {
1055    use super::*;
1056    use rlx_ir::infer::GraphExt;
1057    use rlx_ir::*;
1058    use std::cell::Cell;
1059
1060    fn tiny_graph(n: usize) -> Graph {
1061        let mut g = Graph::new("t");
1062        let f = DType::F32;
1063        let x = g.input("x", Shape::new(&[n], f));
1064        let y = g.activation(rlx_ir::op::Activation::Relu, x, Shape::new(&[n], f));
1065        g.set_outputs(vec![y]);
1066        g
1067    }
1068
1069    #[test]
1070    fn cache_hits_avoid_recompile() {
1071        let mut cache = CompileCache::new(Device::Cpu, 4);
1072        let calls = Cell::new(0);
1073
1074        let _ = cache.get_or_compile(1, || {
1075            calls.set(calls.get() + 1);
1076            tiny_graph(8)
1077        });
1078        let _ = cache.get_or_compile(1, || {
1079            calls.set(calls.get() + 1);
1080            tiny_graph(8)
1081        });
1082        let _ = cache.get_or_compile(1, || {
1083            calls.set(calls.get() + 1);
1084            tiny_graph(8)
1085        });
1086        // Same key three times: build closure runs once.
1087        assert_eq!(calls.get(), 1);
1088        assert_eq!(cache.len(), 1);
1089    }
1090
1091    #[test]
1092    fn fifo_evicts_oldest_at_capacity() {
1093        let mut cache = CompileCache::new(Device::Cpu, 2);
1094        let _ = cache.get_or_compile(1, || tiny_graph(4));
1095        let _ = cache.get_or_compile(2, || tiny_graph(8));
1096        assert!(cache.contains(1) && cache.contains(2));
1097        // Third entry evicts key 1 (oldest).
1098        let _ = cache.get_or_compile(3, || tiny_graph(16));
1099        assert!(!cache.contains(1));
1100        assert!(cache.contains(2) && cache.contains(3));
1101    }
1102
1103    #[test]
1104    fn different_keys_keep_separate_compiles() {
1105        let mut cache = CompileCache::new(Device::Cpu, 4);
1106        let calls = Cell::new(0);
1107        let _ = cache.get_or_compile(1, || {
1108            calls.set(calls.get() + 1);
1109            tiny_graph(8)
1110        });
1111        let _ = cache.get_or_compile(2, || {
1112            calls.set(calls.get() + 1);
1113            tiny_graph(16)
1114        });
1115        let _ = cache.get_or_compile(1, || {
1116            calls.set(calls.get() + 1);
1117            tiny_graph(8)
1118        });
1119        // Two unique keys → two compiles.
1120        assert_eq!(calls.get(), 2);
1121        assert_eq!(cache.len(), 2);
1122    }
1123
1124    // ── BucketedCompileCache ──────────────────────────────────────────
1125
1126    #[test]
1127    fn bucket_amortizes_keys_within_range() {
1128        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..4, 4..16]);
1129        let calls = Cell::new(0);
1130        let uppers = Cell::new((0u64, 0u64));
1131
1132        // Two distinct keys (2 and 3) both fall inside bucket 0 (1..4).
1133        let (u1, _) = cache
1134            .get_or_compile(2, |upper| {
1135                calls.set(calls.get() + 1);
1136                uppers.set((upper, uppers.get().1));
1137                tiny_graph(upper as usize)
1138            })
1139            .expect("key 2 in range");
1140        let (u2, _) = cache
1141            .get_or_compile(3, |upper| {
1142                calls.set(calls.get() + 1);
1143                uppers.set((uppers.get().0, upper));
1144                tiny_graph(upper as usize)
1145            })
1146            .expect("key 3 in range");
1147
1148        // One compile, both calls saw the same upper = range.end - 1 = 3.
1149        assert_eq!(calls.get(), 1);
1150        assert_eq!(u1, 3);
1151        assert_eq!(u2, 3);
1152        assert_eq!(uppers.get().0, 3);
1153        assert_eq!(cache.compiled_count(), 1);
1154        assert_eq!(cache.total_buckets(), 2);
1155    }
1156
1157    #[test]
1158    fn bucket_lookup_returns_none_outside_range() {
1159        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..4, 4..16]);
1160        assert!(cache.bucket_for(0).is_none());
1161        assert!(cache.bucket_for(16).is_none());
1162        assert!(cache.bucket_for(100).is_none());
1163        assert_eq!(cache.bucket_for(3), Some(0));
1164        assert_eq!(cache.bucket_for(4), Some(1));
1165        assert_eq!(cache.bucket_upper_for_key(3), Some(3));
1166        assert_eq!(cache.bucket_upper_for_key(4), Some(15));
1167        assert!(cache.bucket_upper_for_key(0).is_none());
1168
1169        let calls = Cell::new(0);
1170        let result = cache.get_or_compile(100, |u| {
1171            calls.set(calls.get() + 1);
1172            tiny_graph(u as usize)
1173        });
1174        assert!(result.is_none());
1175        assert_eq!(calls.get(), 0); // build closure must not run for OOR keys
1176        assert_eq!(cache.compiled_count(), 0);
1177    }
1178
1179    #[test]
1180    fn bucket_compiles_lazily_per_bucket() {
1181        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..4, 4..16, 16..64]);
1182        let calls = Cell::new(0);
1183
1184        let _ = cache.get_or_compile(2, |u| {
1185            calls.set(calls.get() + 1);
1186            tiny_graph(u as usize)
1187        });
1188        let _ = cache.get_or_compile(8, |u| {
1189            calls.set(calls.get() + 1);
1190            tiny_graph(u as usize)
1191        });
1192        // Two distinct buckets hit → two compiles. Third bucket untouched.
1193        assert_eq!(calls.get(), 2);
1194        assert_eq!(cache.compiled_count(), 2);
1195        assert_eq!(cache.total_buckets(), 3);
1196    }
1197
1198    #[test]
1199    #[should_panic(expected = "overlap")]
1200    fn bucket_overlap_rejected() {
1201        let _ = BucketedCompileCache::new(Device::Cpu, vec![1..8, 4..16]);
1202    }
1203
1204    #[test]
1205    #[should_panic(expected = "≥1 bucket")]
1206    fn empty_bucket_list_rejected() {
1207        let _ = BucketedCompileCache::new(Device::Cpu, vec![]);
1208    }
1209
1210    // ── pad_rows / slice_rows ─────────────────────────────────────────
1211
1212    #[test]
1213    fn pad_rows_appends_zeros() {
1214        // 1D: actual=3 → upper=5, inner=1.
1215        let p = pad_rows(&[1.0, 2.0, 3.0], 1, 5);
1216        assert_eq!(p, vec![1.0, 2.0, 3.0, 0.0, 0.0]);
1217
1218        // 2D row-major [actual=2, inner=3] → [upper=4, inner=3].
1219        let p = pad_rows(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, 4);
1220        assert_eq!(
1221            p,
1222            vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
1223        );
1224
1225        // actual == upper: no-op pad.
1226        let p = pad_rows(&[7.0, 8.0], 1, 2);
1227        assert_eq!(p, vec![7.0, 8.0]);
1228    }
1229
1230    #[test]
1231    fn slice_rows_truncates_trailing() {
1232        let s = slice_rows(&[1.0, 2.0, 3.0, 0.0, 0.0], 1, 3);
1233        assert_eq!(s, vec![1.0, 2.0, 3.0]);
1234
1235        let s = slice_rows(&[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 0.0, 0.0], 3, 2);
1236        assert_eq!(s, vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
1237    }
1238
1239    #[test]
1240    #[should_panic(expected = "exceed upper")]
1241    fn pad_rows_rejects_too_long_input() {
1242        let _ = pad_rows(&[1.0, 2.0, 3.0, 4.0], 1, 3);
1243    }
1244
1245    #[test]
1246    #[should_panic(expected = "exceed upper")]
1247    fn slice_rows_rejects_too_large_actual() {
1248        let _ = slice_rows(&[1.0, 2.0, 3.0], 1, 5);
1249    }
1250
1251    // ── BucketedCompileCache::run_padded ──────────────────────────────
1252
1253    #[test]
1254    fn run_padded_pads_input_and_slices_output() {
1255        // tiny_graph is 1D [n] → relu → [n].
1256        // Compile bucket [1..16) at upper=15, run with actual_rows=10.
1257        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..16]);
1258        let input: Vec<f32> = vec![1.0, -1.0, 2.0, -2.0, 3.0, -3.0, 4.0, -4.0, 5.0, -5.0];
1259
1260        let (upper, outs) = cache
1261            .run_padded(
1262                10, // key
1263                10, // actual rows
1264                |max| tiny_graph(max as usize),
1265                &[("x", &input, 1)], // 1D, inner stride 1
1266                &[1],                // slice the one output to actual rows
1267            )
1268            .expect("key 10 in [1..16)");
1269
1270        assert_eq!(upper, 15);
1271        assert_eq!(outs.len(), 1);
1272        let out = &outs[0];
1273        assert_eq!(out.len(), 10, "output sliced back to actual_rows");
1274        let expected: Vec<f32> = input.iter().map(|x| x.max(0.0)).collect();
1275        assert_eq!(out, &expected);
1276    }
1277
1278    #[test]
1279    fn run_padded_reuses_bucket_across_actuals() {
1280        // Same bucket, two different actuals — only one compile.
1281        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..16]);
1282        let calls = Cell::new(0);
1283
1284        let (u1, o1) = cache
1285            .run_padded(
1286                10,
1287                10,
1288                |max| {
1289                    calls.set(calls.get() + 1);
1290                    tiny_graph(max as usize)
1291                },
1292                &[(
1293                    "x",
1294                    &[1.0, -1.0, 2.0, -2.0, 3.0, -3.0, 4.0, -4.0, 5.0, -5.0],
1295                    1,
1296                )],
1297                &[1],
1298            )
1299            .unwrap();
1300        assert_eq!(o1.len(), 1);
1301        assert_eq!(o1[0].len(), 10);
1302        assert_eq!(u1, 15);
1303
1304        let (u2, o2) = cache
1305            .run_padded(
1306                5,
1307                5,
1308                |max| {
1309                    calls.set(calls.get() + 1);
1310                    tiny_graph(max as usize)
1311                },
1312                &[("x", &[-1.0, 2.0, -3.0, 4.0, -5.0], 1)],
1313                &[1],
1314            )
1315            .unwrap();
1316        assert_eq!(o2.len(), 1);
1317        assert_eq!(o2[0].len(), 5);
1318        assert_eq!(u2, 15);
1319        assert_eq!(o2[0], vec![0.0, 2.0, 0.0, 4.0, 0.0]);
1320
1321        assert_eq!(calls.get(), 1, "bucket cached across actuals");
1322        assert_eq!(cache.compiled_count(), 1);
1323    }
1324
1325    #[test]
1326    fn run_padded_returns_none_out_of_range() {
1327        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..16]);
1328        let calls = Cell::new(0);
1329        let result = cache.run_padded(
1330            100,
1331            5,
1332            |u| {
1333                calls.set(calls.get() + 1);
1334                tiny_graph(u as usize)
1335            },
1336            &[("x", &[1.0, 2.0, 3.0, 4.0, 5.0], 1)],
1337            &[1],
1338        );
1339        assert!(result.is_none());
1340        assert_eq!(calls.get(), 0);
1341        assert_eq!(cache.compiled_count(), 0);
1342    }
1343
1344    // ── power_of_two_ladder ───────────────────────────────────────────
1345
1346    #[test]
1347    fn power_of_two_ladder_generates_log_buckets() {
1348        let cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 8, 64);
1349        // Expect buckets covering keys 1..=64 with extents 8, 16, 32, 64.
1350        let ranges: Vec<_> = cache.buckets().cloned().collect();
1351        assert_eq!(ranges, vec![1..9, 9..17, 17..33, 33..65]);
1352        assert_eq!(cache.total_buckets(), 4);
1353    }
1354
1355    #[test]
1356    fn power_of_two_ladder_picks_smallest_extent_for_actual() {
1357        // Ladder: extents 8, 16, 32, 64. actual=17 lands in the 32-extent
1358        // bucket, NOT the 64-extent one — that's the compute saving.
1359        let mut cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 8, 64);
1360        let captured_uppers: std::cell::RefCell<Vec<u64>> = Default::default();
1361
1362        let (u17, _) = cache
1363            .get_or_compile(17, |upper| {
1364                captured_uppers.borrow_mut().push(upper);
1365                tiny_graph(upper as usize)
1366            })
1367            .unwrap();
1368        let (u9, _) = cache
1369            .get_or_compile(9, |upper| {
1370                captured_uppers.borrow_mut().push(upper);
1371                tiny_graph(upper as usize)
1372            })
1373            .unwrap();
1374        let (u3, _) = cache
1375            .get_or_compile(3, |upper| {
1376                captured_uppers.borrow_mut().push(upper);
1377                tiny_graph(upper as usize)
1378            })
1379            .unwrap();
1380        let (u64_, _) = cache
1381            .get_or_compile(64, |upper| {
1382                captured_uppers.borrow_mut().push(upper);
1383                tiny_graph(upper as usize)
1384            })
1385            .unwrap();
1386
1387        assert_eq!(u17, 32, "key=17 → smallest extent ≥ 17 is 32");
1388        assert_eq!(u9, 16, "key=9  → smallest extent ≥ 9  is 16");
1389        assert_eq!(u3, 8, "key=3  → smallest extent ≥ 3  is 8");
1390        assert_eq!(u64_, 64, "key=64 → exact match at 64");
1391        assert_eq!(*captured_uppers.borrow(), vec![32, 16, 8, 64]);
1392        assert_eq!(cache.compiled_count(), 4);
1393    }
1394
1395    #[test]
1396    fn power_of_two_ladder_min_above_one_starts_at_one() {
1397        // First bucket always covers from key 1, even when min > 1.
1398        // (`min` controls the ladder's first extent, not the lower edge.)
1399        let cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 16, 32);
1400        let ranges: Vec<_> = cache.buckets().cloned().collect();
1401        // min=16 → first extent 16, second 32. Buckets: 1..17, 17..33.
1402        assert_eq!(ranges, vec![1..17, 17..33]);
1403    }
1404
1405    #[test]
1406    fn power_of_two_ladder_non_pow2_min_rounds_up() {
1407        // min=10 → next_power_of_two = 16.
1408        let cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 10, 64);
1409        let ranges: Vec<_> = cache.buckets().cloned().collect();
1410        assert_eq!(ranges, vec![1..17, 17..33, 33..65]);
1411    }
1412
1413    #[test]
1414    fn power_of_two_ladder_max_below_pow2_extends_up() {
1415        // max=20 needs to be covered → ladder extends to 32.
1416        let cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 8, 20);
1417        let ranges: Vec<_> = cache.buckets().cloned().collect();
1418        assert_eq!(ranges, vec![1..9, 9..17, 17..33]);
1419    }
1420
1421    #[test]
1422    fn power_of_two_ladder_min_equals_max() {
1423        let cache = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 16, 16);
1424        let ranges: Vec<_> = cache.buckets().cloned().collect();
1425        assert_eq!(ranges, vec![1..17]);
1426    }
1427
1428    #[test]
1429    #[should_panic(expected = "min must be ≥ 1")]
1430    fn power_of_two_ladder_zero_min_rejected() {
1431        let _ = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 0, 16);
1432    }
1433
1434    #[test]
1435    #[should_panic(expected = "max")]
1436    fn power_of_two_ladder_max_below_min_rejected() {
1437        let _ = BucketedCompileCache::power_of_two_ladder(Device::Cpu, 32, 8);
1438    }
1439
1440    // ── Active-extent dispatch (true per-kernel skip-compute) ─────────
1441    //
1442    // The 3 tests below assert per-thunk active-extent scaling on the CPU
1443    // backend. Today `rlx_cpu::thunk::execute_thunks_active` is documented
1444    // as a stub that returns false (rlx-cpu/src/thunk.rs:2100-2110), so
1445    // the runtime falls back to full-extent dispatch — overwrites the
1446    // tail and the tail-preservation assertions fail. They're left here
1447    // (marked `#[ignore]`) as the test-driven contract that the future
1448    // active-extent implementation must satisfy. Drop the `#[ignore]`
1449    // when the per-thunk scaling lands for Copy / ActivationInPlace /
1450    // BinaryFull / Attention.
1451
1452    #[test]
1453    #[ignore = "active-extent execution is a stub on CPU (thunk.rs::execute_thunks_active)"]
1454    fn active_extent_skips_compute_on_cpu_activation() {
1455        // tiny_graph(15) is `Input([15]) → Relu → Output` and lowers to
1456        // a Copy + ActivationInPlace pair on CPU — both are in the safe
1457        // set, so the active-extent path runs scaled.
1458        //
1459        // To prove kernels actually skipped: warm the arena with a prior
1460        // full-extent run whose output is `[1.0; 15]`, then run again
1461        // with a negative-only input and active=5. The first 5 outputs
1462        // get re-copied + re-relu'd to 0; the tail (indices 5..15) stays
1463        // at 1.0 because both Copy and Activation skipped it. A full-
1464        // extent fallback would clip every element to 0.
1465        let graph = tiny_graph(15);
1466        let mut compiled = Session::new(Device::Cpu).compile(graph);
1467
1468        // Warm-up: full extent, all-positive input → output [1.0; 15].
1469        let warm_input: Vec<f32> = vec![1.0; 15];
1470        let warm_outs = compiled.run(&[("x", &warm_input)]);
1471        assert_eq!(warm_outs[0], vec![1.0; 15], "warm-up sanity");
1472
1473        // Active-extent run: all-negative input, hint actual=5 of 15.
1474        // First 5: Copy(-1) + Relu → 0. Tail: kernels skip → stays 1.0.
1475        let neg_input: Vec<f32> = vec![-1.0; 15];
1476        compiled.set_active_extent(Some((5, 15)));
1477        let outs = compiled.run(&[("x", &neg_input)]);
1478        let out = &outs[0];
1479
1480        assert_eq!(out.len(), 15);
1481        assert_eq!(
1482            out[..5],
1483            [0.0; 5],
1484            "first 5 elements processed (relu of -1)"
1485        );
1486        assert_eq!(
1487            out[5..],
1488            [1.0; 10],
1489            "tail untouched — proves Copy + Activation skipped indices 5..15"
1490        );
1491
1492        // Clear the hint and run again with the negative input — full
1493        // extent now processes everything, every element clips to 0.
1494        compiled.set_active_extent(None);
1495        let outs = compiled.run(&[("x", &neg_input)]);
1496        assert_eq!(
1497            outs[0],
1498            vec![0.0; 15],
1499            "full-extent path must clip every negative"
1500        );
1501    }
1502
1503    #[test]
1504    #[ignore = "active-extent execution is a stub on CPU (thunk.rs::execute_thunks_active)"]
1505    fn active_extent_skips_compute_on_binary_full() {
1506        // Input([4]) + Input([4]) → Output. Lowers to a BinaryFull
1507        // thunk with no broadcast (lhs_len == rhs_len == len), which
1508        // is in the safe set.
1509        let mut g = Graph::new("add");
1510        let f = DType::F32;
1511        let a = g.input("a", Shape::new(&[4], f));
1512        let b = g.input("b", Shape::new(&[4], f));
1513        let c = g.add(a, b);
1514        g.set_outputs(vec![c]);
1515        let mut compiled = Session::new(Device::Cpu).compile(g);
1516
1517        // Warm: full extent, output buffer becomes [2.0; 4].
1518        let warm = compiled.run(&[("a", &[1.0f32; 4]), ("b", &[1.0f32; 4])]);
1519        assert_eq!(warm[0], vec![2.0; 4]);
1520
1521        // Active-extent run: actual=2 of upper=4. Process first 2
1522        // elements only; tail (indices 2..4) stays at 2.0 from warm.
1523        compiled.set_active_extent(Some((2, 4)));
1524        let outs = compiled.run(&[("a", &[10.0f32; 4]), ("b", &[10.0f32; 4])]);
1525        let out = &outs[0];
1526        assert_eq!(out[..2], [20.0, 20.0], "first 2 = active sum");
1527        assert_eq!(
1528            out[2..],
1529            [2.0, 2.0],
1530            "tail untouched — proves BinaryFull skipped indices 2..4"
1531        );
1532
1533        // Clear hint → full path overwrites entire output.
1534        compiled.set_active_extent(None);
1535        let outs = compiled.run(&[("a", &[10.0f32; 4]), ("b", &[10.0f32; 4])]);
1536        assert_eq!(outs[0], vec![20.0; 4]);
1537    }
1538
1539    #[test]
1540    #[ignore = "process-wide STATE; runs only in isolation via `cargo test perfetto -- --ignored`"]
1541    fn perfetto_trace_emits_per_thunk_events() {
1542        // PLAN L3: end-to-end Perfetto event capture. Requires the env
1543        // var to be set BEFORE the perfetto module is first touched
1544        // (OnceLock — can't re-init). We set it here unconditionally;
1545        // for tests run in parallel within the same process, the
1546        // earliest test wins. To avoid flake we mark this `#[ignore]`
1547        // and the developer runs it explicitly.
1548        use std::env;
1549        use std::fs;
1550        let path = env::temp_dir().join(format!("rlx-perfetto-e2e-{}.json", std::process::id()));
1551        if path.exists() {
1552            let _ = fs::remove_file(&path);
1553        }
1554        unsafe {
1555            env::set_var("RLX_TRACE_PERFETTO", &path);
1556        }
1557
1558        // Build + run a small CPU graph — Add → Relu (no fusion macros).
1559        let f = DType::F32;
1560        let mut g = Graph::new("perf");
1561        let a = g.input("a", Shape::new(&[4], f));
1562        let b = g.input("b", Shape::new(&[4], f));
1563        let s = g.add(a, b);
1564        let r = g.relu(s);
1565        g.set_outputs(vec![r]);
1566        let mut compiled = Session::new(Device::Cpu).compile(g);
1567        let _ = compiled.run(&[("a", &[1.0; 4]), ("b", &[1.0; 4])]);
1568
1569        // Force the trace file to flush its closing bracket.
1570        crate::perfetto::flush_and_finalize();
1571
1572        let contents = fs::read_to_string(&path).expect("trace file");
1573        // At minimum we should see one of our thunk names.
1574        assert!(
1575            contents.contains("\"binary\"")
1576                || contents.contains("\"activation\"")
1577                || contents.contains("\"elementwise_region\""),
1578            "expected at least one thunk-name event in perfetto trace; got: {contents}"
1579        );
1580        // JSON shape: starts with `[` and (after flush) ends with `]`.
1581        assert!(contents.trim_start().starts_with('['));
1582        let _ = fs::remove_file(&path);
1583    }
1584
1585    #[test]
1586    fn elementwise_region_fused_matches_unfused() {
1587        // PLAN L2: a chain `Add(a, b) → Mul(_, c) → Relu` should fuse
1588        // into one ElementwiseRegion thunk in the CPU backend. Compare
1589        // its output against the value computed by hand to confirm the
1590        // fused execution is numerically identical.
1591        let f = DType::F32;
1592        let mut g = Graph::new("ew_e2e");
1593        let a = g.input("a", Shape::new(&[8], f));
1594        let b = g.input("b", Shape::new(&[8], f));
1595        let c = g.input("c", Shape::new(&[8], f));
1596        let s = Shape::new(&[8], f);
1597        let add = g.add(a, b);
1598        let mul = g.mul(add, c);
1599        let relu = g.relu(mul);
1600        let _ = s;
1601        g.set_outputs(vec![relu]);
1602
1603        let mut compiled = Session::new(Device::Cpu).compile(g);
1604        let av: Vec<f32> = vec![1.0, -2.0, 3.0, -4.0, 0.5, -0.5, 1.5, -1.5];
1605        let bv: Vec<f32> = vec![0.5, 1.0, 2.0, 4.0, 0.5, 0.5, 0.5, 0.5];
1606        let cv: Vec<f32> = vec![1.0, 2.0, 1.0, 1.0, 2.0, 3.0, 0.5, 4.0];
1607        let outs = compiled.run(&[("a", &av), ("b", &bv), ("c", &cv)]);
1608        let out = &outs[0];
1609
1610        let expected: Vec<f32> = (0..8)
1611            .map(|i| {
1612                let v = (av[i] + bv[i]) * cv[i];
1613                v.max(0.0)
1614            })
1615            .collect();
1616        for (i, (got, exp)) in out.iter().zip(&expected).enumerate() {
1617            assert!(
1618                (got - exp).abs() < 1e-6,
1619                "mismatch at {i}: got {got}, expected {exp}"
1620            );
1621        }
1622    }
1623
1624    #[test]
1625    #[ignore = "active-extent execution is a stub on CPU (thunk.rs::execute_thunks_active)"]
1626    fn active_extent_skips_compute_on_attention() {
1627        // Standalone Attention with kernel-synthesized MaskKind::None.
1628        // Q/K/V shape: [batch=1, seq=4, num_heads*head_dim=8].
1629        use rlx_ir::op::MaskKind;
1630        let f = DType::F32;
1631        let mut g = Graph::new("attn");
1632        let q = g.input("q", Shape::new(&[1, 4, 8], f));
1633        let k = g.input("k", Shape::new(&[1, 4, 8], f));
1634        let v = g.input("v", Shape::new(&[1, 4, 8], f));
1635        let out = g.attention_kind(q, k, v, 2, 4, MaskKind::None, Shape::new(&[1, 4, 8], f));
1636        g.set_outputs(vec![out]);
1637        let mut compiled = Session::new(Device::Cpu).compile(g);
1638
1639        // Warm: full extent. Q=K=V uniform → output uniform-ish.
1640        let warm = compiled.run(&[
1641            ("q", &[1.0f32; 32]),
1642            ("k", &[1.0f32; 32]),
1643            ("v", &[1.0f32; 32]),
1644        ]);
1645        let warm_out = warm[0].clone();
1646        assert_eq!(warm_out.len(), 32);
1647
1648        // Active: s_active=2 of s_full=4. Different inputs.
1649        // Tail rows (indices 16..32 = positions 2,3) should be untouched
1650        // — preserved from the warm run. First 16 indices recomputed.
1651        compiled.set_active_extent(Some((2, 4)));
1652        let outs = compiled.run(&[
1653            ("q", &[3.0f32; 32]),
1654            ("k", &[3.0f32; 32]),
1655            ("v", &[3.0f32; 32]),
1656        ]);
1657        let out = &outs[0];
1658        assert_eq!(out.len(), 32);
1659        assert_eq!(
1660            &out[16..],
1661            &warm_out[16..],
1662            "tail (positions 2,3) must be untouched — proves Attention skipped"
1663        );
1664        // Sanity: first 2 positions changed since input value differs (3.0 vs 1.0).
1665        assert_ne!(
1666            &out[..16],
1667            &warm_out[..16],
1668            "first 2 positions should reflect new input"
1669        );
1670    }
1671
1672    #[test]
1673    fn active_extent_falls_back_when_unsupported_thunk_in_schedule() {
1674        // A graph containing any thunk outside `safe_for_active_extent`
1675        // (e.g. Sgemm via a matmul) must fall back to the full-extent
1676        // executor — partial application would feed garbage downstream.
1677        // We can't easily construct such a graph at this layer without
1678        // pulling in matmul builders, but we can verify the trait
1679        // contract via the simpler check: setting an extent hint on a
1680        // matmul-bearing graph still gives correct outputs (full-extent
1681        // fallback path was taken).
1682        //
1683        // Skipped explicit construction here — the safety net is the
1684        // `if !all(safe) return false` guard inside execute_thunks_active
1685        // plus the `if !active_used { execute_thunks(...) }` fallback in
1686        // the CPU executor, both unit-tested via direct safety-predicate
1687        // and the warm-arena test above.
1688    }
1689
1690    #[test]
1691    fn run_padded_uses_active_extent_on_cpu() {
1692        // End-to-end: the cache wires set_active_extent before run.
1693        // Same setup as above but driven through run_padded.
1694        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..16]);
1695        let input: Vec<f32> = vec![
1696            1.0, -1.0, 2.0, -2.0, 3.0, // 5 real values
1697            -10.0, -20.0, -30.0, -40.0, -50.0, // padding zeros from pad_rows
1698        ];
1699        // pad_rows zero-pads from len=5 up to upper=15, so the arena
1700        // tail past index 5 is 0.0 going in. After active-extent run,
1701        // tail stays at 0.0 (untouched, but the value happens to match
1702        // what relu would produce). We can't observe skip via output
1703        // here — slice_rows trims to actual_rows anyway.
1704        let (upper, outs) = cache
1705            .run_padded(
1706                5,
1707                5,
1708                |max| tiny_graph(max as usize),
1709                &[("x", &input[..5], 1)],
1710                &[1],
1711            )
1712            .unwrap();
1713        assert_eq!(upper, 15);
1714        assert_eq!(outs[0].len(), 5);
1715        // Active-extent path (CPU honors): outputs match relu of the
1716        // first 5 inputs. Slicing already handled, so user-visible
1717        // result is the same whether or not the kernel skipped tail
1718        // compute. The point of this test is just to confirm the wiring
1719        // path doesn't crash and produces correct outputs end-to-end.
1720        assert_eq!(outs[0], vec![1.0, 0.0, 2.0, 0.0, 3.0]);
1721    }
1722
1723    #[test]
1724    fn run_padded_inner_zero_returns_output_unsliced() {
1725        // Marking output_inners[0] = 0 disables slicing for that output.
1726        // The compiled graph still runs at upper=15, so we expect 15 outputs back.
1727        let mut cache = BucketedCompileCache::new(Device::Cpu, vec![1..16]);
1728        let input: Vec<f32> = vec![1.0, -1.0, 2.0, -2.0, 3.0];
1729
1730        let (upper, outs) = cache
1731            .run_padded(
1732                5,
1733                5,
1734                |max| tiny_graph(max as usize),
1735                &[("x", &input, 1)],
1736                &[0], // don't slice this output
1737            )
1738            .unwrap();
1739
1740        assert_eq!(upper, 15);
1741        assert_eq!(
1742            outs[0].len(),
1743            15,
1744            "unsliced output preserves full upper extent"
1745        );
1746        // First 5 = relu of input, tail 10 = relu(0) = 0.
1747        assert_eq!(&outs[0][..5], &[1.0, 0.0, 2.0, 0.0, 3.0]);
1748        assert!(outs[0][5..].iter().all(|&v| v == 0.0));
1749    }
1750
1751    #[test]
1752    fn dynamic_dim_cache_specializes_per_key() {
1753        use rlx_ir::DType;
1754        use rlx_ir::Shape;
1755        use rlx_ir::hir::HirModule;
1756        use rlx_ir::sym;
1757
1758        let mut cache = DynamicDimCompileCache::new(Device::Cpu, 4);
1759        let opts = crate::CompileOptions::new();
1760        {
1761            let _short = cache
1762                .get_or_specialize(
1763                    8,
1764                    &rlx_ir::DimBinding::batch_seq(1, 8),
1765                    || {
1766                        let mut hir = HirModule::new("dyn_cache");
1767                        let x = hir.input_batch_seq("x", sym::BATCH, sym::SEQ, 4, DType::F32);
1768                        let w = hir.param("w", Shape::new(&[4, 2], DType::F32));
1769                        let y = hir.linear(
1770                            x,
1771                            w,
1772                            None,
1773                            None,
1774                            Shape::batch_seq(sym::BATCH, sym::SEQ, 2, DType::F32),
1775                        );
1776                        hir.set_outputs(vec![y]);
1777                        hir
1778                    },
1779                    &opts,
1780                )
1781                .expect("specialize short");
1782        }
1783        assert!(cache.has_template());
1784        assert_eq!(cache.len(), 1);
1785        cache
1786            .get_or_specialize(
1787                128,
1788                &rlx_ir::DimBinding::batch_seq(1, 128),
1789                || panic!("HIR builder must not run twice"),
1790                &opts,
1791            )
1792            .expect("specialize long");
1793        assert_eq!(cache.len(), 2);
1794    }
1795}
rlx_runtime/compile_cache.rs

rlx_runtime/
compile_cache.rs