Skip to main content

mlx_native/
buffer_pool.rs

1//! [`MlxBufferPool`] — arena-style GPU buffer allocator with reuse.
2//!
3//! Buffers are bucketed by power-of-two sizes.  When a buffer is released back
4//! to the pool, it is added to the free list for its size bucket.  A subsequent
5//! `alloc` call will reuse a free buffer of compatible (>= requested) size
6//! rather than allocating new Metal memory.
7//!
8//! Two return-path patterns are supported and **must not be mixed within a
9//! single arena cycle**:
10//!
11//! * **Per-buffer** via [`release`](MlxBufferPool::release) — explicit return
12//!   of a single buffer to the free list, suitable for ad-hoc patterns where
13//!   the caller knows the precise lifetime of each buffer.
14//! * **Arena bulk** via [`reset`](MlxBufferPool::reset) — bulk-return of every
15//!   buffer handed out by [`alloc`](MlxBufferPool::alloc) since the previous
16//!   reset.  Suitable for per-inference / per-decode-token arena patterns
17//!   where no individual buffer's lifetime crosses the reset boundary.
18//!
19//! Internally, every `alloc` records an ARC-cloned `metal::Buffer` handle so
20//! that `reset` can bulk-recycle without requiring callers to enumerate every
21//! buffer individually.  ARC retain on `metal::Buffer` is cheap (refcount inc).
22
23use std::collections::HashMap;
24
25use crate::buffer::MlxBuffer;
26use crate::device::MlxDevice;
27use crate::dtypes::DType;
28use crate::error::{MlxError, Result};
29
30/// Arena-style buffer pool that reuses Metal buffer allocations.
31///
32/// # Design
33///
34/// * Buffers are bucketed by their allocated size rounded up to the nearest
35///   power of two.  This reduces fragmentation at the cost of occasionally
36///   over-allocating by up to 2x.
37/// * `release()` returns a single buffer; `reset()` returns all outstanding
38///   buffers handed out since the last reset.
39/// * The `MlxDevice` is passed in at every [`alloc`] call (rather than stored
40///   in the pool).  This keeps the pool free of lifetime parameters so it
41///   can be embedded in any owner struct (e.g. the per-decode-token
42///   `DecodeBuffers` cache in hf2q's qwen35 forward path).
43///
44/// # Why an arena reset matters
45///
46/// In the per-decode-token hot path, each token allocates ~1750 Metal buffers
47/// for scratch / intermediate / parameter storage across attention, FFN, and
48/// linear-attention layers.  Direct `MlxDevice::alloc_buffer()` calls hit
49/// Metal's allocator each time (5-30 µs each); pooling reuses the underlying
50/// `metal::Buffer` objects across token boundaries so steady-state allocation
51/// cost amortizes to near zero.  See ADR-012 §Optimize / Task #15 for the
52/// MoE dwq46 0.90× parity gap that motivated this work.
53pub struct MlxBufferPool {
54    /// Free buffers keyed by their power-of-two bucket size.
55    free: HashMap<usize, Vec<metal::Buffer>>,
56    /// Buffers handed out by [`alloc`] since the last [`reset`].  Each entry
57    /// holds an ARC-cloned `metal::Buffer` so the pool's reference keeps the
58    /// underlying GPU allocation alive even after the caller's `MlxBuffer`
59    /// goes out of scope.  [`reset`] drains this into [`free`].
60    in_use: Vec<(usize, metal::Buffer)>,
61    /// Residency set that owns the allocations registered by this pool.
62    residency_set: Option<crate::residency::ResidencySet>,
63    /// Unique Metal buffers this pool added to the residency set, keyed by
64    /// their stable contents pointer. This avoids double-removing buffers if
65    /// callers mix release/reset despite that pattern being unsupported.
66    resident_buffers: HashMap<usize, metal::Buffer>,
67}
68
69impl Default for MlxBufferPool {
70    fn default() -> Self {
71        Self::new()
72    }
73}
74
75impl MlxBufferPool {
76    /// Create a new empty buffer pool.  The Metal device is passed to
77    /// [`alloc`] at every call site, so the pool itself is lifetime-free.
78    pub fn new() -> Self {
79        Self {
80            free: HashMap::new(),
81            in_use: Vec::new(),
82            residency_set: None,
83            resident_buffers: HashMap::new(),
84        }
85    }
86
87    /// Allocate a buffer from the pool.
88    ///
89    /// If a free buffer of compatible size exists in the pool, it is reused
90    /// (with updated dtype/shape metadata).  Otherwise a new Metal buffer is
91    /// allocated from `device` at the bucket size so future reuse is
92    /// possible for any request up to that bucket.
93    ///
94    /// Each successful `alloc` registers the buffer in the pool's in-use
95    /// list (ARC clone — cheap), so a subsequent [`reset`] returns it to
96    /// the free list automatically.
97    pub fn alloc(
98        &mut self,
99        device: &MlxDevice,
100        byte_len: usize,
101        dtype: DType,
102        shape: Vec<usize>,
103    ) -> Result<MlxBuffer> {
104        let (buffer, added_residency) = self.alloc_inner(device, byte_len, dtype, shape)?;
105        if added_residency {
106            if let Some(set) = self.residency_set.as_ref() {
107                set.commit();
108            }
109        }
110        Ok(buffer)
111    }
112
113    /// Allocate several buffers and commit residency-set updates once.
114    pub fn alloc_batch<I>(&mut self, device: &MlxDevice, requests: I) -> Result<Vec<MlxBuffer>>
115    where
116        I: IntoIterator<Item = (usize, DType, Vec<usize>)>,
117    {
118        let mut buffers = Vec::new();
119        let mut added_residency = false;
120
121        for (byte_len, dtype, shape) in requests {
122            let (buffer, added) = self.alloc_inner(device, byte_len, dtype, shape)?;
123            added_residency |= added;
124            buffers.push(buffer);
125        }
126
127        if added_residency {
128            if let Some(set) = self.residency_set.as_ref() {
129                set.commit();
130            }
131        }
132
133        Ok(buffers)
134    }
135
136    fn alloc_inner(
137        &mut self,
138        device: &MlxDevice,
139        byte_len: usize,
140        dtype: DType,
141        shape: Vec<usize>,
142    ) -> Result<(MlxBuffer, bool)> {
143        let bucket = bucket_size(byte_len);
144        let mut added_residency = false;
145
146        // Try to reuse a free buffer from this bucket.
147        let metal_buf = self
148            .free
149            .get_mut(&bucket)
150            .and_then(|free_list| free_list.pop());
151
152        let metal_buf = match metal_buf {
153            Some(b) => b,
154            None => {
155                // Fresh allocation at bucket size.
156                let raw = device
157                    .metal_device()
158                    .new_buffer(bucket as u64, metal::MTLResourceOptions::StorageModeShared);
159                if raw.contents().is_null() {
160                    return Err(MlxError::BufferAllocationError { bytes: bucket });
161                }
162                // ADR-015 iter61a-2 (broken-window B-W-1 residual fix): zero-init
163                // every fresh pool allocation. The same MTLResourceOptions::
164                // StorageModeShared recycling that affects MlxDevice::alloc_buffer
165                // (closed in iter61a, src/device.rs) ALSO affects the pool's
166                // fresh-allocation path. iter61a closed device-direct allocations
167                // but the per-decode-token / per-prefill-chunk arena pool grew its
168                // free list via this `new_buffer` call without zero-init — so on
169                // the FIRST cold-process prefill (where the pool is empty and
170                // every alloc takes the fresh path), kernels reading scratch /
171                // intermediate buffers before fully populating them propagated
172                // recycled-page garbage into logits. Empirically: 5/5 cold-run
173                // first-token logit dumps on 27B-dwq46 produced 5 distinct
174                // hashes, with max abs logit diff up to 5.06 across runs and
175                // 248044/248044 logits differing — far above kernel-reduction
176                // ULP noise, consistent with structural memory contamination.
177                //
178                // Cost: one memset per fresh allocation. Reused buffers (the
179                // steady-state hot path after warm-up) skip this entirely
180                // because their bytes are valid producer outputs from prior
181                // pool cycles. Per `feedback_no_broken_windows` + mantra
182                // "No fallback. No stub. Just pure excellence." — fix at the
183                // source.
184                //
185                // Safety: `raw.contents()` is non-null (verified above), points
186                // to exactly `bucket` bytes of `StorageModeShared` memory we
187                // just allocated and have exclusive access to. The buffer is
188                // not yet wrapped in `MlxBuffer` and not yet in `in_use` /
189                // residency set, so no other thread or GPU dispatch references
190                // it. Writing zero bytes is well-defined for any DType.
191                unsafe {
192                    std::ptr::write_bytes(raw.contents() as *mut u8, 0, bucket);
193                }
194                added_residency = self.register_residency_allocation(device, &raw)?;
195                raw
196            }
197        };
198
199        // Track the handout so reset() can recycle it.  ARC clone is cheap.
200        self.in_use.push((bucket, metal_buf.clone()));
201
202        Ok((MlxBuffer::from_raw(metal_buf, dtype, shape), added_residency))
203    }
204
205    /// Return a single buffer to the pool's free list for future reuse.
206    ///
207    /// The Metal memory is **not** deallocated — it stays resident on the GPU
208    /// for fast reuse.  `release` is the per-buffer alternative to [`reset`];
209    /// see the module docs for guidance on which to use.
210    ///
211    /// **Mixing `release` and `reset` within the same arena cycle is not
212    /// supported** — the pool's in-use list does not deduplicate, so a buffer
213    /// returned via `release` and then bulk-returned via `reset` would land in
214    /// the free list twice (each entry holds an ARC clone of the same Metal
215    /// buffer; the duplication wastes a free-list slot but is not a memory
216    /// leak — both clones drop together once popped).  Pick one pattern per
217    /// arena cycle.
218    pub fn release(&mut self, buffer: MlxBuffer) {
219        let bucket = bucket_size(buffer.byte_len());
220        let metal_buf = buffer.into_inner();
221        self.free.entry(bucket).or_default().push(metal_buf);
222    }
223
224    /// Bulk-return every buffer handed out by [`alloc`] since the last reset
225    /// to the pool's free list.
226    ///
227    /// # Caller contract
228    ///
229    /// All `MlxBuffer` values returned by `alloc` since the last reset must be
230    /// out-of-scope (dropped) at the time `reset` is called.  Reset transfers
231    /// the pool's ARC clones to the free list, where they become available to
232    /// subsequent [`alloc`] calls.  If a caller is still holding an `MlxBuffer`
233    /// and a later `alloc` re-issues the underlying buffer, the two callers
234    /// will share GPU memory (aliasing).  The Metal ARC keeps the storage
235    /// alive in either case, but writes from the new caller will be visible
236    /// to the stale caller — a correctness bug, not a memory error.
237    ///
238    /// In Rust's ownership model, locally-bound `MlxBuffer` values fall out of
239    /// scope at the end of their lexical block, making the per-decode-token
240    /// arena pattern safe by construction:
241    ///
242    /// ```ignore
243    /// loop {
244    ///     pool.reset();          // start of token — recycle previous token's buffers
245    ///     forward_pass(&pool);   // many alloc(), no explicit release
246    /// }                          // forward_pass returns; locals dropped
247    /// ```
248    pub fn reset(&mut self) {
249        for (bucket, metal_buf) in self.in_use.drain(..) {
250            self.free.entry(bucket).or_default().push(metal_buf);
251        }
252    }
253
254    /// Register an externally-allocated buffer with this pool's residency set
255    /// without taking ownership.
256    ///
257    /// # Why this exists
258    ///
259    /// [`alloc`](Self::alloc) bucket-rounds requests up to the next power of
260    /// two, which is acceptable for transient per-token scratch (the worst
261    /// case is ~2× over-allocation on a few megabytes) but unacceptable for
262    /// large static weight tensors.  hf2q's Qwen3.5-MoE weight set totals
263    /// ~17.26 GB; bucket-rounding would balloon that to ~25.55 GB
264    /// (+8.3 GB / +48% blowup) — unshippable on a 128 GB unified-memory
265    /// M5 Max once KV cache and intermediates are layered on top.
266    ///
267    /// `register_existing` provides a *residency-only* path: the caller
268    /// allocates the buffer at its exact size via
269    /// [`MlxDevice::alloc_buffer`](crate::MlxDevice::alloc_buffer) (or
270    /// loads it via [`GgufFile::load_tensor_into_pool`](crate::GgufFile::load_tensor_into_pool)),
271    /// retains the [`MlxBuffer`] handle, and asks the pool to add the
272    /// underlying Metal allocation to its residency set so it gets the
273    /// MTLResidencySet hint on the next dispatch.
274    ///
275    /// # Ownership semantics
276    ///
277    /// * The pool **does not** take ownership of the buffer.  The caller's
278    ///   `MlxBuffer` handle remains the canonical owner.
279    /// * The pool **does not** recycle this buffer on [`reset`](Self::reset)
280    ///   (it is not added to `in_use`).
281    /// * The pool **does** include this buffer in its residency set so it
282    ///   is hinted-resident on the next encoder dispatch.
283    /// * On pool [`Drop`], the residency-set membership is removed but the
284    ///   underlying Metal buffer is **not** freed — the caller's `MlxBuffer`
285    ///   handle keeps the ARC alive.
286    ///
287    /// # `HF2Q_NO_RESIDENCY=1` escape hatch
288    ///
289    /// When the environment variable `HF2Q_NO_RESIDENCY=1` is set, the
290    /// process boots its [`MlxDevice`](crate::MlxDevice) without any
291    /// residency set (see `device.rs`).  In that mode this method returns
292    /// `Ok(())` without touching anything — operators who suspect a
293    /// residency-induced regression can opt out without recompiling.
294    ///
295    /// # Idempotence
296    ///
297    /// Registering the same buffer twice (identified by its
298    /// `metal::Buffer.contents()` pointer) is a no-op on the second call —
299    /// the residency set membership is tracked in a `HashMap` keyed by
300    /// contents pointer.
301    ///
302    /// # Errors
303    ///
304    /// Returns `MlxError::InvalidArgument` if the buffer was allocated on a
305    /// different `MlxDevice` than any previously registered buffer.
306    pub fn register_existing(
307        &mut self,
308        device: &MlxDevice,
309        buffer: &MlxBuffer,
310    ) -> Result<()> {
311        // ADR-015 iter8e (Phase 3b): MlxDevice::alloc_buffer now
312        // auto-registers each new buffer with the device's residency set
313        // via Arc<MlxBufferStorage>. If this caller's buffer already owns
314        // its registration, short-circuit — re-registering would double-add
315        // and the pool's Drop would issue a stray removeAllocation: against
316        // a buffer the storage's RAII path will also remove.
317        if let Some(buffer_set) = buffer.residency_set() {
318            let Some(device_set) = device.residency_set() else {
319                return Err(MlxError::InvalidArgument(
320                    "MlxBuffer is registered with a residency set, but device has none".into(),
321                ));
322            };
323            if !buffer_set.same_owner(device_set) {
324                return Err(MlxError::InvalidArgument(
325                    "MlxBufferPool cannot register a buffer from a different residency-enabled device"
326                        .into(),
327                ));
328            }
329            // Adopt the buffer's residency set so the pool's same_owner
330            // checks downstream agree, but do NOT add the buffer — it's
331            // already in the set via its own Arc<MlxBufferStorage>.
332            match self.residency_set.as_ref() {
333                Some(pool_set) if !pool_set.same_owner(device_set) => {
334                    return Err(MlxError::InvalidArgument(
335                        "MlxBufferPool cannot mix residency-enabled devices".into(),
336                    ));
337                }
338                Some(_) => {}
339                None => {
340                    self.residency_set = Some(device_set.clone());
341                }
342            }
343            return Ok(());
344        }
345
346        let added = self.register_residency_allocation(device, buffer.metal_buffer())?;
347        if added {
348            if let Some(set) = self.residency_set.as_ref() {
349                // Batched-add path: explicit commit (counts in the
350                // commit-call counter) preserves the
351                // `commit_called_after_alloc_batch`-style semantics.
352                set.commit();
353            }
354        }
355        Ok(())
356    }
357
358    /// Return all free buffers' count (for diagnostics).
359    pub fn free_count(&self) -> usize {
360        self.free.values().map(|v| v.len()).sum()
361    }
362
363    /// Total number of bytes held in the free list.
364    pub fn free_bytes(&self) -> usize {
365        self.free
366            .iter()
367            .map(|(&bucket, bufs)| bucket * bufs.len())
368            .sum()
369    }
370
371    /// Number of buffers currently in-use (alloc'd but not yet reset).
372    pub fn in_use_count(&self) -> usize {
373        self.in_use.len()
374    }
375
376    /// Clear all free buffers, releasing Metal memory.  Does not affect
377    /// in-use tracking.
378    pub fn clear(&mut self) {
379        let mut removed_any = false;
380
381        if let Some(set) = self.residency_set.as_ref() {
382            for metal_buf in self.free.values().flatten() {
383                let key = buffer_key(metal_buf);
384                if let Some(resident_buf) = self.resident_buffers.remove(&key) {
385                    set.remove_allocation(&resident_buf);
386                    removed_any = true;
387                }
388            }
389
390            if removed_any {
391                set.commit();
392            }
393        }
394
395        self.free.clear();
396    }
397
398    fn register_residency_allocation(
399        &mut self,
400        device: &MlxDevice,
401        buffer: &metal::Buffer,
402    ) -> Result<bool> {
403        let Some(device_set) = device.residency_set() else {
404            return Ok(false);
405        };
406
407        match self.residency_set.as_ref() {
408            Some(pool_set) if !pool_set.same_owner(device_set) => {
409                return Err(MlxError::InvalidArgument(
410                    "MlxBufferPool cannot mix residency-enabled devices".into(),
411                ));
412            }
413            Some(_) => {}
414            None => {
415                self.residency_set = Some(device_set.clone());
416            }
417        }
418
419        let key = buffer_key(buffer);
420
421        // 2026-05-03 — HF2Q_PROFILE_RESIDENCY_ABORT instrumentation gate.
422        // Falsifies/confirms the host-pointer-collision hypothesis behind
423        // the SIGABRT inside `-[IOGPUMetalResidencySet addAllocation:]`
424        // (6 macOS DiagnosticReports captured 2026-05-02 22:30 → 2026-05-03 07:15,
425        // all identical stack: abort ← addAllocation ← register_residency_allocation
426        // ← MlxBufferPool::alloc ← qwen35 forward-pass alloc site after long
427        // decode). When set, prints one line per call:
428        //   [RESIDENCY] N=<resident_count> key=<host_ptr> mtl=<obj_ptr> dup=<bool>
429        // dup=true means the host_ptr (`buffer.contents() as usize`) collides
430        // with a previously-registered allocation, possibly representing a
431        // DIFFERENT MTLBuffer ARC (Apple recycled the host page). In that
432        // case the existing dedup HashMap returns "skip" and Apple sees the
433        // new MTLBuffer as never-added — which is fine. The interesting case
434        // is dup=false but Apple aborts on the addAllocation: that means the
435        // MTLBuffer is somehow already-known to Apple's set despite our
436        // HashMap saying it isn't. Logged BEFORE the addAllocation so the
437        // log line precedes any abort.
438        let mtl_ptr = (&**buffer as *const metal::BufferRef as *const std::ffi::c_void) as usize;
439        let dup = self.resident_buffers.contains_key(&key);
440        if std::env::var("HF2Q_PROFILE_RESIDENCY_ABORT").is_ok() {
441            eprintln!(
442                "[RESIDENCY] N={} key=0x{:x} mtl=0x{:x} dup={}",
443                self.resident_buffers.len(),
444                key,
445                mtl_ptr,
446                dup,
447            );
448        }
449
450        if !dup {
451            device_set.add_allocation(buffer);
452            self.resident_buffers.insert(key, buffer.clone());
453            return Ok(true);
454        }
455
456        Ok(false)
457    }
458
459    fn remove_all_residency_allocations(&mut self) {
460        let Some(set) = self.residency_set.as_ref() else {
461            return;
462        };
463
464        if self.resident_buffers.is_empty() {
465            return;
466        }
467
468        for buffer in self.resident_buffers.values() {
469            set.remove_allocation(buffer);
470        }
471        set.commit();
472        self.resident_buffers.clear();
473    }
474}
475
476impl Drop for MlxBufferPool {
477    fn drop(&mut self) {
478        self.remove_all_residency_allocations();
479    }
480}
481
482/// Round `n` up to the nearest power of two.
483///
484/// Returns 1 for n == 0 (though callers should never request 0 bytes).
485fn bucket_size(n: usize) -> usize {
486    if n <= 1 {
487        return 1;
488    }
489    n.next_power_of_two()
490}
491
492#[inline]
493fn buffer_key(buffer: &metal::Buffer) -> usize {
494    buffer.contents() as usize
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500
501    #[test]
502    fn test_bucket_size_powers() {
503        assert_eq!(bucket_size(0), 1);
504        assert_eq!(bucket_size(1), 1);
505        assert_eq!(bucket_size(2), 2);
506        assert_eq!(bucket_size(3), 4);
507        assert_eq!(bucket_size(4), 4);
508        assert_eq!(bucket_size(5), 8);
509        assert_eq!(bucket_size(1023), 1024);
510        assert_eq!(bucket_size(1024), 1024);
511        assert_eq!(bucket_size(1025), 2048);
512    }
513
514    #[test]
515    fn test_pool_arena_reset_recycles_in_use() {
516        // Per-decode-token arena pattern: alloc many, drop locals, reset, alloc again.
517        // Subsequent allocs must reuse the same Metal buffers (verified by ARC-cloned
518        // contents pointer).
519        let device = MlxDevice::new().expect("device");
520        let mut pool = MlxBufferPool::new();
521
522        // Cycle 1: allocate three buffers in different buckets, then drop them
523        // (locals fall out of scope at the end of the block).
524        let (ptr_a, ptr_b, ptr_c) = {
525            let buf_a = pool.alloc(&device, 1024, DType::F32, vec![256]).expect("alloc a");
526            let buf_b = pool.alloc(&device, 2048, DType::F32, vec![512]).expect("alloc b");
527            let buf_c = pool.alloc(&device, 1024, DType::F32, vec![256]).expect("alloc c");
528            (buf_a.contents_ptr(), buf_b.contents_ptr(), buf_c.contents_ptr())
529        };
530        assert_eq!(pool.in_use_count(), 3);
531        assert_eq!(pool.free_count(), 0);
532
533        // Reset returns all three to free.
534        pool.reset();
535        assert_eq!(pool.in_use_count(), 0);
536        assert_eq!(pool.free_count(), 3);
537
538        // Cycle 2: allocate compatible-bucket buffers, must reuse the same
539        // underlying Metal buffers (contents_ptr equal).
540        let buf_d = pool.alloc(&device, 1024, DType::F32, vec![256]).expect("alloc d");
541        let buf_e = pool.alloc(&device, 2048, DType::F32, vec![512]).expect("alloc e");
542        let ptr_d = buf_d.contents_ptr();
543        let ptr_e = buf_e.contents_ptr();
544
545        // Pointers must come from {a, b, c} — bucket 1024 reuse for d (matches a or c),
546        // bucket 2048 reuse for e (matches b).
547        assert!(
548            ptr_d == ptr_a || ptr_d == ptr_c,
549            "buf_d {:?} must reuse one of a {:?} / c {:?}",
550            ptr_d, ptr_a, ptr_c,
551        );
552        assert_eq!(ptr_e, ptr_b, "buf_e must reuse b (only 2048-bucket buffer)");
553
554        // After cycle-2 alloc, free has 1 (the unused 1024-bucket buffer) + in_use 2.
555        assert_eq!(pool.in_use_count(), 2);
556        assert_eq!(pool.free_count(), 1);
557    }
558
559    #[test]
560    fn test_pool_reset_with_no_alloc_is_idempotent() {
561        // Empty reset must be a no-op.  No MlxDevice required — pool
562        // operations on an empty pool don't touch the device; the
563        // smoke check used to live here was incidental and triggered
564        // the unused-variable warning since `device` was bound but
565        // never consumed.
566        let mut pool = MlxBufferPool::new();
567        pool.reset();
568        assert_eq!(pool.in_use_count(), 0);
569        assert_eq!(pool.free_count(), 0);
570        // Multiple resets without intervening alloc — still no-op.
571        pool.reset();
572        pool.reset();
573        assert_eq!(pool.in_use_count(), 0);
574    }
575
576    #[test]
577    fn test_register_existing_does_not_recycle_on_reset() {
578        // Externally-allocated buffer registered via register_existing must
579        // NOT be added to the in_use list — reset() should leave the caller's
580        // ownership intact and the buffer must remain valid after the pool
581        // is dropped.
582        let device = MlxDevice::new().expect("device");
583        let mut pool = MlxBufferPool::new();
584
585        // Allocate the buffer EXTERNALLY (via device.alloc_buffer, not
586        // pool.alloc) — this is the no-bucket-rounding path hf2q uses for
587        // static weight tensors.
588        let external = device
589            .alloc_buffer(4096, DType::U8, vec![4096])
590            .expect("alloc external");
591        let external_ptr = external.contents_ptr();
592
593        // Register with the pool's residency set.
594        pool.register_existing(&device, &external)
595            .expect("register_existing");
596
597        // in_use must remain empty (external buffer is not arena-recycled).
598        assert_eq!(pool.in_use_count(), 0);
599
600        // reset() must be a no-op for externally-registered buffers.
601        pool.reset();
602        assert_eq!(pool.in_use_count(), 0);
603        assert_eq!(pool.free_count(), 0);
604
605        // Drop the pool. The external MlxBuffer must still be valid — its
606        // metal::Buffer ARC is held by `external`, not by the pool.
607        drop(pool);
608        assert_eq!(external.contents_ptr(), external_ptr);
609        // Confirm the buffer is still accessible (no UAF).
610        let slice: &[u8] = external.as_slice().expect("slice still valid");
611        assert_eq!(slice.len(), 4096);
612    }
613
614    #[test]
615    fn test_register_existing_idempotent() {
616        // Registering the same buffer twice must not duplicate the residency
617        // membership (resident_buffers HashMap is keyed by contents pointer).
618        let device = MlxDevice::new().expect("device");
619        let mut pool = MlxBufferPool::new();
620
621        let external = device
622            .alloc_buffer(2048, DType::U8, vec![2048])
623            .expect("alloc external");
624
625        pool.register_existing(&device, &external)
626            .expect("register 1");
627        pool.register_existing(&device, &external)
628            .expect("register 2 (idempotent)");
629
630        // Drop the pool (Drop::drop runs remove_all_residency_allocations).
631        // No double-remove panic is the actual assertion here.
632        drop(pool);
633        // Buffer still valid.
634        let _slice: &[u8] = external.as_slice().expect("still valid");
635    }
636
637    #[test]
638    fn test_register_existing_no_residency_env_is_noop() {
639        // With HF2Q_NO_RESIDENCY=1 the device boots without a residency set,
640        // so register_existing has no set to register against and must
641        // return Ok(()) as a no-op without touching anything.
642        //
643        // This test runs serially with other residency-env tests via the
644        // shared TEST_LOCK in tests/test_residency_set.rs — but unit tests
645        // here run in the same process and could race with that integration
646        // test if both are running. We mitigate by:
647        //   1. Reading + restoring the original env value.
648        //   2. Resetting the residency env-cache flag before AND after.
649        //
650        // The unit-test name is uniquely keyed; cargo test by default
651        // single-threads tests within the same binary only when --test-threads=1
652        // is set. We accept that this test could flake under -j > 1 with
653        // the integration tests; in practice cargo test schedules unit and
654        // integration test binaries separately.
655        let prev = std::env::var("HF2Q_NO_RESIDENCY").ok();
656        crate::residency::reset_residency_env_cache_for_test();
657        std::env::set_var("HF2Q_NO_RESIDENCY", "1");
658
659        let device = MlxDevice::new().expect("device");
660        assert!(
661            !device.residency_sets_enabled(),
662            "device should boot without residency under HF2Q_NO_RESIDENCY=1",
663        );
664
665        let mut pool = MlxBufferPool::new();
666        let external = device
667            .alloc_buffer(1024, DType::U8, vec![1024])
668            .expect("alloc external");
669
670        // register_existing must succeed as a no-op.
671        pool.register_existing(&device, &external)
672            .expect("register_existing under HF2Q_NO_RESIDENCY=1 should succeed");
673
674        // Pool's internal residency_set must remain None.
675        assert!(pool.residency_set.is_none());
676        assert!(pool.resident_buffers.is_empty());
677
678        // Cleanup env.
679        match prev {
680            Some(v) => std::env::set_var("HF2Q_NO_RESIDENCY", v),
681            None => std::env::remove_var("HF2Q_NO_RESIDENCY"),
682        }
683        crate::residency::reset_residency_env_cache_for_test();
684    }
685
686    #[test]
687    fn test_pool_release_remains_supported_for_compat() {
688        // The existing per-buffer release() pattern still works.  Mixing
689        // release+reset within the same arena cycle is documented as
690        // unsupported but technically lands a duplicate clone in free —
691        // verify the duplicate is harmless (alloc still picks up a buffer).
692        let device = MlxDevice::new().expect("device");
693        let mut pool = MlxBufferPool::new();
694
695        let buf = pool.alloc(&device, 1024, DType::F32, vec![256]).expect("alloc");
696        assert_eq!(pool.in_use_count(), 1);
697        pool.release(buf);
698        // release() does NOT remove from in_use; that's acceptable per the
699        // documented contract (don't mix patterns).  Free has the released one.
700        assert_eq!(pool.free_count(), 1);
701        assert_eq!(pool.in_use_count(), 1);
702
703        // Allocating again pulls from free first.
704        let _buf2 = pool.alloc(&device, 1024, DType::F32, vec![256]).expect("alloc 2");
705        assert_eq!(pool.free_count(), 0);
706        assert_eq!(pool.in_use_count(), 2);
707    }
708}