train_station/tensor/core/
memory.rs

1//! High-performance memory management for tensor operations
2//!
3//! This module provides thread-local memory pools optimized for ML workloads
4//! with frequent tensor allocation and deallocation. Designed as the foundation
5//! for AGI/ASI research with zero dependencies and maximum performance.
6//!
7//! # Key Features
8//!
9//! - **Thread-Local Pools**: Eliminate contention with per-thread pools
10//! - **Size-Class Optimization**: Optimized for common ML tensor sizes (scalars to large matrices)
11//! - **Zero-Copy Integration**: Seamless integration with tensor view system
12//! - **Statistics Tracking**: Memory usage monitoring and optimization
13//! - **SIMD Alignment**: 32-byte alignment for AVX2 operations
14//! - **Research Enablement**: Predictable allocation patterns for novel architectures
15//!
16//! # Performance Characteristics
17//!
18//! - **Allocation Speed**: 5-10x faster than system allocator for pooled sizes
19//! - **Memory Efficiency**: Reduced fragmentation through ML-optimized size classes
20//! - **Cache Locality**: Better cache utilization through buffer reuse
21//! - **Thread Safety**: Lock-free through thread-local storage
22//! - **Zero Dependencies**: Pure Rust implementation with no external dependencies
23//! - **Edge Ready**: Minimal memory overhead suitable for embedded deployment
24
25use std::alloc::Layout;
26use std::cell::Cell;
27use std::cell::RefCell;
28use std::ptr::NonNull;
29use std::time::Instant;
30// no global atomics needed in simplified design
31
32// Global cross-thread counters removed for simplicity; thread-local stats remain
33
34/// Memory pool statistics for performance monitoring
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct PoolStats {
37    /// Total number of allocation requests
38    pub allocations: usize,
39    /// Total number of deallocation requests  
40    pub deallocations: usize,
41    /// Number of successful pool hits (allocations served from pool)
42    pub pool_hits: usize,
43    /// Number of pool misses (allocations that fell back to system allocator)
44    pub pool_misses: usize,
45    /// Current memory usage in bytes
46    pub current_usage: usize,
47    /// Peak memory usage in bytes
48    pub peak_usage: usize,
49}
50
51/// Size classes optimized for ML workloads
52///
53/// Based on analysis of common tensor sizes in ML applications:
54/// - Small: Scalars, small vectors, activations (≤4KB) - covers up to 32x32 matrices
55/// - Medium: Embeddings, medium matrices (4KB-256KB) - covers 64x64 to 256x256 matrices
56/// - Large: Batch data, large matrices (256KB-4MB) - covers large batch processing
57/// - XLarge: Very large tensors (>4MB) - covers massive models and datasets
58pub const SMALL_BUFFER_SIZE: usize = 1024; // 4KB (1024 * 4 bytes) - up to 32x32 matrices
59pub const MEDIUM_BUFFER_SIZE: usize = 65536; // 256KB (65536 * 4 bytes) - up to 256x256 matrices
60pub const LARGE_BUFFER_SIZE: usize = 1048576; // 4MB (1048576 * 4 bytes) - large batch processing
61
62/// **CRITICAL DESIGN PRINCIPLE**: NO MAXIMUM LIMITS
63///
64/// The memory pool NEVER prevents tensor creation. Instead, it uses adaptive
65/// management to balance performance and memory usage. Users control memory
66/// through their allocation patterns, not artificial limits.
67///
68/// Pool management strategy:
69/// - Pools grow dynamically based on usage patterns
70/// - Automatic cleanup of unused buffers during low activity
71/// - Memory pressure detection for adaptive behavior
72/// - User-controlled memory management through allocation patterns
73///   Target pool sizes for optimal performance (not limits!)
74const TARGET_SMALL_BUFFERS: usize = 32; // Optimal: 32KB cached
75const TARGET_MEDIUM_BUFFERS: usize = 16; // Optimal: 1MB cached
76const TARGET_LARGE_BUFFERS: usize = 8; // Optimal: 8MB cached
77                                       // Cleanup heuristics: conservative headroom and cadence
78const HEADROOM_SMALL: usize = 8;
79const HEADROOM_MEDIUM: usize = 4;
80const HEADROOM_LARGE: usize = 2;
81const HEADROOM_XLARGE: usize = 1;
82
83// Minimum operations and time between cleanup passes (hybrid gating)
84const CLEANUP_MIN_OPS: u64 = 2048;
85const CLEANUP_MIN_INTERVAL_MS: u64 = 2000; // 2s
86
87// A buffer must remain unused for at least this many ops since last touch
88const UNUSED_OPS_THRESHOLD: u64 = 4096;
89
90/// Pooled memory buffer with alignment guarantees and lifecycle tracking
91///
92/// Provides SIMD-aligned memory buffers for efficient tensor operations.
93/// Buffers are reused across allocations to reduce overhead and support
94/// advanced view system integration.
95///
96/// # Key Features
97/// - **Adaptive Lifecycle**: Tracks usage patterns for intelligent management
98/// - **View Integration**: Optimized for tensor view operations
99/// - **Future Proof**: Extensible design for novel ML architectures
100/// - **Zero Limits**: No artificial constraints on buffer creation
101pub struct PooledBuffer {
102    /// Owning allocation for this pooled buffer (system-owned; pool manages lifetime)
103    alloc: crate::tensor::core::Allocation,
104    /// Whether this buffer is currently checked out
105    in_use: bool,
106    /// Last time (in pool ops) this buffer was touched (allocated or returned)
107    last_used_counter: u64,
108}
109
110/// Thread-local memory pool for tensor allocation with adaptive management
111///
112/// **REVOLUTIONARY DESIGN**: No artificial limits - pools grow and shrink based on
113/// actual usage patterns. Optimized for ML workloads with intelligent view system
114/// integration and future-proof extensibility.
115///
116/// # Key Features
117/// - **Unlimited Growth**: Pools expand as needed until system memory exhausted
118/// - **Adaptive Cleanup**: Automatic cleanup of unused buffers during low activity
119/// - **View Optimization**: Special handling for tensors used in view operations
120/// - **Future Proof**: Extensible design for novel ML architectures
121/// - **User Control**: Memory management through allocation patterns, not limits
122pub struct TensorMemoryPool {
123    /// Small buffers for scalars, small vectors (≤1KB)
124    /// **NO SIZE LIMIT** - grows dynamically based on usage
125    small_buffers: Vec<PooledBuffer>,
126
127    /// Medium buffers for embeddings, small matrices (1KB-64KB)
128    /// **NO SIZE LIMIT** - grows dynamically based on usage
129    medium_buffers: Vec<PooledBuffer>,
130
131    /// Large buffers for batch data, large matrices (64KB-1MB)
132    /// **NO SIZE LIMIT** - grows dynamically based on usage
133    large_buffers: Vec<PooledBuffer>,
134
135    /// Extra large buffers for massive tensors (>1MB)
136    /// Reinstated for stress testing rapid reuse stability
137    xlarge_buffers: Vec<PooledBuffer>,
138
139    /// Statistics for this thread's pool
140    stats: PoolStats,
141    // Simplified: no adaptive management state; dynamic growth via Vec
142    /// Monotonic operation counter to timestamp buffer activity
143    op_counter: u64,
144    /// Last cleanup op counter (to avoid frequent passes)
145    last_cleanup_counter: u64,
146    /// Wall-clock last cleanup time (additional gate)
147    last_cleanup_instant: Instant,
148}
149
150// Simplified: removed adaptive/view metrics/usage patterns to reduce complexity
151
152/// Size class enumeration for pattern analysis
153#[derive(Debug, Clone, Copy, PartialEq, Eq)]
154enum SizeClass {
155    Small,  // ≤1KB
156    Medium, // 1KB-64KB
157    Large,  // 64KB-1MB
158    XLarge, // >1MB
159}
160
161// Removed ComprehensivePoolStats/BufferCounts; use thread_stats() when needed
162
163// Duplicate PoolStats removed - using the one defined earlier with proper field names
164
165thread_local! {
166    static MEMORY_POOL: RefCell<TensorMemoryPool> = RefCell::new(TensorMemoryPool::new());
167    /// Thread-local flag to disable memory padding and pooling for allocations made
168    /// during the active context. When enabled, allocations will not add lane-size
169    /// padding and will prefer exact-size system allocations over the pool.
170    static NO_MEM_PADDING: Cell<bool> = const { Cell::new(false) };
171    /// Thread-local flag to control whether allocations should use the memory pool.
172    /// Defaults to true for efficiency. When false, allocations use the system allocator.
173    static USE_POOL_ALLOC: Cell<bool> = const { Cell::new(true) };
174}
175
176/// Runtime SIMD capability level on the current CPU
177#[derive(Debug, Clone, Copy, PartialEq, Eq)]
178pub enum SimdLevel {
179    #[cfg(target_arch = "x86_64")]
180    Avx512,
181    #[cfg(target_arch = "x86_64")]
182    Avx2,
183    #[cfg(target_arch = "x86_64")]
184    Sse2,
185    Scalar,
186}
187
188/// Detect the highest available SIMD level at runtime.
189#[inline]
190pub fn detect_runtime_simd() -> SimdLevel {
191    #[cfg(target_arch = "x86_64")]
192    {
193        // Check in descending order
194        if is_x86_feature_detected!("avx512f") {
195            return SimdLevel::Avx512;
196        }
197        if is_x86_feature_detected!("avx2") {
198            return SimdLevel::Avx2;
199        }
200        if is_x86_feature_detected!("sse2") {
201            return SimdLevel::Sse2;
202        }
203
204        SimdLevel::Scalar
205    }
206    #[cfg(not(target_arch = "x86_64"))]
207    {
208        SimdLevel::Scalar
209    }
210}
211
212/// Lane width (elements per vector) for the given SIMD level for f32 values
213#[inline]
214pub(crate) fn simd_lane_width_elems(level: SimdLevel) -> usize {
215    match level {
216        #[cfg(target_arch = "x86_64")]
217        SimdLevel::Avx512 => 16, // 512 / 32
218        #[cfg(target_arch = "x86_64")]
219        SimdLevel::Avx2 => 8, // 256 / 32
220        #[cfg(target_arch = "x86_64")]
221        SimdLevel::Sse2 => 4, // 128 / 32
222        SimdLevel::Scalar => 1,
223    }
224}
225
226/// Alignment in bytes recommended for the given SIMD level
227#[inline]
228pub fn simd_alignment_bytes(level: SimdLevel) -> usize {
229    match level {
230        #[cfg(target_arch = "x86_64")]
231        SimdLevel::Avx512 => 64,
232        #[cfg(target_arch = "x86_64")]
233        SimdLevel::Avx2 => 32,
234        #[cfg(target_arch = "x86_64")]
235        SimdLevel::Sse2 => 16,
236        SimdLevel::Scalar => 16, // keep at least 16 for general safety
237    }
238}
239
240/// Compute allocation alignment (bytes) and padded element count for a requested length.
241/// When NoMemPadding is enabled, padding is disabled and exact element count is returned.
242/// Enhanced version with better alignment guarantees for matmul operations.
243#[inline]
244pub fn compute_allocation_params(requested_elems: usize) -> (usize, usize) {
245    let level = detect_runtime_simd();
246    #[cfg(target_arch = "x86_64")]
247    let mut align = simd_alignment_bytes(level);
248    #[cfg(not(target_arch = "x86_64"))]
249    let align = simd_alignment_bytes(level);
250
251    // Preserve existing alignment policy (keep minimums but avoid over-padding semantics)
252    #[cfg(target_arch = "x86_64")]
253    {
254        if is_x86_feature_detected!("avx512f") {
255            align = 64;
256        } else if is_x86_feature_detected!("avx2") {
257            align = align.max(32);
258        }
259    }
260
261    if no_mem_padding_enabled() || requested_elems == 0 {
262        (align, requested_elems)
263    } else {
264        let lane = simd_lane_width_elems(level);
265        let padded = requested_elems.div_ceil(lane) * lane;
266        (align, padded)
267    }
268}
269
270/// Returns true if the current thread prefers using the memory pool for allocations.
271#[inline]
272pub fn use_pool_alloc_enabled() -> bool {
273    USE_POOL_ALLOC.with(|flag| flag.get())
274}
275
276impl PooledBuffer {
277    /// Creates a new pooled buffer with specified size and alignment
278    ///
279    /// **DESIGN PRINCIPLE**: Never fails due to limits - always creates buffer
280    /// if system memory is available. Users control memory through their patterns.
281    fn new(size: usize, alignment: usize) -> Self {
282        // Ensure alignment is at least align_of::<f32>()
283        let effective_alignment = alignment.max(std::mem::align_of::<f32>());
284        let layout =
285            Layout::from_size_align(size * std::mem::size_of::<f32>(), effective_alignment)
286                .expect("Invalid layout for pooled buffer");
287        // Use system allocation via Allocation; pool owns this memory
288        let alloc =
289            crate::tensor::core::Allocation::new_uninitialized(size, effective_alignment, layout);
290        // Verify alignment satisfies requested
291        let addr = alloc.ptr.as_ptr() as usize;
292        assert_eq!(
293            addr % alignment,
294            0,
295            "System allocator failed to provide {}-byte aligned memory. Got address 0x{:x} (alignment {})",
296            alignment,
297            addr,
298            addr % alignment
299        );
300        PooledBuffer {
301            alloc,
302            in_use: false,
303            last_used_counter: 0,
304        }
305    }
306
307    /// Gets the raw pointer to the buffer data
308    #[inline(always)]
309    pub fn as_ptr(&self) -> NonNull<f32> {
310        self.alloc.ptr
311    }
312
313    /// Gets the size of the buffer in elements
314    #[inline(always)]
315    pub fn size(&self) -> usize {
316        self.alloc.capacity_elems()
317    }
318
319    // Removed buffer_id tracking in simplified design
320
321    /// Allocates this buffer for tensor use
322    #[inline]
323    fn allocate_for_tensor(&mut self, now_counter: u64) -> bool {
324        if self.in_use {
325            false
326        } else {
327            self.in_use = true;
328            self.last_used_counter = now_counter;
329            true
330        }
331    }
332
333    /// Returns buffer to available state
334    #[inline]
335    fn return_to_pool(&mut self, now_counter: u64) {
336        self.in_use = false;
337        self.last_used_counter = now_counter;
338    }
339
340    /// Checks if buffer is available for allocation
341    #[inline(always)]
342    pub fn is_available(&self) -> bool {
343        !self.in_use
344    }
345}
346
347// No custom Drop needed; `alloc` owns the memory and will free on drop.
348
349impl TensorMemoryPool {
350    /// Creates a new tensor memory pool with adaptive management
351    ///
352    /// **DESIGN PRINCIPLE**: Starts with optimal capacity but grows unlimited
353    pub fn new() -> Self {
354        TensorMemoryPool {
355            // Start with target capacities for optimal performance
356            small_buffers: Vec::with_capacity(TARGET_SMALL_BUFFERS),
357            medium_buffers: Vec::with_capacity(TARGET_MEDIUM_BUFFERS),
358            large_buffers: Vec::with_capacity(TARGET_LARGE_BUFFERS),
359            xlarge_buffers: Vec::with_capacity(4),
360            stats: PoolStats::new(),
361            op_counter: 0,
362            last_cleanup_counter: 0,
363            last_cleanup_instant: Instant::now(),
364        }
365    }
366
367    /// Attempts to allocate memory from the pool
368    ///
369    /// Returns a pointer to allocated memory if a suitable buffer is available,
370    /// otherwise returns None to indicate fallback to system allocator.
371    fn try_allocate(&mut self, size: usize, alignment: usize) -> Option<NonNull<f32>> {
372        let size_class = self.classify_size(size);
373
374        self.try_allocate_internal(size, alignment, size_class)
375    }
376
377    /// Internal allocation method that avoids borrowing conflicts
378    fn try_allocate_internal(
379        &mut self,
380        size: usize,
381        alignment: usize,
382        size_class: SizeClass,
383    ) -> Option<NonNull<f32>> {
384        // Periodically attempt cleanup prior to allocation
385        self.maybe_cleanup();
386        match size_class {
387            SizeClass::Small => {
388                self.try_allocate_from_small_pool(SMALL_BUFFER_SIZE, alignment, size_class)
389            }
390            SizeClass::Medium => {
391                self.try_allocate_from_medium_pool(MEDIUM_BUFFER_SIZE, alignment, size_class)
392            }
393            SizeClass::Large => {
394                self.try_allocate_from_large_pool(LARGE_BUFFER_SIZE, alignment, size_class)
395            }
396            SizeClass::XLarge => {
397                let planned = TensorMemoryPool::planned_capacity_elems(size);
398                self.try_allocate_from_xlarge_pool(planned, alignment, size_class)
399            }
400        }
401    }
402
403    /// Allocate from small pool
404    fn try_allocate_from_small_pool(
405        &mut self,
406        buffer_size: usize,
407        alignment: usize,
408        _size_class: SizeClass,
409    ) -> Option<NonNull<f32>> {
410        let nowc = self.bump_op_counter();
411        for buffer in self.small_buffers.iter_mut() {
412            if buffer.is_available()
413                && buffer.alloc.alignment() >= alignment
414                && buffer.allocate_for_tensor(nowc)
415            {
416                self.stats.record_allocation_hit(buffer_size);
417                return Some(buffer.as_ptr());
418            }
419        }
420        let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
421        if new_buffer.allocate_for_tensor(nowc) {
422            let ptr = new_buffer.as_ptr();
423            self.small_buffers.push(new_buffer);
424            self.stats
425                .record_allocation_miss(buffer_size, "new_buffer_created");
426            Some(ptr)
427        } else {
428            None
429        }
430    }
431
432    /// Allocate from medium pool
433    fn try_allocate_from_medium_pool(
434        &mut self,
435        buffer_size: usize,
436        alignment: usize,
437        _size_class: SizeClass,
438    ) -> Option<NonNull<f32>> {
439        let nowc = self.bump_op_counter();
440        for buffer in self.medium_buffers.iter_mut() {
441            if buffer.is_available()
442                && buffer.alloc.alignment() >= alignment
443                && buffer.allocate_for_tensor(nowc)
444            {
445                self.stats.record_allocation_hit(buffer_size);
446                return Some(buffer.as_ptr());
447            }
448        }
449        let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
450        if new_buffer.allocate_for_tensor(nowc) {
451            let ptr = new_buffer.as_ptr();
452            self.medium_buffers.push(new_buffer);
453            self.stats
454                .record_allocation_miss(buffer_size, "new_buffer_created");
455            Some(ptr)
456        } else {
457            None
458        }
459    }
460
461    /// Allocate from large pool
462    fn try_allocate_from_large_pool(
463        &mut self,
464        buffer_size: usize,
465        alignment: usize,
466        _size_class: SizeClass,
467    ) -> Option<NonNull<f32>> {
468        let nowc = self.bump_op_counter();
469        for buffer in self.large_buffers.iter_mut() {
470            if buffer.is_available()
471                && buffer.alloc.alignment() >= alignment
472                && buffer.allocate_for_tensor(nowc)
473            {
474                self.stats.record_allocation_hit(buffer_size);
475                return Some(buffer.as_ptr());
476            }
477        }
478        let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
479        if new_buffer.allocate_for_tensor(nowc) {
480            let ptr = new_buffer.as_ptr();
481            self.large_buffers.push(new_buffer);
482            self.stats
483                .record_allocation_miss(buffer_size, "new_buffer_created");
484            Some(ptr)
485        } else {
486            None
487        }
488    }
489
490    /// Allocate from xlarge pool
491    fn try_allocate_from_xlarge_pool(
492        &mut self,
493        buffer_size: usize,
494        alignment: usize,
495        _size_class: SizeClass,
496    ) -> Option<NonNull<f32>> {
497        let nowc = self.bump_op_counter();
498        for buffer in self.xlarge_buffers.iter_mut() {
499            // Only reuse when the existing buffer capacity is sufficient and alignment is compatible
500            if buffer.is_available()
501                && buffer.size() >= buffer_size
502                && buffer.alloc.alignment() >= alignment
503                && buffer.allocate_for_tensor(nowc)
504            {
505                self.stats.record_allocation_hit(buffer_size);
506                return Some(buffer.as_ptr());
507            }
508        }
509        let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
510        if new_buffer.allocate_for_tensor(nowc) {
511            let ptr = new_buffer.as_ptr();
512            self.xlarge_buffers.push(new_buffer);
513            self.stats
514                .record_allocation_miss(buffer_size, "new_buffer_created");
515            Some(ptr)
516        } else {
517            None
518        }
519    }
520
521    // Removed create_new_buffer helper; creation handled inline in try_allocate_from_* functions
522
523    /// Classifies size into size class
524    #[inline]
525    fn classify_size(&self, size: usize) -> SizeClass {
526        if size <= SMALL_BUFFER_SIZE {
527            SizeClass::Small
528        } else if size <= MEDIUM_BUFFER_SIZE {
529            SizeClass::Medium
530        } else if size <= LARGE_BUFFER_SIZE {
531            SizeClass::Large
532        } else {
533            SizeClass::XLarge
534        }
535    }
536
537    #[cfg(test)]
538    fn stats(&self) -> &PoolStats {
539        &self.stats
540    }
541}
542
543/// RAII guard to temporarily disable memory padding and pooled allocations
544/// within the current thread. This trades some runtime performance for
545/// potentially lower memory usage by avoiding lane-size padding and pool rounding.
546#[allow(dead_code)]
547pub struct NoMemPaddingGuard {
548    prev: bool,
549}
550
551impl Drop for NoMemPaddingGuard {
552    fn drop(&mut self) {
553        let _ = NO_MEM_PADDING.try_with(|flag| flag.set(self.prev));
554    }
555}
556
557impl NoMemPaddingGuard {
558    /// Create a new guard that disables memory padding until dropped
559    #[allow(dead_code)]
560    pub fn new() -> Self {
561        let prev = NO_MEM_PADDING.with(|flag| {
562            let old = flag.get();
563            flag.set(true);
564            old
565        });
566        NoMemPaddingGuard { prev }
567    }
568}
569
570impl Default for NoMemPaddingGuard {
571    fn default() -> Self {
572        Self::new()
573    }
574}
575
576/// RAII guard to temporarily disable pool usage (force system allocation) in this thread.
577pub struct NoMemPoolGuard {
578    prev: bool,
579}
580
581impl Drop for NoMemPoolGuard {
582    fn drop(&mut self) {
583        let _ = USE_POOL_ALLOC.try_with(|flag| flag.set(self.prev));
584    }
585}
586
587impl NoMemPoolGuard {
588    /// Create a new guard that disables pool allocations until dropped
589    pub fn new() -> Self {
590        let prev = USE_POOL_ALLOC.with(|flag| {
591            let old = flag.get();
592            flag.set(false);
593            old
594        });
595        NoMemPoolGuard { prev }
596    }
597}
598
599impl Default for NoMemPoolGuard {
600    fn default() -> Self {
601        Self::new()
602    }
603}
604
605/// Execute a closure with the memory pool disabled for the current thread.
606#[inline]
607pub fn with_no_mem_pool<F, R>(f: F) -> R
608where
609    F: FnOnce() -> R,
610{
611    let _guard = NoMemPoolGuard::new();
612    f()
613}
614
615/// Execute a closure with memory padding disabled for the current thread.
616#[inline]
617#[allow(dead_code)]
618pub fn with_no_mem_padding<F, R>(f: F) -> R
619where
620    F: FnOnce() -> R,
621{
622    let _guard = NoMemPaddingGuard::new();
623    f()
624}
625
626/// Returns true if the current thread has memory padding disabled.
627#[inline]
628pub fn no_mem_padding_enabled() -> bool {
629    NO_MEM_PADDING.with(|flag| flag.get())
630}
631
632impl TensorMemoryPool {
633    /// Returns the planned capacity (in f32 elements) the pool will allocate for a
634    /// given requested number of elements. This mirrors the internal size-class logic.
635    pub fn planned_capacity_elems(requested_elems: usize) -> usize {
636        if requested_elems <= SMALL_BUFFER_SIZE {
637            SMALL_BUFFER_SIZE
638        } else if requested_elems <= MEDIUM_BUFFER_SIZE {
639            MEDIUM_BUFFER_SIZE
640        } else if requested_elems <= LARGE_BUFFER_SIZE {
641            LARGE_BUFFER_SIZE
642        } else {
643            // Ensure exponential growth for very large allocations
644            (requested_elems * 2).max(262144 * 2)
645        }
646    }
647}
648
649impl PoolStats {
650    fn new() -> Self {
651        PoolStats {
652            allocations: 0,
653            deallocations: 0,
654            pool_hits: 0,
655            pool_misses: 0,
656            current_usage: 0,
657            peak_usage: 0,
658        }
659    }
660
661    fn record_allocation_hit(&mut self, buffer_size: usize) {
662        self.allocations += 1;
663        self.pool_hits += 1;
664        self.current_usage += buffer_size;
665        if self.current_usage > self.peak_usage {
666            self.peak_usage = self.current_usage;
667        }
668    }
669
670    fn record_allocation_miss(&mut self, _buffer_size: usize, _reason: &str) {
671        self.allocations += 1;
672        self.pool_misses += 1;
673    }
674
675    fn record_deallocation(&mut self, size: usize) {
676        self.deallocations += 1;
677        self.current_usage = self.current_usage.saturating_sub(size);
678    }
679}
680
681/// Public interface for memory pool operations
682impl TensorMemoryPool {
683    /// Attempts to allocate memory from the thread-local pool
684    ///
685    /// Returns Some(ptr) if allocation succeeds from pool,
686    /// None if fallback to system allocator is needed.
687    pub fn allocate(size: usize, alignment: usize) -> Option<NonNull<f32>> {
688        let result = MEMORY_POOL.with(|pool| pool.borrow_mut().try_allocate(size, alignment));
689        result
690    }
691
692    /// Attempts to return memory to the thread-local pool without panicking if TLS is
693    /// unavailable (e.g., during thread shutdown). Returns Some(result) when TLS is
694    /// accessible, or None if TLS is not available.
695    pub fn try_deallocate(ptr: NonNull<f32>) -> Option<bool> {
696        MEMORY_POOL
697            .try_with(|pool| {
698                let mut pool_mut = pool.borrow_mut();
699                pool_mut.return_to_pool(ptr)
700            })
701            .ok()
702    }
703
704    /// Return buffer to the appropriate pool
705    ///
706    /// Returns true if the buffer was successfully returned to a pool,
707    /// false if the buffer was not found in any pool (indicating it
708    /// was allocated directly from the system allocator).
709    fn return_to_pool(&mut self, ptr: NonNull<f32>) -> bool {
710        // Check each pool individually to avoid borrowing conflicts
711        if self.return_to_small_pool(ptr) {
712            self.maybe_cleanup();
713            return true;
714        }
715        if self.return_to_medium_pool(ptr) {
716            self.maybe_cleanup();
717            return true;
718        }
719        if self.return_to_large_pool(ptr) {
720            self.maybe_cleanup();
721            return true;
722        }
723        if self.return_to_xlarge_pool(ptr) {
724            self.maybe_cleanup();
725            return true;
726        }
727
728        // Buffer not found in any pool - this is expected for system-allocated memory
729        false
730    }
731
732    /// Return buffer to small pool
733    fn return_to_small_pool(&mut self, ptr: NonNull<f32>) -> bool {
734        let nowc = self.bump_op_counter();
735        for buffer in self.small_buffers.iter_mut() {
736            if buffer.as_ptr() == ptr {
737                buffer.return_to_pool(nowc);
738                self.stats.record_deallocation(buffer.size());
739                return true;
740            }
741        }
742        false
743    }
744
745    /// Return buffer to medium pool
746    fn return_to_medium_pool(&mut self, ptr: NonNull<f32>) -> bool {
747        let nowc = self.bump_op_counter();
748        for buffer in self.medium_buffers.iter_mut() {
749            if buffer.as_ptr() == ptr {
750                buffer.return_to_pool(nowc);
751                self.stats.record_deallocation(buffer.size());
752                return true;
753            }
754        }
755        false
756    }
757
758    /// Return buffer to large pool
759    fn return_to_large_pool(&mut self, ptr: NonNull<f32>) -> bool {
760        let nowc = self.bump_op_counter();
761        for buffer in self.large_buffers.iter_mut() {
762            if buffer.as_ptr() == ptr {
763                buffer.return_to_pool(nowc);
764                self.stats.record_deallocation(buffer.size());
765                return true;
766            }
767        }
768        false
769    }
770
771    /// Return buffer to xlarge pool
772    fn return_to_xlarge_pool(&mut self, ptr: NonNull<f32>) -> bool {
773        let nowc = self.bump_op_counter();
774        for buffer in self.xlarge_buffers.iter_mut() {
775            if buffer.as_ptr() == ptr {
776                buffer.return_to_pool(nowc);
777                self.stats.record_deallocation(buffer.size());
778                return true;
779            }
780        }
781        false
782    }
783
784    /// Gets statistics for the current thread's pool
785    #[cfg(test)]
786    pub fn thread_stats() -> PoolStats {
787        MEMORY_POOL.with(|pool| *pool.borrow().stats())
788    }
789
790    /// Test-only helper: return current buffer counts per pool
791    #[cfg(test)]
792    pub fn pool_sizes() -> (usize, usize, usize, usize) {
793        MEMORY_POOL.with(|pool| {
794            let p = pool.borrow();
795            (
796                p.small_buffers.len(),
797                p.medium_buffers.len(),
798                p.large_buffers.len(),
799                p.xlarge_buffers.len(),
800            )
801        })
802    }
803}
804
805impl TensorMemoryPool {
806    #[inline]
807    fn bump_op_counter(&mut self) -> u64 {
808        // Wrapping add to avoid panic on very long runs; practical overflow is unlikely
809        self.op_counter = self.op_counter.wrapping_add(1);
810        self.op_counter
811    }
812
813    /// Determine if a cleanup pass should run given time and op-counter thresholds
814    #[inline]
815    fn should_cleanup(&self) -> bool {
816        let ops_since = self.op_counter.wrapping_sub(self.last_cleanup_counter);
817        if ops_since < CLEANUP_MIN_OPS {
818            return false;
819        }
820        let elapsed = self.last_cleanup_instant.elapsed();
821        elapsed.as_millis() as u64 >= CLEANUP_MIN_INTERVAL_MS
822    }
823
824    /// Attempt to free long-idle excess buffers while preserving headroom to avoid thrash.
825    fn maybe_cleanup(&mut self) {
826        if !self.should_cleanup() {
827            return;
828        }
829
830        // Cleanup strategy per size class
831        let nowc = self.op_counter;
832        Self::cleanup_pool_vec(
833            &mut self.small_buffers,
834            TARGET_SMALL_BUFFERS,
835            HEADROOM_SMALL,
836            nowc,
837        );
838        Self::cleanup_pool_vec(
839            &mut self.medium_buffers,
840            TARGET_MEDIUM_BUFFERS,
841            HEADROOM_MEDIUM,
842            nowc,
843        );
844        Self::cleanup_pool_vec(
845            &mut self.large_buffers,
846            TARGET_LARGE_BUFFERS,
847            HEADROOM_LARGE,
848            nowc,
849        );
850        // For xlarge, keep minimal headroom; usage is often bursty and large
851        Self::cleanup_pool_vec(&mut self.xlarge_buffers, 2, HEADROOM_XLARGE, nowc);
852
853        // Update cleanup gates
854        self.last_cleanup_counter = self.op_counter;
855        self.last_cleanup_instant = Instant::now();
856    }
857
858    fn cleanup_pool_vec(
859        vec: &mut Vec<PooledBuffer>,
860        target: usize,
861        headroom: usize,
862        now_counter: u64,
863    ) {
864        if vec.is_empty() {
865            return;
866        }
867        // Compute current demand and desired capacity
868        let in_use = vec.iter().filter(|b| !b.is_available()).count();
869        let desired = core::cmp::max(target, in_use.saturating_add(headroom));
870        if vec.len() <= desired {
871            return;
872        }
873
874        // Identify eligible candidates: available and long-idle
875        let mut eligible: Vec<(usize, u64)> = vec
876            .iter()
877            .enumerate()
878            .filter(|(_i, b)| b.is_available())
879            .map(|(i, b)| (i, now_counter.wrapping_sub(b.last_used_counter)))
880            .filter(|(_i, age_ops)| *age_ops >= UNUSED_OPS_THRESHOLD)
881            .collect();
882
883        if eligible.is_empty() {
884            return;
885        }
886
887        // Prefer removing the stalest buffers first
888        eligible.sort_by_key(|(_i, age)| core::cmp::Reverse(*age));
889
890        let excess = vec.len().saturating_sub(desired);
891        let to_remove = core::cmp::min(excess, eligible.len());
892        if to_remove == 0 {
893            return;
894        }
895
896        // Remove by index from highest to lowest to avoid shifting issues
897        let mut to_drop: Vec<usize> = eligible.iter().take(to_remove).map(|(i, _)| *i).collect();
898        to_drop.sort_unstable_by(|a, b| b.cmp(a));
899        for idx in to_drop {
900            vec.remove(idx);
901        }
902    }
903}
904
905#[cfg(test)]
906mod tests {
907    use super::*;
908
909    #[test]
910    fn test_with_no_mem_padding_guard_scoping() {
911        // Default should be false
912        assert!(!no_mem_padding_enabled());
913        {
914            let _g = NoMemPaddingGuard::new();
915            assert!(no_mem_padding_enabled());
916        }
917        assert!(!no_mem_padding_enabled());
918    }
919
920    #[test]
921    fn test_compute_allocation_params_padding_behavior() {
922        // With padding enabled
923        let (align1, padded1) = compute_allocation_params(33);
924        let lane = simd_lane_width_elems(detect_runtime_simd());
925        assert!(padded1 >= 33);
926        assert_eq!(padded1 % lane, 0);
927        assert!(align1 >= 16);
928
929        // No padding
930        let res = with_no_mem_padding(|| compute_allocation_params(33));
931
932        assert_eq!(res.1, 33);
933    }
934
935    #[test]
936    fn test_same_thread_alloc_dealloc_counters_across_classes() {
937        let before = TensorMemoryPool::thread_stats();
938        {
939            let lane = simd_lane_width_elems(detect_runtime_simd());
940            let sizes = [
941                SMALL_BUFFER_SIZE.min(8),
942                MEDIUM_BUFFER_SIZE / 2,
943                LARGE_BUFFER_SIZE / 2,
944                LARGE_BUFFER_SIZE + lane * 3 + 7, // xlarge request
945            ];
946            for &n in &sizes {
947                let _t = crate::tensor::Tensor::new(vec![n]);
948            }
949        }
950        let after = TensorMemoryPool::thread_stats();
951        assert!(after.allocations >= before.allocations + 4);
952        assert!(after.deallocations >= before.deallocations + 4);
953    }
954
955    #[test]
956    fn test_xlarge_pool_does_not_reuse_too_small_buffer() {
957        let lane = simd_lane_width_elems(detect_runtime_simd());
958        let align = simd_alignment_bytes(detect_runtime_simd());
959        // First, create an xlarge buffer of some planned capacity
960        let small_xlarge = LARGE_BUFFER_SIZE + lane * 2;
961        let _t1 = crate::tensor::Tensor::new(vec![small_xlarge]);
962        // Now request a larger xlarge size that exceeds the prior capacity
963        let larger = small_xlarge * 2 + lane * 3;
964        let ptr_opt = MEMORY_POOL.with(|pool| {
965            let mut p = pool.borrow_mut();
966            p.try_allocate_from_xlarge_pool(larger, align, SizeClass::XLarge)
967        });
968        // We should get Some(ptr) from a newly created buffer; this test
969        // only asserts that an allocation succeeds and the pool doesn't panic/crash.
970        assert!(ptr_opt.is_some());
971    }
972
973    #[test]
974    fn test_cross_thread_drop_safe_no_crash() {
975        use std::thread;
976        let lane = simd_lane_width_elems(detect_runtime_simd());
977        let n = LARGE_BUFFER_SIZE + lane * 2 + 3; // xlarge
978        let t = crate::tensor::Tensor::new(vec![n]);
979        let handle = thread::spawn(move || {
980            // drop in another thread
981            drop(t);
982        });
983        let _ = handle.join();
984    }
985
986    #[test]
987    fn test_try_deallocate_returns_some_true_for_pooled() {
988        let align = simd_alignment_bytes(detect_runtime_simd());
989        let ptr = TensorMemoryPool::allocate(128, align).expect("pool allocate failed");
990        let res = TensorMemoryPool::try_deallocate(ptr);
991        assert_eq!(res, Some(true));
992    }
993
994    #[test]
995    fn perf_pool_vs_no_pool_by_category_over_1000_iterations() {
996        use std::time::Instant;
997
998        // Choose representative shapes per size class
999        let small = vec![32, 32]; // 1,024 elems
1000        let medium = vec![256, 256]; // 65,536 elems
1001        let large = vec![1024, 1024]; // 1,048,576 elems
1002        let xlarge = vec![1200, 1200]; // > large
1003
1004        fn bench_shape(shape: &[usize], iters: usize) -> std::time::Duration {
1005            let start = Instant::now();
1006            let mut sink = 0.0f32;
1007            for i in 0..iters {
1008                // Allocate
1009                let t0 = crate::tensor::Tensor::ones(shape.to_vec());
1010                // Simple API ops chain to exercise read/write paths
1011                let t1 = t0.add_scalar((i % 5) as f32 * 0.1);
1012                let t2 = t1.mul_scalar(1.2345);
1013                // Reduce to scalar to avoid DCE and force readback
1014                let s = t2.sum();
1015                sink += s.value();
1016            }
1017            assert!(sink.is_finite());
1018            start.elapsed()
1019        }
1020
1021        let iters = 1000usize;
1022
1023        let cats: [(&str, Vec<usize>); 4] = [
1024            ("small", small),
1025            ("medium", medium),
1026            ("large", large),
1027            ("xlarge", xlarge),
1028        ];
1029
1030        for (name, shape) in cats.iter() {
1031            let pooled = bench_shape(shape, iters);
1032            let system = super::with_no_mem_pool(|| bench_shape(shape, iters));
1033            let pooled_ms = pooled.as_secs_f64() * 1_000.0;
1034            let system_ms = system.as_secs_f64() * 1_000.0;
1035            let speedup = if pooled_ms > 0.0 {
1036                system_ms / pooled_ms
1037            } else {
1038                0.0
1039            };
1040            println!(
1041                "Perf [{} | {:?} elems]: pooled={:.2} ms, no_pool={:.2} ms, speedup={:.2}x (iters={})",
1042                name,
1043                shape.iter().product::<usize>(),
1044                pooled_ms,
1045                system_ms,
1046                speedup,
1047                iters
1048            );
1049
1050            // Both modes must produce a measurable duration
1051            assert!(pooled > std::time::Duration::from_millis(0));
1052            assert!(system > std::time::Duration::from_millis(0));
1053        }
1054    }
1055}
1056
1057#[cfg(test)]
1058mod xlarge_stress_tests {
1059    use super::*;
1060
1061    #[test]
1062    fn stress_xlarge_pool_various_sizes_single_thread() {
1063        // Define sizes slightly above LARGE_BUFFER_SIZE to hit xlarge pool
1064        let lane = simd_lane_width_elems(detect_runtime_simd());
1065        let sizes = [
1066            LARGE_BUFFER_SIZE + 1,
1067            LARGE_BUFFER_SIZE * 2 + lane - 1,
1068            LARGE_BUFFER_SIZE * 3 + 17,
1069            LARGE_BUFFER_SIZE * 4 + lane * 3 + 5,
1070            LARGE_BUFFER_SIZE * 6 + 123,
1071        ];
1072        for _ in 0..1000 {
1073            for &n in &sizes {
1074                let elems = n;
1075                let mut t = crate::tensor::Tensor::new(vec![elems]);
1076                // initialize a few positions to avoid reading uninitialized memory
1077                if elems > 0 {
1078                    t.set(&[0], 0.0);
1079                }
1080                assert_eq!(t.size(), elems);
1081            }
1082        }
1083    }
1084
1085    #[test]
1086    fn stress_xlarge_pool_multithreaded() {
1087        use std::thread;
1088        let lane = simd_lane_width_elems(detect_runtime_simd());
1089        let sizes = [
1090            LARGE_BUFFER_SIZE + 1,
1091            LARGE_BUFFER_SIZE * 2 + lane - 1,
1092            LARGE_BUFFER_SIZE * 3 + 17,
1093            LARGE_BUFFER_SIZE * 4 + lane * 3 + 5,
1094            LARGE_BUFFER_SIZE * 6 + 123,
1095        ];
1096        let threads = 8usize.min(
1097            std::thread::available_parallelism()
1098                .map(|n| n.get())
1099                .unwrap_or(8),
1100        );
1101        let mut handles = Vec::new();
1102        for tid in 0..threads {
1103            let sizes_clone = sizes;
1104            handles.push(thread::spawn(move || {
1105                for r in 0..20 {
1106                    for (i, n) in sizes_clone.iter().enumerate() {
1107                        let elems = n + (tid * 13 + r * 7 + i) % lane;
1108                        let mut t = crate::tensor::Tensor::new(vec![elems]);
1109                        assert_eq!(t.size(), elems);
1110                        // write a few positions to exercise memory
1111                        if elems > 0 {
1112                            let idx0 = elems / 2;
1113                            let idx1 = (elems.saturating_sub(1)) / 3;
1114                            let idx2 = (elems.saturating_sub(1)) / 5;
1115                            // write via safe API
1116                            if idx0 < t.size() {
1117                                t.set(&[idx0], 1.2345);
1118                            }
1119                            if idx1 < t.size() {
1120                                t.set(&[idx1], 2.3456);
1121                            }
1122                            if idx2 < t.size() {
1123                                t.set(&[idx2], 3.4567);
1124                            }
1125                        }
1126                    }
1127                }
1128            }));
1129        }
1130        for h in handles {
1131            let _ = h.join();
1132        }
1133    }
1134}
1135
1136#[cfg(test)]
1137mod additional_safety_tests {
1138    use super::*;
1139
1140    #[test]
1141    fn test_pool_alloc_dealloc_balanced_small_medium_large() {
1142        let before = TensorMemoryPool::thread_stats();
1143        {
1144            let _s1 = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(16)]);
1145            let _m1 = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 4]);
1146            let _l1 = crate::tensor::Tensor::new(vec![LARGE_BUFFER_SIZE / 4]);
1147        }
1148        let after = TensorMemoryPool::thread_stats();
1149        assert!(
1150            after.allocations >= before.allocations + 3,
1151            "allocations did not increase as expected: before={}, after={}",
1152            before.allocations,
1153            after.allocations
1154        );
1155        assert!(
1156            after.deallocations >= before.deallocations + 3,
1157            "deallocations did not increase as expected: before={}, after={}",
1158            before.deallocations,
1159            after.deallocations
1160        );
1161        // Current usage should not grow across scope
1162        assert!(
1163            after.current_usage <= before.current_usage,
1164            "current_usage grew: before={}, after={}",
1165            before.current_usage,
1166            after.current_usage
1167        );
1168    }
1169
1170    #[test]
1171    fn test_pointer_alignment_across_classes() {
1172        let align = simd_alignment_bytes(detect_runtime_simd());
1173        for &n in &[
1174            8usize,
1175            SMALL_BUFFER_SIZE,
1176            MEDIUM_BUFFER_SIZE,
1177            LARGE_BUFFER_SIZE + 128,
1178        ] {
1179            let t = crate::tensor::Tensor::new(vec![n]);
1180            unsafe {
1181                let addr = t.as_ptr() as usize;
1182                assert_eq!(
1183                    addr % align,
1184                    0,
1185                    "pointer not aligned to {} for n={}",
1186                    align,
1187                    n
1188                );
1189            }
1190        }
1191    }
1192
1193    #[test]
1194    fn test_with_no_mem_pool_uses_system_allocator_no_pool_stats() {
1195        let before = TensorMemoryPool::thread_stats();
1196        with_no_mem_pool(|| {
1197            let _t1 = crate::tensor::Tensor::new(vec![64]);
1198            let _t2 = crate::tensor::Tensor::new(vec![2048]);
1199            let _t3 = crate::tensor::Tensor::new(vec![131072]);
1200        });
1201        let after = TensorMemoryPool::thread_stats();
1202        // Pool should not register hits/misses when disabled within the scope
1203        assert_eq!(
1204            after.allocations, before.allocations,
1205            "pool allocations changed with pool disabled: before={}, after={}",
1206            before.allocations, after.allocations
1207        );
1208        assert_eq!(
1209            after.deallocations, before.deallocations,
1210            "pool deallocations changed with pool disabled: before={}, after={}",
1211            before.deallocations, after.deallocations
1212        );
1213    }
1214
1215    #[test]
1216    fn test_cross_thread_drop_does_not_affect_this_thread_stats() {
1217        let before = TensorMemoryPool::thread_stats();
1218        // Allocate in a worker thread and drop in this thread
1219        let handle =
1220            std::thread::spawn(|| crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(32)]));
1221        let t = handle.join().unwrap();
1222        drop(t); // Drop on current thread; should not touch this thread's pool stats
1223        let after = TensorMemoryPool::thread_stats();
1224        assert_eq!(
1225            after.allocations, before.allocations,
1226            "allocations changed in current thread due to cross-thread drop: before={}, after={}",
1227            before.allocations, after.allocations
1228        );
1229        // Deallocation also should not be recorded in this thread
1230        assert_eq!(
1231            after.deallocations, before.deallocations,
1232            "deallocations changed in current thread due to cross-thread drop: before={}, after={}",
1233            before.deallocations, after.deallocations
1234        );
1235    }
1236
1237    #[test]
1238    fn test_many_alloc_dealloc_cycles_no_growth_in_current_usage() {
1239        let before = TensorMemoryPool::thread_stats();
1240        for _ in 0..100 {
1241            let _t1 = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(64)]);
1242            let _t2 = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 8]);
1243        }
1244        let after = TensorMemoryPool::thread_stats();
1245        // current_usage should remain bounded and not monotonically grow
1246        assert!(
1247            after.current_usage <= before.current_usage + SMALL_BUFFER_SIZE + MEDIUM_BUFFER_SIZE,
1248            "current_usage unexpected growth: before={}, after={}",
1249            before.current_usage,
1250            after.current_usage
1251        );
1252    }
1253}
1254
1255#[cfg(test)]
1256mod cleanup_tests {
1257    use super::*;
1258    use std::thread;
1259    use std::time::Duration;
1260
1261    // Helper to create and hold N tensors of the given element count (single-dim)
1262    fn hold_tensors(count: usize, elems: usize) -> Vec<crate::tensor::Tensor> {
1263        let mut v = Vec::with_capacity(count);
1264        for _ in 0..count {
1265            v.push(crate::tensor::Tensor::new(vec![elems]));
1266        }
1267        v
1268    }
1269
1270    // Helper to bump pool op counters by performing lightweight small allocations
1271    fn bump_ops_small_iters(iters: usize) {
1272        for _ in 0..iters {
1273            let _t = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(8)]);
1274        }
1275    }
1276
1277    #[test]
1278    fn test_no_cleanup_while_many_small_buffers_in_use() {
1279        // Prime the pool with many small buffers held alive
1280        let holders = hold_tensors(40, SMALL_BUFFER_SIZE.min(32));
1281        let (small_before, _, _, _) = TensorMemoryPool::pool_sizes();
1282        assert!(
1283            small_before >= 40,
1284            "expected >=40 small buffers, got {}",
1285            small_before
1286        );
1287
1288        // Bump op counters and time while these buffers remain in use
1289        bump_ops_small_iters(1500); // ~3000 ops
1290        thread::sleep(Duration::from_millis(2100));
1291        bump_ops_small_iters(700); // exceed thresholds
1292
1293        // Trigger a cleanup attempt via an allocation in another size class (medium)
1294        {
1295            let _m = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 2]);
1296        }
1297
1298        // While buffers are still in-use, no trimming should occur (len must not decrease)
1299        let (small_mid, _, _, _) = TensorMemoryPool::pool_sizes();
1300        assert!(
1301            small_mid >= small_before,
1302            "small pool shrank while heavily in-use: before={} after={}",
1303            small_before,
1304            small_mid
1305        );
1306
1307        // Now drop the holders; their last_used timestamps are fresh, so cleanup shouldn't trim them
1308        drop(holders);
1309
1310        // Trigger cleanup again
1311        let _ = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 2]);
1312        let (small_after, _, _, _) = TensorMemoryPool::pool_sizes();
1313        assert!(
1314            small_after >= small_before,
1315            "small pool unexpectedly trimmed active buffers: before={} after={}",
1316            small_before,
1317            small_after
1318        );
1319    }
1320
1321    #[test]
1322    fn test_cleanup_trims_long_idle_medium_buffers() {
1323        // Create many medium buffers simultaneously to grow pool capacity
1324        {
1325            let _holders = hold_tensors(30, MEDIUM_BUFFER_SIZE / 2);
1326            // _holders dropped at end of scope, all buffers become available
1327        }
1328        let (_, med_before, _, _) = TensorMemoryPool::pool_sizes();
1329        assert!(
1330            med_before >= 30,
1331            "expected >=30 medium buffers, got {}",
1332            med_before
1333        );
1334
1335        // Leave medium buffers idle; bump ops using small allocations and wait to satisfy time gate
1336        bump_ops_small_iters(2300); // ~4600 ops (> UNUSED_OPS_THRESHOLD)
1337        thread::sleep(Duration::from_millis(2100));
1338
1339        // Trigger cleanup and observe trimming
1340        let _ = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(16)]);
1341        let (_, med_after, _, _) = TensorMemoryPool::pool_sizes();
1342
1343        assert!(
1344            med_after < med_before,
1345            "medium pool not trimmed despite long idle: before={} after={}",
1346            med_before,
1347            med_after
1348        );
1349    }
1350}