1use std::alloc::Layout;
26use std::cell::Cell;
27use std::cell::RefCell;
28use std::ptr::NonNull;
29use std::time::Instant;
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub struct PoolStats {
37 pub allocations: usize,
39 pub deallocations: usize,
41 pub pool_hits: usize,
43 pub pool_misses: usize,
45 pub current_usage: usize,
47 pub peak_usage: usize,
49}
50
51pub const SMALL_BUFFER_SIZE: usize = 1024; pub const MEDIUM_BUFFER_SIZE: usize = 65536; pub const LARGE_BUFFER_SIZE: usize = 1048576; const TARGET_SMALL_BUFFERS: usize = 32; const TARGET_MEDIUM_BUFFERS: usize = 16; const TARGET_LARGE_BUFFERS: usize = 8; const HEADROOM_SMALL: usize = 8;
79const HEADROOM_MEDIUM: usize = 4;
80const HEADROOM_LARGE: usize = 2;
81const HEADROOM_XLARGE: usize = 1;
82
83const CLEANUP_MIN_OPS: u64 = 2048;
85const CLEANUP_MIN_INTERVAL_MS: u64 = 2000; const UNUSED_OPS_THRESHOLD: u64 = 4096;
89
90pub struct PooledBuffer {
102 alloc: crate::tensor::core::Allocation,
104 in_use: bool,
106 last_used_counter: u64,
108}
109
110pub struct TensorMemoryPool {
123 small_buffers: Vec<PooledBuffer>,
126
127 medium_buffers: Vec<PooledBuffer>,
130
131 large_buffers: Vec<PooledBuffer>,
134
135 xlarge_buffers: Vec<PooledBuffer>,
138
139 stats: PoolStats,
141 op_counter: u64,
144 last_cleanup_counter: u64,
146 last_cleanup_instant: Instant,
148}
149
150#[derive(Debug, Clone, Copy, PartialEq, Eq)]
154enum SizeClass {
155 Small, Medium, Large, XLarge, }
160
161thread_local! {
166 static MEMORY_POOL: RefCell<TensorMemoryPool> = RefCell::new(TensorMemoryPool::new());
167 static NO_MEM_PADDING: Cell<bool> = const { Cell::new(false) };
171 static USE_POOL_ALLOC: Cell<bool> = const { Cell::new(true) };
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
178pub enum SimdLevel {
179 #[cfg(target_arch = "x86_64")]
180 Avx512,
181 #[cfg(target_arch = "x86_64")]
182 Avx2,
183 #[cfg(target_arch = "x86_64")]
184 Sse2,
185 Scalar,
186}
187
188#[inline]
190pub fn detect_runtime_simd() -> SimdLevel {
191 #[cfg(target_arch = "x86_64")]
192 {
193 if is_x86_feature_detected!("avx512f") {
195 return SimdLevel::Avx512;
196 }
197 if is_x86_feature_detected!("avx2") {
198 return SimdLevel::Avx2;
199 }
200 if is_x86_feature_detected!("sse2") {
201 return SimdLevel::Sse2;
202 }
203
204 SimdLevel::Scalar
205 }
206 #[cfg(not(target_arch = "x86_64"))]
207 {
208 SimdLevel::Scalar
209 }
210}
211
212#[inline]
214pub(crate) fn simd_lane_width_elems(level: SimdLevel) -> usize {
215 match level {
216 #[cfg(target_arch = "x86_64")]
217 SimdLevel::Avx512 => 16, #[cfg(target_arch = "x86_64")]
219 SimdLevel::Avx2 => 8, #[cfg(target_arch = "x86_64")]
221 SimdLevel::Sse2 => 4, SimdLevel::Scalar => 1,
223 }
224}
225
226#[inline]
228pub fn simd_alignment_bytes(level: SimdLevel) -> usize {
229 match level {
230 #[cfg(target_arch = "x86_64")]
231 SimdLevel::Avx512 => 64,
232 #[cfg(target_arch = "x86_64")]
233 SimdLevel::Avx2 => 32,
234 #[cfg(target_arch = "x86_64")]
235 SimdLevel::Sse2 => 16,
236 SimdLevel::Scalar => 16, }
238}
239
240#[inline]
244pub fn compute_allocation_params(requested_elems: usize) -> (usize, usize) {
245 let level = detect_runtime_simd();
246 #[cfg(target_arch = "x86_64")]
247 let mut align = simd_alignment_bytes(level);
248 #[cfg(not(target_arch = "x86_64"))]
249 let align = simd_alignment_bytes(level);
250
251 #[cfg(target_arch = "x86_64")]
253 {
254 if is_x86_feature_detected!("avx512f") {
255 align = 64;
256 } else if is_x86_feature_detected!("avx2") {
257 align = align.max(32);
258 }
259 }
260
261 if no_mem_padding_enabled() || requested_elems == 0 {
262 (align, requested_elems)
263 } else {
264 let lane = simd_lane_width_elems(level);
265 let padded = requested_elems.div_ceil(lane) * lane;
266 (align, padded)
267 }
268}
269
270#[inline]
272pub fn use_pool_alloc_enabled() -> bool {
273 USE_POOL_ALLOC.with(|flag| flag.get())
274}
275
276impl PooledBuffer {
277 fn new(size: usize, alignment: usize) -> Self {
282 let effective_alignment = alignment.max(std::mem::align_of::<f32>());
284 let layout =
285 Layout::from_size_align(size * std::mem::size_of::<f32>(), effective_alignment)
286 .expect("Invalid layout for pooled buffer");
287 let alloc =
289 crate::tensor::core::Allocation::new_uninitialized(size, effective_alignment, layout);
290 let addr = alloc.ptr.as_ptr() as usize;
292 assert_eq!(
293 addr % alignment,
294 0,
295 "System allocator failed to provide {}-byte aligned memory. Got address 0x{:x} (alignment {})",
296 alignment,
297 addr,
298 addr % alignment
299 );
300 PooledBuffer {
301 alloc,
302 in_use: false,
303 last_used_counter: 0,
304 }
305 }
306
307 #[inline(always)]
309 pub fn as_ptr(&self) -> NonNull<f32> {
310 self.alloc.ptr
311 }
312
313 #[inline(always)]
315 pub fn size(&self) -> usize {
316 self.alloc.capacity_elems()
317 }
318
319 #[inline]
323 fn allocate_for_tensor(&mut self, now_counter: u64) -> bool {
324 if self.in_use {
325 false
326 } else {
327 self.in_use = true;
328 self.last_used_counter = now_counter;
329 true
330 }
331 }
332
333 #[inline]
335 fn return_to_pool(&mut self, now_counter: u64) {
336 self.in_use = false;
337 self.last_used_counter = now_counter;
338 }
339
340 #[inline(always)]
342 pub fn is_available(&self) -> bool {
343 !self.in_use
344 }
345}
346
347impl TensorMemoryPool {
350 pub fn new() -> Self {
354 TensorMemoryPool {
355 small_buffers: Vec::with_capacity(TARGET_SMALL_BUFFERS),
357 medium_buffers: Vec::with_capacity(TARGET_MEDIUM_BUFFERS),
358 large_buffers: Vec::with_capacity(TARGET_LARGE_BUFFERS),
359 xlarge_buffers: Vec::with_capacity(4),
360 stats: PoolStats::new(),
361 op_counter: 0,
362 last_cleanup_counter: 0,
363 last_cleanup_instant: Instant::now(),
364 }
365 }
366
367 fn try_allocate(&mut self, size: usize, alignment: usize) -> Option<NonNull<f32>> {
372 let size_class = self.classify_size(size);
373
374 self.try_allocate_internal(size, alignment, size_class)
375 }
376
377 fn try_allocate_internal(
379 &mut self,
380 size: usize,
381 alignment: usize,
382 size_class: SizeClass,
383 ) -> Option<NonNull<f32>> {
384 self.maybe_cleanup();
386 match size_class {
387 SizeClass::Small => {
388 self.try_allocate_from_small_pool(SMALL_BUFFER_SIZE, alignment, size_class)
389 }
390 SizeClass::Medium => {
391 self.try_allocate_from_medium_pool(MEDIUM_BUFFER_SIZE, alignment, size_class)
392 }
393 SizeClass::Large => {
394 self.try_allocate_from_large_pool(LARGE_BUFFER_SIZE, alignment, size_class)
395 }
396 SizeClass::XLarge => {
397 let planned = TensorMemoryPool::planned_capacity_elems(size);
398 self.try_allocate_from_xlarge_pool(planned, alignment, size_class)
399 }
400 }
401 }
402
403 fn try_allocate_from_small_pool(
405 &mut self,
406 buffer_size: usize,
407 alignment: usize,
408 _size_class: SizeClass,
409 ) -> Option<NonNull<f32>> {
410 let nowc = self.bump_op_counter();
411 for buffer in self.small_buffers.iter_mut() {
412 if buffer.is_available()
413 && buffer.alloc.alignment() >= alignment
414 && buffer.allocate_for_tensor(nowc)
415 {
416 self.stats.record_allocation_hit(buffer_size);
417 return Some(buffer.as_ptr());
418 }
419 }
420 let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
421 if new_buffer.allocate_for_tensor(nowc) {
422 let ptr = new_buffer.as_ptr();
423 self.small_buffers.push(new_buffer);
424 self.stats
425 .record_allocation_miss(buffer_size, "new_buffer_created");
426 Some(ptr)
427 } else {
428 None
429 }
430 }
431
432 fn try_allocate_from_medium_pool(
434 &mut self,
435 buffer_size: usize,
436 alignment: usize,
437 _size_class: SizeClass,
438 ) -> Option<NonNull<f32>> {
439 let nowc = self.bump_op_counter();
440 for buffer in self.medium_buffers.iter_mut() {
441 if buffer.is_available()
442 && buffer.alloc.alignment() >= alignment
443 && buffer.allocate_for_tensor(nowc)
444 {
445 self.stats.record_allocation_hit(buffer_size);
446 return Some(buffer.as_ptr());
447 }
448 }
449 let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
450 if new_buffer.allocate_for_tensor(nowc) {
451 let ptr = new_buffer.as_ptr();
452 self.medium_buffers.push(new_buffer);
453 self.stats
454 .record_allocation_miss(buffer_size, "new_buffer_created");
455 Some(ptr)
456 } else {
457 None
458 }
459 }
460
461 fn try_allocate_from_large_pool(
463 &mut self,
464 buffer_size: usize,
465 alignment: usize,
466 _size_class: SizeClass,
467 ) -> Option<NonNull<f32>> {
468 let nowc = self.bump_op_counter();
469 for buffer in self.large_buffers.iter_mut() {
470 if buffer.is_available()
471 && buffer.alloc.alignment() >= alignment
472 && buffer.allocate_for_tensor(nowc)
473 {
474 self.stats.record_allocation_hit(buffer_size);
475 return Some(buffer.as_ptr());
476 }
477 }
478 let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
479 if new_buffer.allocate_for_tensor(nowc) {
480 let ptr = new_buffer.as_ptr();
481 self.large_buffers.push(new_buffer);
482 self.stats
483 .record_allocation_miss(buffer_size, "new_buffer_created");
484 Some(ptr)
485 } else {
486 None
487 }
488 }
489
490 fn try_allocate_from_xlarge_pool(
492 &mut self,
493 buffer_size: usize,
494 alignment: usize,
495 _size_class: SizeClass,
496 ) -> Option<NonNull<f32>> {
497 let nowc = self.bump_op_counter();
498 for buffer in self.xlarge_buffers.iter_mut() {
499 if buffer.is_available()
501 && buffer.size() >= buffer_size
502 && buffer.alloc.alignment() >= alignment
503 && buffer.allocate_for_tensor(nowc)
504 {
505 self.stats.record_allocation_hit(buffer_size);
506 return Some(buffer.as_ptr());
507 }
508 }
509 let mut new_buffer = PooledBuffer::new(buffer_size, alignment);
510 if new_buffer.allocate_for_tensor(nowc) {
511 let ptr = new_buffer.as_ptr();
512 self.xlarge_buffers.push(new_buffer);
513 self.stats
514 .record_allocation_miss(buffer_size, "new_buffer_created");
515 Some(ptr)
516 } else {
517 None
518 }
519 }
520
521 #[inline]
525 fn classify_size(&self, size: usize) -> SizeClass {
526 if size <= SMALL_BUFFER_SIZE {
527 SizeClass::Small
528 } else if size <= MEDIUM_BUFFER_SIZE {
529 SizeClass::Medium
530 } else if size <= LARGE_BUFFER_SIZE {
531 SizeClass::Large
532 } else {
533 SizeClass::XLarge
534 }
535 }
536
537 #[cfg(test)]
538 fn stats(&self) -> &PoolStats {
539 &self.stats
540 }
541}
542
543#[allow(dead_code)]
547pub struct NoMemPaddingGuard {
548 prev: bool,
549}
550
551impl Drop for NoMemPaddingGuard {
552 fn drop(&mut self) {
553 let _ = NO_MEM_PADDING.try_with(|flag| flag.set(self.prev));
554 }
555}
556
557impl NoMemPaddingGuard {
558 #[allow(dead_code)]
560 pub fn new() -> Self {
561 let prev = NO_MEM_PADDING.with(|flag| {
562 let old = flag.get();
563 flag.set(true);
564 old
565 });
566 NoMemPaddingGuard { prev }
567 }
568}
569
570impl Default for NoMemPaddingGuard {
571 fn default() -> Self {
572 Self::new()
573 }
574}
575
576pub struct NoMemPoolGuard {
578 prev: bool,
579}
580
581impl Drop for NoMemPoolGuard {
582 fn drop(&mut self) {
583 let _ = USE_POOL_ALLOC.try_with(|flag| flag.set(self.prev));
584 }
585}
586
587impl NoMemPoolGuard {
588 pub fn new() -> Self {
590 let prev = USE_POOL_ALLOC.with(|flag| {
591 let old = flag.get();
592 flag.set(false);
593 old
594 });
595 NoMemPoolGuard { prev }
596 }
597}
598
599impl Default for NoMemPoolGuard {
600 fn default() -> Self {
601 Self::new()
602 }
603}
604
605#[inline]
607pub fn with_no_mem_pool<F, R>(f: F) -> R
608where
609 F: FnOnce() -> R,
610{
611 let _guard = NoMemPoolGuard::new();
612 f()
613}
614
615#[inline]
617#[allow(dead_code)]
618pub fn with_no_mem_padding<F, R>(f: F) -> R
619where
620 F: FnOnce() -> R,
621{
622 let _guard = NoMemPaddingGuard::new();
623 f()
624}
625
626#[inline]
628pub fn no_mem_padding_enabled() -> bool {
629 NO_MEM_PADDING.with(|flag| flag.get())
630}
631
632impl TensorMemoryPool {
633 pub fn planned_capacity_elems(requested_elems: usize) -> usize {
636 if requested_elems <= SMALL_BUFFER_SIZE {
637 SMALL_BUFFER_SIZE
638 } else if requested_elems <= MEDIUM_BUFFER_SIZE {
639 MEDIUM_BUFFER_SIZE
640 } else if requested_elems <= LARGE_BUFFER_SIZE {
641 LARGE_BUFFER_SIZE
642 } else {
643 (requested_elems * 2).max(262144 * 2)
645 }
646 }
647}
648
649impl PoolStats {
650 fn new() -> Self {
651 PoolStats {
652 allocations: 0,
653 deallocations: 0,
654 pool_hits: 0,
655 pool_misses: 0,
656 current_usage: 0,
657 peak_usage: 0,
658 }
659 }
660
661 fn record_allocation_hit(&mut self, buffer_size: usize) {
662 self.allocations += 1;
663 self.pool_hits += 1;
664 self.current_usage += buffer_size;
665 if self.current_usage > self.peak_usage {
666 self.peak_usage = self.current_usage;
667 }
668 }
669
670 fn record_allocation_miss(&mut self, _buffer_size: usize, _reason: &str) {
671 self.allocations += 1;
672 self.pool_misses += 1;
673 }
674
675 fn record_deallocation(&mut self, size: usize) {
676 self.deallocations += 1;
677 self.current_usage = self.current_usage.saturating_sub(size);
678 }
679}
680
681impl TensorMemoryPool {
683 pub fn allocate(size: usize, alignment: usize) -> Option<NonNull<f32>> {
688 let result = MEMORY_POOL.with(|pool| pool.borrow_mut().try_allocate(size, alignment));
689 result
690 }
691
692 pub fn try_deallocate(ptr: NonNull<f32>) -> Option<bool> {
696 MEMORY_POOL
697 .try_with(|pool| {
698 let mut pool_mut = pool.borrow_mut();
699 pool_mut.return_to_pool(ptr)
700 })
701 .ok()
702 }
703
704 fn return_to_pool(&mut self, ptr: NonNull<f32>) -> bool {
710 if self.return_to_small_pool(ptr) {
712 self.maybe_cleanup();
713 return true;
714 }
715 if self.return_to_medium_pool(ptr) {
716 self.maybe_cleanup();
717 return true;
718 }
719 if self.return_to_large_pool(ptr) {
720 self.maybe_cleanup();
721 return true;
722 }
723 if self.return_to_xlarge_pool(ptr) {
724 self.maybe_cleanup();
725 return true;
726 }
727
728 false
730 }
731
732 fn return_to_small_pool(&mut self, ptr: NonNull<f32>) -> bool {
734 let nowc = self.bump_op_counter();
735 for buffer in self.small_buffers.iter_mut() {
736 if buffer.as_ptr() == ptr {
737 buffer.return_to_pool(nowc);
738 self.stats.record_deallocation(buffer.size());
739 return true;
740 }
741 }
742 false
743 }
744
745 fn return_to_medium_pool(&mut self, ptr: NonNull<f32>) -> bool {
747 let nowc = self.bump_op_counter();
748 for buffer in self.medium_buffers.iter_mut() {
749 if buffer.as_ptr() == ptr {
750 buffer.return_to_pool(nowc);
751 self.stats.record_deallocation(buffer.size());
752 return true;
753 }
754 }
755 false
756 }
757
758 fn return_to_large_pool(&mut self, ptr: NonNull<f32>) -> bool {
760 let nowc = self.bump_op_counter();
761 for buffer in self.large_buffers.iter_mut() {
762 if buffer.as_ptr() == ptr {
763 buffer.return_to_pool(nowc);
764 self.stats.record_deallocation(buffer.size());
765 return true;
766 }
767 }
768 false
769 }
770
771 fn return_to_xlarge_pool(&mut self, ptr: NonNull<f32>) -> bool {
773 let nowc = self.bump_op_counter();
774 for buffer in self.xlarge_buffers.iter_mut() {
775 if buffer.as_ptr() == ptr {
776 buffer.return_to_pool(nowc);
777 self.stats.record_deallocation(buffer.size());
778 return true;
779 }
780 }
781 false
782 }
783
784 #[cfg(test)]
786 pub fn thread_stats() -> PoolStats {
787 MEMORY_POOL.with(|pool| *pool.borrow().stats())
788 }
789
790 #[cfg(test)]
792 pub fn pool_sizes() -> (usize, usize, usize, usize) {
793 MEMORY_POOL.with(|pool| {
794 let p = pool.borrow();
795 (
796 p.small_buffers.len(),
797 p.medium_buffers.len(),
798 p.large_buffers.len(),
799 p.xlarge_buffers.len(),
800 )
801 })
802 }
803}
804
805impl TensorMemoryPool {
806 #[inline]
807 fn bump_op_counter(&mut self) -> u64 {
808 self.op_counter = self.op_counter.wrapping_add(1);
810 self.op_counter
811 }
812
813 #[inline]
815 fn should_cleanup(&self) -> bool {
816 let ops_since = self.op_counter.wrapping_sub(self.last_cleanup_counter);
817 if ops_since < CLEANUP_MIN_OPS {
818 return false;
819 }
820 let elapsed = self.last_cleanup_instant.elapsed();
821 elapsed.as_millis() as u64 >= CLEANUP_MIN_INTERVAL_MS
822 }
823
824 fn maybe_cleanup(&mut self) {
826 if !self.should_cleanup() {
827 return;
828 }
829
830 let nowc = self.op_counter;
832 Self::cleanup_pool_vec(
833 &mut self.small_buffers,
834 TARGET_SMALL_BUFFERS,
835 HEADROOM_SMALL,
836 nowc,
837 );
838 Self::cleanup_pool_vec(
839 &mut self.medium_buffers,
840 TARGET_MEDIUM_BUFFERS,
841 HEADROOM_MEDIUM,
842 nowc,
843 );
844 Self::cleanup_pool_vec(
845 &mut self.large_buffers,
846 TARGET_LARGE_BUFFERS,
847 HEADROOM_LARGE,
848 nowc,
849 );
850 Self::cleanup_pool_vec(&mut self.xlarge_buffers, 2, HEADROOM_XLARGE, nowc);
852
853 self.last_cleanup_counter = self.op_counter;
855 self.last_cleanup_instant = Instant::now();
856 }
857
858 fn cleanup_pool_vec(
859 vec: &mut Vec<PooledBuffer>,
860 target: usize,
861 headroom: usize,
862 now_counter: u64,
863 ) {
864 if vec.is_empty() {
865 return;
866 }
867 let in_use = vec.iter().filter(|b| !b.is_available()).count();
869 let desired = core::cmp::max(target, in_use.saturating_add(headroom));
870 if vec.len() <= desired {
871 return;
872 }
873
874 let mut eligible: Vec<(usize, u64)> = vec
876 .iter()
877 .enumerate()
878 .filter(|(_i, b)| b.is_available())
879 .map(|(i, b)| (i, now_counter.wrapping_sub(b.last_used_counter)))
880 .filter(|(_i, age_ops)| *age_ops >= UNUSED_OPS_THRESHOLD)
881 .collect();
882
883 if eligible.is_empty() {
884 return;
885 }
886
887 eligible.sort_by_key(|(_i, age)| core::cmp::Reverse(*age));
889
890 let excess = vec.len().saturating_sub(desired);
891 let to_remove = core::cmp::min(excess, eligible.len());
892 if to_remove == 0 {
893 return;
894 }
895
896 let mut to_drop: Vec<usize> = eligible.iter().take(to_remove).map(|(i, _)| *i).collect();
898 to_drop.sort_unstable_by(|a, b| b.cmp(a));
899 for idx in to_drop {
900 vec.remove(idx);
901 }
902 }
903}
904
905#[cfg(test)]
906mod tests {
907 use super::*;
908
909 #[test]
910 fn test_with_no_mem_padding_guard_scoping() {
911 assert!(!no_mem_padding_enabled());
913 {
914 let _g = NoMemPaddingGuard::new();
915 assert!(no_mem_padding_enabled());
916 }
917 assert!(!no_mem_padding_enabled());
918 }
919
920 #[test]
921 fn test_compute_allocation_params_padding_behavior() {
922 let (align1, padded1) = compute_allocation_params(33);
924 let lane = simd_lane_width_elems(detect_runtime_simd());
925 assert!(padded1 >= 33);
926 assert_eq!(padded1 % lane, 0);
927 assert!(align1 >= 16);
928
929 let res = with_no_mem_padding(|| compute_allocation_params(33));
931
932 assert_eq!(res.1, 33);
933 }
934
935 #[test]
936 fn test_same_thread_alloc_dealloc_counters_across_classes() {
937 let before = TensorMemoryPool::thread_stats();
938 {
939 let lane = simd_lane_width_elems(detect_runtime_simd());
940 let sizes = [
941 SMALL_BUFFER_SIZE.min(8),
942 MEDIUM_BUFFER_SIZE / 2,
943 LARGE_BUFFER_SIZE / 2,
944 LARGE_BUFFER_SIZE + lane * 3 + 7, ];
946 for &n in &sizes {
947 let _t = crate::tensor::Tensor::new(vec![n]);
948 }
949 }
950 let after = TensorMemoryPool::thread_stats();
951 assert!(after.allocations >= before.allocations + 4);
952 assert!(after.deallocations >= before.deallocations + 4);
953 }
954
955 #[test]
956 fn test_xlarge_pool_does_not_reuse_too_small_buffer() {
957 let lane = simd_lane_width_elems(detect_runtime_simd());
958 let align = simd_alignment_bytes(detect_runtime_simd());
959 let small_xlarge = LARGE_BUFFER_SIZE + lane * 2;
961 let _t1 = crate::tensor::Tensor::new(vec![small_xlarge]);
962 let larger = small_xlarge * 2 + lane * 3;
964 let ptr_opt = MEMORY_POOL.with(|pool| {
965 let mut p = pool.borrow_mut();
966 p.try_allocate_from_xlarge_pool(larger, align, SizeClass::XLarge)
967 });
968 assert!(ptr_opt.is_some());
971 }
972
973 #[test]
974 fn test_cross_thread_drop_safe_no_crash() {
975 use std::thread;
976 let lane = simd_lane_width_elems(detect_runtime_simd());
977 let n = LARGE_BUFFER_SIZE + lane * 2 + 3; let t = crate::tensor::Tensor::new(vec![n]);
979 let handle = thread::spawn(move || {
980 drop(t);
982 });
983 let _ = handle.join();
984 }
985
986 #[test]
987 fn test_try_deallocate_returns_some_true_for_pooled() {
988 let align = simd_alignment_bytes(detect_runtime_simd());
989 let ptr = TensorMemoryPool::allocate(128, align).expect("pool allocate failed");
990 let res = TensorMemoryPool::try_deallocate(ptr);
991 assert_eq!(res, Some(true));
992 }
993
994 #[test]
995 fn perf_pool_vs_no_pool_by_category_over_1000_iterations() {
996 use std::time::Instant;
997
998 let small = vec![32, 32]; let medium = vec![256, 256]; let large = vec![1024, 1024]; let xlarge = vec![1200, 1200]; fn bench_shape(shape: &[usize], iters: usize) -> std::time::Duration {
1005 let start = Instant::now();
1006 let mut sink = 0.0f32;
1007 for i in 0..iters {
1008 let t0 = crate::tensor::Tensor::ones(shape.to_vec());
1010 let t1 = t0.add_scalar((i % 5) as f32 * 0.1);
1012 let t2 = t1.mul_scalar(1.2345);
1013 let s = t2.sum();
1015 sink += s.value();
1016 }
1017 assert!(sink.is_finite());
1018 start.elapsed()
1019 }
1020
1021 let iters = 1000usize;
1022
1023 let cats: [(&str, Vec<usize>); 4] = [
1024 ("small", small),
1025 ("medium", medium),
1026 ("large", large),
1027 ("xlarge", xlarge),
1028 ];
1029
1030 for (name, shape) in cats.iter() {
1031 let pooled = bench_shape(shape, iters);
1032 let system = super::with_no_mem_pool(|| bench_shape(shape, iters));
1033 let pooled_ms = pooled.as_secs_f64() * 1_000.0;
1034 let system_ms = system.as_secs_f64() * 1_000.0;
1035 let speedup = if pooled_ms > 0.0 {
1036 system_ms / pooled_ms
1037 } else {
1038 0.0
1039 };
1040 println!(
1041 "Perf [{} | {:?} elems]: pooled={:.2} ms, no_pool={:.2} ms, speedup={:.2}x (iters={})",
1042 name,
1043 shape.iter().product::<usize>(),
1044 pooled_ms,
1045 system_ms,
1046 speedup,
1047 iters
1048 );
1049
1050 assert!(pooled > std::time::Duration::from_millis(0));
1052 assert!(system > std::time::Duration::from_millis(0));
1053 }
1054 }
1055}
1056
1057#[cfg(test)]
1058mod xlarge_stress_tests {
1059 use super::*;
1060
1061 #[test]
1062 fn stress_xlarge_pool_various_sizes_single_thread() {
1063 let lane = simd_lane_width_elems(detect_runtime_simd());
1065 let sizes = [
1066 LARGE_BUFFER_SIZE + 1,
1067 LARGE_BUFFER_SIZE * 2 + lane - 1,
1068 LARGE_BUFFER_SIZE * 3 + 17,
1069 LARGE_BUFFER_SIZE * 4 + lane * 3 + 5,
1070 LARGE_BUFFER_SIZE * 6 + 123,
1071 ];
1072 for _ in 0..1000 {
1073 for &n in &sizes {
1074 let elems = n;
1075 let mut t = crate::tensor::Tensor::new(vec![elems]);
1076 if elems > 0 {
1078 t.set(&[0], 0.0);
1079 }
1080 assert_eq!(t.size(), elems);
1081 }
1082 }
1083 }
1084
1085 #[test]
1086 fn stress_xlarge_pool_multithreaded() {
1087 use std::thread;
1088 let lane = simd_lane_width_elems(detect_runtime_simd());
1089 let sizes = [
1090 LARGE_BUFFER_SIZE + 1,
1091 LARGE_BUFFER_SIZE * 2 + lane - 1,
1092 LARGE_BUFFER_SIZE * 3 + 17,
1093 LARGE_BUFFER_SIZE * 4 + lane * 3 + 5,
1094 LARGE_BUFFER_SIZE * 6 + 123,
1095 ];
1096 let threads = 8usize.min(
1097 std::thread::available_parallelism()
1098 .map(|n| n.get())
1099 .unwrap_or(8),
1100 );
1101 let mut handles = Vec::new();
1102 for tid in 0..threads {
1103 let sizes_clone = sizes;
1104 handles.push(thread::spawn(move || {
1105 for r in 0..20 {
1106 for (i, n) in sizes_clone.iter().enumerate() {
1107 let elems = n + (tid * 13 + r * 7 + i) % lane;
1108 let mut t = crate::tensor::Tensor::new(vec![elems]);
1109 assert_eq!(t.size(), elems);
1110 if elems > 0 {
1112 let idx0 = elems / 2;
1113 let idx1 = (elems.saturating_sub(1)) / 3;
1114 let idx2 = (elems.saturating_sub(1)) / 5;
1115 if idx0 < t.size() {
1117 t.set(&[idx0], 1.2345);
1118 }
1119 if idx1 < t.size() {
1120 t.set(&[idx1], 2.3456);
1121 }
1122 if idx2 < t.size() {
1123 t.set(&[idx2], 3.4567);
1124 }
1125 }
1126 }
1127 }
1128 }));
1129 }
1130 for h in handles {
1131 let _ = h.join();
1132 }
1133 }
1134}
1135
1136#[cfg(test)]
1137mod additional_safety_tests {
1138 use super::*;
1139
1140 #[test]
1141 fn test_pool_alloc_dealloc_balanced_small_medium_large() {
1142 let before = TensorMemoryPool::thread_stats();
1143 {
1144 let _s1 = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(16)]);
1145 let _m1 = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 4]);
1146 let _l1 = crate::tensor::Tensor::new(vec![LARGE_BUFFER_SIZE / 4]);
1147 }
1148 let after = TensorMemoryPool::thread_stats();
1149 assert!(
1150 after.allocations >= before.allocations + 3,
1151 "allocations did not increase as expected: before={}, after={}",
1152 before.allocations,
1153 after.allocations
1154 );
1155 assert!(
1156 after.deallocations >= before.deallocations + 3,
1157 "deallocations did not increase as expected: before={}, after={}",
1158 before.deallocations,
1159 after.deallocations
1160 );
1161 assert!(
1163 after.current_usage <= before.current_usage,
1164 "current_usage grew: before={}, after={}",
1165 before.current_usage,
1166 after.current_usage
1167 );
1168 }
1169
1170 #[test]
1171 fn test_pointer_alignment_across_classes() {
1172 let align = simd_alignment_bytes(detect_runtime_simd());
1173 for &n in &[
1174 8usize,
1175 SMALL_BUFFER_SIZE,
1176 MEDIUM_BUFFER_SIZE,
1177 LARGE_BUFFER_SIZE + 128,
1178 ] {
1179 let t = crate::tensor::Tensor::new(vec![n]);
1180 unsafe {
1181 let addr = t.as_ptr() as usize;
1182 assert_eq!(
1183 addr % align,
1184 0,
1185 "pointer not aligned to {} for n={}",
1186 align,
1187 n
1188 );
1189 }
1190 }
1191 }
1192
1193 #[test]
1194 fn test_with_no_mem_pool_uses_system_allocator_no_pool_stats() {
1195 let before = TensorMemoryPool::thread_stats();
1196 with_no_mem_pool(|| {
1197 let _t1 = crate::tensor::Tensor::new(vec![64]);
1198 let _t2 = crate::tensor::Tensor::new(vec![2048]);
1199 let _t3 = crate::tensor::Tensor::new(vec![131072]);
1200 });
1201 let after = TensorMemoryPool::thread_stats();
1202 assert_eq!(
1204 after.allocations, before.allocations,
1205 "pool allocations changed with pool disabled: before={}, after={}",
1206 before.allocations, after.allocations
1207 );
1208 assert_eq!(
1209 after.deallocations, before.deallocations,
1210 "pool deallocations changed with pool disabled: before={}, after={}",
1211 before.deallocations, after.deallocations
1212 );
1213 }
1214
1215 #[test]
1216 fn test_cross_thread_drop_does_not_affect_this_thread_stats() {
1217 let before = TensorMemoryPool::thread_stats();
1218 let handle =
1220 std::thread::spawn(|| crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(32)]));
1221 let t = handle.join().unwrap();
1222 drop(t); let after = TensorMemoryPool::thread_stats();
1224 assert_eq!(
1225 after.allocations, before.allocations,
1226 "allocations changed in current thread due to cross-thread drop: before={}, after={}",
1227 before.allocations, after.allocations
1228 );
1229 assert_eq!(
1231 after.deallocations, before.deallocations,
1232 "deallocations changed in current thread due to cross-thread drop: before={}, after={}",
1233 before.deallocations, after.deallocations
1234 );
1235 }
1236
1237 #[test]
1238 fn test_many_alloc_dealloc_cycles_no_growth_in_current_usage() {
1239 let before = TensorMemoryPool::thread_stats();
1240 for _ in 0..100 {
1241 let _t1 = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(64)]);
1242 let _t2 = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 8]);
1243 }
1244 let after = TensorMemoryPool::thread_stats();
1245 assert!(
1247 after.current_usage <= before.current_usage + SMALL_BUFFER_SIZE + MEDIUM_BUFFER_SIZE,
1248 "current_usage unexpected growth: before={}, after={}",
1249 before.current_usage,
1250 after.current_usage
1251 );
1252 }
1253}
1254
1255#[cfg(test)]
1256mod cleanup_tests {
1257 use super::*;
1258 use std::thread;
1259 use std::time::Duration;
1260
1261 fn hold_tensors(count: usize, elems: usize) -> Vec<crate::tensor::Tensor> {
1263 let mut v = Vec::with_capacity(count);
1264 for _ in 0..count {
1265 v.push(crate::tensor::Tensor::new(vec![elems]));
1266 }
1267 v
1268 }
1269
1270 fn bump_ops_small_iters(iters: usize) {
1272 for _ in 0..iters {
1273 let _t = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(8)]);
1274 }
1275 }
1276
1277 #[test]
1278 fn test_no_cleanup_while_many_small_buffers_in_use() {
1279 let holders = hold_tensors(40, SMALL_BUFFER_SIZE.min(32));
1281 let (small_before, _, _, _) = TensorMemoryPool::pool_sizes();
1282 assert!(
1283 small_before >= 40,
1284 "expected >=40 small buffers, got {}",
1285 small_before
1286 );
1287
1288 bump_ops_small_iters(1500); thread::sleep(Duration::from_millis(2100));
1291 bump_ops_small_iters(700); {
1295 let _m = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 2]);
1296 }
1297
1298 let (small_mid, _, _, _) = TensorMemoryPool::pool_sizes();
1300 assert!(
1301 small_mid >= small_before,
1302 "small pool shrank while heavily in-use: before={} after={}",
1303 small_before,
1304 small_mid
1305 );
1306
1307 drop(holders);
1309
1310 let _ = crate::tensor::Tensor::new(vec![MEDIUM_BUFFER_SIZE / 2]);
1312 let (small_after, _, _, _) = TensorMemoryPool::pool_sizes();
1313 assert!(
1314 small_after >= small_before,
1315 "small pool unexpectedly trimmed active buffers: before={} after={}",
1316 small_before,
1317 small_after
1318 );
1319 }
1320
1321 #[test]
1322 fn test_cleanup_trims_long_idle_medium_buffers() {
1323 {
1325 let _holders = hold_tensors(30, MEDIUM_BUFFER_SIZE / 2);
1326 }
1328 let (_, med_before, _, _) = TensorMemoryPool::pool_sizes();
1329 assert!(
1330 med_before >= 30,
1331 "expected >=30 medium buffers, got {}",
1332 med_before
1333 );
1334
1335 bump_ops_small_iters(2300); thread::sleep(Duration::from_millis(2100));
1338
1339 let _ = crate::tensor::Tensor::new(vec![SMALL_BUFFER_SIZE.min(16)]);
1341 let (_, med_after, _, _) = TensorMemoryPool::pool_sizes();
1342
1343 assert!(
1344 med_after < med_before,
1345 "medium pool not trimmed despite long idle: before={} after={}",
1346 med_before,
1347 med_after
1348 );
1349 }
1350}