cjc_runtime/aligned_pool.rs
1//! Aligned memory pool -- 16-byte aligned allocation for SIMD readiness.
2//!
3//! Provides [`AlignedPool`] for raw 16-byte-aligned byte storage, and
4//! [`AlignedByteSlice`] for transparent alignment of tensor weight data.
5//! When source bytes are already aligned, no copy is performed; when
6//! misaligned, a one-time aligned copy is made into the pool.
7//!
8//! # SIMD relevance
9//!
10//! AVX2 `_mm256_load_pd` requires 32-byte alignment for optimal performance,
11//! but 16-byte alignment avoids page-split penalties on all x86-64 CPUs.
12//! The [`tensor_simd`](crate::tensor_simd) module uses unaligned loads
13//! (`_mm256_loadu_pd`) so alignment is not strictly required, but aligned
14//! data is faster on older microarchitectures.
15
16use std::fmt;
17use std::rc::Rc;
18
19use crate::error::RuntimeError;
20use crate::tensor::Tensor;
21
22// ---------------------------------------------------------------------------
23// 2c. AlignedPool — 16-byte aligned allocation for SIMD readiness
24// ---------------------------------------------------------------------------
25
26/// A pre-allocated memory pool with 16-byte alignment guarantee.
27///
28/// Used by `AlignedByteSlice` to ensure that f32/f64 data mapped from raw
29/// bytes starts on a SIMD-friendly boundary. When source bytes are already
30/// aligned, no copy is needed; when misaligned, a one-time aligned copy is
31/// performed into the pool.
32#[derive(Debug, Clone)]
33pub struct AlignedPool {
34 /// Backing storage. The Vec itself is heap-allocated with alignment ≥ 8.
35 /// We over-allocate by 15 bytes and track the aligned offset.
36 storage: Vec<u8>,
37 /// Byte offset into `storage` where the aligned region begins.
38 aligned_offset: usize,
39 /// Usable capacity (bytes) from the aligned offset.
40 capacity: usize,
41 /// Number of bytes currently written.
42 len: usize,
43}
44
45impl AlignedPool {
46 /// Create a new pool with capacity for at least `capacity_bytes` of
47 /// 16-byte-aligned data. The actual allocation may be slightly larger.
48 pub fn new(capacity_bytes: usize) -> Self {
49 // Over-allocate by 15 bytes so we can always find a 16-byte boundary.
50 let alloc_size = capacity_bytes + 15;
51 let storage = vec![0u8; alloc_size];
52 let base_ptr = storage.as_ptr() as usize;
53 let aligned_offset = (16 - (base_ptr % 16)) % 16;
54 AlignedPool {
55 storage,
56 aligned_offset,
57 capacity: capacity_bytes,
58 len: 0,
59 }
60 }
61
62 /// Returns a pointer to the aligned region.
63 pub fn as_ptr(&self) -> *const u8 {
64 // SAFETY: aligned_offset is always within bounds by construction.
65 unsafe { self.storage.as_ptr().add(self.aligned_offset) }
66 }
67
68 /// Returns a mutable pointer to the aligned region.
69 pub fn as_mut_ptr(&mut self) -> *mut u8 {
70 unsafe { self.storage.as_mut_ptr().add(self.aligned_offset) }
71 }
72
73 /// Returns the aligned region as a byte slice.
74 pub fn as_bytes(&self) -> &[u8] {
75 &self.storage[self.aligned_offset..self.aligned_offset + self.len]
76 }
77
78 /// Check if a raw pointer is 16-byte aligned.
79 pub fn is_aligned_16(ptr: *const u8) -> bool {
80 (ptr as usize) % 16 == 0
81 }
82
83 /// Copy `data` into the pool, returning the aligned byte slice.
84 /// Returns an error if data exceeds pool capacity.
85 pub fn copy_from(&mut self, data: &[u8]) -> Result<(), RuntimeError> {
86 if data.len() > self.capacity {
87 return Err(RuntimeError::InvalidOperation(
88 format!(
89 "AlignedPool: data length {} exceeds capacity {}",
90 data.len(),
91 self.capacity
92 ),
93 ));
94 }
95 let dest = &mut self.storage[self.aligned_offset..self.aligned_offset + data.len()];
96 dest.copy_from_slice(data);
97 self.len = data.len();
98 Ok(())
99 }
100
101 /// Current number of bytes stored.
102 pub fn len(&self) -> usize {
103 self.len
104 }
105
106 /// Whether the pool is empty.
107 pub fn is_empty(&self) -> bool {
108 self.len == 0
109 }
110
111 /// Total capacity in bytes.
112 pub fn capacity(&self) -> usize {
113 self.capacity
114 }
115
116 /// Verify that the aligned pointer is indeed 16-byte aligned.
117 pub fn check_alignment(&self) -> bool {
118 Self::is_aligned_16(self.as_ptr())
119 }
120}
121
122impl fmt::Display for AlignedPool {
123 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124 write!(
125 f,
126 "AlignedPool(len={}, capacity={}, aligned={})",
127 self.len, self.capacity, self.check_alignment()
128 )
129 }
130}
131
132/// An alignment-aware byte slice that guarantees 16-byte alignment for
133/// tensor weight mapping. If the source bytes are already aligned, it
134/// wraps them directly. If misaligned, it copies into an `AlignedPool`.
135#[derive(Debug, Clone)]
136pub struct AlignedByteSlice {
137 /// The pool holds the aligned copy (if a copy was needed).
138 pool: Option<AlignedPool>,
139 /// Original bytes (kept for reference / fallback).
140 original: Rc<Vec<u8>>,
141 /// Whether a copy was performed (true = was misaligned).
142 was_copied: bool,
143}
144
145impl AlignedByteSlice {
146 /// Create an aligned byte slice from raw bytes.
147 ///
148 /// If the data is already 16-byte aligned, no copy is performed.
149 /// If misaligned, the data is copied into an aligned pool and a
150 /// warning flag is set.
151 pub fn from_bytes(data: Rc<Vec<u8>>) -> Self {
152 let ptr = data.as_ptr();
153 if AlignedPool::is_aligned_16(ptr) {
154 AlignedByteSlice {
155 pool: None,
156 original: data,
157 was_copied: false,
158 }
159 } else {
160 let mut pool = AlignedPool::new(data.len());
161 // This cannot fail: pool capacity == data.len()
162 pool.copy_from(&data).unwrap();
163 AlignedByteSlice {
164 pool: Some(pool),
165 original: data,
166 was_copied: true,
167 }
168 }
169 }
170
171 /// Get the aligned bytes. If a copy was needed, returns the pool's
172 /// bytes; otherwise returns the original directly.
173 pub fn as_bytes(&self) -> &[u8] {
174 match &self.pool {
175 Some(pool) => pool.as_bytes(),
176 None => &self.original,
177 }
178 }
179
180 /// Whether a copy was required for alignment.
181 pub fn was_realigned(&self) -> bool {
182 self.was_copied
183 }
184
185 /// Length in bytes.
186 pub fn len(&self) -> usize {
187 self.original.len()
188 }
189
190 /// Whether empty.
191 pub fn is_empty(&self) -> bool {
192 self.original.is_empty()
193 }
194
195 /// Map these aligned bytes to a Tensor, identical to Tensor::from_bytes
196 /// but with alignment guarantee.
197 pub fn as_tensor(&self, shape: &[usize], dtype: &str) -> Result<Tensor, RuntimeError> {
198 Tensor::from_bytes(self.as_bytes(), shape, dtype)
199 }
200}
201
202impl fmt::Display for AlignedByteSlice {
203 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204 write!(
205 f,
206 "AlignedByteSlice(len={}, realigned={})",
207 self.len(),
208 self.was_copied
209 )
210 }
211}
212