Skip to main content

cjc_runtime/
aligned_pool.rs

1//! Aligned memory pool -- 16-byte aligned allocation for SIMD readiness.
2//!
3//! Provides [`AlignedPool`] for raw 16-byte-aligned byte storage, and
4//! [`AlignedByteSlice`] for transparent alignment of tensor weight data.
5//! When source bytes are already aligned, no copy is performed; when
6//! misaligned, a one-time aligned copy is made into the pool.
7//!
8//! # SIMD relevance
9//!
10//! AVX2 `_mm256_load_pd` requires 32-byte alignment for optimal performance,
11//! but 16-byte alignment avoids page-split penalties on all x86-64 CPUs.
12//! The [`tensor_simd`](crate::tensor_simd) module uses unaligned loads
13//! (`_mm256_loadu_pd`) so alignment is not strictly required, but aligned
14//! data is faster on older microarchitectures.
15
16use std::fmt;
17use std::rc::Rc;
18
19use crate::error::RuntimeError;
20use crate::tensor::Tensor;
21
22// ---------------------------------------------------------------------------
23// 2c. AlignedPool — 16-byte aligned allocation for SIMD readiness
24// ---------------------------------------------------------------------------
25
26/// A pre-allocated memory pool with 16-byte alignment guarantee.
27///
28/// Used by `AlignedByteSlice` to ensure that f32/f64 data mapped from raw
29/// bytes starts on a SIMD-friendly boundary. When source bytes are already
30/// aligned, no copy is needed; when misaligned, a one-time aligned copy is
31/// performed into the pool.
32#[derive(Debug, Clone)]
33pub struct AlignedPool {
34    /// Backing storage. The Vec itself is heap-allocated with alignment ≥ 8.
35    /// We over-allocate by 15 bytes and track the aligned offset.
36    storage: Vec<u8>,
37    /// Byte offset into `storage` where the aligned region begins.
38    aligned_offset: usize,
39    /// Usable capacity (bytes) from the aligned offset.
40    capacity: usize,
41    /// Number of bytes currently written.
42    len: usize,
43}
44
45impl AlignedPool {
46    /// Create a new pool with capacity for at least `capacity_bytes` of
47    /// 16-byte-aligned data. The actual allocation may be slightly larger.
48    pub fn new(capacity_bytes: usize) -> Self {
49        // Over-allocate by 15 bytes so we can always find a 16-byte boundary.
50        let alloc_size = capacity_bytes + 15;
51        let storage = vec![0u8; alloc_size];
52        let base_ptr = storage.as_ptr() as usize;
53        let aligned_offset = (16 - (base_ptr % 16)) % 16;
54        AlignedPool {
55            storage,
56            aligned_offset,
57            capacity: capacity_bytes,
58            len: 0,
59        }
60    }
61
62    /// Returns a pointer to the aligned region.
63    pub fn as_ptr(&self) -> *const u8 {
64        // SAFETY: aligned_offset is always within bounds by construction.
65        unsafe { self.storage.as_ptr().add(self.aligned_offset) }
66    }
67
68    /// Returns a mutable pointer to the aligned region.
69    pub fn as_mut_ptr(&mut self) -> *mut u8 {
70        unsafe { self.storage.as_mut_ptr().add(self.aligned_offset) }
71    }
72
73    /// Returns the aligned region as a byte slice.
74    pub fn as_bytes(&self) -> &[u8] {
75        &self.storage[self.aligned_offset..self.aligned_offset + self.len]
76    }
77
78    /// Check if a raw pointer is 16-byte aligned.
79    pub fn is_aligned_16(ptr: *const u8) -> bool {
80        (ptr as usize) % 16 == 0
81    }
82
83    /// Copy `data` into the pool, returning the aligned byte slice.
84    /// Returns an error if data exceeds pool capacity.
85    pub fn copy_from(&mut self, data: &[u8]) -> Result<(), RuntimeError> {
86        if data.len() > self.capacity {
87            return Err(RuntimeError::InvalidOperation(
88                format!(
89                    "AlignedPool: data length {} exceeds capacity {}",
90                    data.len(),
91                    self.capacity
92                ),
93            ));
94        }
95        let dest = &mut self.storage[self.aligned_offset..self.aligned_offset + data.len()];
96        dest.copy_from_slice(data);
97        self.len = data.len();
98        Ok(())
99    }
100
101    /// Current number of bytes stored.
102    pub fn len(&self) -> usize {
103        self.len
104    }
105
106    /// Whether the pool is empty.
107    pub fn is_empty(&self) -> bool {
108        self.len == 0
109    }
110
111    /// Total capacity in bytes.
112    pub fn capacity(&self) -> usize {
113        self.capacity
114    }
115
116    /// Verify that the aligned pointer is indeed 16-byte aligned.
117    pub fn check_alignment(&self) -> bool {
118        Self::is_aligned_16(self.as_ptr())
119    }
120}
121
122impl fmt::Display for AlignedPool {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        write!(
125            f,
126            "AlignedPool(len={}, capacity={}, aligned={})",
127            self.len, self.capacity, self.check_alignment()
128        )
129    }
130}
131
132/// An alignment-aware byte slice that guarantees 16-byte alignment for
133/// tensor weight mapping. If the source bytes are already aligned, it
134/// wraps them directly. If misaligned, it copies into an `AlignedPool`.
135#[derive(Debug, Clone)]
136pub struct AlignedByteSlice {
137    /// The pool holds the aligned copy (if a copy was needed).
138    pool: Option<AlignedPool>,
139    /// Original bytes (kept for reference / fallback).
140    original: Rc<Vec<u8>>,
141    /// Whether a copy was performed (true = was misaligned).
142    was_copied: bool,
143}
144
145impl AlignedByteSlice {
146    /// Create an aligned byte slice from raw bytes.
147    ///
148    /// If the data is already 16-byte aligned, no copy is performed.
149    /// If misaligned, the data is copied into an aligned pool and a
150    /// warning flag is set.
151    pub fn from_bytes(data: Rc<Vec<u8>>) -> Self {
152        let ptr = data.as_ptr();
153        if AlignedPool::is_aligned_16(ptr) {
154            AlignedByteSlice {
155                pool: None,
156                original: data,
157                was_copied: false,
158            }
159        } else {
160            let mut pool = AlignedPool::new(data.len());
161            // This cannot fail: pool capacity == data.len()
162            pool.copy_from(&data).unwrap();
163            AlignedByteSlice {
164                pool: Some(pool),
165                original: data,
166                was_copied: true,
167            }
168        }
169    }
170
171    /// Get the aligned bytes. If a copy was needed, returns the pool's
172    /// bytes; otherwise returns the original directly.
173    pub fn as_bytes(&self) -> &[u8] {
174        match &self.pool {
175            Some(pool) => pool.as_bytes(),
176            None => &self.original,
177        }
178    }
179
180    /// Whether a copy was required for alignment.
181    pub fn was_realigned(&self) -> bool {
182        self.was_copied
183    }
184
185    /// Length in bytes.
186    pub fn len(&self) -> usize {
187        self.original.len()
188    }
189
190    /// Whether empty.
191    pub fn is_empty(&self) -> bool {
192        self.original.is_empty()
193    }
194
195    /// Map these aligned bytes to a Tensor, identical to Tensor::from_bytes
196    /// but with alignment guarantee.
197    pub fn as_tensor(&self, shape: &[usize], dtype: &str) -> Result<Tensor, RuntimeError> {
198        Tensor::from_bytes(self.as_bytes(), shape, dtype)
199    }
200}
201
202impl fmt::Display for AlignedByteSlice {
203    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204        write!(
205            f,
206            "AlignedByteSlice(len={}, realigned={})",
207            self.len(),
208            self.was_copied
209        )
210    }
211}
212