const CACHE_LINE_SIZE: usize = 64;
#[derive(Debug)]
pub struct CacheAlignedBuffer {
data: Vec<f32>,
offset: usize,
len: usize,
}
impl CacheAlignedBuffer {
#[must_use]
pub fn new(len: usize) -> Self {
let align_elements = CACHE_LINE_SIZE / std::mem::size_of::<f32>();
let extra = align_elements - 1;
let data = vec![0.0f32; len + extra];
let ptr = data.as_ptr() as usize;
let misalignment = ptr % CACHE_LINE_SIZE;
let offset = if misalignment == 0 {
0
} else {
(CACHE_LINE_SIZE - misalignment) / std::mem::size_of::<f32>()
};
Self { data, offset, len }
}
#[must_use]
pub fn is_aligned(&self, alignment: usize) -> bool {
let ptr = self.as_slice().as_ptr() as usize;
ptr.is_multiple_of(alignment)
}
#[must_use]
pub fn len(&self) -> usize {
self.len
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len == 0
}
#[must_use]
pub fn as_slice(&self) -> &[f32] {
&self.data[self.offset..self.offset + self.len]
}
pub fn as_mut_slice(&mut self) -> &mut [f32] {
let offset = self.offset;
let len = self.len;
&mut self.data[offset..offset + len]
}
}
#[inline]
pub fn prefetch_read(data: &[f32], position: usize, distance: usize) {
let prefetch_pos = position + distance;
if prefetch_pos < data.len() {
let _ = unsafe { std::ptr::read_volatile(&raw const data[prefetch_pos]) };
}
}
#[must_use]
pub fn sequential_sum(data: &[f32]) -> f32 {
data.iter().sum()
}
#[must_use]
pub fn sum_with_prefetch(data: &[f32], prefetch_distance: usize) -> f32 {
let mut sum = 0.0f32;
let len = data.len();
for i in 0..len {
if i + prefetch_distance < len {
prefetch_read(data, i, prefetch_distance);
}
sum += data[i];
}
sum
}
#[must_use]
pub fn naive_matmul(
mat_a: &[f32],
mat_b: &[f32],
rows: usize,
inner: usize,
cols: usize,
) -> Vec<f32> {
let mut result = vec![0.0f32; rows * cols];
for row in 0..rows {
for col in 0..cols {
let mut sum = 0.0f32;
for idx in 0..inner {
sum += mat_a[row * inner + idx] * mat_b[idx * cols + col];
}
result[row * cols + col] = sum;
}
}
result
}
#[inline]
fn compute_block(
result: &mut [f32],
mat_a: &[f32],
mat_b: &[f32],
row_range: std::ops::Range<usize>,
col_range: std::ops::Range<usize>,
inner_range: std::ops::Range<usize>,
inner: usize,
cols: usize,
) {
for row in row_range {
for col in col_range.clone() {
let mut sum = result[row * cols + col];
for idx in inner_range.clone() {
sum += mat_a[row * inner + idx] * mat_b[idx * cols + col];
}
result[row * cols + col] = sum;
}
}
}
#[must_use]
#[allow(clippy::many_single_char_names)] pub fn blocked_matmul(
mat_a: &[f32],
mat_b: &[f32],
rows: usize,
inner: usize,
cols: usize,
block_size: usize,
) -> Vec<f32> {
let mut result = vec![0.0f32; rows * cols];
for row_blk in (0..rows).step_by(block_size) {
let row_end = (row_blk + block_size).min(rows);
for col_blk in (0..cols).step_by(block_size) {
let col_end = (col_blk + block_size).min(cols);
for inner_blk in (0..inner).step_by(block_size) {
let inner_end = (inner_blk + block_size).min(inner);
compute_block(
&mut result,
mat_a,
mat_b,
row_blk..row_end,
col_blk..col_end,
inner_blk..inner_end,
inner,
cols,
);
}
}
}
result
}
#[derive(Debug)]
pub struct TensorPool {
capacity: usize,
buffers: Vec<Vec<f32>>,
}
impl TensorPool {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
capacity,
buffers: Vec::with_capacity(capacity),
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.capacity
}
#[must_use]
pub fn available(&self) -> usize {
self.buffers.len()
}
pub fn acquire(&mut self, size: usize) -> Vec<f32> {
if let Some(idx) = self.buffers.iter().position(|b| b.capacity() >= size) {
let mut buffer = self.buffers.swap_remove(idx);
buffer.resize(size, 0.0);
buffer
} else {
vec![0.0f32; size]
}
}
pub fn release(&mut self, buffer: Vec<f32>) {
if self.buffers.len() < self.capacity {
self.buffers.push(buffer);
}
}
pub fn clear(&mut self) {
self.buffers.clear();
}
}
#[derive(Debug)]
pub struct ForwardArena {
data: Vec<f32>,
offset: usize,
}
impl ForwardArena {
#[must_use]
pub fn new(capacity: usize) -> Self {
Self {
data: vec![0.0f32; capacity],
offset: 0,
}
}
#[must_use]
pub fn capacity(&self) -> usize {
self.data.len()
}
#[must_use]
pub fn used(&self) -> usize {
self.offset
}
pub fn alloc(&mut self, size: usize) -> &mut [f32] {
let start = self.offset;
let end = start + size;
assert!(
end <= self.data.len(),
"ForwardArena: insufficient capacity (need {}, have {})",
end,
self.data.len()
);
self.offset = end;
&mut self.data[start..end]
}
pub fn reset(&mut self) {
self.offset = 0;
}
}
#[derive(Debug)]
pub struct ScratchBuffer {
num_layers: usize,
layer_size: usize,
data: Vec<f32>,
}
impl ScratchBuffer {
#[must_use]
pub fn new(num_layers: usize, layer_size: usize) -> Self {
Self {
num_layers,
layer_size,
data: vec![0.0f32; num_layers * layer_size],
}
}
#[must_use]
pub fn num_layers(&self) -> usize {
self.num_layers
}
#[must_use]
pub fn layer_size(&self) -> usize {
self.layer_size
}
#[must_use]
pub fn total_size(&self) -> usize {
self.num_layers * self.layer_size
}
#[must_use]
pub fn get_layer(&self, layer_idx: usize) -> &[f32] {
assert!(
layer_idx < self.num_layers,
"ScratchBuffer: layer index {} out of bounds (max {})",
layer_idx,
self.num_layers
);
let start = layer_idx * self.layer_size;
let end = start + self.layer_size;
&self.data[start..end]
}
pub fn get_layer_mut(&mut self, layer_idx: usize) -> &mut [f32] {
assert!(
layer_idx < self.num_layers,
"ScratchBuffer: layer index {} out of bounds (max {})",
layer_idx,
self.num_layers
);
let start = layer_idx * self.layer_size;
let end = start + self.layer_size;
&mut self.data[start..end]
}
pub fn reset(&mut self) {
self.data.fill(0.0);
}
}
include!("allocator_cache_aligned.rs");