#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use std::arch::x86_64::*;
use std::mem;
use std::ptr;
use std::slice;
use crate::eval::value::Value;
use crate::diagnostics::{Error, Result};
#[derive(Debug, Clone, PartialEq)]
pub enum SimdOperationType {
DenseUniform,
Sparse,
MixedTypes,
Streaming,
Small,
}
pub struct AlignedBuffer<T> {
ptr: *mut T,
capacity: usize,
len: usize,
alignment: usize,
}
impl<T> AlignedBuffer<T> {
pub fn new(capacity: usize, alignment: usize) -> Result<Self> {
if !alignment.is_power_of_two() || alignment < mem::align_of::<T>() {
return Err(Box::new(Error::runtime_error(
format!("Invalid alignment: {} (must be power of 2 and >= {})",
alignment, mem::align_of::<T>()),
None
)));
}
let layout = std::alloc::Layout::from_size_align(
capacity * mem::size_of::<T>(),
alignment,
).map_err(|e| Error::runtime_error(
format!("Failed to create layout: {}", e),
None
))?;
let ptr = unsafe { std::alloc::alloc(layout) as *mut T };
if ptr.is_null() {
return Err(Box::new(Error::runtime_error(
"Failed to allocate aligned memory".to_string(),
None
)));
}
Ok(AlignedBuffer {
ptr,
capacity,
len: 0,
alignment,
})
}
pub fn as_mut_slice(&mut self) -> &mut [T] {
unsafe { slice::from_raw_parts_mut(self.ptr, self.len) }
}
pub fn as_slice(&self) -> &[T] {
unsafe { slice::from_raw_parts(self.ptr, self.len) }
}
pub fn resize(&mut self, new_len: usize) -> Result<()> {
if new_len > self.capacity {
return Err(Box::new(Error::runtime_error(
format!("Buffer resize {} exceeds capacity {}", new_len, self.capacity),
None
)));
}
self.len = new_len;
Ok(())
}
pub fn extend_from_slice(&mut self, source: &[T]) -> Result<()>
where
T: Copy
{
if self.len + source.len() > self.capacity {
return Err(Box::new(Error::runtime_error(
"Buffer extension would exceed capacity".to_string(),
None
)));
}
unsafe {
ptr::copy_nonoverlapping(
source.as_ptr(),
self.ptr.add(self.len),
source.len()
);
}
self.len += source.len();
Ok(())
}
}
impl<T> Drop for AlignedBuffer<T> {
fn drop(&mut self) {
if !self.ptr.is_null() {
let layout = std::alloc::Layout::from_size_align(
self.capacity * mem::size_of::<T>(),
self.alignment,
).unwrap();
unsafe {
std::alloc::dealloc(self.ptr as *mut u8, layout);
}
}
}
}
unsafe impl<T: Send> Send for AlignedBuffer<T> {}
unsafe impl<T: Sync> Sync for AlignedBuffer<T> {}
pub struct SimdNumericOps {
cpu_features: CpuFeatures,
f64_pool_avx512: Vec<AlignedBuffer<f64>>,
f64_pool_avx2: Vec<AlignedBuffer<f64>>,
f64_pool_sse2: Vec<AlignedBuffer<f64>>,
perf_stats: SimdPerfStats,
}
#[derive(Debug, Clone)]
pub struct CpuFeatures {
pub avx512f: bool,
pub avx2: bool,
pub sse2: bool,
pub fma: bool,
pub neon: bool, }
#[derive(Debug, Default)]
struct SimdPerfStats {
total_ops: u64,
total_time_ns: u64,
dense_ops: u64,
sparse_ops: u64,
mixed_ops: u64,
streaming_ops: u64,
small_ops: u64,
}
impl SimdNumericOps {
pub fn new() -> Self {
let cpu_features = Self::detect_cpu_features();
SimdNumericOps {
cpu_features,
f64_pool_avx512: Vec::new(),
f64_pool_avx2: Vec::new(),
f64_pool_sse2: Vec::new(),
perf_stats: SimdPerfStats::default(),
}
}
fn detect_cpu_features() -> CpuFeatures {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if is_x86_feature_detected!("avx512f") {
CpuFeatures {
avx512f: true,
avx2: is_x86_feature_detected!("avx2"),
sse2: is_x86_feature_detected!("sse2"),
fma: is_x86_feature_detected!("fma"),
neon: false,
}
} else if is_x86_feature_detected!("avx2") {
CpuFeatures {
avx512f: false,
avx2: true,
sse2: is_x86_feature_detected!("sse2"),
fma: is_x86_feature_detected!("fma"),
neon: false,
}
} else {
CpuFeatures {
avx512f: false,
avx2: false,
sse2: is_x86_feature_detected!("sse2"),
fma: false,
neon: false,
}
}
}
#[cfg(target_arch = "aarch64")]
{
CpuFeatures {
avx512f: false,
avx2: false,
sse2: false,
fma: false,
neon: true,
}
}
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
{
CpuFeatures {
avx512f: false,
avx2: false,
sse2: false,
fma: false,
neon: false,
}
}
}
pub fn analyze_operation_type(&self, data: &[f64]) -> SimdOperationType {
let len = data.len();
if len < 8 {
return SimdOperationType::Small;
}
if len > 8192 {
return SimdOperationType::Streaming;
}
let zero_count = data.iter().filter(|&&x| x == 0.0).count();
let sparsity_ratio = zero_count as f64 / len as f64;
if sparsity_ratio > 0.7 {
SimdOperationType::Sparse
} else {
SimdOperationType::DenseUniform
}
}
pub fn add_f64_arrays(&mut self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
if a.len() != b.len() || a.len() != result.len() {
return Err(Box::new(Error::runtime_error(
"Array length mismatch in SIMD addition".to_string(),
None
)));
}
let start_time = std::time::Instant::now();
let op_type = self.analyze_operation_type(a);
let result_code = match op_type {
SimdOperationType::Small => {
self.add_f64_arrays_scalar(a, b, result)
},
SimdOperationType::DenseUniform => {
if self.cpu_features.avx512f {
unsafe { self.add_f64_arrays_avx512(a, b, result) }
} else if self.cpu_features.avx2 {
unsafe { self.add_f64_arrays_avx2(a, b, result) }
} else if self.cpu_features.sse2 {
unsafe { self.add_f64_arrays_sse2(a, b, result) }
} else {
self.add_f64_arrays_scalar(a, b, result)
}
},
SimdOperationType::Sparse => {
self.add_f64_arrays_sparse(a, b, result)
},
SimdOperationType::Streaming => {
self.add_f64_arrays_streaming(a, b, result)
},
SimdOperationType::MixedTypes => {
self.add_f64_arrays_scalar(a, b, result)
}
};
let elapsed = start_time.elapsed();
self.perf_stats.total_ops += 1;
self.perf_stats.total_time_ns += elapsed.as_nanos() as u64;
match op_type {
SimdOperationType::DenseUniform => self.perf_stats.dense_ops += 1,
SimdOperationType::Sparse => self.perf_stats.sparse_ops += 1,
SimdOperationType::MixedTypes => self.perf_stats.mixed_ops += 1,
SimdOperationType::Streaming => self.perf_stats.streaming_ops += 1,
SimdOperationType::Small => self.perf_stats.small_ops += 1,
}
result_code
}
fn add_f64_arrays_scalar(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
for i in 0..a.len() {
result[i] = a[i] + b[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f")]
unsafe fn add_f64_arrays_avx512(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
let len = a.len();
let chunks = len / 8;
let remainder = len % 8;
for i in 0..chunks {
let offset = i * 8;
let a_chunk = _mm512_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm512_loadu_pd(b.as_ptr().add(offset));
let sum = _mm512_add_pd(a_chunk, b_chunk);
_mm512_storeu_pd(result.as_mut_ptr().add(offset), sum);
}
if remainder > 0 {
let offset = chunks * 8;
for i in 0..remainder {
result[offset + i] = a[offset + i] + b[offset + i];
}
}
Ok(())
}
#[target_feature(enable = "avx2")]
unsafe fn add_f64_arrays_avx2(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
let len = a.len();
let chunks = len / 4;
let remainder = len % 4;
for i in 0..chunks {
let offset = i * 4;
let a_chunk = _mm256_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm256_loadu_pd(b.as_ptr().add(offset));
let sum = _mm256_add_pd(a_chunk, b_chunk);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), sum);
}
if remainder > 0 {
let offset = chunks * 4;
for i in 0..remainder {
result[offset + i] = a[offset + i] + b[offset + i];
}
}
Ok(())
}
#[target_feature(enable = "sse2")]
unsafe fn add_f64_arrays_sse2(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
let len = a.len();
let chunks = len / 2;
let remainder = len % 2;
for i in 0..chunks {
let offset = i * 2;
let a_chunk = _mm_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm_loadu_pd(b.as_ptr().add(offset));
let sum = _mm_add_pd(a_chunk, b_chunk);
_mm_storeu_pd(result.as_mut_ptr().add(offset), sum);
}
if remainder > 0 {
let offset = chunks * 2;
result[offset] = a[offset] + b[offset];
}
Ok(())
}
fn add_f64_arrays_sparse(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
for i in 0..a.len() {
if a[i] == 0.0 && b[i] == 0.0 {
result[i] = 0.0;
} else if a[i] == 0.0 {
result[i] = b[i];
} else if b[i] == 0.0 {
result[i] = a[i];
} else {
result[i] = a[i] + b[i];
}
}
Ok(())
}
fn add_f64_arrays_streaming(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
const CHUNK_SIZE: usize = 1024;
let chunks = a.len() / CHUNK_SIZE;
let remainder = a.len() % CHUNK_SIZE;
for chunk in 0..chunks {
let start = chunk * CHUNK_SIZE;
let end = start + CHUNK_SIZE;
let a_chunk = &a[start..end];
let b_chunk = &b[start..end];
let result_chunk = &mut result[start..end];
if self.cpu_features.avx2 {
unsafe {
self.add_f64_arrays_avx2(a_chunk, b_chunk, result_chunk)?;
}
} else if self.cpu_features.sse2 {
unsafe {
self.add_f64_arrays_sse2(a_chunk, b_chunk, result_chunk)?;
}
} else {
self.add_f64_arrays_scalar(a_chunk, b_chunk, result_chunk)?;
}
}
if remainder > 0 {
let start = chunks * CHUNK_SIZE;
let a_rem = &a[start..];
let b_rem = &b[start..];
let result_rem = &mut result[start..];
self.add_f64_arrays_scalar(a_rem, b_rem, result_rem)?;
}
Ok(())
}
pub fn multiply_f64_arrays(&mut self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
if a.len() != b.len() || a.len() != result.len() {
return Err(Box::new(Error::runtime_error(
"Array length mismatch in SIMD multiplication".to_string(),
None
)));
}
let start_time = std::time::Instant::now();
let op_type = self.analyze_operation_type(a);
let result_code = match op_type {
SimdOperationType::Small => {
self.multiply_f64_arrays_scalar(a, b, result)
},
SimdOperationType::DenseUniform => {
if self.cpu_features.avx2 {
unsafe { self.multiply_f64_arrays_avx2(a, b, result) }
} else if self.cpu_features.sse2 {
unsafe { self.multiply_f64_arrays_sse2(a, b, result) }
} else {
self.multiply_f64_arrays_scalar(a, b, result)
}
},
_ => {
self.multiply_f64_arrays_scalar(a, b, result)
}
};
let elapsed = start_time.elapsed();
self.perf_stats.total_ops += 1;
self.perf_stats.total_time_ns += elapsed.as_nanos() as u64;
result_code
}
fn multiply_f64_arrays_scalar(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
for i in 0..a.len() {
result[i] = a[i] * b[i];
}
Ok(())
}
#[target_feature(enable = "avx2")]
unsafe fn multiply_f64_arrays_avx2(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
let len = a.len();
let chunks = len / 4;
let remainder = len % 4;
for i in 0..chunks {
let offset = i * 4;
let a_chunk = _mm256_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm256_loadu_pd(b.as_ptr().add(offset));
let product = _mm256_mul_pd(a_chunk, b_chunk);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), product);
}
if remainder > 0 {
let offset = chunks * 4;
for i in 0..remainder {
result[offset + i] = a[offset + i] * b[offset + i];
}
}
Ok(())
}
#[target_feature(enable = "sse2")]
unsafe fn multiply_f64_arrays_sse2(&self, a: &[f64], b: &[f64], result: &mut [f64]) -> Result<()> {
let len = a.len();
let chunks = len / 2;
let remainder = len % 2;
for i in 0..chunks {
let offset = i * 2;
let a_chunk = _mm_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm_loadu_pd(b.as_ptr().add(offset));
let product = _mm_mul_pd(a_chunk, b_chunk);
_mm_storeu_pd(result.as_mut_ptr().add(offset), product);
}
if remainder > 0 {
let offset = chunks * 2;
result[offset] = a[offset] * b[offset];
}
Ok(())
}
pub fn dot_product_f64(&mut self, a: &[f64], b: &[f64]) -> Result<f64> {
if a.len() != b.len() {
return Err(Box::new(Error::runtime_error(
"Array length mismatch in dot product".to_string(),
None
)));
}
let result = if self.cpu_features.avx2 {
unsafe { self.dot_product_f64_avx2(a, b) }
} else if self.cpu_features.sse2 {
unsafe { self.dot_product_f64_sse2(a, b) }
} else {
self.dot_product_f64_scalar(a, b)
};
Ok(result)
}
fn dot_product_f64_scalar(&self, a: &[f64], b: &[f64]) -> f64 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
#[target_feature(enable = "avx2")]
unsafe fn dot_product_f64_avx2(&self, a: &[f64], b: &[f64]) -> f64 {
let len = a.len();
let chunks = len / 4;
let remainder = len % 4;
let mut sum_vec = _mm256_setzero_pd();
for i in 0..chunks {
let offset = i * 4;
let a_chunk = _mm256_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm256_loadu_pd(b.as_ptr().add(offset));
let product = _mm256_mul_pd(a_chunk, b_chunk);
sum_vec = _mm256_add_pd(sum_vec, product);
}
let sum_high = _mm256_extractf128_pd(sum_vec, 1);
let sum_low = _mm256_castpd256_pd128(sum_vec);
let sum_combined = _mm_add_pd(sum_low, sum_high);
let sum_final = _mm_add_pd(sum_combined, _mm_shuffle_pd(sum_combined, sum_combined, 1));
let mut result = _mm_cvtsd_f64(sum_final);
if remainder > 0 {
let offset = chunks * 4;
for i in 0..remainder {
result += a[offset + i] * b[offset + i];
}
}
result
}
#[target_feature(enable = "sse2")]
unsafe fn dot_product_f64_sse2(&self, a: &[f64], b: &[f64]) -> f64 {
let len = a.len();
let chunks = len / 2;
let remainder = len % 2;
let mut sum_vec = _mm_setzero_pd();
for i in 0..chunks {
let offset = i * 2;
let a_chunk = _mm_loadu_pd(a.as_ptr().add(offset));
let b_chunk = _mm_loadu_pd(b.as_ptr().add(offset));
let product = _mm_mul_pd(a_chunk, b_chunk);
sum_vec = _mm_add_pd(sum_vec, product);
}
let sum_final = _mm_add_pd(sum_vec, _mm_shuffle_pd(sum_vec, sum_vec, 1));
let mut result = _mm_cvtsd_f64(sum_final);
if remainder > 0 {
let offset = chunks * 2;
result += a[offset] * b[offset];
}
result
}
pub fn get_performance_stats(&self) -> &SimdPerfStats {
&self.perf_stats
}
pub fn reset_performance_stats(&mut self) {
self.perf_stats = SimdPerfStats::default();
}
pub fn optimize_scheme_numeric_operation(&mut self, op: &str, args: &[Value]) -> Result<Option<Value>> {
match op {
"+" | "add" => self.optimize_scheme_addition(args),
"*" | "multiply" => self.optimize_scheme_multiplication(args),
"dot-product" => self.optimize_scheme_dot_product(args),
_ => Ok(None), }
}
fn optimize_scheme_addition(&mut self, args: &[Value]) -> Result<Option<Value>> {
if args.len() != 2 {
return Ok(None);
}
let (vec_a, vec_b) = match (&args[0], &args[1]) {
(Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => {
let a_f64: Result<Vec<f64>> = a.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
let b_f64: Result<Vec<f64>> = b.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
match (a_f64, b_f64) {
(Ok(a_vals), Ok(b_vals)) => (a_vals, b_vals),
_ => return Ok(None), }
},
_ => return Ok(None), };
let mut result = vec![0.0; vec_a.len()];
self.add_f64_arrays(&vec_a, &vec_b, &mut result)?;
let scheme_result: Vec<Value> = result.into_iter()
.map(Value::real)
.collect();
Ok(Some(Value::vector(scheme_result)))
}
fn optimize_scheme_multiplication(&mut self, args: &[Value]) -> Result<Option<Value>> {
if args.len() != 2 {
return Ok(None);
}
let (vec_a, vec_b) = match (&args[0], &args[1]) {
(Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => {
let a_f64: Result<Vec<f64>> = a.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
let b_f64: Result<Vec<f64>> = b.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
match (a_f64, b_f64) {
(Ok(a_vals), Ok(b_vals)) => (a_vals, b_vals),
_ => return Ok(None),
}
},
_ => return Ok(None),
};
let mut result = vec![0.0; vec_a.len()];
self.multiply_f64_arrays(&vec_a, &vec_b, &mut result)?;
let scheme_result: Vec<Value> = result.into_iter()
.map(Value::real)
.collect();
Ok(Some(Value::vector(scheme_result)))
}
fn optimize_scheme_dot_product(&mut self, args: &[Value]) -> Result<Option<Value>> {
if args.len() != 2 {
return Ok(None);
}
let (vec_a, vec_b) = match (&args[0], &args[1]) {
(Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => {
let a_f64: Result<Vec<f64>> = a.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
let b_f64: Result<Vec<f64>> = b.iter()
.map(|v| v.as_f64().ok_or_else(|| Error::runtime_error(
"Vector element not convertible to f64".to_string(), None)))
.collect();
match (a_f64, b_f64) {
(Ok(a_vals), Ok(b_vals)) => (a_vals, b_vals),
_ => return Ok(None),
}
},
_ => return Ok(None),
};
let result = self.dot_product_f64(&vec_a, &vec_b)?;
Ok(Some(Value::real(result)))
}
}
impl Default for SimdNumericOps {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cpu_feature_detection() {
let features = SimdNumericOps::detect_cpu_features();
if cfg!(target_arch = "x86_64") {
assert!(features.sse2);
}
if cfg!(target_arch = "aarch64") {
assert!(features.neon);
}
}
#[test]
fn test_aligned_buffer_creation() {
let buffer: Result<AlignedBuffer<f64>> = AlignedBuffer::new(16, 64);
assert!(buffer.is_ok());
let mut buf = buffer.unwrap();
assert_eq!(buf.as_slice().len(), 0);
buf.resize(8).unwrap();
assert_eq!(buf.as_slice().len(), 8);
}
#[test]
fn test_operation_type_analysis() {
let simd = SimdNumericOps::new();
let small = vec![1.0, 2.0, 3.0];
assert_eq!(simd.analyze_operation_type(&small), SimdOperationType::Small);
let large = vec![1.0; 10000];
assert_eq!(simd.analyze_operation_type(&large), SimdOperationType::Streaming);
let sparse = vec![0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
assert_eq!(simd.analyze_operation_type(&sparse), SimdOperationType::Sparse);
let dense = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
assert_eq!(simd.analyze_operation_type(&dense), SimdOperationType::DenseUniform);
}
#[test]
fn test_scalar_addition() {
let simd = SimdNumericOps::new();
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let mut result = vec![0.0; 4];
simd.add_f64_arrays_scalar(&a, &b, &mut result).unwrap();
assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
}
#[test]
fn test_simd_addition() {
let mut simd = SimdNumericOps::new();
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = vec![8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0];
let mut result = vec![0.0; 8];
simd.add_f64_arrays(&a, &b, &mut result).unwrap();
assert_eq!(result, vec![9.0; 8]);
}
#[test]
fn test_dot_product() {
let mut simd = SimdNumericOps::new();
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![2.0, 3.0, 4.0, 5.0];
let result = simd.dot_product_f64(&a, &b).unwrap();
assert_eq!(result, 40.0); }
#[test]
fn test_scheme_integration() {
let mut simd = SimdNumericOps::new();
let vec_a = Value::vector(vec![Value::real(1.0), Value::real(2.0), Value::real(3.0)]);
let vec_b = Value::vector(vec![Value::real(4.0), Value::real(5.0), Value::real(6.0)]);
let args = vec![vec_a, vec_b];
let result = simd.optimize_scheme_numeric_operation("+", &args).unwrap();
assert!(result.is_some());
if let Some(Value::Vector(result_vec)) = result {
assert_eq!(result_vec.len(), 3);
assert_eq!(result_vec[0].as_f64().unwrap(), 5.0);
assert_eq!(result_vec[1].as_f64().unwrap(), 7.0);
assert_eq!(result_vec[2].as_f64().unwrap(), 9.0);
} else {
panic!("Expected vector result");
}
}
#[test]
fn test_performance_stats() {
let mut simd = SimdNumericOps::new();
let initial_stats = simd.get_performance_stats();
assert_eq!(initial_stats.total_ops, 0);
let a = vec![1.0; 100];
let b = vec![2.0; 100];
let mut result = vec![0.0; 100];
simd.add_f64_arrays(&a, &b, &mut result).unwrap();
let updated_stats = simd.get_performance_stats();
assert_eq!(updated_stats.total_ops, 1);
assert!(updated_stats.total_time_ns > 0);
}
}