#![allow(dead_code)]
#![allow(missing_docs)]
use crate::error::{IoError, Result};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
use scirs2_core::simd_ops::PlatformCapabilities;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
pub struct SimdIoProcessor;
impl SimdIoProcessor {
pub fn convert_f64_to_f32(input: &ArrayView1<f64>) -> Array1<f32> {
let mut output = Array1::<f32>::zeros(input.len());
if input.len() > 1000 {
output.iter_mut().zip(input.iter()).for_each(|(out, &inp)| {
*out = inp as f32;
});
} else {
for (out, &inp) in output.iter_mut().zip(input.iter()) {
*out = inp as f32;
}
}
output
}
pub fn normalize_audio_simd(data: &mut ArrayViewMut1<f32>) {
let max_val = if data.len() > 1000 {
data.iter().map(|&x| x.abs()).fold(0.0f32, f32::max)
} else {
data.iter().map(|&x| x.abs()).fold(0.0f32, f32::max)
};
if max_val > 0.0 {
let scale = 1.0 / max_val;
if data.len() > 1000 {
data.iter_mut().for_each(|x| *x *= scale);
} else {
data.mapv_inplace(|x| x * scale);
}
}
}
pub fn apply_gain_simd(data: &mut ArrayViewMut1<f32>, gain: f32) {
if data.len() > 1000 {
data.iter_mut().for_each(|x| *x *= gain);
} else {
data.mapv_inplace(|x| x * gain);
}
}
pub fn int16_to_float_simd(input: &[i16]) -> Array1<f32> {
let mut output = Array1::<f32>::zeros(input.len());
let scale = 1.0 / 32768.0;
if input.len() > 1000 {
output
.iter_mut()
.zip(input.iter())
.for_each(|(out, &sample)| {
*out = sample as f32 * scale;
});
} else {
for (out, &sample) in output.iter_mut().zip(input.iter()) {
*out = sample as f32 * scale;
}
}
output
}
pub fn float_to_int16_simd(input: &ArrayView1<f32>) -> Vec<i16> {
let scale = 32767.0;
if input.len() > 1000 {
input
.iter()
.map(|&sample| {
let scaled = sample * scale;
let clamped = scaled.clamp(-32768.0, 32767.0);
clamped as i16
})
.collect()
} else {
input
.iter()
.map(|&sample| {
let scaled = sample * scale;
let clamped = scaled.clamp(-32768.0, 32767.0);
clamped as i16
})
.collect()
}
}
pub fn byteswap_f32_simd(data: &mut [f32]) {
let chunk_size = 8;
let full_chunks = data.len() / chunk_size;
for i in 0..full_chunks {
let start = i * chunk_size;
let end = start + chunk_size;
for item in data.iter_mut().take(end).skip(start) {
*item = f32::from_bits(item.to_bits().swap_bytes());
}
}
for item in data.iter_mut().skip(full_chunks * chunk_size) {
*item = f32::from_bits(item.to_bits().swap_bytes());
}
}
pub fn checksum_simd(data: &[u8]) -> u32 {
let mut sum = 0u32;
let chunk_size = 64;
let chunks = data.chunks_exact(chunk_size);
let remainder = chunks.remainder();
for chunk in chunks {
let mut chunk_sum = 0u32;
for i in (0..chunk_size).step_by(4) {
chunk_sum = chunk_sum.wrapping_add(u32::from_le_bytes([
chunk[i],
chunk[i + 1],
chunk[i + 2],
chunk[i + 3],
]));
}
sum = sum.wrapping_add(chunk_sum);
}
for &byte in remainder {
sum = sum.wrapping_add(byte as u32);
}
sum
}
}
pub mod csv_simd {
use super::*;
pub fn find_delimiters_simd(buffer: &[u8], delimiter: u8) -> Vec<usize> {
let mut positions = Vec::new();
let chunk_size = 64;
let chunks = buffer.chunks_exact(chunk_size);
let mut offset = 0;
for chunk in chunks {
for (i, &byte) in chunk.iter().enumerate() {
if byte == delimiter {
positions.push(offset + i);
}
}
offset += chunk_size;
}
let remainder = buffer.len() % chunk_size;
let start = buffer.len() - remainder;
for (i, &byte) in buffer[start..].iter().enumerate() {
if byte == delimiter {
positions.push(start + i);
}
}
positions
}
pub fn parse_floats_simd(fields: &[&str]) -> Result<Vec<f64>> {
let mut results = Vec::with_capacity(fields.len());
for field in fields {
match field.parse::<f64>() {
Ok(val) => results.push(val),
Err(_) => return Err(IoError::ParseError(format!("Invalid float: {}", field))),
}
}
Ok(results)
}
}
pub mod compression_simd {
use super::*;
pub fn delta_encode_simd(data: &ArrayView1<f64>) -> Array1<f64> {
if data.is_empty() {
return Array1::zeros(0);
}
let mut result = Array1::zeros(data.len());
result[0] = data[0];
for i in 1..data.len() {
result[i] = data[i] - data[i - 1];
}
result
}
pub fn delta_decode_simd(data: &ArrayView1<f64>) -> Array1<f64> {
if data.is_empty() {
return Array1::zeros(0);
}
let mut result = Array1::zeros(data.len());
result[0] = data[0];
for i in 1..data.len() {
result[i] = result[i - 1] + data[i];
}
result
}
pub fn rle_encode_simd(data: &[u8]) -> Vec<(u8, usize)> {
if data.is_empty() {
return Vec::new();
}
let mut runs = Vec::new();
let mut current_val = data[0];
let mut count = 1;
for &val in &data[1..] {
if val == current_val {
count += 1;
} else {
runs.push((current_val, count));
current_val = val;
count = 1;
}
}
runs.push((current_val, count));
runs
}
}
pub mod matrix_simd {
use super::*;
use scirs2_core::ndarray::{Array2, ArrayView2, ArrayViewMut2};
pub struct CacheOptimizedMatrixProcessor {
capabilities: PlatformCapabilities,
l1_cache_size: usize,
l2_cache_size: usize,
l3_cache_size: usize,
cache_line_size: usize,
}
impl Default for CacheOptimizedMatrixProcessor {
fn default() -> Self {
Self::new()
}
}
impl CacheOptimizedMatrixProcessor {
pub fn new() -> Self {
let capabilities = PlatformCapabilities::detect();
Self {
capabilities,
l1_cache_size: 32 * 1024, l2_cache_size: 256 * 1024, l3_cache_size: 8 * 1024 * 1024, cache_line_size: 64, }
}
pub fn transpose_advanced_fast<T>(&self, input: &ArrayView2<T>) -> Array2<T>
where
T: Copy + Default + Send + Sync,
{
let (rows, cols) = input.dim();
let mut output = Array2::default((cols, rows));
if rows < 32 || cols < 32 {
for i in 0..rows {
for j in 0..cols {
output[[j, i]] = input[[i, j]];
}
}
return output;
}
let element_size = std::mem::size_of::<T>();
let block_size = self.calculate_optimal_block_size(element_size);
for i in (0..rows).step_by(block_size) {
for j in (0..cols).step_by(block_size) {
self.transpose_block(input, &output, i, j, block_size, rows, cols);
}
}
output
}
fn transpose_block<T>(
&self,
input: &ArrayView2<T>,
output: &Array2<T>,
start_i: usize,
start_j: usize,
block_size: usize,
rows: usize,
cols: usize,
) where
T: Copy + Default,
{
let end_i = (start_i + block_size).min(rows);
let end_j = (start_j + block_size).min(cols);
if block_size <= 8 {
self.transpose_micro_kernel_8x8(
input, output, start_i, start_j, end_i, end_j, cols, rows,
);
} else {
let half_block = block_size / 2;
for i in (start_i..end_i).step_by(half_block) {
for j in (start_j..end_j).step_by(half_block) {
self.transpose_block(input, output, i, j, half_block, rows, cols);
}
}
}
}
fn transpose_micro_kernel_8x8<T>(
&self,
input: &ArrayView2<T>,
output: &Array2<T>,
start_i: usize,
start_j: usize,
end_i: usize,
end_j: usize,
cols: usize,
rows: usize,
) where
T: Copy + Default,
{
for i in start_i..end_i {
for j in start_j..end_j {
if i < input.nrows()
&& j < input.ncols()
&& j < output.nrows()
&& i < output.ncols()
{
unsafe {
let src_ptr = input.as_ptr().add(i * cols + j);
let dst_ptr = output.as_ptr().add(j * rows + i) as *mut T;
*dst_ptr = *src_ptr;
}
}
}
}
}
pub fn matrix_multiply_blocked<T>(
&self,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> Result<Array2<T>>
where
T: Copy + Default + Send + Sync + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
{
let (m, k) = a.dim();
let (k2, n) = b.dim();
if k != k2 {
return Err(IoError::ValidationError(
"Matrix dimensions don't match for multiplication".to_string(),
));
}
let mut c = Array2::default((m, n));
let element_size = std::mem::size_of::<T>();
let (mc, kc, nc) = self.calculate_gemm_block_sizes(element_size);
for i in (0..m).step_by(mc) {
let m_end = (i + mc).min(m);
for p in (0..k).step_by(kc) {
let k_end = (p + kc).min(k);
for j in (0..n).step_by(nc) {
let n_end = (j + nc).min(n);
self.gemm_micro_kernel(a, b, &mut c, i, m_end, p, k_end, j, n_end);
}
}
}
Ok(c)
}
fn gemm_micro_kernel<T>(
&self,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
c: &mut Array2<T>,
i_start: usize,
i_end: usize,
k_start: usize,
k_end: usize,
j_start: usize,
j_end: usize,
) where
T: Copy + Default + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
{
for i in i_start..i_end {
for j in j_start..j_end {
let mut sum = c[[i, j]];
for kk in k_start..k_end {
sum = sum + a[[i, kk]] * b[[kk, j]];
}
c[[i, j]] = sum;
}
}
}
fn calculate_gemm_block_sizes(&self, elementsize: usize) -> (usize, usize, usize) {
let mc_elements = self.l2_cache_size / (2 * elementsize);
let mc = (mc_elements as f64).sqrt() as usize;
let kc_elements = self.l3_cache_size / (3 * elementsize);
let kc = (kc_elements / mc).min(384);
let nc_elements = self.l1_cache_size / elementsize;
let nc = (nc_elements / kc).min(64);
(mc.max(8), kc.max(8), nc.max(8))
}
pub fn matrix_vector_multiply_simd<T>(
&self,
matrix: &ArrayView2<T>,
vector: &ArrayView1<T>,
) -> Result<Array1<T>>
where
T: Copy + Default + Send + Sync + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
{
let (rows, cols) = matrix.dim();
if cols != vector.len() {
return Err(IoError::ValidationError(
"Matrix columns must match vector length".to_string(),
));
}
let mut result = Array1::default(rows);
if rows > 100 {
for (i, res) in result.iter_mut().enumerate() {
let mut sum = T::default();
for j in 0..cols {
sum = sum + matrix[[i, j]] * vector[j];
}
*res = sum;
}
} else {
for i in 0..rows {
let mut sum = T::default();
for j in 0..cols {
sum = sum + matrix[[i, j]] * vector[j];
}
result[i] = sum;
}
}
Ok(result)
}
fn calculate_optimal_block_size(&self, elementsize: usize) -> usize {
let target_working_set = self.l2_cache_size / 2;
let elements_per_block = target_working_set / elementsize;
let block_size = (elements_per_block as f64).sqrt() as usize;
let elements_per_line = self.cache_line_size / elementsize;
((block_size / elements_per_line) * elements_per_line).max(8)
}
pub fn optimize_memory_access<T>(&self, data: &mut ArrayViewMut2<T>)
where
T: Copy + Default,
{
let (rows, cols) = data.dim();
if rows < 32 || cols < 32 {
return; }
let block_size = self.calculate_optimal_block_size(std::mem::size_of::<T>());
for i in (0..rows).step_by(block_size) {
for j in (0..cols).step_by(block_size) {
let end_i = (i + block_size).min(rows);
let end_j = (j + block_size).min(cols);
for ii in i..end_i {
for jj in
(j..end_j).step_by(self.cache_line_size / std::mem::size_of::<T>())
{
if ii < rows && jj < cols && (ii * cols + jj) < data.len() {
unsafe {
let ptr = data.as_ptr().add(ii * cols + jj);
#[cfg(target_arch = "x86_64")]
{
_mm_prefetch(ptr as *const i8, _MM_HINT_T0);
}
#[cfg(not(target_arch = "x86_64"))]
{
let _ = *ptr; }
}
}
}
}
}
}
}
}
pub fn transpose_simd<T: Copy + Default + Send + Sync>(input: &ArrayView2<T>) -> Array2<T> {
let processor = CacheOptimizedMatrixProcessor::new();
processor.transpose_advanced_fast(input)
}
pub fn matmul_simd(a: &ArrayView2<f32>, b: &ArrayView2<f32>) -> Result<Array2<f32>> {
let (m, k) = a.dim();
let (k2, n) = b.dim();
if k != k2 {
return Err(IoError::ValidationError(
"Matrix dimensions don't match".to_string(),
));
}
let mut c = Array2::zeros((m, n));
let block_size = 64;
for i_block in (0..m).step_by(block_size) {
for j_block in (0..n).step_by(block_size) {
for k_block in (0..k).step_by(block_size) {
let i_end = (i_block + block_size).min(m);
let j_end = (j_block + block_size).min(n);
let k_end = (k_block + block_size).min(k);
for i in i_block..i_end {
for j in j_block..j_end {
let mut sum = 0.0f32;
for kk in k_block..k_end {
sum += a[[i, kk]] * b[[kk, j]];
}
c[[i, j]] += sum;
}
}
}
}
}
Ok(c)
}
pub fn elementwise_add_simd(a: &ArrayView2<f32>, b: &ArrayView2<f32>) -> Result<Array2<f32>> {
if a.dim() != b.dim() {
return Err(IoError::ValidationError(
"Array dimensions don't match".to_string(),
));
}
let mut result = Array2::zeros(a.dim());
if a.len() > 1024 {
for ((r, &a_val), &b_val) in result
.as_slice_mut()
.expect("Operation failed")
.iter_mut()
.zip(a.as_slice().expect("Operation failed").iter())
.zip(b.as_slice().expect("Operation failed").iter())
{
*r = a_val + b_val;
}
} else {
for ((i, j), &a_val) in a.indexed_iter() {
result[[i, j]] = a_val + b[[i, j]];
}
}
Ok(result)
}
}
pub mod stats_simd {
use super::*;
use std::f64;
pub fn mean_simd(data: &ArrayView1<f64>) -> f64 {
if data.is_empty() {
return 0.0;
}
let sum = data
.as_slice()
.expect("Operation failed")
.iter()
.sum::<f64>();
sum / data.len() as f64
}
pub fn variance_simd(data: &ArrayView1<f64>) -> f64 {
if data.len() < 2 {
return 0.0;
}
let mean = mean_simd(data);
let slice = data.as_slice().expect("Operation failed");
let sum_sq_diff: f64 = slice.iter().map(|&x| (x - mean).powi(2)).sum();
sum_sq_diff / (data.len() - 1) as f64
}
pub fn minmax_simd(data: &ArrayView1<f64>) -> (f64, f64) {
if data.is_empty() {
return (f64::NAN, f64::NAN);
}
let slice = data.as_slice().expect("Operation failed");
let (min, max) = slice
.iter()
.fold((f64::INFINITY, f64::NEG_INFINITY), |acc, &x| {
(acc.0.min(x), acc.1.max(x))
});
(min, max)
}
pub fn quantile_simd(data: &ArrayView1<f64>, q: f64) -> f64 {
if data.is_empty() || !(0.0..=1.0).contains(&q) {
return f64::NAN;
}
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
let index = q * (sorted_data.len() - 1) as f64;
let lower = index.floor() as usize;
let upper = index.ceil() as usize;
if lower == upper {
sorted_data[lower]
} else {
let weight = index - lower as f64;
sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
}
}
}
pub mod binary_simd {
use super::*;
pub fn fast_memcopy(src: &[u8], dst: &mut [u8]) -> Result<()> {
if src.len() != dst.len() {
return Err(IoError::ValidationError(
"Source and destination lengths don't match".to_string(),
));
}
if src.len() > 4096 {
dst.iter_mut().zip(src.iter()).for_each(|(d, &s)| *d = s);
} else {
dst.copy_from_slice(src);
}
Ok(())
}
pub fn xor_simd(data: &mut [u8], key: &[u8]) {
let key_len = key.len();
data.iter_mut().enumerate().for_each(|(i, byte)| {
*byte ^= key[i % key_len];
});
}
pub fn popcount_simd(data: &[u8]) -> usize {
data.iter().map(|&byte| byte.count_ones() as usize).sum()
}
pub fn find_pattern_simd(haystack: &[u8], needle: &[u8]) -> Vec<usize> {
if needle.is_empty() || haystack.len() < needle.len() {
return Vec::new();
}
let mut positions = Vec::new();
let chunk_size = 1024;
for (chunk_start, chunk) in haystack.chunks(chunk_size).enumerate() {
for i in 0..=(chunk.len().saturating_sub(needle.len())) {
if chunk[i..].starts_with(needle) {
positions.push(chunk_start * chunk_size + i);
}
}
}
positions
}
}
pub struct AdvancedSimdProcessor {
capabilities: PlatformCapabilities,
optimal_chunk_size: usize,
vector_width: usize,
}
impl Default for AdvancedSimdProcessor {
fn default() -> Self {
Self::new()
}
}
impl AdvancedSimdProcessor {
pub fn new() -> Self {
let capabilities = PlatformCapabilities::detect();
let vector_width = Self::detect_optimal_vector_width(&capabilities);
let optimal_chunk_size = Self::calculate_optimal_chunk_size(vector_width);
Self {
capabilities,
optimal_chunk_size,
vector_width,
}
}
fn detect_optimal_vector_width(capabilities: &PlatformCapabilities) -> usize {
#[cfg(target_arch = "x86_64")]
{
if capabilities.avx512_available {
64 } else if capabilities.avx2_available {
32 } else if capabilities.simd_available {
16 } else {
8 }
}
#[cfg(target_arch = "aarch64")]
{
if capabilities.neon_available {
16 } else {
8 }
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
8 }
}
fn calculate_optimal_chunk_size(vector_width: usize) -> usize {
let target_size = 24 * 1024;
let chunk_size = target_size / vector_width;
(chunk_size / vector_width) * vector_width
}
pub fn convert_f64_to_f32_advanced(&self, input: &ArrayView1<f64>) -> Array1<f32> {
let len = input.len();
let mut output = Array1::<f32>::zeros(len);
if len < 64 {
for (i, &val) in input.iter().enumerate() {
output[i] = val as f32;
}
return output;
}
let input_slice = input.as_slice().expect("Operation failed");
let output_slice = output.as_slice_mut().expect("Operation failed");
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.avx2_available {
self.convert_f64_to_f32_avx2(input_slice, output_slice);
} else if self.capabilities.simd_available {
self.convert_f64_to_f32_sse(input_slice, output_slice);
} else {
self.convert_f64_to_f32_scalar(input_slice, output_slice);
}
}
#[cfg(target_arch = "aarch64")]
{
if self.capabilities.neon_available {
self.convert_f64_to_f32_neon(input_slice, output_slice);
} else {
self.convert_f64_to_f32_scalar(input_slice, output_slice);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
self.convert_f64_to_f32_scalar(input_slice, output_slice);
}
output
}
#[cfg(target_arch = "x86_64")]
fn convert_f64_to_f32_avx2(&self, input: &[f64], output: &mut [f32]) {
let len = input.len().min(output.len());
let simd_end = (len / 8) * 8;
unsafe {
let mut i = 0;
while i < simd_end {
if i + 7 < input.len() && i + 7 < output.len() {
let input_ptr = input.as_ptr().add(i);
let v1 = _mm256_loadu_pd(input_ptr);
let v2 = _mm256_loadu_pd(input_ptr.add(4));
let f32_lo = _mm256_cvtpd_ps(v1);
let f32_hi = _mm256_cvtpd_ps(v2);
let combined = _mm256_insertf128_ps(_mm256_castps128_ps256(f32_lo), f32_hi, 1);
let output_ptr = output.as_mut_ptr().add(i);
_mm256_storeu_ps(output_ptr, combined);
}
i += 8;
}
}
for i in simd_end..len {
output[i] = input[i] as f32;
}
}
#[cfg(target_arch = "x86_64")]
fn convert_f64_to_f32_sse(&self, input: &[f64], output: &mut [f32]) {
let len = input.len().min(output.len());
let simd_end = (len / 4) * 4;
unsafe {
let mut i = 0;
while i < simd_end {
let input_ptr = input.as_ptr().add(i);
let v1 = _mm_loadu_pd(input_ptr);
let v2 = _mm_loadu_pd(input_ptr.add(2));
let f32_1 = _mm_cvtpd_ps(v1);
let f32_2 = _mm_cvtpd_ps(v2);
let combined = _mm_movelh_ps(f32_1, f32_2);
let output_ptr = output.as_mut_ptr().add(i);
_mm_storeu_ps(output_ptr, combined);
i += 4;
}
}
for i in simd_end..len {
output[i] = input[i] as f32;
}
}
#[cfg(target_arch = "aarch64")]
fn convert_f64_to_f32_neon(&self, input: &[f64], output: &mut [f32]) {
let len = input.len().min(output.len());
let simd_end = (len / 4) * 4;
unsafe {
let mut i = 0;
while i < simd_end {
let input_ptr = input.as_ptr().add(i);
let v1 = vld1q_f64(input_ptr);
let v2 = vld1q_f64(input_ptr.add(2));
let f32_1 = vcvt_f32_f64(v1);
let f32_2 = vcvt_f32_f64(v2);
let combined = vcombine_f32(f32_1, f32_2);
let output_ptr = output.as_mut_ptr().add(i);
vst1q_f32(output_ptr, combined);
i += 4;
}
}
for i in simd_end..len {
output[i] = input[i] as f32;
}
}
fn convert_f64_to_f32_scalar(&self, input: &[f64], output: &mut [f32]) {
let len = input.len().min(output.len());
if len > 1024 {
input[..len]
.iter()
.zip(output[..len].iter_mut())
.for_each(|(&inp, out)| {
*out = inp as f32;
});
} else {
for (i, &inp) in input.iter().enumerate().take(len) {
output[i] = inp as f32;
}
}
}
pub fn transpose_matrix_simd<T>(&self, input: &ArrayView2<T>) -> Array2<T>
where
T: Copy + Default + Send + Sync,
{
let (rows, cols) = input.dim();
let mut output = Array2::default((cols, rows));
if rows < 64 || cols < 64 {
for i in 0..rows {
for j in 0..cols {
output[[j, i]] = input[[i, j]];
}
}
return output;
}
let block_size = self.calculate_transpose_block_size(std::mem::size_of::<T>());
for i in (0..rows).step_by(block_size) {
for j in (0..cols).step_by(block_size) {
let row_end = (i + block_size).min(rows);
let col_end = (j + block_size).min(cols);
for ii in i..row_end {
for jj in j..col_end {
output[[jj, ii]] = input[[ii, jj]];
}
}
}
}
output
}
fn calculate_transpose_block_size(&self, elementsize: usize) -> usize {
let target_cache_size = 128 * 1024;
let elements_per_cache_line = 64 / elementsize; let block_elements = target_cache_size / elementsize;
let block_size = (block_elements as f64).sqrt() as usize;
(block_size / elements_per_cache_line) * elements_per_cache_line
}
pub fn memcopy_simd(&self, src: &[u8], dst: &mut [u8]) -> Result<()> {
if src.len() != dst.len() {
return Err(IoError::ValidationError(
"Source and destination lengths don't match".to_string(),
));
}
let len = src.len();
if len < 64 {
dst.copy_from_slice(src);
return Ok(());
}
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.avx2_available {
self.memcopy_avx2(src, dst);
} else if self.capabilities.simd_available {
self.memcopy_sse(src, dst);
} else {
self.memcopy_parallel(src, dst);
}
}
#[cfg(target_arch = "aarch64")]
{
if self.capabilities.neon_available {
self.memcopy_neon(src, dst);
} else {
self.memcopy_parallel(src, dst);
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
self.memcopy_parallel(src, dst);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
fn memcopy_avx2(&self, src: &[u8], dst: &mut [u8]) {
let len = src.len();
let simd_end = (len / 32) * 32;
unsafe {
let mut i = 0;
while i < simd_end {
if i + 64 < len {
_mm_prefetch(src.as_ptr().add(i + 64) as *const i8, _MM_HINT_T0);
}
let data = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
_mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, data);
i += 32;
}
}
if simd_end < len {
dst[simd_end..].copy_from_slice(&src[simd_end..]);
}
}
#[cfg(target_arch = "x86_64")]
fn memcopy_sse(&self, src: &[u8], dst: &mut [u8]) {
let len = src.len();
let simd_end = (len / 16) * 16;
unsafe {
let mut i = 0;
while i < simd_end {
if i + 64 < len {
_mm_prefetch(src.as_ptr().add(i + 64) as *const i8, _MM_HINT_T0);
}
let data = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
_mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, data);
i += 16;
}
}
if simd_end < len {
dst[simd_end..].copy_from_slice(&src[simd_end..]);
}
}
#[cfg(target_arch = "aarch64")]
fn memcopy_neon(&self, src: &[u8], dst: &mut [u8]) {
let len = src.len();
let simd_end = (len / 16) * 16;
unsafe {
let mut i = 0;
while i < simd_end {
let data = vld1q_u8(src.as_ptr().add(i));
vst1q_u8(dst.as_mut_ptr().add(i), data);
i += 16;
}
}
if simd_end < len {
dst[simd_end..].copy_from_slice(&src[simd_end..]);
}
}
fn memcopy_parallel(&self, src: &[u8], dst: &mut [u8]) {
let len = src.len();
if len > 1024 * 1024 {
dst.iter_mut().zip(src.iter()).for_each(|(d, &s)| *d = s);
} else {
dst.copy_from_slice(src);
}
}
pub fn get_optimization_info(&self) -> SimdOptimizationInfo {
SimdOptimizationInfo {
vector_width: self.vector_width,
optimal_chunk_size: self.optimal_chunk_size,
platform_features: PlatformCapabilities::detect(),
recommended_threshold: self.calculate_simd_threshold(),
}
}
fn calculate_simd_threshold(&self) -> usize {
self.vector_width * 8 }
}
pub struct SimdOptimizationInfo {
pub vector_width: usize,
pub optimal_chunk_size: usize,
pub platform_features: PlatformCapabilities,
pub recommended_threshold: usize,
}
impl std::fmt::Debug for SimdOptimizationInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SimdOptimizationInfo")
.field("vector_width", &self.vector_width)
.field("optimal_chunk_size", &self.optimal_chunk_size)
.field("platform_features", &"<PlatformCapabilities>")
.field("recommended_threshold", &self.recommended_threshold)
.finish()
}
}
pub struct SimdIoAccelerator;
impl SimdIoAccelerator {
pub fn read_and_process_f64(
_path: &std::path::Path,
processor: impl Fn(&ArrayView1<f64>) -> Array1<f64>,
) -> Result<Array1<f64>> {
let mock_data = Array1::from_vec((0..1000).map(|x| x as f64).collect());
Ok(processor(&mock_data.view()))
}
pub fn preprocess_and_write_f64(
data: &ArrayView1<f64>,
_path: &std::path::Path,
preprocessor: impl Fn(&ArrayView1<f64>) -> Array1<f64>,
) -> Result<()> {
let processed = preprocessor(data);
if processed.len() == data.len() {
Ok(())
} else {
Err(IoError::Other(
"Preprocessing changed data length".to_string(),
))
}
}
pub fn batch_process<T>(
arrays: &[ArrayView1<T>],
processor: impl Fn(&ArrayView1<T>) -> Array1<T> + Send + Sync,
) -> Vec<Array1<T>>
where
T: Copy + Send + Sync,
{
arrays.iter().map(processor).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::array;
#[test]
fn test_convert_f64_to_f32() {
let input = array![1.0, 2.0, 3.0, 4.0, 5.0];
let result = SimdIoProcessor::convert_f64_to_f32(&input.view());
assert_eq!(result.len(), 5);
assert!((result[0] - 1.0f32).abs() < 1e-6);
assert!((result[4] - 5.0f32).abs() < 1e-6);
}
#[test]
fn test_normalize_audio() {
let mut data = array![0.5, -1.0, 0.25, -0.75];
let max_val = data.iter().map(|&x: &f32| x.abs()).fold(0.0f32, f32::max);
if max_val > 0.0 {
let scale = 1.0f32 / max_val;
data.mapv_inplace(|x| x * scale);
}
assert!((data[1] - (-1.0f32)).abs() < 1e-6f32); assert!((data[0] - 0.5f32).abs() < 1e-6f32);
}
#[test]
fn test_checksum() {
let data = b"Hello, World!";
let checksum = SimdIoProcessor::checksum_simd(data);
assert!(checksum > 0);
}
#[test]
fn test_advanced_simd_processor() {
let processor = AdvancedSimdProcessor::new();
let info = processor.get_optimization_info();
assert!(info.vector_width >= 8);
assert!(info.optimal_chunk_size > 0);
assert!(info.recommended_threshold > 0);
}
#[test]
fn test_optimized_conversion() {
let processor = AdvancedSimdProcessor::new();
let input = Array1::from_vec((0..1000).map(|x| x as f64 * 0.1).collect());
let result = processor.convert_f64_to_f32_advanced(&input.view());
assert_eq!(result.len(), 1000);
assert!((result[0] - 0.0f32).abs() < 1e-6);
assert!((result[999] - 99.9f32).abs() < 1e-3);
}
#[test]
fn test_simd_matrix_transpose() {
let processor = AdvancedSimdProcessor::new();
let input = Array2::from_shape_fn((100, 80), |(i, j)| (i * 80 + j) as f64);
let result = processor.transpose_matrix_simd(&input.view());
assert_eq!(result.dim(), (80, 100));
assert_eq!(result[[0, 0]], input[[0, 0]]);
assert_eq!(result[[79, 99]], input[[99, 79]]);
}
#[test]
fn test_simd_memcopy() {
let processor = AdvancedSimdProcessor::new();
let src: Vec<u8> = (0..1024).map(|x| (x % 256) as u8).collect();
let mut dst = vec![0u8; 1024];
processor
.memcopy_simd(&src, &mut dst)
.expect("Operation failed");
assert_eq!(src, dst);
}
#[test]
fn test_cache_optimized_matrix_processor() {
let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
let input = Array2::from_shape_fn((64, 48), |(i, j)| (i * 48 + j) as f64);
let result = processor.transpose_advanced_fast(&input.view());
assert_eq!(result.dim(), (48, 64));
assert_eq!(result[[0, 0]], input[[0, 0]]);
assert_eq!(result[[47, 63]], input[[63, 47]]);
}
#[test]
fn test_blocked_matrix_multiply() {
let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
let a = Array2::from_shape_fn((32, 24), |(i, j)| (i + j) as f64);
let b = Array2::from_shape_fn((24, 16), |(i, j)| (i * j + 1) as f64);
let result = processor
.matrix_multiply_blocked(&a.view(), &b.view())
.expect("Operation failed");
assert_eq!(result.dim(), (32, 16));
let expected_00 = (0..24).map(|k| a[[0, k]] * b[[k, 0]]).sum::<f64>();
assert!((result[[0, 0]] - expected_00).abs() < 1e-10);
}
#[test]
fn test_matrix_vector_multiply_simd() {
let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
let matrix = Array2::from_shape_fn((10, 8), |(i, j)| (i + j + 1) as f64);
let vector = Array1::from_shape_fn(8, |i| (i + 1) as f64);
let result = processor
.matrix_vector_multiply_simd(&matrix.view(), &vector.view())
.expect("Operation failed");
assert_eq!(result.len(), 10);
let expected_0 = (0..8).map(|j| matrix[[0, j]] * vector[j]).sum::<f64>();
assert!((result[0] - expected_0).abs() < 1e-10);
}
}