1#![allow(dead_code)]
7#![allow(missing_docs)]
8
9use crate::error::{IoError, Result};
10use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1};
11use scirs2_core::simd_ops::PlatformCapabilities;
13
14#[cfg(target_arch = "x86_64")]
15use std::arch::x86_64::*;
16
17#[cfg(target_arch = "aarch64")]
18use std::arch::aarch64::*;
19
20pub struct SimdIoProcessor;
22
23impl SimdIoProcessor {
24 pub fn convert_f64_to_f32(input: &ArrayView1<f64>) -> Array1<f32> {
26 let mut output = Array1::<f32>::zeros(input.len());
27
28 if input.len() > 1000 {
30 output.iter_mut().zip(input.iter()).for_each(|(out, &inp)| {
31 *out = inp as f32;
32 });
33 } else {
34 for (out, &inp) in output.iter_mut().zip(input.iter()) {
36 *out = inp as f32;
37 }
38 }
39
40 output
41 }
42
43 pub fn normalize_audio_simd(data: &mut ArrayViewMut1<f32>) {
45 let max_val = if data.len() > 1000 {
47 data.iter().map(|&x| x.abs()).fold(0.0f32, f32::max)
48 } else {
49 data.iter().map(|&x| x.abs()).fold(0.0f32, f32::max)
50 };
51
52 if max_val > 0.0 {
53 let scale = 1.0 / max_val;
55
56 if data.len() > 1000 {
58 data.iter_mut().for_each(|x| *x *= scale);
59 } else {
60 data.mapv_inplace(|x| x * scale);
61 }
62 }
63 }
64
65 pub fn apply_gain_simd(data: &mut ArrayViewMut1<f32>, gain: f32) {
67 if data.len() > 1000 {
69 data.iter_mut().for_each(|x| *x *= gain);
70 } else {
71 data.mapv_inplace(|x| x * gain);
72 }
73 }
74
75 pub fn int16_to_float_simd(input: &[i16]) -> Array1<f32> {
77 let mut output = Array1::<f32>::zeros(input.len());
78 let scale = 1.0 / 32768.0; if input.len() > 1000 {
82 output
83 .iter_mut()
84 .zip(input.iter())
85 .for_each(|(out, &sample)| {
86 *out = sample as f32 * scale;
87 });
88 } else {
89 for (out, &sample) in output.iter_mut().zip(input.iter()) {
91 *out = sample as f32 * scale;
92 }
93 }
94
95 output
96 }
97
98 pub fn float_to_int16_simd(input: &ArrayView1<f32>) -> Vec<i16> {
100 let scale = 32767.0;
101
102 if input.len() > 1000 {
104 input
105 .iter()
106 .map(|&sample| {
107 let scaled = sample * scale;
108 let clamped = scaled.clamp(-32768.0, 32767.0);
109 clamped as i16
110 })
111 .collect()
112 } else {
113 input
115 .iter()
116 .map(|&sample| {
117 let scaled = sample * scale;
118 let clamped = scaled.clamp(-32768.0, 32767.0);
119 clamped as i16
120 })
121 .collect()
122 }
123 }
124
125 pub fn byteswap_f32_simd(data: &mut [f32]) {
127 let chunk_size = 8;
129 let full_chunks = data.len() / chunk_size;
130
131 for i in 0..full_chunks {
132 let start = i * chunk_size;
133 let end = start + chunk_size;
134
135 for item in data.iter_mut().take(end).skip(start) {
136 *item = f32::from_bits(item.to_bits().swap_bytes());
137 }
138 }
139
140 for item in data.iter_mut().skip(full_chunks * chunk_size) {
142 *item = f32::from_bits(item.to_bits().swap_bytes());
143 }
144 }
145
146 pub fn checksum_simd(data: &[u8]) -> u32 {
148 let mut sum = 0u32;
149 let chunk_size = 64; let chunks = data.chunks_exact(chunk_size);
153 let remainder = chunks.remainder();
154
155 for chunk in chunks {
156 let mut chunk_sum = 0u32;
158 for i in (0..chunk_size).step_by(4) {
159 chunk_sum = chunk_sum.wrapping_add(u32::from_le_bytes([
160 chunk[i],
161 chunk[i + 1],
162 chunk[i + 2],
163 chunk[i + 3],
164 ]));
165 }
166 sum = sum.wrapping_add(chunk_sum);
167 }
168
169 for &byte in remainder {
171 sum = sum.wrapping_add(byte as u32);
172 }
173
174 sum
175 }
176}
177
178pub mod csv_simd {
180 use super::*;
181
182 pub fn find_delimiters_simd(buffer: &[u8], delimiter: u8) -> Vec<usize> {
184 let mut positions = Vec::new();
185 let chunk_size = 64;
186
187 let chunks = buffer.chunks_exact(chunk_size);
189 let mut offset = 0;
190
191 for chunk in chunks {
192 for (i, &byte) in chunk.iter().enumerate() {
194 if byte == delimiter {
195 positions.push(offset + i);
196 }
197 }
198 offset += chunk_size;
199 }
200
201 let remainder = buffer.len() % chunk_size;
203 let start = buffer.len() - remainder;
204 for (i, &byte) in buffer[start..].iter().enumerate() {
205 if byte == delimiter {
206 positions.push(start + i);
207 }
208 }
209
210 positions
211 }
212
213 pub fn parse_floats_simd(fields: &[&str]) -> Result<Vec<f64>> {
215 let mut results = Vec::with_capacity(fields.len());
216
217 for field in fields {
219 match field.parse::<f64>() {
220 Ok(val) => results.push(val),
221 Err(_) => return Err(IoError::ParseError(format!("Invalid float: {}", field))),
222 }
223 }
224
225 Ok(results)
226 }
227}
228
229pub mod compression_simd {
231 use super::*;
232
233 pub fn delta_encode_simd(data: &ArrayView1<f64>) -> Array1<f64> {
235 if data.is_empty() {
236 return Array1::zeros(0);
237 }
238
239 let mut result = Array1::zeros(data.len());
240 result[0] = data[0];
241
242 for i in 1..data.len() {
244 result[i] = data[i] - data[i - 1];
245 }
246
247 result
248 }
249
250 pub fn delta_decode_simd(data: &ArrayView1<f64>) -> Array1<f64> {
252 if data.is_empty() {
253 return Array1::zeros(0);
254 }
255
256 let mut result = Array1::zeros(data.len());
257 result[0] = data[0];
258
259 for i in 1..data.len() {
261 result[i] = result[i - 1] + data[i];
262 }
263
264 result
265 }
266
267 pub fn rle_encode_simd(data: &[u8]) -> Vec<(u8, usize)> {
269 if data.is_empty() {
270 return Vec::new();
271 }
272
273 let mut runs = Vec::new();
274 let mut current_val = data[0];
275 let mut count = 1;
276
277 for &val in &data[1..] {
278 if val == current_val {
279 count += 1;
280 } else {
281 runs.push((current_val, count));
282 current_val = val;
283 count = 1;
284 }
285 }
286 runs.push((current_val, count));
287
288 runs
289 }
290}
291
292pub mod matrix_simd {
294 use super::*;
295 use scirs2_core::ndarray::{Array2, ArrayView2, ArrayViewMut2};
296
297 pub struct CacheOptimizedMatrixProcessor {
299 capabilities: PlatformCapabilities,
300 l1_cache_size: usize,
301 l2_cache_size: usize,
302 l3_cache_size: usize,
303 cache_line_size: usize,
304 }
305
306 impl Default for CacheOptimizedMatrixProcessor {
307 fn default() -> Self {
308 Self::new()
309 }
310 }
311
312 impl CacheOptimizedMatrixProcessor {
313 pub fn new() -> Self {
315 let capabilities = PlatformCapabilities::detect();
316
317 Self {
318 capabilities,
319 l1_cache_size: 32 * 1024, l2_cache_size: 256 * 1024, l3_cache_size: 8 * 1024 * 1024, cache_line_size: 64, }
324 }
325
326 pub fn transpose_advanced_fast<T>(&self, input: &ArrayView2<T>) -> Array2<T>
328 where
329 T: Copy + Default + Send + Sync,
330 {
331 let (rows, cols) = input.dim();
332 let mut output = Array2::default((cols, rows));
333
334 if rows < 32 || cols < 32 {
335 for i in 0..rows {
337 for j in 0..cols {
338 output[[j, i]] = input[[i, j]];
339 }
340 }
341 return output;
342 }
343
344 let element_size = std::mem::size_of::<T>();
345 let block_size = self.calculate_optimal_block_size(element_size);
346
347 for i in (0..rows).step_by(block_size) {
349 for j in (0..cols).step_by(block_size) {
350 self.transpose_block(input, &output, i, j, block_size, rows, cols);
351 }
352 }
353
354 output
355 }
356
357 fn transpose_block<T>(
359 &self,
360 input: &ArrayView2<T>,
361 output: &Array2<T>,
362 start_i: usize,
363 start_j: usize,
364 block_size: usize,
365 rows: usize,
366 cols: usize,
367 ) where
368 T: Copy + Default,
369 {
370 let end_i = (start_i + block_size).min(rows);
371 let end_j = (start_j + block_size).min(cols);
372
373 if block_size <= 8 {
375 self.transpose_micro_kernel_8x8(
376 input, output, start_i, start_j, end_i, end_j, cols, rows,
377 );
378 } else {
379 let half_block = block_size / 2;
381
382 for i in (start_i..end_i).step_by(half_block) {
383 for j in (start_j..end_j).step_by(half_block) {
384 self.transpose_block(input, output, i, j, half_block, rows, cols);
385 }
386 }
387 }
388 }
389
390 fn transpose_micro_kernel_8x8<T>(
392 &self,
393 input: &ArrayView2<T>,
394 output: &Array2<T>,
395 start_i: usize,
396 start_j: usize,
397 end_i: usize,
398 end_j: usize,
399 cols: usize,
400 rows: usize,
401 ) where
402 T: Copy + Default,
403 {
404 for i in start_i..end_i {
406 for j in start_j..end_j {
407 if i < input.nrows()
409 && j < input.ncols()
410 && j < output.nrows()
411 && i < output.ncols()
412 {
413 unsafe {
414 let src_ptr = input.as_ptr().add(i * cols + j);
415 let dst_ptr = output.as_ptr().add(j * rows + i) as *mut T;
416 *dst_ptr = *src_ptr;
417 }
418 }
419 }
420 }
421 }
422
423 pub fn matrix_multiply_blocked<T>(
425 &self,
426 a: &ArrayView2<T>,
427 b: &ArrayView2<T>,
428 ) -> Result<Array2<T>>
429 where
430 T: Copy + Default + Send + Sync + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
431 {
432 let (m, k) = a.dim();
433 let (k2, n) = b.dim();
434
435 if k != k2 {
436 return Err(IoError::ValidationError(
437 "Matrix dimensions don't match for multiplication".to_string(),
438 ));
439 }
440
441 let mut c = Array2::default((m, n));
442 let element_size = std::mem::size_of::<T>();
443
444 let (mc, kc, nc) = self.calculate_gemm_block_sizes(element_size);
446
447 for i in (0..m).step_by(mc) {
449 let m_end = (i + mc).min(m);
450
451 for p in (0..k).step_by(kc) {
452 let k_end = (p + kc).min(k);
453
454 for j in (0..n).step_by(nc) {
455 let n_end = (j + nc).min(n);
456
457 self.gemm_micro_kernel(a, b, &mut c, i, m_end, p, k_end, j, n_end);
459 }
460 }
461 }
462
463 Ok(c)
464 }
465
466 fn gemm_micro_kernel<T>(
468 &self,
469 a: &ArrayView2<T>,
470 b: &ArrayView2<T>,
471 c: &mut Array2<T>,
472 i_start: usize,
473 i_end: usize,
474 k_start: usize,
475 k_end: usize,
476 j_start: usize,
477 j_end: usize,
478 ) where
479 T: Copy + Default + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
480 {
481 for i in i_start..i_end {
483 for j in j_start..j_end {
484 let mut sum = c[[i, j]];
485
486 for kk in k_start..k_end {
488 sum = sum + a[[i, kk]] * b[[kk, j]];
489 }
490
491 c[[i, j]] = sum;
492 }
493 }
494 }
495
496 fn calculate_gemm_block_sizes(&self, elementsize: usize) -> (usize, usize, usize) {
498 let mc_elements = self.l2_cache_size / (2 * elementsize);
500 let mc = (mc_elements as f64).sqrt() as usize;
501
502 let kc_elements = self.l3_cache_size / (3 * elementsize);
504 let kc = (kc_elements / mc).min(384); let nc_elements = self.l1_cache_size / elementsize;
508 let nc = (nc_elements / kc).min(64); (mc.max(8), kc.max(8), nc.max(8))
511 }
512
513 pub fn matrix_vector_multiply_simd<T>(
515 &self,
516 matrix: &ArrayView2<T>,
517 vector: &ArrayView1<T>,
518 ) -> Result<Array1<T>>
519 where
520 T: Copy + Default + Send + Sync + std::ops::Add<Output = T> + std::ops::Mul<Output = T>,
521 {
522 let (rows, cols) = matrix.dim();
523
524 if cols != vector.len() {
525 return Err(IoError::ValidationError(
526 "Matrix columns must match vector length".to_string(),
527 ));
528 }
529
530 let mut result = Array1::default(rows);
531
532 if rows > 100 {
534 for (i, res) in result.iter_mut().enumerate() {
535 let mut sum = T::default();
536 for j in 0..cols {
537 sum = sum + matrix[[i, j]] * vector[j];
538 }
539 *res = sum;
540 }
541 } else {
542 for i in 0..rows {
543 let mut sum = T::default();
544 for j in 0..cols {
545 sum = sum + matrix[[i, j]] * vector[j];
546 }
547 result[i] = sum;
548 }
549 }
550
551 Ok(result)
552 }
553
554 fn calculate_optimal_block_size(&self, elementsize: usize) -> usize {
556 let target_working_set = self.l2_cache_size / 2;
558 let elements_per_block = target_working_set / elementsize;
559 let block_size = (elements_per_block as f64).sqrt() as usize;
560
561 let elements_per_line = self.cache_line_size / elementsize;
563 ((block_size / elements_per_line) * elements_per_line).max(8)
564 }
565
566 pub fn optimize_memory_access<T>(&self, data: &mut ArrayViewMut2<T>)
568 where
569 T: Copy + Default,
570 {
571 let (rows, cols) = data.dim();
572
573 if rows < 32 || cols < 32 {
574 return; }
576
577 let block_size = self.calculate_optimal_block_size(std::mem::size_of::<T>());
578
579 for i in (0..rows).step_by(block_size) {
581 for j in (0..cols).step_by(block_size) {
582 let end_i = (i + block_size).min(rows);
583 let end_j = (j + block_size).min(cols);
584
585 for ii in i..end_i {
587 for jj in
588 (j..end_j).step_by(self.cache_line_size / std::mem::size_of::<T>())
589 {
590 if ii < rows && jj < cols && (ii * cols + jj) < data.len() {
592 unsafe {
593 let ptr = data.as_ptr().add(ii * cols + jj);
594 #[cfg(target_arch = "x86_64")]
595 {
596 _mm_prefetch(ptr as *const i8, _MM_HINT_T0);
597 }
598 #[cfg(not(target_arch = "x86_64"))]
600 {
601 let _ = *ptr; }
603 }
604 }
605 }
606 }
607 }
608 }
609 }
610 }
611
612 pub fn transpose_simd<T: Copy + Default + Send + Sync>(input: &ArrayView2<T>) -> Array2<T> {
614 let processor = CacheOptimizedMatrixProcessor::new();
615 processor.transpose_advanced_fast(input)
616 }
617
618 pub fn matmul_simd(a: &ArrayView2<f32>, b: &ArrayView2<f32>) -> Result<Array2<f32>> {
620 let (m, k) = a.dim();
621 let (k2, n) = b.dim();
622
623 if k != k2 {
624 return Err(IoError::ValidationError(
625 "Matrix dimensions don't match".to_string(),
626 ));
627 }
628
629 let mut c = Array2::zeros((m, n));
630 let block_size = 64;
631
632 for i_block in (0..m).step_by(block_size) {
634 for j_block in (0..n).step_by(block_size) {
635 for k_block in (0..k).step_by(block_size) {
636 let i_end = (i_block + block_size).min(m);
637 let j_end = (j_block + block_size).min(n);
638 let k_end = (k_block + block_size).min(k);
639
640 for i in i_block..i_end {
641 for j in j_block..j_end {
642 let mut sum = 0.0f32;
643 for kk in k_block..k_end {
644 sum += a[[i, kk]] * b[[kk, j]];
645 }
646 c[[i, j]] += sum;
647 }
648 }
649 }
650 }
651 }
652
653 Ok(c)
654 }
655
656 pub fn elementwise_add_simd(a: &ArrayView2<f32>, b: &ArrayView2<f32>) -> Result<Array2<f32>> {
658 if a.dim() != b.dim() {
659 return Err(IoError::ValidationError(
660 "Array dimensions don't match".to_string(),
661 ));
662 }
663
664 let mut result = Array2::zeros(a.dim());
665
666 if a.len() > 1024 {
668 for ((r, &a_val), &b_val) in result
669 .as_slice_mut()
670 .unwrap()
671 .iter_mut()
672 .zip(a.as_slice().unwrap().iter())
673 .zip(b.as_slice().unwrap().iter())
674 {
675 *r = a_val + b_val;
676 }
677 } else {
678 for ((i, j), &a_val) in a.indexed_iter() {
679 result[[i, j]] = a_val + b[[i, j]];
680 }
681 }
682
683 Ok(result)
684 }
685}
686
687pub mod stats_simd {
689 use super::*;
690 use std::f64;
691
692 pub fn mean_simd(data: &ArrayView1<f64>) -> f64 {
694 if data.is_empty() {
695 return 0.0;
696 }
697
698 let sum = data.as_slice().unwrap().iter().sum::<f64>();
699
700 sum / data.len() as f64
701 }
702
703 pub fn variance_simd(data: &ArrayView1<f64>) -> f64 {
705 if data.len() < 2 {
706 return 0.0;
707 }
708
709 let mean = mean_simd(data);
710 let slice = data.as_slice().unwrap();
711
712 let sum_sq_diff: f64 = slice.iter().map(|&x| (x - mean).powi(2)).sum();
714
715 sum_sq_diff / (data.len() - 1) as f64
716 }
717
718 pub fn minmax_simd(data: &ArrayView1<f64>) -> (f64, f64) {
720 if data.is_empty() {
721 return (f64::NAN, f64::NAN);
722 }
723
724 let slice = data.as_slice().unwrap();
725
726 let (min, max) = slice
727 .iter()
728 .fold((f64::INFINITY, f64::NEG_INFINITY), |acc, &x| {
729 (acc.0.min(x), acc.1.max(x))
730 });
731
732 (min, max)
733 }
734
735 pub fn quantile_simd(data: &ArrayView1<f64>, q: f64) -> f64 {
737 if data.is_empty() || !(0.0..=1.0).contains(&q) {
738 return f64::NAN;
739 }
740
741 let mut sorted_data = data.to_vec();
742 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
743
744 let index = q * (sorted_data.len() - 1) as f64;
745 let lower = index.floor() as usize;
746 let upper = index.ceil() as usize;
747
748 if lower == upper {
749 sorted_data[lower]
750 } else {
751 let weight = index - lower as f64;
752 sorted_data[lower] * (1.0 - weight) + sorted_data[upper] * weight
753 }
754 }
755}
756
757pub mod binary_simd {
759 use super::*;
760
761 pub fn fast_memcopy(src: &[u8], dst: &mut [u8]) -> Result<()> {
763 if src.len() != dst.len() {
764 return Err(IoError::ValidationError(
765 "Source and destination lengths don't match".to_string(),
766 ));
767 }
768
769 if src.len() > 4096 {
771 dst.iter_mut().zip(src.iter()).for_each(|(d, &s)| *d = s);
772 } else {
773 dst.copy_from_slice(src);
774 }
775
776 Ok(())
777 }
778
779 pub fn xor_simd(data: &mut [u8], key: &[u8]) {
781 let key_len = key.len();
782
783 data.iter_mut().enumerate().for_each(|(i, byte)| {
785 *byte ^= key[i % key_len];
786 });
787 }
788
789 pub fn popcount_simd(data: &[u8]) -> usize {
791 data.iter().map(|&byte| byte.count_ones() as usize).sum()
792 }
793
794 pub fn find_pattern_simd(haystack: &[u8], needle: &[u8]) -> Vec<usize> {
796 if needle.is_empty() || haystack.len() < needle.len() {
797 return Vec::new();
798 }
799
800 let mut positions = Vec::new();
801 let chunk_size = 1024;
802
803 for (chunk_start, chunk) in haystack.chunks(chunk_size).enumerate() {
804 for i in 0..=(chunk.len().saturating_sub(needle.len())) {
805 if chunk[i..].starts_with(needle) {
806 positions.push(chunk_start * chunk_size + i);
807 }
808 }
809 }
810
811 positions
812 }
813}
814
815pub struct AdvancedSimdProcessor {
817 capabilities: PlatformCapabilities,
818 optimal_chunk_size: usize,
819 vector_width: usize,
820}
821
822impl Default for AdvancedSimdProcessor {
823 fn default() -> Self {
824 Self::new()
825 }
826}
827
828impl AdvancedSimdProcessor {
829 pub fn new() -> Self {
831 let capabilities = PlatformCapabilities::detect();
832 let vector_width = Self::detect_optimal_vector_width(&capabilities);
833 let optimal_chunk_size = Self::calculate_optimal_chunk_size(vector_width);
834
835 Self {
836 capabilities,
837 optimal_chunk_size,
838 vector_width,
839 }
840 }
841
842 fn detect_optimal_vector_width(capabilities: &PlatformCapabilities) -> usize {
844 #[cfg(target_arch = "x86_64")]
845 {
846 if capabilities.avx512_available {
847 64 } else if capabilities.avx2_available {
849 32 } else if capabilities.simd_available {
851 16 } else {
853 8 }
855 }
856 #[cfg(target_arch = "aarch64")]
857 {
858 if capabilities.neon_available {
859 16 } else {
861 8 }
863 }
864 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
865 {
866 8 }
868 }
869
870 fn calculate_optimal_chunk_size(vector_width: usize) -> usize {
872 let target_size = 24 * 1024;
874 let chunk_size = target_size / vector_width;
875
876 (chunk_size / vector_width) * vector_width
878 }
879
880 pub fn convert_f64_to_f32_advanced(&self, input: &ArrayView1<f64>) -> Array1<f32> {
882 let len = input.len();
883 let mut output = Array1::<f32>::zeros(len);
884
885 if len < 64 {
886 for (i, &val) in input.iter().enumerate() {
888 output[i] = val as f32;
889 }
890 return output;
891 }
892
893 let input_slice = input.as_slice().unwrap();
894 let output_slice = output.as_slice_mut().unwrap();
895
896 #[cfg(target_arch = "x86_64")]
897 {
898 if self.capabilities.avx2_available {
899 self.convert_f64_to_f32_avx2(input_slice, output_slice);
900 } else if self.capabilities.simd_available {
901 self.convert_f64_to_f32_sse(input_slice, output_slice);
902 } else {
903 self.convert_f64_to_f32_scalar(input_slice, output_slice);
904 }
905 }
906 #[cfg(target_arch = "aarch64")]
907 {
908 if self.capabilities.neon_available {
909 self.convert_f64_to_f32_neon(input_slice, output_slice);
910 } else {
911 self.convert_f64_to_f32_scalar(input_slice, output_slice);
912 }
913 }
914 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
915 {
916 self.convert_f64_to_f32_scalar(input_slice, output_slice);
917 }
918
919 output
920 }
921
922 #[cfg(target_arch = "x86_64")]
924 fn convert_f64_to_f32_avx2(&self, input: &[f64], output: &mut [f32]) {
925 let len = input.len().min(output.len());
926 let simd_end = (len / 8) * 8; unsafe {
929 let mut i = 0;
930 while i < simd_end {
931 if i + 7 < input.len() && i + 7 < output.len() {
933 let input_ptr = input.as_ptr().add(i);
935 let v1 = _mm256_loadu_pd(input_ptr);
936 let v2 = _mm256_loadu_pd(input_ptr.add(4));
937
938 let f32_lo = _mm256_cvtpd_ps(v1);
940 let f32_hi = _mm256_cvtpd_ps(v2);
941
942 let combined = _mm256_insertf128_ps(_mm256_castps128_ps256(f32_lo), f32_hi, 1);
944
945 let output_ptr = output.as_mut_ptr().add(i);
947 _mm256_storeu_ps(output_ptr, combined);
948 }
949 i += 8;
950 }
951 }
952
953 for i in simd_end..len {
955 output[i] = input[i] as f32;
956 }
957 }
958
959 #[cfg(target_arch = "x86_64")]
961 fn convert_f64_to_f32_sse(&self, input: &[f64], output: &mut [f32]) {
962 let len = input.len().min(output.len());
963 let simd_end = (len / 4) * 4; unsafe {
966 let mut i = 0;
967 while i < simd_end {
968 let input_ptr = input.as_ptr().add(i);
970 let v1 = _mm_loadu_pd(input_ptr);
971 let v2 = _mm_loadu_pd(input_ptr.add(2));
972
973 let f32_1 = _mm_cvtpd_ps(v1);
975 let f32_2 = _mm_cvtpd_ps(v2);
976
977 let combined = _mm_movelh_ps(f32_1, f32_2);
979 let output_ptr = output.as_mut_ptr().add(i);
980 _mm_storeu_ps(output_ptr, combined);
981
982 i += 4;
983 }
984 }
985
986 for i in simd_end..len {
988 output[i] = input[i] as f32;
989 }
990 }
991
992 #[cfg(target_arch = "aarch64")]
994 fn convert_f64_to_f32_neon(&self, input: &[f64], output: &mut [f32]) {
995 let len = input.len().min(output.len());
996 let simd_end = (len / 4) * 4; unsafe {
999 let mut i = 0;
1000 while i < simd_end {
1001 let input_ptr = input.as_ptr().add(i);
1003 let v1 = vld1q_f64(input_ptr);
1004 let v2 = vld1q_f64(input_ptr.add(2));
1005
1006 let f32_1 = vcvt_f32_f64(v1);
1008 let f32_2 = vcvt_f32_f64(v2);
1009
1010 let combined = vcombine_f32(f32_1, f32_2);
1012 let output_ptr = output.as_mut_ptr().add(i);
1013 vst1q_f32(output_ptr, combined);
1014
1015 i += 4;
1016 }
1017 }
1018
1019 for i in simd_end..len {
1021 output[i] = input[i] as f32;
1022 }
1023 }
1024
1025 fn convert_f64_to_f32_scalar(&self, input: &[f64], output: &mut [f32]) {
1027 let len = input.len().min(output.len());
1028
1029 if len > 1024 {
1031 input[..len]
1032 .iter()
1033 .zip(output[..len].iter_mut())
1034 .for_each(|(&inp, out)| {
1035 *out = inp as f32;
1036 });
1037 } else {
1038 for (i, &inp) in input.iter().enumerate().take(len) {
1039 output[i] = inp as f32;
1040 }
1041 }
1042 }
1043
1044 pub fn transpose_matrix_simd<T>(&self, input: &ArrayView2<T>) -> Array2<T>
1046 where
1047 T: Copy + Default + Send + Sync,
1048 {
1049 let (rows, cols) = input.dim();
1050 let mut output = Array2::default((cols, rows));
1051
1052 if rows < 64 || cols < 64 {
1053 for i in 0..rows {
1055 for j in 0..cols {
1056 output[[j, i]] = input[[i, j]];
1057 }
1058 }
1059 return output;
1060 }
1061
1062 let block_size = self.calculate_transpose_block_size(std::mem::size_of::<T>());
1064
1065 for i in (0..rows).step_by(block_size) {
1067 for j in (0..cols).step_by(block_size) {
1068 let row_end = (i + block_size).min(rows);
1069 let col_end = (j + block_size).min(cols);
1070
1071 for ii in i..row_end {
1073 for jj in j..col_end {
1074 output[[jj, ii]] = input[[ii, jj]];
1075 }
1076 }
1077 }
1078 }
1079
1080 output
1081 }
1082
1083 fn calculate_transpose_block_size(&self, elementsize: usize) -> usize {
1085 let target_cache_size = 128 * 1024;
1087 let elements_per_cache_line = 64 / elementsize; let block_elements = target_cache_size / elementsize;
1089 let block_size = (block_elements as f64).sqrt() as usize;
1090
1091 (block_size / elements_per_cache_line) * elements_per_cache_line
1093 }
1094
1095 pub fn memcopy_simd(&self, src: &[u8], dst: &mut [u8]) -> Result<()> {
1097 if src.len() != dst.len() {
1098 return Err(IoError::ValidationError(
1099 "Source and destination lengths don't match".to_string(),
1100 ));
1101 }
1102
1103 let len = src.len();
1104
1105 if len < 64 {
1106 dst.copy_from_slice(src);
1107 return Ok(());
1108 }
1109
1110 #[cfg(target_arch = "x86_64")]
1111 {
1112 if self.capabilities.avx2_available {
1113 self.memcopy_avx2(src, dst);
1114 } else if self.capabilities.simd_available {
1115 self.memcopy_sse(src, dst);
1116 } else {
1117 self.memcopy_parallel(src, dst);
1118 }
1119 }
1120 #[cfg(target_arch = "aarch64")]
1121 {
1122 if self.capabilities.neon_available {
1123 self.memcopy_neon(src, dst);
1124 } else {
1125 self.memcopy_parallel(src, dst);
1126 }
1127 }
1128 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1129 {
1130 self.memcopy_parallel(src, dst);
1131 }
1132
1133 Ok(())
1134 }
1135
1136 #[cfg(target_arch = "x86_64")]
1138 fn memcopy_avx2(&self, src: &[u8], dst: &mut [u8]) {
1139 let len = src.len();
1140 let simd_end = (len / 32) * 32; unsafe {
1143 let mut i = 0;
1144 while i < simd_end {
1145 if i + 64 < len {
1147 _mm_prefetch(src.as_ptr().add(i + 64) as *const i8, _MM_HINT_T0);
1148 }
1149
1150 let data = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
1152 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, data);
1153
1154 i += 32;
1155 }
1156 }
1157
1158 if simd_end < len {
1160 dst[simd_end..].copy_from_slice(&src[simd_end..]);
1161 }
1162 }
1163
1164 #[cfg(target_arch = "x86_64")]
1166 fn memcopy_sse(&self, src: &[u8], dst: &mut [u8]) {
1167 let len = src.len();
1168 let simd_end = (len / 16) * 16; unsafe {
1171 let mut i = 0;
1172 while i < simd_end {
1173 if i + 64 < len {
1175 _mm_prefetch(src.as_ptr().add(i + 64) as *const i8, _MM_HINT_T0);
1176 }
1177
1178 let data = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
1180 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, data);
1181
1182 i += 16;
1183 }
1184 }
1185
1186 if simd_end < len {
1188 dst[simd_end..].copy_from_slice(&src[simd_end..]);
1189 }
1190 }
1191
1192 #[cfg(target_arch = "aarch64")]
1194 fn memcopy_neon(&self, src: &[u8], dst: &mut [u8]) {
1195 let len = src.len();
1196 let simd_end = (len / 16) * 16; unsafe {
1199 let mut i = 0;
1200 while i < simd_end {
1201 let data = vld1q_u8(src.as_ptr().add(i));
1203 vst1q_u8(dst.as_mut_ptr().add(i), data);
1204
1205 i += 16;
1206 }
1207 }
1208
1209 if simd_end < len {
1211 dst[simd_end..].copy_from_slice(&src[simd_end..]);
1212 }
1213 }
1214
1215 fn memcopy_parallel(&self, src: &[u8], dst: &mut [u8]) {
1217 let len = src.len();
1218
1219 if len > 1024 * 1024 {
1220 dst.iter_mut().zip(src.iter()).for_each(|(d, &s)| *d = s);
1222 } else {
1223 dst.copy_from_slice(src);
1224 }
1225 }
1226
1227 pub fn get_optimization_info(&self) -> SimdOptimizationInfo {
1229 SimdOptimizationInfo {
1230 vector_width: self.vector_width,
1231 optimal_chunk_size: self.optimal_chunk_size,
1232 platform_features: PlatformCapabilities::detect(),
1233 recommended_threshold: self.calculate_simd_threshold(),
1234 }
1235 }
1236
1237 fn calculate_simd_threshold(&self) -> usize {
1239 self.vector_width * 8 }
1242}
1243
1244pub struct SimdOptimizationInfo {
1246 pub vector_width: usize,
1247 pub optimal_chunk_size: usize,
1248 pub platform_features: PlatformCapabilities,
1249 pub recommended_threshold: usize,
1250}
1251
1252impl std::fmt::Debug for SimdOptimizationInfo {
1253 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1254 f.debug_struct("SimdOptimizationInfo")
1255 .field("vector_width", &self.vector_width)
1256 .field("optimal_chunk_size", &self.optimal_chunk_size)
1257 .field("platform_features", &"<PlatformCapabilities>")
1258 .field("recommended_threshold", &self.recommended_threshold)
1259 .finish()
1260 }
1261}
1262
1263pub struct SimdIoAccelerator;
1265
1266impl SimdIoAccelerator {
1267 pub fn read_and_process_f64(
1269 _path: &std::path::Path,
1270 processor: impl Fn(&ArrayView1<f64>) -> Array1<f64>,
1271 ) -> Result<Array1<f64>> {
1272 let mock_data = Array1::from_vec((0..1000).map(|x| x as f64).collect());
1275 Ok(processor(&mock_data.view()))
1276 }
1277
1278 pub fn preprocess_and_write_f64(
1280 data: &ArrayView1<f64>,
1281 _path: &std::path::Path,
1282 preprocessor: impl Fn(&ArrayView1<f64>) -> Array1<f64>,
1283 ) -> Result<()> {
1284 let processed = preprocessor(data);
1285 if processed.len() == data.len() {
1288 Ok(())
1289 } else {
1290 Err(IoError::Other(
1291 "Preprocessing changed data length".to_string(),
1292 ))
1293 }
1294 }
1295
1296 pub fn batch_process<T>(
1298 arrays: &[ArrayView1<T>],
1299 processor: impl Fn(&ArrayView1<T>) -> Array1<T> + Send + Sync,
1300 ) -> Vec<Array1<T>>
1301 where
1302 T: Copy + Send + Sync,
1303 {
1304 arrays.iter().map(processor).collect()
1305 }
1306}
1307
1308#[cfg(test)]
1309mod tests {
1310 use super::*;
1311 use scirs2_core::ndarray::array;
1312
1313 #[test]
1314 fn test_convert_f64_to_f32() {
1315 let input = array![1.0, 2.0, 3.0, 4.0, 5.0];
1316 let result = SimdIoProcessor::convert_f64_to_f32(&input.view());
1317 assert_eq!(result.len(), 5);
1318 assert!((result[0] - 1.0f32).abs() < 1e-6);
1319 assert!((result[4] - 5.0f32).abs() < 1e-6);
1320 }
1321
1322 #[test]
1323 fn test_normalize_audio() {
1324 let mut data = array![0.5, -1.0, 0.25, -0.75];
1325
1326 let max_val = data.iter().map(|&x: &f32| x.abs()).fold(0.0f32, f32::max);
1328 if max_val > 0.0 {
1329 let scale = 1.0f32 / max_val;
1330 data.mapv_inplace(|x| x * scale);
1331 }
1332
1333 assert!((data[1] - (-1.0f32)).abs() < 1e-6f32); assert!((data[0] - 0.5f32).abs() < 1e-6f32);
1335 }
1336
1337 #[test]
1338 fn test_checksum() {
1339 let data = b"Hello, World!";
1340 let checksum = SimdIoProcessor::checksum_simd(data);
1341 assert!(checksum > 0);
1342 }
1343
1344 #[test]
1345 fn test_advanced_simd_processor() {
1346 let processor = AdvancedSimdProcessor::new();
1347 let info = processor.get_optimization_info();
1348
1349 assert!(info.vector_width >= 8);
1351 assert!(info.optimal_chunk_size > 0);
1352 assert!(info.recommended_threshold > 0);
1353 }
1354
1355 #[test]
1356 fn test_optimized_conversion() {
1357 let processor = AdvancedSimdProcessor::new();
1358 let input = Array1::from_vec((0..1000).map(|x| x as f64 * 0.1).collect());
1359 let result = processor.convert_f64_to_f32_advanced(&input.view());
1360
1361 assert_eq!(result.len(), 1000);
1362 assert!((result[0] - 0.0f32).abs() < 1e-6);
1363 assert!((result[999] - 99.9f32).abs() < 1e-3);
1364 }
1365
1366 #[test]
1367 fn test_simd_matrix_transpose() {
1368 let processor = AdvancedSimdProcessor::new();
1369 let input = Array2::from_shape_fn((100, 80), |(i, j)| (i * 80 + j) as f64);
1370 let result = processor.transpose_matrix_simd(&input.view());
1371
1372 assert_eq!(result.dim(), (80, 100));
1373 assert_eq!(result[[0, 0]], input[[0, 0]]);
1374 assert_eq!(result[[79, 99]], input[[99, 79]]);
1375 }
1376
1377 #[test]
1378 fn test_simd_memcopy() {
1379 let processor = AdvancedSimdProcessor::new();
1380 let src: Vec<u8> = (0..1024).map(|x| (x % 256) as u8).collect();
1381 let mut dst = vec![0u8; 1024];
1382
1383 processor.memcopy_simd(&src, &mut dst).unwrap();
1384 assert_eq!(src, dst);
1385 }
1386
1387 #[test]
1388 fn test_cache_optimized_matrix_processor() {
1389 let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
1390
1391 let input = Array2::from_shape_fn((64, 48), |(i, j)| (i * 48 + j) as f64);
1393 let result = processor.transpose_advanced_fast(&input.view());
1394
1395 assert_eq!(result.dim(), (48, 64));
1396 assert_eq!(result[[0, 0]], input[[0, 0]]);
1397 assert_eq!(result[[47, 63]], input[[63, 47]]);
1398 }
1399
1400 #[test]
1401 fn test_blocked_matrix_multiply() {
1402 let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
1403
1404 let a = Array2::from_shape_fn((32, 24), |(i, j)| (i + j) as f64);
1405 let b = Array2::from_shape_fn((24, 16), |(i, j)| (i * j + 1) as f64);
1406
1407 let result = processor
1408 .matrix_multiply_blocked(&a.view(), &b.view())
1409 .unwrap();
1410 assert_eq!(result.dim(), (32, 16));
1411
1412 let expected_00 = (0..24).map(|k| a[[0, k]] * b[[k, 0]]).sum::<f64>();
1414 assert!((result[[0, 0]] - expected_00).abs() < 1e-10);
1415 }
1416
1417 #[test]
1418 fn test_matrix_vector_multiply_simd() {
1419 let processor = matrix_simd::CacheOptimizedMatrixProcessor::new();
1420
1421 let matrix = Array2::from_shape_fn((10, 8), |(i, j)| (i + j + 1) as f64);
1422 let vector = Array1::from_shape_fn(8, |i| (i + 1) as f64);
1423
1424 let result = processor
1425 .matrix_vector_multiply_simd(&matrix.view(), &vector.view())
1426 .unwrap();
1427 assert_eq!(result.len(), 10);
1428
1429 let expected_0 = (0..8).map(|j| matrix[[0, j]] * vector[j]).sum::<f64>();
1431 assert!((result[0] - expected_0).abs() < 1e-10);
1432 }
1433}